Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton


 
  // LUCENE-3849
  public void testStopwordsPosIncHole2() throws Exception {
    // use two stopfilters for testing here
    Directory dir = newDirectory();
    final Automaton secondSet = BasicAutomata.makeString("foobar");
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
View Full Code Here


        try {
            List<Term> terms = new ArrayList<Term>();
            Term onTerm = newFulltextTerm(token, fieldName);
            Terms t = MultiFields.getTerms(reader, onTerm.field());
            Automaton a = WildcardQuery.toAutomaton(onTerm);
            CompiledAutomaton ca = new CompiledAutomaton(a);
            TermsEnum te = ca.getTermsEnum(t);
            BytesRef text;
            while ((text = te.next()) != null) {
                terms.add(newFulltextTerm(text.utf8ToString(), fieldName));
View Full Code Here

        try {
            List<Term> terms = new ArrayList<Term>();
            Term onTerm = newFulltextTerm(token, fieldName);
            Terms t = MultiFields.getTerms(reader, onTerm.field());
            Automaton a = WildcardQuery.toAutomaton(onTerm);
            CompiledAutomaton ca = new CompiledAutomaton(a);
            TermsEnum te = ca.getTermsEnum(t);
            BytesRef text;
            while ((text = te.next()) != null) {
                terms.add(newFulltextTerm(text.utf8ToString(), fieldName));
View Full Code Here

      // NOTE: not great that we ask the suggester to give
      // us the "answer key" (ie maybe we have a bug in
      // suggester.toLevA ...) ... but testRandom2() fixes
      // this:
      Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
      assertTrue(automaton.isDeterministic());
      // TODO: could be faster... but its slowCompletor for a reason
      BytesRef spare = new BytesRef();
      for (TermFreq2 e : slowCompletor) {
        spare.copyChars(e.analyzedForm);
        Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
        for (IntsRef intsRef : finiteStrings) {
          State p = automaton.getInitialState();
          BytesRef ref = Util.toBytesRef(intsRef, spare);
          boolean added = false;
          for (int i = ref.offset; i < ref.length; i++) {
            State q = p.step(ref.bytes[i] & 0xff);
            if (q == null) {
View Full Code Here

  /** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes from each term. */
  public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton a = new Automaton();
    boolean deterministic = true;

    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

    final BytesRef term = termBytesAtt.getBytesRef();

    in.reset();

    // Only temporarily holds states ahead of our current
    // position:

    final RollingBuffer<Position> positions = new Positions();

    int pos = -1;
    Position posData = null;

    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      assert pos > -1 || posInc > 0;

      if (posInc > 0) {

        // New node:
        pos += posInc;

        posData = positions.get(pos);
        assert posData.leaving == null;

        if (posData.arriving == null) {
          // No token ever arrived to this position
          if (pos == 0) {
            // OK: this is the first token
            posData.leaving = a.getInitialState();
          } else {
            // This means there's a hole (eg, StopFilter
            // does this):
            posData.leaving = new State();
            addHoles(a.getInitialState(), positions, pos);
          }
        } else {
          posData.leaving = new State();
          posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
          if (posInc > 1) {
            // A token spanned over a hole; add holes
            // "under" it:
            addHoles(a.getInitialState(), positions, pos);
          }
        }
        positions.freeBefore(pos);
      } else {
        // note: this isn't necessarily true. its just that we aren't surely det.
        // we could optimize this further (e.g. buffer and sort synonyms at a position)
        // but thats probably overkill. this is cheap and dirty
        deterministic = false;
      }

      final int endPos = pos + posLengthAtt.getPositionLength();

      termBytesAtt.fillBytesRef();
      final BytesRef term2 = changeToken(term);
      final Position endPosData = positions.get(endPos);
      if (endPosData.arriving == null) {
        endPosData.arriving = new State();
      }

      State state = posData.leaving;
      for(int byteIDX=0;byteIDX<term2.length;byteIDX++) {
        final State nextState = byteIDX == term2.length-1 ? endPosData.arriving : new State();
        state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
        state = nextState;
      }
    }

    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != null) {
        posData.arriving.setAccept(true);
      }
      pos++;
    }

    //toDot(a);
    a.setDeterministic(deterministic);
    return a;
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

      new Token[] {
        token("a", 1, 1),
        token("x", 0, 3),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =  join("abc", "def");

    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 2, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);

    final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));

    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.