Examples of org.apache.lucene.util.automaton.Automaton

Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton

org.apache.lucene.util.automaton.Automaton
Finite-state automaton with regular expression operations.
Class invariants:
- An automaton is either represented explicitly (with {@link State} and{@link Transition} objects) or with a singleton string (see{@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton is known to accept exactly one string. (Implicitly, all states and transitions of an automaton are reachable from its initial state.)
- Automata are always reduced (see {@link #reduce()}) and have no transitions to dead states (see {@link #removeDeadTransitions()}).
- If an automaton is nondeterministic, then {@link #isDeterministic()}returns false (but the converse is not required).
- Automata provided as input to operations are generally assumed to be disjoint.
If the states or transitions are manipulated manually, the {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methodsshould be used afterwards to restore representation invariants that are assumed by the built-in automata operations.

Note: This class has internal mutable state and is not thread safe. It is the caller's responsibility to ensure any necessary synchronization if you wish to use the same Automaton from multiple threads. In general it is instead recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable, thread safe, and much faster.
@lucene.experimental

  
  // LUCENE-3849
  public void testStopwordsPosIncHole2() throws Exception {
    // use two stopfilters for testing here
    Directory dir = newDirectory();
    final Automaton secondSet = BasicAutomata.makeString("foobar");
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader);
        TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);

View Full Code Here


        try {
            List<Term> terms = new ArrayList<Term>();
            Term onTerm = newFulltextTerm(token, fieldName);
            Terms t = MultiFields.getTerms(reader, onTerm.field());
            Automaton a = WildcardQuery.toAutomaton(onTerm);
            CompiledAutomaton ca = new CompiledAutomaton(a);
            TermsEnum te = ca.getTermsEnum(t);
            BytesRef text;
            while ((text = te.next()) != null) {
                terms.add(newFulltextTerm(text.utf8ToString(), fieldName));

View Full Code Here


        try {
            List<Term> terms = new ArrayList<Term>();
            Term onTerm = newFulltextTerm(token, fieldName);
            Terms t = MultiFields.getTerms(reader, onTerm.field());
            Automaton a = WildcardQuery.toAutomaton(onTerm);
            CompiledAutomaton ca = new CompiledAutomaton(a);
            TermsEnum te = ca.getTermsEnum(t);
            BytesRef text;
            while ((text = te.next()) != null) {
                terms.add(newFulltextTerm(text.utf8ToString(), fieldName));

View Full Code Here


      // NOTE: not great that we ask the suggester to give
      // us the "answer key" (ie maybe we have a bug in
      // suggester.toLevA ...) ... but testRandom2() fixes
      // this:
      Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
      assertTrue(automaton.isDeterministic());
      // TODO: could be faster... but its slowCompletor for a reason
      BytesRef spare = new BytesRef();
      for (TermFreq2 e : slowCompletor) {
        spare.copyChars(e.analyzedForm);
        Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
        for (IntsRef intsRef : finiteStrings) {
          State p = automaton.getInitialState();
          BytesRef ref = Util.toBytesRef(intsRef, spare);
          boolean added = false;
          for (int i = ref.offset; i < ref.length; i++) {
            State q = p.step(ref.bytes[i] & 0xff);
            if (q == null) {

View Full Code Here

  /** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes from each term. */
  public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton a = new Automaton();
    boolean deterministic = true;


    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);


    final BytesRef term = termBytesAtt.getBytesRef();


    in.reset();


    // Only temporarily holds states ahead of our current
    // position:


    final RollingBuffer<Position> positions = new Positions();


    int pos = -1;
    Position posData = null;


    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      assert pos > -1 || posInc > 0;


      if (posInc > 0) {


        // New node:
        pos += posInc;


        posData = positions.get(pos);
        assert posData.leaving == null;


        if (posData.arriving == null) {
          // No token ever arrived to this position
          if (pos == 0) {
            // OK: this is the first token
            posData.leaving = a.getInitialState();
          } else {
            // This means there's a hole (eg, StopFilter
            // does this):
            posData.leaving = new State();
            addHoles(a.getInitialState(), positions, pos);
          }
        } else {
          posData.leaving = new State();
          posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
          if (posInc > 1) {
            // A token spanned over a hole; add holes
            // "under" it:
            addHoles(a.getInitialState(), positions, pos);
          }
        }
        positions.freeBefore(pos);
      } else {
        // note: this isn't necessarily true. its just that we aren't surely det.
        // we could optimize this further (e.g. buffer and sort synonyms at a position)
        // but thats probably overkill. this is cheap and dirty
        deterministic = false;
      }


      final int endPos = pos + posLengthAtt.getPositionLength();


      termBytesAtt.fillBytesRef();
      final BytesRef term2 = changeToken(term);
      final Position endPosData = positions.get(endPos);
      if (endPosData.arriving == null) {
        endPosData.arriving = new State();
      }


      State state = posData.leaving;
      for(int byteIDX=0;byteIDX<term2.length;byteIDX++) {
        final State nextState = byteIDX == term2.length-1 ? endPosData.arriving : new State();
        state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
        state = nextState;
      }
    }


    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != null) {
        posData.arriving.setAccept(true);
      }
      pos++;
    }


    //toDot(a);
    a.setDeterministic(deterministic);
    return a;
  }

View Full Code Here


    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

      new Token[] {
        token("a", 1, 1),
        token("x", 0, 3),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =  join("abc", "def");


    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 2, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);


    final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def"));


    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex

org.apache.jackrabbit.oak.plugins.index.lucene.LucenePropertyIndex

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.TestGraphTokenizers

org.apache.lucene.analysis.TokenStreamToAutomaton

org.apache.lucene.index.TestIndexWriter

org.apache.lucene.index.TestTermsEnum

org.apache.lucene.search.FuzzyTermsEnum

org.apache.lucene.search.postingshighlight.MultiTermHighlighting

org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.