Examples of org.apache.lucene.util.automaton.Automaton

Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton

org.apache.lucene.util.automaton.Automaton
Finite-state automaton with regular expression operations.
Class invariants:
- An automaton is either represented explicitly (with {@link State} and{@link Transition} objects) or with a singleton string (see{@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton is known to accept exactly one string. (Implicitly, all states and transitions of an automaton are reachable from its initial state.)
- Automata are always reduced (see {@link #reduce()}) and have no transitions to dead states (see {@link #removeDeadTransitions()}).
- If an automaton is nondeterministic, then {@link #isDeterministic()}returns false (but the converse is not required).
- Automata provided as input to operations are generally assumed to be disjoint.
If the states or transitions are manipulated manually, the {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methodsshould be used afterwards to restore representation invariants that are assumed by the built-in automata operations.

Note: This class has internal mutable state and is not thread safe. It is the caller's responsibility to ensure any necessary synchronization if you wish to use the same Automaton from multiple threads. In general it is instead recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable, thread safe, and much faster.
@lucene.experimental

  }


  @Override
  protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
      BasicOperations.determinize(utf8automaton);
      return utf8automaton;
    } else {
      return a;
    }

View Full Code Here

    return tsta;
  }


  Automaton toLevenshteinAutomata(Automaton automaton) {
    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
    Automaton subs[] = new Automaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
      if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
        subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
        upto++;
      } else {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
        int ints[] = new int[path.length-nonFuzzyPrefix];
        System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
        subs[upto] = combined;
        upto++;
      }
    }


    if (subs.length == 0) {
      // automaton is empty, there is no accepted paths through it
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs[0];
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
      Automaton a = BasicOperations.union(Arrays.asList(subs));
      // TODO: we could call toLevenshteinAutomata() before det? 
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      BasicOperations.determinize(a);


      return a;

View Full Code Here

    for (int i = 0; i <= 0x10FFFF; i++) {
      if (Character.isLetter(i)) {
        initial.addTransition(new Transition(i, i, accept));
      }
    }
    Automaton single = new Automaton(initial);
    single.reduce();
    Automaton repeat = BasicOperations.repeat(single);
    jvmLetter = new CharacterRunAutomaton(repeat);
  }

View Full Code Here

      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {


      Automaton lookupAutomaton = toLookupAutomaton(key);


      final CharsRef spare = new CharsRef();


      //System.out.println("  now intersect exactFirst=" + exactFirst);

View Full Code Here

    return prefixPaths;
  }
  
  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
    Automaton automaton = null;
    TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
    try {


      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as

View Full Code Here

  }


  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    } finally {
      IOUtils.closeWhileHandlingException(ts);

View Full Code Here


    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

      new Token[] {
        token("a", 1, 1),
        token("x", 0, 3),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); 
    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); 
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =  join("abc", "def");


    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex

org.apache.jackrabbit.oak.plugins.index.lucene.LucenePropertyIndex

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.TestGraphTokenizers

org.apache.lucene.analysis.TokenStreamToAutomaton

org.apache.lucene.index.TestIndexWriter

org.apache.lucene.index.TestTermsEnum

org.apache.lucene.search.FuzzyTermsEnum

org.apache.lucene.search.postingshighlight.MultiTermHighlighting

org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.