Examples of org.apache.lucene.util.automaton.Automaton

Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton

org.apache.lucene.util.automaton.Automaton
Finite-state automaton with regular expression operations.
Class invariants:
- An automaton is either represented explicitly (with {@link State} and{@link Transition} objects) or with a singleton string (see{@link #getSingleton()} and {@link #expandSingleton()}) in case the automaton is known to accept exactly one string. (Implicitly, all states and transitions of an automaton are reachable from its initial state.)
- Automata are always reduced (see {@link #reduce()}) and have no transitions to dead states (see {@link #removeDeadTransitions()}).
- If an automaton is nondeterministic, then {@link #isDeterministic()}returns false (but the converse is not required).
- Automata provided as input to operations are generally assumed to be disjoint.
If the states or transitions are manipulated manually, the {@link #restoreInvariant()} and {@link #setDeterministic(boolean)} methodsshould be used afterwards to restore representation invariants that are assumed by the built-in automata operations.

Note: This class has internal mutable state and is not thread safe. It is the caller's responsibility to ensure any necessary synchronization if you wish to use the same Automaton from multiple threads. In general it is instead recommended to use a {@link RunAutomaton} for multithreaded matching: it is immutable, thread safe, and much faster.
@lucene.experimental

      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {


      Automaton lookupAutomaton = toLookupAutomaton(key);


      final CharsRef spare = new CharsRef();


      //System.out.println("  now intersect exactFirst=" + exactFirst);

View Full Code Here

    return prefixPaths;
  }
  
  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
    Automaton automaton = null;
    TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
    try {


      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as

View Full Code Here

  }


  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    } finally {
      IOUtils.closeWhileHandlingException(ts);

View Full Code Here

   * determinized)
   */
  public void testNFA() throws IOException {
    // accept this or three, the union is an NFA (two transitions for 't' from
    // initial state)
    Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"),
        BasicAutomata.makeString("three"));
    assertAutomatonHits(2, nfa);
  }

View Full Code Here

  /**
   * Test that rewriting to a prefix query works as expected, preserves
   * MultiTermQuery semantics.
   */
  public void testRewritePrefix() throws IOException {
    Automaton pfx = BasicAutomata.makeString("do");
    pfx.expandSingleton(); // expand singleton representation for testing
    Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata
        .makeAnyString());
    AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton);
    Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN);
    assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum);
    assertEquals(3, automatonQueryNrHits(aq));

View Full Code Here

    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");


    Automaton automaton = new RegExp(".*d", RegExp.NONE).toAutomaton();
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
    TermsEnum te;
    
    // should seek to startTerm
    te = terms.intersect(ca, new BytesRef("aad"));

View Full Code Here

    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");


    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();  // accept ALL
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    


    TermsEnum te = terms.intersect(ca, null);
    DocsEnum de;

View Full Code Here

      // From the random terms, pick some ratio and compile an
      // automaton:
      final Set<String> acceptTerms = new HashSet<String>();
      final TreeSet<BytesRef> sortedAcceptTerms = new TreeSet<BytesRef>();
      final double keepPct = random().nextDouble();
      Automaton a;
      if (iter == 0) {
        if (VERBOSE) {
          System.out.println("\nTEST: empty automaton");
        }
        a = BasicAutomata.makeEmpty();
      } else {
        if (VERBOSE) {
          System.out.println("\nTEST: keepPct=" + keepPct);
        }
        for (String s : terms) {
          final String s2;
          if (random().nextDouble() <= keepPct) {
            s2 = s;
          } else {
            s2 = getRandomString();
          }
          acceptTerms.add(s2);
          sortedAcceptTerms.add(new BytesRef(s2));
        }
        a = BasicAutomata.makeStringUnion(sortedAcceptTerms);
      }
      
      if (random().nextBoolean()) {
        if (VERBOSE) {
          System.out.println("TEST: reduce the automaton");
        }
        a.reduce();
      }


      final CompiledAutomaton c = new CompiledAutomaton(a, true, false);


      final BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.size()];
      final Set<BytesRef> acceptTermsSet = new HashSet<BytesRef>();
      int upto = 0;
      for(String s : acceptTerms) {
        final BytesRef b = new BytesRef(s);
        acceptTermsArray[upto++] = b;
        acceptTermsSet.add(b);
        assertTrue(accepts(c, b));
      }
      Arrays.sort(acceptTermsArray);


      if (VERBOSE) {
        System.out.println("\nTEST: accept terms (unicode order):");
        for(BytesRef t : acceptTermsArray) {
          System.out.println("  " + t.utf8ToString() + (termsSet.contains(t) ? " (exists)" : ""));
        }
        System.out.println(a.toDot());
      }


      for(int iter2=0;iter2<100;iter2++) {
        final BytesRef startTerm = acceptTermsArray.length == 0 || random().nextBoolean() ? null : acceptTermsArray[random().nextInt(acceptTermsArray.length)];

View Full Code Here

    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    w.close();
    AtomicReader sub = getOnlySegmentReader(r);
    Terms terms = sub.fields().terms("field");
    Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();    
    CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);    
    TermsEnum te = terms.intersect(ca, null);
    assertEquals("aaa", te.next().utf8ToString());
    assertEquals(0, te.docs(null, null, DocsEnum.FLAG_NONE).nextDoc());
    assertEquals("bbb", te.next().utf8ToString());

View Full Code Here

        }
        int termLength = termText.length;
        int prefixLength = Math.min(fq.getPrefixLength(), termLength);
        String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
        LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
        Automaton automaton = builder.toAutomaton(fq.getMaxEdits());
        if (prefixLength > 0) {
          Automaton prefix = BasicAutomata.makeString(UnicodeUtil.newString(termText, 0, prefixLength));
          automaton = BasicOperations.concatenate(prefix, automaton);
        }
        list.add(new CharacterRunAutomaton(automaton) {
          @Override
          public String toString() {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndex

org.apache.jackrabbit.oak.plugins.index.lucene.LucenePropertyIndex

org.apache.lucene.analysis.core.TestDuelingAnalyzers

org.apache.lucene.analysis.TestGraphTokenizers

org.apache.lucene.analysis.TokenStreamToAutomaton

org.apache.lucene.index.TestIndexWriter

org.apache.lucene.index.TestTermsEnum

org.apache.lucene.search.FuzzyTermsEnum

org.apache.lucene.search.postingshighlight.MultiTermHighlighting

org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.