Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton


  }

  @Override
  protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
      BasicOperations.determinize(utf8automaton);
      return utf8automaton;
    } else {
      return a;
    }
View Full Code Here


    return tsta;
  }

  Automaton toLevenshteinAutomata(Automaton automaton) {
    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
    Automaton subs[] = new Automaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
      if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
        subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
        upto++;
      } else {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
        int ints[] = new int[path.length-nonFuzzyPrefix];
        System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
        subs[upto] = combined;
        upto++;
      }
    }

    if (subs.length == 0) {
      // automaton is empty, there is no accepted paths through it
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs[0];
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
      Automaton a = BasicOperations.union(Arrays.asList(subs));
      // TODO: we could call toLevenshteinAutomata() before det?
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      BasicOperations.determinize(a);

      return a;
View Full Code Here

    for (int i = 0; i <= 0x10FFFF; i++) {
      if (Character.isLetter(i)) {
        initial.addTransition(new Transition(i, i, accept));
      }
    }
    Automaton single = new Automaton(initial);
    single.reduce();
    Automaton repeat = BasicOperations.repeat(single);
    jvmLetter = new CharacterRunAutomaton(repeat);
  }
View Full Code Here

      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
View Full Code Here

    return prefixPaths;
  }
 
  final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
    // Analyze surface form:
    Automaton automaton = null;
    TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());
    try {

      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as
View Full Code Here

  }

  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    Automaton automaton = null;
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    try {
      automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    } finally {
      IOUtils.closeWhileHandlingException(ts);
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = BasicAutomata.makeString("abc");
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("a", 1, 1),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

      new Token[] {
        token("a", 1, 1),
        token("x", 0, 3),
        token("b", 3, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b"));
    final Automaton a2 = join(s2a("x"), SEP_A, s2a("b"));
    final Automaton expected = BasicOperations.union(a1, a2);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

    final TokenStream ts = new CannedTokenStream(
      new Token[] {
        token("abc", 1, 1),
        token("def", 1, 1),
      });
    final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts);
    final Automaton expected =  join("abc", "def");

    //toDot(actual);
    assertTrue(BasicOperations.sameLanguage(expected, actual));
  }
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.