Package org.apache.lucene.util.automaton

Examples of org.apache.lucene.util.automaton.Automaton


  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    ts.close();

    // TODO: we could use the end offset to "guess"
    // whether the final token was a partial token; this
    // would only be a heuristic ... but maybe an OK one.
View Full Code Here


      // NOTE: not great that we ask the suggester to give
      // us the "answer key" (ie maybe we have a bug in
      // suggester.toLevA ...) ... but testRandom2() fixes
      // this:
      Automaton automaton = suggester.toLevenshteinAutomata(suggester.toLookupAutomaton(analyzedKey));
      assertTrue(automaton.isDeterministic());
      // TODO: could be faster... but its slowCompletor for a reason
      BytesRef spare = new BytesRef();
      for (TermFreq2 e : slowCompletor) {
        spare.copyChars(e.analyzedForm);
        Set<IntsRef> finiteStrings = suggester.toFiniteStrings(spare, tokenStreamToAutomaton);
        for (IntsRef intsRef : finiteStrings) {
          State p = automaton.getInitialState();
          BytesRef ref = Util.toBytesRef(intsRef, spare);
          boolean added = false;
          for (int i = ref.offset; i < ref.length; i++) {
            State q = p.step(ref.bytes[i] & 0xff);
            if (q == null) {
View Full Code Here

  /** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes from each term. */
  public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton a = new Automaton();
    boolean deterministic = true;

    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

    final BytesRef term = termBytesAtt.getBytesRef();

    in.reset();

    // Only temporarily holds states ahead of our current
    // position:

    final RollingBuffer<Position> positions = new Positions();

    int pos = -1;
    Position posData = null;
    int maxOffset = 0;
    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      if (!preservePositionIncrements && posInc > 1) {
        posInc = 1;
      }
      assert pos > -1 || posInc > 0;

      if (posInc > 0) {

        // New node:
        pos += posInc;

        posData = positions.get(pos);
        assert posData.leaving == null;

        if (posData.arriving == null) {
          // No token ever arrived to this position
          if (pos == 0) {
            // OK: this is the first token
            posData.leaving = a.getInitialState();
          } else {
            // This means there's a hole (eg, StopFilter
            // does this):
            posData.leaving = new State();
            addHoles(a.getInitialState(), positions, pos);
          }
        } else {
          posData.leaving = new State();
          posData.arriving.addTransition(new Transition(POS_SEP, posData.leaving));
          if (posInc > 1) {
            // A token spanned over a hole; add holes
            // "under" it:
            addHoles(a.getInitialState(), positions, pos);
          }
        }
        positions.freeBefore(pos);
      } else {
        // note: this isn't necessarily true. its just that we aren't surely det.
        // we could optimize this further (e.g. buffer and sort synonyms at a position)
        // but thats probably overkill. this is cheap and dirty
        deterministic = false;
      }

      final int endPos = pos + posLengthAtt.getPositionLength();

      termBytesAtt.fillBytesRef();
      final BytesRef term2 = changeToken(term);
      final Position endPosData = positions.get(endPos);
      if (endPosData.arriving == null) {
        endPosData.arriving = new State();
      }

      State state = posData.leaving;
      for(int byteIDX=0;byteIDX<term2.length;byteIDX++) {
        final State nextState = byteIDX == term2.length-1 ? endPosData.arriving : new State();
        state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
        state = nextState;
      }

      maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
    }

    in.end();
    State endState = null;
    if (offsetAtt.endOffset() > maxOffset) {
      endState = new State();
      endState.setAccept(true);
    }

    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != null) {
        if (endState != null) {
          posData.arriving.addTransition(new Transition(POS_SEP, endState));
        } else {
          posData.arriving.setAccept(true);
        }
      }
      pos++;
    }

    //toDot(a);
    a.setDeterministic(deterministic);
    return a;
  }
View Full Code Here

    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
      w.write(levA.toDot());
      w.close();
      System.out.println("Wrote LevA to out.dot");
View Full Code Here

  }

  @Override
  protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
      BasicOperations.determinize(utf8automaton);
      return utf8automaton;
    } else {
      return a;
    }
View Full Code Here

    return tsta;
  }

  Automaton toLevenshteinAutomata(Automaton automaton) {
    final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
    Automaton subs[] = new Automaton[ref.size()];
    int upto = 0;
    for (IntsRef path : ref) {
      if (path.length <= nonFuzzyPrefix || path.length < minFuzzyLength) {
        subs[upto] = BasicAutomata.makeString(path.ints, path.offset, path.length);
        upto++;
      } else {
        Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, nonFuzzyPrefix);
        int ints[] = new int[path.length-nonFuzzyPrefix];
        System.arraycopy(path.ints, path.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
        Automaton levAutomaton = lev.toAutomaton(maxEdits);
        Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton));
        combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
        subs[upto] = combined;
        upto++;
      }
    }

    if (subs.length == 0) {
      // automaton is empty, there is no accepted paths through it
      return BasicAutomata.makeEmpty(); // matches nothing
    } else if (subs.length == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs[0];
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
      Automaton a = BasicOperations.union(Arrays.asList(subs));
      // TODO: we could call toLevenshteinAutomata() before det?
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      BasicOperations.determinize(a);

      return a;
View Full Code Here

      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRef spare = new CharsRef();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
   
View Full Code Here

    TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString());

    // Create corresponding automaton: labels are bytes
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    Automaton automaton = ts2a.toAutomaton(ts);
    ts.close();

    replaceSep(automaton);
    automaton = convertAutomaton(automaton);
View Full Code Here

  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
    // TODO: is there a Reader from a CharSequence?
    // Turn tokenstream into automaton:
    TokenStream ts = queryAnalyzer.tokenStream("", key.toString());
    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    ts.close();

    // TODO: we could use the end offset to "guess"
    // whether the final token was a partial token; this
    // would only be a heuristic ... but maybe an OK one.
View Full Code Here

        return -1;
    }

    private Term[] expandTerms(String field, String queryStr) throws XPathException {
        List<Term> termList = new ArrayList<>(8);
        Automaton automaton = WildcardQuery.toAutomaton(new Term(field, queryStr));
        CompiledAutomaton compiled = new CompiledAutomaton(automaton);
        IndexReader reader = null;
        try {
            reader = index.getReader();
View Full Code Here

TOP

Related Classes of org.apache.lucene.util.automaton.Automaton

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.