Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.CachingTokenFilter


        int maxNumFragmentsRequired = 2;
        String fragmentSeparator = "...";
        Scorer scorer = null;
        TokenStream tokenStream = null;
        if (mode == SPAN) {
          tokenStream = new CachingTokenFilter(analyzer.tokenStream(HighlighterTest.FIELD_NAME,
              new StringReader(text)));
          scorer = new SpanScorer(query, HighlighterTest.FIELD_NAME,
              (CachingTokenFilter) tokenStream);
        } else if (mode == STANDARD) {
          scorer = new QueryScorer(query);
View Full Code Here


   * @throws IOException
   */
  public Map getWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, String fieldName,
      IndexReader reader) throws IOException {
    this.fieldName = fieldName;
    this.cachedTokenFilter = new CachingTokenFilter(tokenStream);

    Map terms = new PositionCheckingMap();
    extract(query, terms);

    int totalNumDocs = reader.numDocs();
View Full Code Here

    }

    public static String bestFragmentHighlighted(String field, Query query, String text, Fragmenter fragmenter) {

        try {
            CachingTokenFilter tokenStream =
                new CachingTokenFilter(new DomainSearchAnalyzer().tokenStream(field, new StringReader(text)));

      QueryScorer scorer = new QueryScorer(query, field, "");
      scorer.init(tokenStream);
     
            Highlighter highlighter =
                new Highlighter(new DomainSearchFormatter(), scorer);
            highlighter.setTextFragmenter(fragmenter);
            tokenStream.reset();

            try {
                return highlighter.getBestFragments(tokenStream, text, 2, " ... ");

            } catch (InvalidTokenOffsetsException e) {
View Full Code Here

      String text = fieldNode.getTextAsString();
      String field = fieldNode.getFieldAsString();

      TokenStream source = this.analyzer.tokenStream(field, new StringReader(
          text));
      CachingTokenFilter buffer = new CachingTokenFilter(source);

      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;

      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }

      try {

        while (buffer.incrementToken()) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;

          } else {
            severalTokensAtSamePosition = true;
          }

        }

      } catch (IOException e) {
        // ignore
      }

      try {
        // rewind the buffer stream
        buffer.reset();

        // close original stream - all tokens buffered
        source.close();
      } catch (IOException e) {
        // ignore
      }

      if (!buffer.hasAttribute(CharTermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }

      CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);

      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();

      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.toString();

        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }

        fieldNode.setText(term);

        return fieldNode;

      } else if (severalTokensAtSamePosition || !(node instanceof QuotedFieldQueryNode)) {
        if (positionCount == 1 || !(node instanceof QuotedFieldQueryNode)) {
          // no phrase query:
          LinkedList<QueryNode> children = new LinkedList<QueryNode>();

          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            children.add(new FieldQueryNode(field, term, -1, -1));

          }
          if (positionCount == 1)
            return new GroupQueryNode(
              new StandardBooleanQueryNode(children, true));
          else
            return new StandardBooleanQueryNode(children, false);

        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {

              for (FieldQueryNode termNode : multiTerms) {

                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }

                mpq.add(termNode);

              }

              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;

              multiTerms.clear();

            }

            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));

          }

          for (FieldQueryNode termNode : multiTerms) {

            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);

            } else {
              termNode.setPositionIncrement(termGroupCount);
            }

            mpq.add(termNode);

          }

          return mpq;

        }

      } else {

        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

        int position = -1;

        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();

            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
View Full Code Here

      source = analyzer.reusableTokenStream(field, new StringReader(queryText));
      source.reset();
    } catch (IOException e) {
      source = analyzer.tokenStream(field, new StringReader(queryText));
    }
    CachingTokenFilter buffer = new CachingTokenFilter(source);
    TermAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;

    boolean success = false;
    try {
      buffer.reset();
      success = true;
    } catch (IOException e) {
      // success==false if we hit an exception
    }
    if (success) {
      if (buffer.hasAttribute(TermAttribute.class)) {
        termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
      }
    }

    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;

    boolean hasMoreTokens = false;
    if (termAtt != null) {
      try {
        hasMoreTokens = buffer.incrementToken();
        while (hasMoreTokens) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;
          } else {
            severalTokensAtSamePosition = true;
          }
          hasMoreTokens = buffer.incrementToken();
        }
      } catch (IOException e) {
        // ignore
      }
    }
    try {
      // rewind the buffer stream
      buffer.reset();

      // close original stream - all tokens buffered
      source.close();
    }
    catch (IOException e) {
      // ignore
    }

    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      String term = null;
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        term = termAtt.term();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      return newTermQuery(new Term(field, term));
    } else {
      if (severalTokensAtSamePosition) {
        if (positionCount == 1) {
          // no phrase query:
          BooleanQuery q = newBooleanQuery(true);
          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.term();
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            Query currentQuery = newTermQuery(
                new Term(field, term));
            q.add(currentQuery, BooleanClause.Occur.SHOULD);
          }
          return q;
        }
        else {
          // phrase query:
          MultiPhraseQuery mpq = newMultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List multiTerms = new ArrayList();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.term();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add((Term[])multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, term));
          }
          if (enablePositionIncrements) {
            mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add((Term[])multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      }
      else {
        PhraseQuery pq = newPhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;


        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.term();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
View Full Code Here

      String text = fieldNode.getTextAsString();
      String field = fieldNode.getFieldAsString();

      TokenStream source = this.analyzer.tokenStream(field, new StringReader(
          text));
      CachingTokenFilter buffer = new CachingTokenFilter(source);

      PositionIncrementAttribute posIncrAtt = null;
      int numTokens = 0;
      int positionCount = 0;
      boolean severalTokensAtSamePosition = false;

      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = (PositionIncrementAttribute) buffer
            .getAttribute(PositionIncrementAttribute.class);
      }

      try {

        while (buffer.incrementToken()) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt
              .getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;

          } else {
            severalTokensAtSamePosition = true;
          }

        }

      } catch (IOException e) {
        // ignore
      }

      try {
        // rewind the buffer stream
        buffer.reset();

        // close original stream - all tokens buffered
        source.close();
      } catch (IOException e) {
        // ignore
      }

      if (!buffer.hasAttribute(TermAttribute.class)) {
        return new NoTokenFoundQueryNode();
      }

      TermAttribute termAtt = (TermAttribute) buffer
          .getAttribute(TermAttribute.class);

      if (numTokens == 0) {
        return new NoTokenFoundQueryNode();

      } else if (numTokens == 1) {
        String term = null;
        try {
          boolean hasNext;
          hasNext = buffer.incrementToken();
          assert hasNext == true;
          term = termAtt.term();

        } catch (IOException e) {
          // safe to ignore, because we know the number of tokens
        }

        fieldNode.setText(term);

        return fieldNode;

      } else if (severalTokensAtSamePosition) {
        if (positionCount == 1) {
          // no phrase query:
          LinkedList<QueryNode> children = new LinkedList<QueryNode>();

          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.term();

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            children.add(new FieldQueryNode(field, term, -1, -1));

          }

          return new GroupQueryNode(new StandardBooleanQueryNode(children, true));

        } else {
          // phrase query:
          MultiPhraseQueryNode mpq = new MultiPhraseQueryNode();

          List<FieldQueryNode> multiTerms = new ArrayList<FieldQueryNode>();
          int position = -1;
          int i = 0;
          int termGroupCount = 0;
          for (; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.term();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }

            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {

              for (FieldQueryNode termNode : multiTerms) {

                if (this.positionIncrementsEnabled) {
                  termNode.setPositionIncrement(position);
                } else {
                  termNode.setPositionIncrement(termGroupCount);
                }

                mpq.add(termNode);

              }

              // Only increment once for each "group" of
              // terms that were in the same position:
              termGroupCount++;

              multiTerms.clear();

            }

            position += positionIncrement;
            multiTerms.add(new FieldQueryNode(field, term, -1, -1));

          }

          for (FieldQueryNode termNode : multiTerms) {

            if (this.positionIncrementsEnabled) {
              termNode.setPositionIncrement(position);

            } else {
              termNode.setPositionIncrement(termGroupCount);
            }

            mpq.add(termNode);

          }

          return mpq;

        }

      } else {

        TokenizedPhraseQueryNode pq = new TokenizedPhraseQueryNode();

        int position = -1;

        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.term();

            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
View Full Code Here

    return rv;
  }

  private IndexReader getReaderForField(String field) throws IOException {
    if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
      tokenStream = new CachingTokenFilter(tokenStream);
      cachedTokenStream = true;
    }
    IndexReader reader = (IndexReader) readers.get(field);
    if (reader == null) {
      MemoryIndex indexer = new MemoryIndex();
View Full Code Here

  public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
    MockRAMDirectory dir = new MockRAMDirectory();
    Analyzer analyzer = new WhitespaceAnalyzer();
    IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
    Document doc = new Document();
    TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd   ")));
    Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();
View Full Code Here

   */
  protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop) {
    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Use the analyzer to get all the tokens, and then build a TermQuery,
    // PhraseQuery, or nothing based on the term count
    CachingTokenFilter buffer = null;
    TermToBytesRefAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;
    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;
    boolean hasMoreTokens = false;   
   
    TokenStream source = null;
    try {
      source = analyzer.tokenStream(field, queryText);
      source.reset();
      buffer = new CachingTokenFilter(source);
      buffer.reset();

      if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
        termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }

      if (termAtt != null) {
        try {
          hasMoreTokens = buffer.incrementToken();
          while (hasMoreTokens) {
            numTokens++;
            int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
            if (positionIncrement != 0) {
              positionCount += positionIncrement;
            } else {
              severalTokensAtSamePosition = true;
            }
            hasMoreTokens = buffer.incrementToken();
          }
        } catch (IOException e) {
          // ignore
        }
      }
    } catch (IOException e) {
      throw new RuntimeException("Error analyzing query text", e);
    } finally {
      IOUtils.closeWhileHandlingException(source);
    }
   
    // rewind the buffer stream
    buffer.reset();

    BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();

    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        termAtt.fillBytesRef();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
    } else {
      if (severalTokensAtSamePosition || (!quoted)) {
        if (positionCount == 1 || (!quoted)) {
          // no phrase query:
         
          if (positionCount == 1) {
            // simple case: only one position, with synonyms
            BooleanQuery q = newBooleanQuery(true);
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              Query currentQuery = newTermQuery(
                  new Term(field, BytesRef.deepCopyOf(bytes)));
              q.add(currentQuery, BooleanClause.Occur.SHOULD);
            }
            return q;
          } else {
            // multiple positions
            BooleanQuery q = newBooleanQuery(false);
            Query currentQuery = null;
            for (int i = 0; i < numTokens; i++) {
              try {
                boolean hasNext = buffer.incrementToken();
                assert hasNext == true;
                termAtt.fillBytesRef();
              } catch (IOException e) {
                // safe to ignore, because we know the number of tokens
              }
              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
                if (!(currentQuery instanceof BooleanQuery)) {
                  Query t = currentQuery;
                  currentQuery = newBooleanQuery(true);
                  ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
                }
                ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
              } else {
                if (currentQuery != null) {
                  q.add(currentQuery, operator);
                }
                currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
              }
            }
            q.add(currentQuery, operator);
            return q;
          }
        } else {
          // phrase query:
          MultiPhraseQuery mpq = newMultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List<Term> multiTerms = new ArrayList<>();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              termAtt.fillBytesRef();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }

            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add(multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add(multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes)));
          }
          if (enablePositionIncrements) {
            mpq.add(multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add(multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      } else {
        PhraseQuery pq = newPhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;

        for (int i = 0; i < numTokens; i++) {
          int positionIncrement = 1;

          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            termAtt.fillBytesRef();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
View Full Code Here

  public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
    MockRAMDirectory dir = new MockRAMDirectory();
    Analyzer analyzer = new WhitespaceAnalyzer();
    IndexWriter w = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
    Document doc = new Document();
    TokenStream stream = new CachingTokenFilter(analyzer.tokenStream("field", new StringReader("abcd   ")));
    Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
    w.close();
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.CachingTokenFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.