Source Code of org.apache.lucene.search.highlight.StoredTokenStream

/*
 * Created on 28-Oct-2004
 */
package org.apache.lucene.search.highlight;


/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;


/**
 * Hides implementation issues associated with obtaining a TokenStream for use with
 * the higlighter - can obtain from TermFreqVectors with offsets and (optionally) positions or
 * from Analyzer class reparsing the stored content.
 */
public class TokenSources
{
  /**
   * A convenience method that tries to first get a TermPositionVector for the specified docId, then, falls back to
   * using the passed in {@link org.apache.lucene.document.Document} to retrieve the TokenStream.  This is useful when
   * you already have the document, but would prefer to use the vector first.
   * @param reader The {@link org.apache.lucene.index.IndexReader} to use to try and get the vector from
   * @param docId The docId to retrieve.
   * @param field The field to retrieve on the document
   * @param doc The document to fall back on
   * @param analyzer The analyzer to use for creating the TokenStream if the vector doesn't exist
   * @return The {@link org.apache.lucene.analysis.TokenStream} for the {@link org.apache.lucene.document.Fieldable} on the {@link org.apache.lucene.document.Document}
   * @throws IOException if there was an error loading
   */
  public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Document doc, Analyzer analyzer) throws IOException{
    TokenStream ts=null;


    TermFreqVector tfv = reader.getTermFreqVector(docId,field);
    if(tfv!=null)
    {
        if(tfv instanceof TermPositionVector)
        {
            ts=getTokenStream((TermPositionVector) tfv);
        }
    }
    //No token info stored so fall back to analyzing raw content
    if(ts==null)
    {
        ts=getTokenStream(doc,field,analyzer);
    }
    return ts;
  }
    /**
     * A convenience method that tries a number of approaches to getting a token stream.
     * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
     * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
     * @param reader
     * @param docId
     * @param field
     * @param analyzer
     * @return null if field not stored correctly 
     * @throws IOException
     */
    public static TokenStream getAnyTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
    {
    TokenStream ts=null;


    TermFreqVector tfv = reader.getTermFreqVector(docId,field);
    if(tfv!=null)
    {
        if(tfv instanceof TermPositionVector)
        {
            ts=getTokenStream((TermPositionVector) tfv);
        }
    }
    //No token info stored so fall back to analyzing raw content
    if(ts==null)
    {
        ts=getTokenStream(reader,docId,field,analyzer);
    }
    return ts;
    }
    
    
    public static TokenStream getTokenStream(TermPositionVector tpv)
    {
        //assumes the worst and makes no assumptions about token position sequences.
         return getTokenStream(tpv,false);   
    }
    /**
     * Low level api.
     * Returns a token stream or null if no offset info available in index.
     * This can be used to feed the highlighter with a pre-parsed token stream 
     * 
     * In my tests the speeds to recreate 1000 token streams using this method are:
     * - with TermVector offset only data stored - 420  milliseconds 
     * - with TermVector offset AND position data stored - 271 milliseconds
     *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
     *  positions - no overlaps or gaps)
     * The cost of not using TermPositionVector to store
     * pre-parsed content and using an analyzer to re-parse the original content: 
     * - reanalyzing the original content - 980 milliseconds
     * 
     * The re-analyze timings will typically vary depending on -
     *   1) The complexity of the analyzer code (timings above were using a 
     *      stemmer/lowercaser/stopword combo)
     *  2) The  number of other fields (Lucene reads ALL fields off the disk 
     *     when accessing just one document field - can cost dear!)
     *  3) Use of compression on field storage - could be faster due to compression (less disk IO)
     *     or slower (more CPU burn) depending on the content.
     *
     * @param tpv
     * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
     * to eek out the last drops of performance, set to true. If in doubt, set to false.
     */
    public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) {
        //an object used to iterate across an array of tokens
        class StoredTokenStream extends TokenStream {
          Token tokens[];
          int currentToken = 0;
          TermAttribute termAtt;
          OffsetAttribute offsetAtt;
    
          StoredTokenStream(Token tokens[]) {
            this.tokens = tokens;
            termAtt = addAttribute(TermAttribute.class);
            offsetAtt = addAttribute(OffsetAttribute.class);
          }
    
          @Override
          public boolean incrementToken() throws IOException {
            if (currentToken >= tokens.length) {
              return false;
            }
            Token token = tokens[currentToken++];
            termAtt.setTermBuffer(token.term());
            offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
          }
        }      
        //code to reconstruct the original sequence of Tokens
        String[] terms=tpv.getTerms();          
        int[] freq=tpv.getTermFrequencies();
        int totalTokens=0;


        for (int t = 0; t < freq.length; t++)
        {
            totalTokens+=freq[t];
        }
        Token tokensInOriginalOrder[]=new Token[totalTokens];
        ArrayList<Token> unsortedTokens = null;
        for (int t = 0; t < freq.length; t++)
        {
            TermVectorOffsetInfo[] offsets=tpv.getOffsets(t);
            if(offsets==null)
            {
                return null;
            }
            
            int[] pos=null;
            if(tokenPositionsGuaranteedContiguous)
            {
                //try get the token position info to speed up assembly of tokens into sorted sequence
                pos=tpv.getTermPositions(t);
            }
            if(pos==null)
            {  
                //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
                if(unsortedTokens==null)
                {
                    unsortedTokens=new ArrayList<Token>();
                }
                for (int tp = 0; tp < offsets.length; tp++)
                {
                  Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  token.setTermBuffer(terms[t]);
                  unsortedTokens.add(token);
                }
            }
            else
            {
                //We have positions stored and a guarantee that the token position information is contiguous
                
                // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
                // creates jumps in position numbers - this code would fail under those circumstances
                
                //tokens stored with positions - can use this to index straight into sorted array
                for (int tp = 0; tp < pos.length; tp++)
                {
                  Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
                  tokensInOriginalOrder[pos[tp]] = token;
                }                
            }
        }
        //If the field has been stored without position data we must perform a sort        
        if(unsortedTokens!=null) {
            tokensInOriginalOrder= unsortedTokens.toArray(new Token[unsortedTokens.size()]);
            Arrays.sort(tokensInOriginalOrder, new Comparator<Token>(){
                public int compare(Token t1, Token t2) {
                    if(t1.startOffset()>t2.endOffset())
                        return 1;
                    if(t1.startOffset()<t2.startOffset())
                        return -1;
                    return 0;
                }});
        }
        return new StoredTokenStream(tokensInOriginalOrder);
    }


    public static TokenStream getTokenStream(IndexReader reader,int docId, String field) throws IOException
    {
    TermFreqVector tfv = reader.getTermFreqVector(docId,field);
    if(tfv==null)
    {
        throw new IllegalArgumentException(field+" in doc #"+docId
                  +"does not have any term position data stored");
    }
      if(tfv instanceof TermPositionVector)
      {
      TermPositionVector tpv=(TermPositionVector) reader.getTermFreqVector(docId,field);
          return getTokenStream(tpv);          
      }
      throw new IllegalArgumentException(field+" in doc #"+docId
              +"does not have any term position data stored");
    }


    //convenience method
    public static TokenStream getTokenStream(IndexReader reader,int docId, String field,Analyzer analyzer) throws IOException
    {
      Document doc=reader.document(docId);
      return getTokenStream(doc, field, analyzer);
    }
    
  public static TokenStream getTokenStream(Document doc, String field, Analyzer analyzer){
    String contents=doc.get(field);
    if(contents==null)
    {
        throw new IllegalArgumentException("Field "+field +" in document is not stored and cannot be analyzed");
    }
        return getTokenStream(field, contents, analyzer);
  }
  //convenience method
  public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
    return analyzer.tokenStream(field,new StringReader(contents));
  }


}
Source Code of org.apache.lucene.search.highlight.StoredTokenStream

Related Classes of org.apache.lucene.search.highlight.StoredTokenStream