Source Code of lucli.LuceneMethods$CountingCollector

package lucli;


/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;


import jline.ConsoleReader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;


/**
 * Various methods that interact with Lucene and provide info about the 
 * index, search, etc. Parts adapted from Lucene demo.
 */
class LuceneMethods {


  private int numDocs;
  private final FSDirectory indexName; //directory of this index
  private List<String> fields; //Fields as a vector
  private List<String> indexedFields; //Fields as a vector
  private String fieldsArray[]; //Fields as an array
  private Searcher searcher;
  private Query query; //current query string
  private String analyzerClassFQN = null; // Analyzer class, if NULL, use default Analyzer


  public LuceneMethods(String index) throws IOException {
    indexName = FSDirectory.open(new File(index));
    message("Lucene CLI. Using directory '" + indexName + "'. Type 'help' for instructions.");
  }


    private Analyzer createAnalyzer() {
        if (analyzerClassFQN == null) return new StandardAnalyzer(Version.LUCENE_CURRENT);
        try {
            return Class.forName(analyzerClassFQN).asSubclass(Analyzer.class).newInstance();
        } catch (ClassCastException cce) {
            message("Given class is not an Analyzer: " + analyzerClassFQN);
            return new StandardAnalyzer(Version.LUCENE_CURRENT);
        } catch (Exception e) {
            message("Unable to use Analyzer " + analyzerClassFQN);
            return new StandardAnalyzer(Version.LUCENE_CURRENT);
        }
    }




  public void info() throws java.io.IOException {
    IndexReader indexReader = IndexReader.open(indexName, true);




    getFieldInfo();
    numDocs = indexReader.numDocs();
    message("Index has " + numDocs + " documents ");
    message("All Fields:" + fields.toString());
    message("Indexed Fields:" + indexedFields.toString());


    if (IndexWriter.isLocked(indexName)) {
      message("Index is locked");
    }
    //IndexReader.getCurrentVersion(indexName);
    //System.out.println("Version:" + version);


    indexReader.close();
  }




  public void search(String queryString, boolean explain, boolean showTokens, ConsoleReader cr)
      throws java.io.IOException, org.apache.lucene.queryParser.ParseException {
    initSearch(queryString);
    int numHits = computeCount(query);
    message(numHits + " total matching documents");
    if (explain) {
      query = explainQuery(queryString);
    }


    final int HITS_PER_PAGE = 10;
    message("--------------------------------------");
    for (int start = 0; start < numHits; start += HITS_PER_PAGE) {
      int end = Math.min(numHits, start + HITS_PER_PAGE);
      ScoreDoc[] hits = search(query, end);
      for (int ii = start; ii < end; ii++) {
        Document doc = searcher.doc(hits[ii].doc);
        message("---------------- " + (ii + 1) + " score:" + hits[ii].score + "---------------------");
        printHit(doc);
        if (showTokens) {
          invertDocument(doc);
        }
        if (explain) {
          Explanation exp = searcher.explain(query, hits[ii].doc);
          message("Explanation:" + exp.toString());
        }
      }
      message("#################################################");


      if (numHits > end) {
        // TODO: don't let the input end up in the command line history
        queryString = cr.readLine("more (y/n) ? ");
        if (queryString.length() == 0 || queryString.charAt(0) == 'n')
          break;
      }
    }
    searcher.close();
  }


  /**
   * TODO: Allow user to specify what field(s) to display
   */
  private void printHit(Document doc) {
    for (int ii = 0; ii < fieldsArray.length; ii++) {
      String currField = fieldsArray[ii];
      String[] result = doc.getValues(currField);
      if (result != null) {
        for (int i = 0; i < result.length; i++) {
          message(currField + ":" + result[i]);
        }
      } else {
        message(currField + ": <not available>");
      }
    }
    //another option is to just do message(doc);
  }


    public void optimize() throws IOException {
    //open the index writer. False: don't create a new one
    IndexWriter indexWriter = new IndexWriter(indexName, new IndexWriterConfig(
        Version.LUCENE_CURRENT, createAnalyzer()).setOpenMode(
        OpenMode.APPEND));
    message("Starting to optimize index.");
    long start = System.currentTimeMillis();
    indexWriter.optimize();
    message("Done optimizing index. Took " + (System.currentTimeMillis() - start) + " msecs");
    indexWriter.close();
  }




    private Query explainQuery(String queryString) throws IOException, ParseException {


    searcher = new IndexSearcher(indexName, true);
    Analyzer analyzer = createAnalyzer();
    getFieldInfo();


    int arraySize = indexedFields.size();
    String indexedArray[] = new String[arraySize];
    for (int ii = 0; ii < arraySize; ii++) {
      indexedArray[ii] = indexedFields.get(ii);
    }
    MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, indexedArray, analyzer);
    query = parser.parse(queryString);
    message("Searching for: " + query.toString());
    return (query);


  }


  /**
   * TODO: Allow user to specify analyzer
   */
  private void initSearch(String queryString) throws IOException, ParseException {


    searcher = new IndexSearcher(indexName, true);
    Analyzer analyzer = createAnalyzer();
    getFieldInfo();


    int arraySize = fields.size();
    fieldsArray = new String[arraySize];
    for (int ii = 0; ii < arraySize; ii++) {
      fieldsArray[ii] = fields.get(ii);
    }
    MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, fieldsArray, analyzer);
    query = parser.parse(queryString);
    System.out.println("Searching for: " + query.toString());
  }
  
  final static class CountingCollector extends Collector {
    public int numHits = 0;
    
    @Override
    public void setScorer(Scorer scorer) throws IOException {}
    @Override
    public void collect(int doc) throws IOException {
      numHits++;
    }


    @Override
    public void setNextReader(IndexReader reader, int docBase) {}
    @Override
    public boolean acceptsDocsOutOfOrder() {
      return true;
    }    
  }
  
  private int computeCount(Query q) throws IOException {
    CountingCollector countingCollector = new CountingCollector();
    
    searcher.search(q, countingCollector);    
    return countingCollector.numHits;
  }


  public void count(String queryString) throws java.io.IOException, ParseException {
    initSearch(queryString);
    message(computeCount(query) + " total documents");
    searcher.close();
  }
  
  private ScoreDoc[] search(Query q, int numHits) throws IOException {
    return searcher.search(query, numHits).scoreDocs;
  }


  static public void message(String s) {
    System.out.println(s);
  }


  private void getFieldInfo() throws IOException {
    IndexReader indexReader = IndexReader.open(indexName, true);
    fields = new ArrayList<String>();
    indexedFields = new ArrayList<String>();


    //get the list of all field names
    for(String field : indexReader.getFieldNames(FieldOption.ALL)) {
      if (field != null && !field.equals(""))
        fields.add(field.toString());
    }
    //
    //get the list of indexed field names
    for(String field : indexReader.getFieldNames(FieldOption.INDEXED)) {
      if (field != null && !field.equals(""))
        indexedFields.add(field.toString());
    }
    indexReader.close();
  }




  // Copied from DocumentWriter
  // Tokenizes the fields of a document into Postings.
  private void invertDocument(Document doc)
    throws IOException {


    Map<String,Integer> tokenMap = new HashMap<String,Integer>();
    final int maxFieldLength = 10000;


    Analyzer analyzer = createAnalyzer();
    for (Fieldable field : doc.getFields()) {
      String fieldName = field.name();
      if (field.isIndexed()) {
        if (field.isTokenized()) {     // un-tokenized field
          Reader reader;        // find or make Reader
          if (field.readerValue() != null)
            reader = field.readerValue();
          else if (field.stringValue() != null)
            reader = new StringReader(field.stringValue());
          else
            throw new IllegalArgumentException
              ("field must have either String or Reader value");


          int position = 0;
          // Tokenize field and add to postingTable
          TokenStream stream = analyzer.tokenStream(fieldName, reader);
          CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
          PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
          
          try {
            while (stream.incrementToken()) {
              position += (posIncrAtt.getPositionIncrement() - 1);
              position++;
              String name = termAtt.toString();
              Integer Count = tokenMap.get(name);
              if (Count == null) { // not in there yet
                tokenMap.put(name, Integer.valueOf(1)); //first one
              } else {
                int count = Count.intValue();
                tokenMap.put(name, Integer.valueOf(count + 1));
              }
              if (position > maxFieldLength) break;
            }
          } finally {
            stream.close();
          }
        }


      }
    }
    Map.Entry<String,Integer>[] sortedHash = getSortedMapEntries(tokenMap);
    for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
      Map.Entry<String,Integer> currentEntry = sortedHash[ii];
      message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue());
    }
  }




  /** Provides a list of the top terms of the index.
   *
   * @param field  - the name of the command or null for all of them.
   */
  public void terms(String field) throws IOException {
    TreeMap<String,Integer> termMap = new TreeMap<String,Integer>();
    IndexReader indexReader = IndexReader.open(indexName, true);
    TermEnum terms = indexReader.terms();
    while (terms.next()) {
      Term term = terms.term();
      //message(term.field() + ":" + term.text() + " freq:" + terms.docFreq());
      //if we're either not looking by field or we're matching the specific field
      if ((field == null) || field.equals(term.field()))
        termMap.put(term.field() + ":" + term.text(), Integer.valueOf((terms.docFreq())));
    }


    Iterator<String> termIterator = termMap.keySet().iterator();
    for (int ii = 0; termIterator.hasNext() && ii < 100; ii++) {
      String termDetails = termIterator.next();
      Integer termFreq = termMap.get(termDetails);
      message(termDetails + ": " + termFreq);
    }
    indexReader.close();
  }


  /** Sort Map values
   * @param m the map we're sorting
   * from http://developer.java.sun.com/developer/qow/archive/170/index.jsp
   */
  @SuppressWarnings("unchecked")
  public static <K,V extends Comparable<V>> Map.Entry<K,V>[]
    getSortedMapEntries(Map<K,V> m) {
    Set<Map.Entry<K, V>> set = m.entrySet();
    Map.Entry<K,V>[] entries =
       set.toArray(new Map.Entry[set.size()]);
    Arrays.sort(entries, new Comparator<Map.Entry<K,V>>() {
      public int compare(Map.Entry<K,V> o1, Map.Entry<K,V> o2) {
        V v1 = o1.getValue();
        V v2 = o2.getValue();
        return v2.compareTo(v1); //descending order
      }
    });
    return entries;
  }


    public void analyzer(String word) {
        if ("current".equals(word)) {
            String current = analyzerClassFQN == null ? "StandardAnalyzer" : analyzerClassFQN;
            message("The currently used Analyzer class is: " + current);
            return;
        }
        analyzerClassFQN = word;
        message("Switched to Analyzer class " + analyzerClassFQN);
    }
}
Source Code of lucli.LuceneMethods$CountingCollector

Related Classes of lucli.LuceneMethods$CountingCollector