Package com.esri.gpt.catalog.lucene.stats

Source Code of com.esri.gpt.catalog.lucene.stats.SingleFieldStats

/* See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* Esri Inc. licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.esri.gpt.catalog.lucene.stats;
import com.esri.gpt.framework.security.metadata.MetadataAcl;
import com.esri.gpt.framework.util.Val;

import java.io.IOException;
import java.io.PrintWriter;
import java.net.URLEncoder;
import java.util.List;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.OpenBitSet;

/**
* Provides statistics associated with a single field.
* <p/>
* Statistics are unavailable until collected.
* <p/>
* Statistics provided are:
* <ul>
*   <li>the number of documents considered during statistics collection</li>
*   <li>the number of documents that contain the field</li>
*   <li>the total number of terms indexed for this field across all documents</li>
*   <li>the list of frequencies per term for this field</li>
* </ul>
*/
public class SingleFieldStats extends Collectable {

  /** instance variables ====================================================== */
  private String               fieldName;
  private int                  maxRecords = 100;
  private int                  maxFrequency = -1;
  private int                  minFrequency = 1;
  private long                 numberOfDocsWithField = 0;
  private FrequencyAccumulator termAccumulator = new FrequencyAccumulator();
 
  /** constructors ============================================================ */
 
  /**
   * Construct with a supplied field name.
   * @param fieldName the field name
   */
  public SingleFieldStats(String fieldName) {
    super();
    this.fieldName = fieldName;
  }
 
  /**
   * Constructs with a supplied field name and min/max thresholds.
   * @param fieldName the field name
   * @param maxRecords the maximum number of records to return
   * @param minFrequency the minimum term frequency to consider
   * @param maxFrequency the maximum term frequency to consider
   */
  public SingleFieldStats(String fieldName, int maxRecords, int minFrequency, int maxFrequency) {
    super();
    this.fieldName = fieldName;
    if (maxRecords >= 0) {
      if (maxRecords > 10000) maxRecords = 10000;
      this.maxRecords = maxRecords;
    }
    if (minFrequency > 0) {
      this.minFrequency = minFrequency;
    }
    if (maxFrequency > 0) {
      this.maxFrequency = maxFrequency;
    }
  }
 
  /** properties  ============================================================= */
 
  /**
   * Gets the field name.
   * @return the field name
   */
  private String getFieldName() {
    return this.fieldName;
  }
 
  /**
   * Gets the number of documents containing this field.
   * @return the number of documents
   */
  private long getNumberOfDocsWithField() {
    return this.numberOfDocsWithField;
  }
 
  /**
   * Gets the list of frequencies per term for this field.
   * <br/>Each member will be named by term and counted by term frequency.
   * @return the term frequencies
   */
  private List<NamedFrequency> getTermFrequencies() {
    return this.termAccumulator.getFrequencies();
  }
 
  /**
   * Gets the total number of terms indexed for this field across all documents.
   * @return the total number of terms
   */
  private long getTotalNumberOfTerms() {
    return this.termAccumulator.getTotalFrequency();
  }
 
  /** methods ================================================================= */
 
  /**
   * Executes the collection of statistics.
   * @param request the active statistics request
   * @param reader the index reader
   * @throws IOException if an error occurs while communicating with the index
   */
  public void collectStats(StatsRequest request, IndexReader reader) throws IOException {
    long t1 = System.currentTimeMillis();
    TermEnum termEnum = null;
    TermDocs termDocs = null;

    try {
     
      OpenBitSet documentFilterBitSet = request.getDocumentFilterBitSet();
      OpenBitSet docsWithFieldBitSet = new OpenBitSet(reader.maxDoc());
      boolean isUnfiltered = (documentFilterBitSet == null);
      boolean checkMaxFreq = (this.maxFrequency > 0);
      boolean checkMinFreq = (this.minFrequency > 0);

      // return if there are no stats to collect
      if (this.determineNumberOfDocsConsidered(reader,documentFilterBitSet) <= 0) {
        return;
      } else if (!request.isFieldCollectable(this.fieldName)){
        return;
      }
     
      boolean checkTermDocs = true;
      if (isUnfiltered) {
        MetadataAcl acl = new MetadataAcl(request.getRequestContext());
        if (acl.isPolicyUnrestricted()) {
          if (this.getNumberOfDocsConsidered() > 25000) {
            checkTermDocs = false;
          }
        }
      }
     
      // accumulate term frequencies per field
      termEnum = reader.terms(new Term(this.fieldName));
      termDocs = reader.termDocs();
      do {
        Term term = termEnum.term();
        if (term != null && term.field().equals(this.fieldName)) {
         
          if (checkTermDocs) {
            termDocs.seek(term);
            long count = 0;
            while (termDocs.next()) {
              int docId = termDocs.doc();             
              boolean bSet = isUnfiltered || documentFilterBitSet.fastGet(docId);
              if (bSet) {
                docsWithFieldBitSet.fastSet(docId);
                count++;
              }
            }
            if ((!checkMaxFreq || (count <= this.maxFrequency)) &&
                (!checkMinFreq || (count >= this.minFrequency))) {
              this.termAccumulator.add(term.text(),count);
            }
           
          } else {
            long count = termEnum.docFreq();
            if ((!checkMaxFreq || (count <= this.maxFrequency)) &&
                (!checkMinFreq || (count >= this.minFrequency))) {
              this.termAccumulator.add(term.text(),count);
            }
          }
         
        } else {
          break;
        }
      } while (termEnum.next());
     
      // sort
      this.numberOfDocsWithField = docsWithFieldBitSet.cardinality();
      if (Val.chkStr(request.getSortBy()).equalsIgnoreCase("name")) {
        this.termAccumulator.sortByName();
      } else {
        this.termAccumulator.sortByFrequency();
      }
     
    } finally {
      try {if (termEnum != null) termEnum.close();} catch (Exception ef) {}
      try {if (termDocs != null) termDocs.close();} catch (Exception ef) {}
      this.setTimeMillis(System.currentTimeMillis() - t1);
    }  
   
    // print
    if (request.getResponseWriter() != null) {
      this.print(request);
    }
  
  }
 
  /**
   * Prints collected statistics.
   * @param request the active statistics request
   */
  private void print(StatsRequest request) throws IOException {
    PrintWriter writer = request.getResponseWriter();
    String baseUrl = request.getBaseStatsUrl();
    String baseQueryUrl = request.getBaseQueryUrl();
    String callbackUrl = baseUrl+"/fields?field="+this.fieldName;
   
    int max = this.maxRecords;
    int numToReturn = this.getTermFrequencies().size();
    if ((this.maxRecords >= 0) && (this.maxRecords < numToReturn)) {
      numToReturn = max;
    }
    String sMaxFreq = "none";
    if (this.maxFrequency > 0) {
      sMaxFreq = ""+this.maxFrequency;
    }
    
    /*
    writer.println("timeMillis="+this.getTimeMillis());
    writer.println("numberOfDocsConsidered="+this.getNumberOfDocsConsidered());
    writer.println("fieldName="+this.getFieldName());
    writer.println("numberOfDocsWithField="+this.getNumberOfDocsWithField());
    writer.println("totalNumberOfTerms="+getTotalNumberOfTerms());
   
    writer.println("....................");
    List<NamedFrequency> frequencies = this.getTermFrequencies();
    for (NamedFrequency frequency: frequencies) {
      writer.println("frequency="+frequency.getFrequency()+", term="+frequency.getName());
    }
    */
   
    if (request.getResponseFormat().equalsIgnoreCase("json")) {
      writer.println("{");
      writer.println("  \"field\": \""+Val.escapeStrForJson(this.fieldName)+"\",");
      writer.println("  \"documentsIndexed\": "+this.getNumberOfDocsConsidered()+",");
      //writer.println("  \"numberOfDocsWithField\": "+this.getNumberOfDocsWithField()+",");
      writer.println("  \"totalNumberOfTerms\": "+this.getTermFrequencies().size()+",");
      writer.println("  \"numberOfTermsListed\": "+numToReturn+",");
      writer.println("  \"minFrequencyConsidered\": "+this.minFrequency+",");
      writer.println("  \"maxFrequencyConsidered\": "+this.maxFrequency+",");
    
      writer.println("  \"terms\": [");
      List<NamedFrequency> frequencies = this.getTermFrequencies();
      int count = 0;
      if (numToReturn > 0) {
        for (NamedFrequency frequency: frequencies) {
          count++;
          boolean isLast = (count >= numToReturn);       
          writer.print("    {");
          writer.print("\"name\": \""+Val.escapeStrForJson(frequency.getName())+"\"");
          writer.print(", \"documents\": "+frequency.getFrequency());
          if (!isLast) {
            writer.println("},");
          } else {
            writer.println("}");
            break;
          }
        }
      }
      writer.println("  ]");
      writer.println("}");
      writer.flush();
     
    } else {
   
      // navigation
      writer.println("<p><a href=\""+baseUrl+"/fields\">Fields</a>");
      //writer.println("&nbsp;&nbsp;<a href=\""+baseUrl+"/terms\">Terms</a></p>");
     
      // summary
      writer.println("<p>");
      writer.println("<strong>Field:</strong>&nbsp;"+Val.escapeXmlForBrowser(this.fieldName));
      writer.println("<br/><strong>Documents indexed:</strong>&nbsp;"+this.getNumberOfDocsConsidered());
      //writer.println("<br/><strong>Documents containing field:</strong>&nbsp;"+this.getNumberOfDocsWithField());
      writer.println("<br/><strong>Total number of terms:</strong>&nbsp;"+this.getTermFrequencies().size());
      writer.println("<br/><strong>Number of terms listed:</strong>&nbsp;"+numToReturn);
      writer.println("<br/><strong>Minimum frequency considered:</strong>&nbsp;"+this.minFrequency);
      writer.println("<br/><strong>Maximum frequency considered:</strong>&nbsp;"+sMaxFreq);
      writer.println("</p>");
     
      // statistics table
      if (numToReturn > 0) {
        writer.println("<table border='1'>");
        writer.println("<thead><tr>");
        writer.println("<th><a href=\""+callbackUrl+"&amp;sortBy=name\">Term</a></th>");
        writer.println("<th><a href=\""+callbackUrl+"\">Documents</a></th>");
        writer.println("</tr></thead>");
        writer.println("<tbody>");
        List<NamedFrequency> frequencies = this.getTermFrequencies();
        int count = 0;
        for (NamedFrequency frequency: frequencies) {
          count++;
          boolean isLast = (count >= numToReturn);
          writer.print("<tr>");
         
          String encTerm = URLEncoder.encode(frequency.getName(),"UTF-8");
          //String escTerm = Val.escapeXmlForBrowser(frequency.getName());
          String escTerm = Val.escapeXmlForBrowser(encTerm);
          String href1 = baseUrl+"/terms?term="+escTerm;
          writer.print("<td>");
          //writer.print("<a href=\""+href1+"\">");
          writer.print(Val.escapeXmlForBrowser(frequency.getName()));
          //writer.print("</a>");
          writer.print("</td>");
         
          String q = this.fieldName+":"+QueryParser.escape(frequency.getName());
          String href2 = baseQueryUrl+"?f=html&searchText="+URLEncoder.encode(q,"UTF-8");
          writer.print("<td style=\"text-align:right;\">");
          writer.print("<a href=\""+Val.escapeXmlForBrowser(href2)+"\">");
          writer.print(frequency.getFrequency());
          writer.print("</a></td>");
             
          writer.println("</tr>");
          //writer.flush();
          if (isLast) break;
        }
      }
      writer.println("</tbody>");
      writer.println("</table>");
      writer.flush();
    }
  }
 
}
TOP

Related Classes of com.esri.gpt.catalog.lucene.stats.SingleFieldStats

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.