Source Code of org.apache.nutch.summary.lucene.LuceneSummarizer

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.summary.lucene;


// JDK imports
import java.io.StringReader;
import java.util.ArrayList;


// Hadoop imports
import org.apache.hadoop.conf.Configuration;


// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.WeightedTerm;


// Nutch imports
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Summarizer;
import org.apache.nutch.searcher.Summary;
import org.apache.nutch.searcher.Summary.Ellipsis;
import org.apache.nutch.searcher.Summary.Fragment;
import org.apache.nutch.searcher.Summary.Highlight;




/** Implements hit summarization. */
public class LuceneSummarizer implements Summarizer {
  
  private final static String SEPARATOR = "###";
  private final static Formatter FORMATTER =
          new SimpleHTMLFormatter(SEPARATOR, SEPARATOR);


  /** Converts text to tokens. */
  private Analyzer analyzer = null;
  private Configuration conf = null;
  
  public LuceneSummarizer() { }
  
  private LuceneSummarizer(Configuration conf) {
    setConf(conf);
  }
  
  
  /* ----------------------------- *
   * <implementation:Configurable> *
   * ----------------------------- */
  
  public Configuration getConf() {
    return conf;
  }
  
  public void setConf(Configuration conf) {
    this.conf = conf;
    this.analyzer = new NutchDocumentAnalyzer(conf);
  }
  
  /* ------------------------------ *
   * </implementation:Configurable> *
   * ------------------------------ */
  
  
  /* --------------------------- *
   * <implementation:Summarizer> *
   * --------------------------- */
  
  public Summary getSummary(String text, Query query) {


    String[] terms = query.getTerms();
    WeightedTerm[] weighted = new WeightedTerm[terms.length];
    for (int i=0; i<terms.length; i++) {
      weighted[i] = new WeightedTerm(1.0f, terms[i]);
    }
    Highlighter highlighter = new Highlighter(FORMATTER, new QueryScorer(weighted));
    TokenStream tokens = analyzer.tokenStream("content", new StringReader(text));
    Summary summary = new Summary();
    try {
      // TODO : The max number of fragments (3) should be configurable
      String[] result = highlighter.getBestFragments(tokens, text, 3);
      for (int i=0; i<result.length; i++) {
        String[] parts = result[i].split(SEPARATOR);
        boolean highlight = false;
        for (int j=0; j<parts.length; j++) {
          if (highlight) {
            summary.add(new Highlight(parts[j]));
          } else {
            summary.add(new Fragment(parts[j]));
          }
          highlight = !highlight;
        }
        summary.add(new Ellipsis());
      }
      
      /* TODO MC  BUG resolved 0000029 - if query terms do not occur on text, an empty summary is returned. Now it sends the first tokens. */
      if (result==null || result.length==0) {
        tokens = analyzer.tokenStream("content", new StringReader(text));
              
        Token firstToken=null, lastToken=null;
        Token token=null;
        int maxLen=100; // the same as defined in SimpleFragmenter but it is private
        
        /*
        ArrayList<Token> titleTokens=new ArrayList<Token>();
        ArrayList<Token> textTokens=new ArrayList<Token>();
        boolean titleMatched=false;
        boolean hasMatched=false; // exit match after match title the first time             
        
        // remove title from text. compares pairs of text
        while ((titleMatched || !hasMatched) && (token=tokens.next())!=null) {
          
          if (token.type().equals("<WORD>")) {
          
            if (titleTokens.size()==0) {
              titleTokens.add(token);
            }
            else if (textTokens.size()<titleTokens.size()) {
              textTokens.add(token);
            }
          
            if (textTokens.size()==titleTokens.size()) {
              // compare
              titleMatched=true;
              for (int i=0;i<textTokens.size() && titleMatched;i++) {
                if (!textTokens.get(i).termText().equals(titleTokens.get(i).termText())) {
                  titleMatched=false;    
                }                
              }
              if (titleMatched) { // try to match a larger pattern
                titleTokens.add(textTokens.get(0));
                textTokens.remove(0);
                hasMatched=true;
              }
              else { // remove rest of title from text
                if (hasMatched) {
                  firstToken=textTokens.get(titleTokens.size()-2);                                  
                }
                else { // add one more token to title
                  titleTokens.add(textTokens.get(0));
                    textTokens.remove(0);
                }
              }
            }
          }        
        }
        
        if (textTokens.size()==0) {
          return summary;
        }
                              
        for (int i=0;i<textTokens.size() && textTokens.get(i).endOffset()-firstToken.startOffset()<maxLen;i++) {
          lastToken=textTokens.get(i);
        }
        */
                      
        // read tokens until maxLen
        while ((token=tokens.next())!=null) {        
          if (token.type().equals("<WORD>")) {
            if (firstToken==null) {
              firstToken=token;
            }
            else if (token.endOffset()-firstToken.startOffset()<maxLen) {          
              lastToken=token;                          
            }                    
            else {
              break;
            }
          }
        }        
        if (lastToken==null) {
          lastToken=firstToken;
        }
        
        summary.add(new Fragment(text.substring(firstToken.startOffset(), lastToken.endOffset())));
        summary.add(new Ellipsis());
      }
      /* TODO MC */
      
    } catch (Exception e) {
      // Nothing to do...
    }
    return summary;
  }


  /* ---------------------------- *
   * </implementation:Summarizer> *
   * ---------------------------- */
  
}
Source Code of org.apache.nutch.summary.lucene.LuceneSummarizer

Related Classes of org.apache.nutch.summary.lucene.LuceneSummarizer