Package org.apache.solr.handler

Source Code of org.apache.solr.handler.MoreLikeThisHandler$InterestingTerm

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.solr.handler;

import java.io.IOException;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Comparator;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.MoreLikeThisParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.MoreLikeThisParams.TermStyle;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SimpleFacets;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;

/**
* Solr MoreLikeThis --
*
* Return similar documents either based on a single document or based on posted text.
*
* @since solr 1.3
*/
public class MoreLikeThisHandler extends RequestHandlerBase 
{
  // Pattern is thread safe -- TODO? share this with general 'fl' param
  private static final Pattern splitList = Pattern.compile(",| ");
 
  @Override
  public void init(NamedList args) {
    super.init(args);
  }

  @Override
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
  {
    SolrParams params = req.getParams();
    SolrIndexSearcher searcher = req.getSearcher();
   
   
    MoreLikeThisHelper mlt = new MoreLikeThisHelper( params, searcher );
    List<Query> filters = SolrPluginUtils.parseFilterQueries(req);
   
    // Hold on to the interesting terms if relevant
    TermStyle termStyle = TermStyle.get( params.get( MoreLikeThisParams.INTERESTING_TERMS ) );
    List<InterestingTerm> interesting = (termStyle == TermStyle.NONE )
      ? null : new ArrayList<InterestingTerm>( mlt.mlt.getMaxQueryTerms() );
   
    DocListAndSet mltDocs = null;
    String q = params.get( CommonParams.Q );
   
    // Parse Required Params
    // This will either have a single Reader or valid query
    Reader reader = null;
    try {
      if (q == null || q.trim().length() < 1) {
        Iterable<ContentStream> streams = req.getContentStreams();
        if (streams != null) {
          Iterator<ContentStream> iter = streams.iterator();
          if (iter.hasNext()) {
            reader = iter.next().getReader();
          }
          if (iter.hasNext()) {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
                "MoreLikeThis does not support multiple ContentStreams");
          }
        }
      }

      // What fields do we need to return
      String fl = params.get(CommonParams.FL);
      int flags = 0;
      if (fl != null) {
        flags |= SolrPluginUtils.setReturnFields(fl, rsp);
      }

      int start = params.getInt(CommonParams.START, 0);
      int rows = params.getInt(CommonParams.ROWS, 10);

      // Find documents MoreLikeThis - either with a reader or a query
      // --------------------------------------------------------------------------------
      if (reader != null) {
        mltDocs = mlt.getMoreLikeThis(reader, start, rows, filters,
            interesting, flags);
      } else if (q != null) {
        // Matching options
        boolean includeMatch = params.getBool(MoreLikeThisParams.MATCH_INCLUDE,
            true);
        int matchOffset = params.getInt(MoreLikeThisParams.MATCH_OFFSET, 0);
        // Find the base match
        Query query = QueryParsing.parseQuery(q, params.get(CommonParams.DF),
            params, req.getSchema());
        DocList match = searcher.getDocList(query, null, null, matchOffset, 1,
            flags); // only get the first one...
        if (includeMatch) {
          rsp.add("match", match);
        }

        // This is an iterator, but we only handle the first match
        DocIterator iterator = match.iterator();
        if (iterator.hasNext()) {
          // do a MoreLikeThis query for each document in results
          int id = iterator.nextDoc();
          mltDocs = mlt.getMoreLikeThis(id, start, rows, filters, interesting,
              flags);
        }
      } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
            "MoreLikeThis requires either a query (?q=) or text to find similar documents.");
      }

    } finally {
      if (reader != null) {
        reader.close();
      }
    }
   
    if( mltDocs == null ) {
      mltDocs = new DocListAndSet(); // avoid NPE
    }
    rsp.add( "response", mltDocs.docList );
   
 
    if( interesting != null ) {
      if( termStyle == TermStyle.DETAILS ) {
        NamedList<Float> it = new NamedList<Float>();
        for( InterestingTerm t : interesting ) {
          it.add( t.term.toString(), t.boost );
        }
        rsp.add( "interestingTerms", it );
      }
      else {
        List<String> it = new ArrayList<String>( interesting.size() );
        for( InterestingTerm t : interesting ) {
          it.add( t.term.text());
        }
        rsp.add( "interestingTerms", it );
      }
    }
   
    // maybe facet the results
    if (params.getBool(FacetParams.FACET,false)) {
      if( mltDocs.docSet == null ) {
        rsp.add( "facet_counts", null );
      }
      else {
        SimpleFacets f = new SimpleFacets(req, mltDocs.docSet, params );
        rsp.add( "facet_counts", f.getFacetCounts() );
      }
    }

    boolean dbg = req.getParams().getBool(CommonParams.DEBUG_QUERY, false);
    // Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug?
    if (dbg) {
      try {
        NamedList<Object> dbgInfo = SolrPluginUtils.doStandardDebug(req, q, mlt.getRawMLTQuery(), mltDocs.docList);
        if (null != dbgInfo) {
          if (null != filters) {
            dbgInfo.add("filter_queries",req.getParams().getParams(CommonParams.FQ));
            List<String> fqs = new ArrayList<String>(filters.size());
            for (Query fq : filters) {
              fqs.add(QueryParsing.toString(fq, req.getSchema()));
            }
            dbgInfo.add("parsed_filter_queries",fqs);
          }
          rsp.add("debug", dbgInfo);
        }
      } catch (Exception e) {
        SolrException.logOnce(SolrCore.log, "Exception during debug", e);
        rsp.add("exception_during_debug", SolrException.toStr(e));
      }
    }
  }
 
  public static class InterestingTerm
  {
    public Term term;
    public float boost;
       
    public static Comparator<InterestingTerm> BOOST_ORDER = new Comparator<InterestingTerm>() {
      public int compare(InterestingTerm t1, InterestingTerm t2) {
        float d = t1.boost - t2.boost;
        if( d == 0 ) {
          return 0;
        }
        return (d>0)?1:-1;
      }
    };
  }
 
  /**
   * Helper class for MoreLikeThis that can be called from other request handlers
   */
  public static class MoreLikeThisHelper
  {
    final SolrIndexSearcher searcher;
    final MoreLikeThis mlt;
    final IndexReader reader;
    final SchemaField uniqueKeyField;
    final boolean needDocSet;
    Map<String,Float> boostFields;
   
    public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher )
    {
      this.searcher = searcher;
      this.reader = searcher.getReader();
      this.uniqueKeyField = searcher.getSchema().getUniqueKeyField();
      this.needDocSet = params.getBool(FacetParams.FACET,false);
     
      SolrParams required = params.required();
      String[] fields = splitList.split( required.get(MoreLikeThisParams.SIMILARITY_FIELDS) );
      if( fields.length < 1 ) {
        throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
            "MoreLikeThis requires at least one similarity field: "+MoreLikeThisParams.SIMILARITY_FIELDS );
      }
     
      this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() );
      mlt.setFieldNames(fields);
      mlt.setAnalyzer( searcher.getSchema().getAnalyzer() );
     
      // configurable params
      mlt.setMinTermFreq(       params.getInt(MoreLikeThisParams.MIN_TERM_FREQ,         MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
      mlt.setMinDocFreq(        params.getInt(MoreLikeThisParams.MIN_DOC_FREQ,          MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
      mlt.setMinWordLen(        params.getInt(MoreLikeThisParams.MIN_WORD_LEN,          MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
      mlt.setMaxWordLen(        params.getInt(MoreLikeThisParams.MAX_WORD_LEN,          MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
      mlt.setMaxQueryTerms(     params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS,       MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
      mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
      mlt.setBoost(            params.getBool(MoreLikeThisParams.BOOST, false ) );
      boostFields = SolrPluginUtils.parseFieldBoosts(params.getParams(MoreLikeThisParams.QF));
    }
   
    private Query rawMLTQuery;
    private Query boostedMLTQuery;
    private BooleanQuery realMLTQuery;
   
    public Query getRawMLTQuery(){
      return rawMLTQuery;
    }
   
    public Query getBoostedMLTQuery(){
      return boostedMLTQuery;
    }
   
    public Query getRealMLTQuery(){
      return realMLTQuery;
    }
   
    private Query getBoostedQuery(Query mltquery) {
      BooleanQuery boostedQuery = (BooleanQuery)mltquery.clone();
      if (boostFields.size() > 0) {
        List clauses = boostedQuery.clauses();
        for( Object o : clauses ) {
          TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
          Float b = this.boostFields.get(q.getTerm().field());
          if (b != null) {
            q.setBoost(b*q.getBoost());
          }
        }
      }
      return boostedQuery;
    }
   
    public DocListAndSet getMoreLikeThis( int id, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
    {
      Document doc = reader.document(id);
      rawMLTQuery = mlt.like(id);
      boostedMLTQuery = getBoostedQuery( rawMLTQuery );
      if( terms != null ) {
        fillInterestingTermsFromMLTQuery( rawMLTQuery, terms );
      }

      // exclude current document from results
      realMLTQuery = new BooleanQuery();
      realMLTQuery.add(boostedMLTQuery, BooleanClause.Occur.MUST);
      realMLTQuery.add(
          new TermQuery(new Term(uniqueKeyField.getName(), uniqueKeyField.getType().storedToIndexed(doc.getFieldable(uniqueKeyField.getName())))),
            BooleanClause.Occur.MUST_NOT);
     
      DocListAndSet results = new DocListAndSet();
      if (this.needDocSet) {
        results = searcher.getDocListAndSet(realMLTQuery, filters, null, start, rows, flags);
      } else {
        results.docList = searcher.getDocList(realMLTQuery, filters, null, start, rows, flags);
      }
      return results;
    }

    public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
    {
      // analyzing with the first field: previous (stupid) behavior
      rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
      boostedMLTQuery = getBoostedQuery( rawMLTQuery );
      if( terms != null ) {
        fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
      }
      DocListAndSet results = new DocListAndSet();
      if (this.needDocSet) {
        results = searcher.getDocListAndSet( boostedMLTQuery, filters, null, start, rows, flags);
      } else {
        results.docList = searcher.getDocList( boostedMLTQuery, filters, null, start, rows, flags);
      }
      return results;
    }

    @Deprecated
    public NamedList<DocList> getMoreLikeThese( DocList docs, int rows, int flags ) throws IOException
    {
      IndexSchema schema = searcher.getSchema();
      NamedList<DocList> mlt = new SimpleOrderedMap<DocList>();
      DocIterator iterator = docs.iterator();
      while( iterator.hasNext() ) {
        int id = iterator.nextDoc();
       
        DocListAndSet sim = getMoreLikeThis( id, 0, rows, null, null, flags );
        String name = schema.printableUniqueKey( reader.document( id ) );

        mlt.add(name, sim.docList);
      }
      return mlt;
    }
   
    private void fillInterestingTermsFromMLTQuery( Query query, List<InterestingTerm> terms )
    {
      List clauses = ((BooleanQuery)query).clauses();
      for( Object o : clauses ) {
        TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
        InterestingTerm it = new InterestingTerm();
        it.boost = q.getBoost();
        it.term = q.getTerm();
        terms.add( it );
      }
      // alternatively we could use
      // mltquery.extractTerms( terms );
    }
   
    public MoreLikeThis getMoreLikeThis()
    {
      return mlt;
    }
  }
 
 
  //////////////////////// SolrInfoMBeans methods //////////////////////

  @Override
  public String getVersion() {
    return "$Revision: 1164331 $";
  }

  @Override
  public String getDescription() {
    return "Solr MoreLikeThis";
  }

  @Override
  public String getSourceId() {
    return "$Id: MoreLikeThisHandler.java 1164331 2011-09-02 02:14:28Z koji $";
  }

  @Override
  public String getSource() {
    return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_3_5/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java $";
  }

  @Override
  public URL[] getDocs() {
    try {
      return new URL[] { new URL("http://wiki.apache.org/solr/MoreLikeThis") };
    }
    catch( MalformedURLException ex ) { return null; }
  }
}
TOP

Related Classes of org.apache.solr.handler.MoreLikeThisHandler$InterestingTerm

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.