Package com.tamingtext.classifier.mlt

Source Code of com.tamingtext.classifier.mlt.MoreLikeThisQueryTest

/*
* Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
*
*    Licensed under the Apache License, Version 2.0 (the "License");
*    you may not use this file except in compliance with the License.
*    You may obtain a copy of the License at
*
*        http://www.apache.org/licenses/LICENSE-2.0
*
*    Unless required by applicable law or agreed to in writing, software
*    distributed under the License is distributed on an "AS IS" BASIS,
*    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*    See the License for the specific language governing permissions and
*    limitations under the License.
* -------------------
* To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
* http://www.manning.com/ingersoll
*/

package com.tamingtext.classifier.mlt;

import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;


public class MoreLikeThisQueryTest {
  int nGramSize;
  String inputPath;
  String modelPath;
  int maxResults;
  String categoryFieldName;
 
  @Before
  public void setup() {
    nGramSize = 1;
    inputPath = "src/test/resources/classifier/mlt/sample-input.txt";
    modelPath = "src/test/resources/classifier/mlt/sample-model";
    maxResults = 100;
    categoryFieldName = "category";
  }
  @Test
  public void testMoreLikeThisQuery() throws Exception {
    //<start id="lucene.examples.mlt.setup"/>
    Directory directory = FSDirectory.open(new File(modelPath));
   
    IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/>
    IndexSearcher indexSearcher = new IndexSearcher(indexReader);

    Analyzer analyzer //<co id="mlt.analyzersetup"/>
      = new EnglishAnalyzer(Version.LUCENE_36);
   
    if (nGramSize > 1) { //<co id="mlt.ngramsetup"/>
      analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize,
              nGramSize);
    }
   
    MoreLikeThis moreLikeThis  = new MoreLikeThis(indexReader); //<co id="mlt.configure"/>
    moreLikeThis.setAnalyzer(analyzer);
    moreLikeThis.setFieldNames(new String[] {
      "content"
    });
   
/*<calloutlist>
<callout arearefs="mlt.indexsetup">Open Index</callout>
<callout arearefs="mlt.analyzersetup">Setup Analyzer</callout>
<callout arearefs="mlt.ngramsetup">Setup NGrams</callout>
<callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout>
</calloutlist>*/
    //<end id="lucene.examples.mlt.setup"/>
   
    // for testing against the same corpus
    moreLikeThis.setMinTermFreq(1);
    moreLikeThis.setMinDocFreq(1);
   
    //<start id="lucene.examples.mlt.query"/>
    Reader reader = new FileReader(inputPath); //<co id="mlt.query"/>
    Query query = moreLikeThis.like(reader);

    TopDocs results
      = indexSearcher.search(query, maxResults); //<co id="mlt.search"/>
   
    HashMap<String, CategoryHits> categoryHash
      = new HashMap<String, CategoryHits>();
   
    for (ScoreDoc sd: results.scoreDocs) { //<co id="mlt.collect"/>
      Document d = indexReader.document(sd.doc);
      Fieldable f = d.getFieldable(categoryFieldName);
      String cat = f.stringValue();
      CategoryHits ch = categoryHash.get(cat);
      if (ch == null) {
        ch = new CategoryHits();
        ch.setLabel(cat);
        categoryHash.put(cat, ch);
      }
      ch.incrementScore(sd.score);
    }

    SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/>
      = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator());
    sortedCats.addAll(categoryHash.values());
   
    for (CategoryHits c: sortedCats) { //<co id="mlt.display"/>
      System.out.println(
          c.getLabel() + "\t" + c.getScore());
    }
    /*<calloutlist>
    <callout arearefs="mlt.query">Create Query</callout>
    <callout arearefs="mlt.search">Perform Search</callout>
    <callout arearefs="mlt.collect">Collect Results</callout>
    <callout arearefs="mlt.rank">Rank Categories</callout>
    <callout arearefs="mlt.display">Display Categories</callout>
    </calloutlist>*/
    //<end id="lucene.examples.mlt.query"/>
   
   
   
  }
}
TOP

Related Classes of com.tamingtext.classifier.mlt.MoreLikeThisQueryTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.