/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing;
import it.unibz.instasearch.InstaSearchPlugin;
import java.io.IOException;
import java.util.Collection;
import java.util.Locale;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.eclipse.core.resources.IFile;
import org.eclipse.core.resources.IWorkspaceRoot;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;
public class SearchResultDoc {
private Document doc;
private int docId;
private TermFreqVector termFreqVector;
private float[] termScoreVector;
private float score;
private int matchCount;
private Directory indexDir;
public SearchResultDoc(Directory dir, Document doc, int docId, float score) {
this.indexDir = dir;
this.docId = docId;
this.doc = doc;
this.score = score;
matchCount = 0;
}
private String getFieldValue(Field field) {
return doc.get(field.toString());
}
public String getFilePath() {
return getFieldValue(Field.FILE);
}
public String getFileName() {
return getFieldValue(Field.NAME);
}
public String getFileExtension() {
return getFieldValue(Field.EXT);
}
public boolean isInJar() {
if( doc.getField(Field.JAR.toString())==null )
return false;
String jarField = getFieldValue(Field.JAR);
if( StorageIndexer.NO_VALUE.equals(jarField) )
return false;
if( jarField.toLowerCase(Locale.ENGLISH).endsWith(".jar") )
return true;
return false;
}
public String getJarName() {
if( isInJar() )
return getFieldValue(Field.JAR);
return null;
}
public IPath getProject() {
return new Path(getFieldValue(Field.PROJ));
}
public String getProjectName() {
return getProject().lastSegment();
}
/**
*
* @return
* @throws IOException
*/
private float[] getTermScoreVector() throws IOException
{
if( termScoreVector == null ) {
IndexReader reader = IndexReader.open(indexDir, true);
if( termFreqVector == null )
createFreqVect(reader);
termScoreVector = createTermScoreVector(termFreqVector, reader);
reader.close();
}
return termScoreVector;
}
private TermFreqVector getTermFreqVector() throws IOException
{
if( termFreqVector == null ) {
IndexReader reader = IndexReader.open(indexDir, true);
createFreqVect(reader);
reader.close();
}
return termFreqVector;
}
private void createFreqVect(IndexReader reader) throws IOException
{
termFreqVector = reader.getTermFreqVector(docId, Field.CONTENTS.toString()); // obtain only when requested
}
/**
* Returns a vector of given term scores (tf-idf).
* The size of the vector is the number of terms in this document
* The term positions in the vector are the same as in the term frequency vector
*
* @param terms
* @return TermScoreVector
* @throws IOException
*/
public float[] getTermScoreVector(Collection<String> terms) throws IOException
{
float[] allTermScoreVect = getTermScoreVector();
float[] termScoreVect = new float[allTermScoreVect.length];
TermFreqVector freqVector = getTermFreqVector();
for(String term: terms){
int idx = freqVector.indexOf(term); // does a binary search
if( idx == -1 ) continue;
termScoreVect[idx] = allTermScoreVect[idx];
}
return termScoreVect;
}
public double getTermScore(String term) throws IOException
{
float[] allTermScoreVect = getTermScoreVector();
TermFreqVector freqVector = getTermFreqVector();
if( freqVector == null ) return 0;
int idx = freqVector.indexOf(term); // does a binary search
if( idx == -1 ) return 0;
return allTermScoreVect[idx];
}
private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException
{
if( vect == null )
return new float[0];
int[] termFrequencies = vect.getTermFrequencies();
String[] terms = vect.getTerms();
float[] scores = new float[terms.length];
int numDocs = reader.maxDoc();
Similarity sim = Searcher.SIMILARITY;
for(int i = 0; i < terms.length; i++) {
String termText = terms[i];
Term term = new Term(Field.CONTENTS.toString(), termText);
float termFreq = sim.tf( termFrequencies[i] );
int docFreq = reader.docFreq(term);
float idf = sim.idf(docFreq, numDocs);
float tfIdf = termFreq * idf;
scores[i] = tfIdf;
}
return scores;
}
public IFile getFile()
{
if( isInJar() ) return null;
Path path = new Path(getFilePath());
IWorkspaceRoot workspaceRoot = InstaSearchPlugin.getWorkspaceRoot();
IFile file = workspaceRoot.getFile(path);
if( file == null || file.getRawLocation() == null )
file = workspaceRoot.getFileForLocation(path);
return file;
}
/**
* @return the score
*/
public float getScore()
{
return score;
}
/**
* @return the doc
*/
public Document getDoc()
{
return doc;
}
/**
* @return the docId
*/
public int getDocId()
{
return docId;
}
/**
* @return the matchCount
*/
public int getMatchCount()
{
return matchCount;
}
/**
* Computes match count as SUM( tf ) of all query terms in the document
* Accesses the index thus affects performance
*
* @param reader
* @param queryTerms
* @throws IOException
*/
public void computeMatchCount(IndexReader reader, Collection<String> queryTerms) throws IOException
{
if( termFreqVector == null )
createFreqVect(reader);
if( termFreqVector == null )
return;
int freqs[] = termFreqVector.getTermFrequencies();
int freqSum = 0;
for(String term: queryTerms){
int idx = termFreqVector.indexOf(term); // does a binary search
if( idx == -1 ) continue;
freqSum += freqs[idx];
}
matchCount = freqSum;
}
@Override
public String toString() {
return getFilePath();
}
}