/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.jackrabbit.core.query.lucene;
import org.apache.jackrabbit.core.NodeId;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.search.Query;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Set;
import java.util.HashSet;
import java.util.Iterator;
/**
* <code>DefaultXMLExcerpt</code> implements an ExcerptProvider.
*/
class DefaultXMLExcerpt implements ExcerptProvider {
/**
* Logger instance for this class.
*/
private static final Logger log = LoggerFactory.getLogger(DefaultXMLExcerpt.class);
/**
* The search index.
*/
private SearchIndex index;
/**
* The current query.
*/
private Query query;
/**
* Indicates whether the query is already rewritten.
*/
private boolean rewritten = false;
/**
* {@inheritDoc}
*/
public void init(Query query, SearchIndex index) throws IOException {
this.index = index;
this.query = query;
}
/**
* {@inheritDoc}
*/
public String getExcerpt(NodeId id, int maxFragments, int maxFragmentSize)
throws IOException {
IndexReader reader = index.getIndexReader();
try {
if (!rewritten) {
query = query.rewrite(reader);
rewritten = true;
}
Term idTerm = new Term(FieldNames.UUID, id.getUUID().toString());
TermDocs tDocs = reader.termDocs(idTerm);
int docNumber;
Document doc;
try {
if (tDocs.next()) {
docNumber = tDocs.doc();
doc = reader.document(docNumber);
} else {
// node not found in index
return null;
}
} finally {
tDocs.close();
}
Field[] fields = doc.getFields(FieldNames.FULLTEXT);
if (fields == null) {
log.debug("Fulltext field not stored, using {}",
SimpleExcerptProvider.class.getName());
SimpleExcerptProvider exProvider = new SimpleExcerptProvider();
exProvider.init(query, index);
return exProvider.getExcerpt(id, maxFragments, maxFragmentSize);
}
StringBuffer text = new StringBuffer();
String separator = "";
for (int i = 0; i < fields.length; i++) {
text.append(separator);
text.append(fields[i].stringValue());
// this is a hack! in general multiple fields with the same
// name are handled properly, that is, offset and position is
// calculated correctly. there is one case however where
// the offset gets wrong:
// if a term text ends with characters that are considered noise
// then the offset of the next field will be off by the number
// of noise characters.
// therefore we delete noise characters at the end of the text
for (int j = text.length() - 1; j >= 0; j--) {
if (Character.isLetterOrDigit(text.charAt(j))) {
break;
} else {
text.deleteCharAt(j);
}
}
separator = " ";
}
TermFreqVector tfv = reader.getTermFreqVector(
docNumber, FieldNames.FULLTEXT);
if (tfv instanceof TermPositionVector) {
return createExcerpt((TermPositionVector) tfv, text.toString(),
maxFragments, maxFragmentSize);
} else {
log.debug("No TermPositionVector on Fulltext field, using {}",
SimpleExcerptProvider.class.getName());
SimpleExcerptProvider exProvider = new SimpleExcerptProvider();
exProvider.init(query, index);
return exProvider.getExcerpt(id, maxFragments, maxFragmentSize);
}
} finally {
reader.close();
}
}
/**
* Creates an excerpt for the given <code>text</code> using token offset
* information provided by <code>tpv</code>.
*
* @param tpv the term position vector for the fulltext field.
* @param text the original text.
* @param maxFragments the maximum number of fragments to create.
* @param maxFragmentSize the maximum number of characters in a fragment.
* @return the xml excerpt.
* @throws IOException if an error occurs while creating the excerpt.
*/
private String createExcerpt(TermPositionVector tpv,
String text,
int maxFragments,
int maxFragmentSize)
throws IOException {
Set extractedTerms = new HashSet();
Set relevantTerms = new HashSet();
query.extractTerms(extractedTerms);
// only keep terms for fulltext fields
for (Iterator it = extractedTerms.iterator(); it.hasNext(); ) {
Term t = (Term) it.next();
if (t.field().equals(FieldNames.FULLTEXT)) {
relevantTerms.add(t);
} else {
int idx = t.field().indexOf(FieldNames.FULLTEXT_PREFIX);
if (idx != -1) {
relevantTerms.add(new Term(FieldNames.FULLTEXT, t.text()));
}
}
}
return DefaultHighlighter.highlight(tpv, relevantTerms, text,
"<highlight>", "</highlight>", maxFragments, maxFragmentSize / 2);
}
}