/*
* eXist Open Source Native XML Database
* Copyright (C) 2011 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* $Id$
*/
package org.exist.indexing.lucene;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration;
import org.apache.lucene.search.*;
import org.apache.lucene.util.*;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.indexing.*;
import org.exist.indexing.lucene.PlainTextHighlighter.Offset;
import org.exist.indexing.lucene.PlainTextIndexConfig.PlainTextDoc;
import org.exist.indexing.lucene.PlainTextIndexConfig.PlainTextField;
import org.exist.memtree.MemTreeBuilder;
import org.exist.memtree.NodeImpl;
import org.exist.numbering.NodeId;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.*;
import org.exist.storage.btree.DBException;
import org.exist.storage.lock.Lock;
import org.exist.storage.txn.Txn;
import org.exist.util.ByteConversion;
import org.exist.util.DatabaseConfigurationException;
import org.exist.util.Occurrences;
import org.exist.xmldb.XmldbURI;
import org.exist.xquery.*;
import org.exist.xquery.value.IntegerValue;
import org.exist.xquery.value.NodeValue;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.helpers.AttributesImpl;
import java.io.IOException;
import java.util.*;
/**
* Class for handling all Lucene operations.
*
* @author Wolfgang Meier (wolfgang@exist-db.org)
* @author Dannes Wessels (dannes@exist-db.org)
*/
public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {
public static final String OPTION_DEFAULT_OPERATOR = "default-operator";
public static final String OPTION_PHRASE_SLOP = "phrase-slop";
public static final String OPTION_LEADING_WILDCARD = "leading-wildcard";
public static final String OPTION_FILTER_REWRITE = "filter-rewrite";
public static final String DEFAULT_OPERATOR_OR = "or";
public static final org.apache.lucene.document.FieldType TYPE_NODE_ID = new org.apache.lucene.document.FieldType();
static {
TYPE_NODE_ID.setIndexed(true);
TYPE_NODE_ID.setStored(false);
TYPE_NODE_ID.setOmitNorms(true);
TYPE_NODE_ID.setStoreTermVectors(false);
TYPE_NODE_ID.setTokenized(true);
}
static final Logger LOG = Logger.getLogger(LuceneIndexWorker.class);
protected LuceneIndex index;
private LuceneMatchListener matchListener = null;
private XMLToQuery queryTranslator;
private DBBroker broker;
private DocumentImpl currentDoc = null;
private int mode = 0;
private LuceneConfig config;
private Stack<TextExtractor> contentStack = null;
private Set<NodeId> nodesToRemove = null;
private List<PendingDoc> nodesToWrite = null;
private Document pendingDoc = null;
private int cachedNodesSize = 0;
private int maxCachedNodesSize = 4096 * 1024;
private Analyzer analyzer;
public static final String FIELD_DOC_ID = "docId";
public static final String FIELD_DOC_URI = "docUri";
private final byte[] buf = new byte[1024];
public LuceneIndexWorker(LuceneIndex parent, DBBroker broker) {
this.index = parent;
this.broker = broker;
this.queryTranslator = new XMLToQuery(index);
}
public String getIndexId() {
return LuceneIndex.ID;
}
public String getIndexName() {
return index.getIndexName();
}
public QueryRewriter getQueryRewriter(XQueryContext context) {
return null;
}
public Object configure(IndexController controller, NodeList configNodes, Map<String, String> namespaces) throws DatabaseConfigurationException {
LOG.debug("Configuring lucene index...");
config = new LuceneConfig(configNodes, namespaces);
return config;
}
public void flush() {
switch (mode) {
case StreamListener.STORE:
write();
break;
case StreamListener.REMOVE_ALL_NODES:
removeDocument(currentDoc.getDocId());
break;
case StreamListener.REMOVE_SOME_NODES:
removeNodes();
break;
case StreamListener.REMOVE_BINARY:
removePlainTextIndexes();
break;
}
}
public void setDocument(DocumentImpl document) {
setDocument(document, StreamListener.UNKNOWN);
}
public void setDocument(DocumentImpl document, int newMode) {
currentDoc = document;
//config = null;
contentStack = null;
IndexSpec indexConf = document.getCollection().getIndexConfiguration(broker);
if (indexConf != null) {
config = (LuceneConfig) indexConf.getCustomIndexSpec(LuceneIndex.ID);
if (config != null)
// Create a copy of the original LuceneConfig (there's only one per db instance),
// so we can safely work with it.
config = new LuceneConfig(config);
}
mode = newMode;
}
public void setMode(int mode) {
this.mode = mode;
switch (mode) {
case StreamListener.STORE:
if (nodesToWrite == null)
nodesToWrite = new ArrayList<>();
else
nodesToWrite.clear();
cachedNodesSize = 0;
break;
case StreamListener.REMOVE_SOME_NODES:
nodesToRemove = new TreeSet<>();
break;
}
}
public DocumentImpl getDocument() {
return currentDoc;
}
public int getMode() {
return this.mode;
}
public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean insert, boolean includeSelf) {
if (node.getNodeType() == Node.ATTRIBUTE_NODE)
return null;
if (config == null)
return null;
NodePath p = new NodePath(path);
boolean reindexRequired = false;
if (node.getNodeType() == Node.ELEMENT_NODE && !includeSelf)
p.removeLastComponent();
for (int i = 0; i < p.length(); i++) {
if (config.matches(p)) {
reindexRequired = true;
break;
}
p.removeLastComponent();
}
if (reindexRequired) {
p = new NodePath(path);
StoredNode topMost = null;
StoredNode currentNode = node;
if (currentNode.getNodeType() != Node.ELEMENT_NODE)
currentNode = currentNode.getParentStoredNode();
while (currentNode != null) {
if (config.matches(p))
topMost = currentNode;
currentNode = currentNode.getParentStoredNode();
p.removeLastComponent();
}
return topMost;
}
return null;
}
private StreamListener listener = new LuceneStreamListener();
public StreamListener getListener() {
return listener;
}
public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy) {
boolean needToFilter = false;
Match nextMatch = proxy.getMatches();
while (nextMatch != null) {
if (nextMatch.getIndexId() == LuceneIndex.ID) {
needToFilter = true;
break;
}
nextMatch = nextMatch.getNextMatch();
}
if (!needToFilter)
return null;
if (matchListener == null)
matchListener = new LuceneMatchListener(index, broker, proxy);
else
matchListener.reset(broker, proxy);
return matchListener;
}
protected void removeDocument(int docId) {
IndexWriter writer = null;
try {
writer = index.getWriter();
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(docId, 0, bytes);
Term dt = new Term(FIELD_DOC_ID, bytes);
writer.deleteDocuments(dt);
} catch (IOException e) {
LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
mode = StreamListener.STORE;
}
}
protected void removePlainTextIndexes() {
IndexWriter writer = null;
try {
writer = index.getWriter();
String uri = currentDoc.getURI().toString();
Term dt = new Term(FIELD_DOC_URI, uri);
writer.deleteDocuments(dt);
} catch (IOException e) {
LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
mode = StreamListener.STORE;
}
}
public void removeCollection(Collection collection, DBBroker broker, boolean reindex) {
if (LOG.isDebugEnabled())
LOG.debug("Removing collection " + collection.getURI());
IndexWriter writer = null;
try {
writer = index.getWriter();
for (Iterator<DocumentImpl> i = collection.iterator(broker); i.hasNext(); ) {
DocumentImpl doc = i.next();
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(doc.getDocId(), 0, bytes);
Term dt = new Term(FIELD_DOC_ID, bytes);
writer.deleteDocuments(dt);
}
} catch (IOException | PermissionDeniedException e) {
LOG.error("Error while removing lucene index: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
if (reindex) {
try {
index.sync();
} catch (DBException e) {
LOG.warn("Exception during reindex: " + e.getMessage(), e);
}
}
mode = StreamListener.STORE;
}
if (LOG.isDebugEnabled())
LOG.debug("Collection removed.");
}
/**
* Remove specific nodes from the index. This method is used for node updates
* and called from flush() if the worker is in {@link StreamListener#REMOVE_SOME_NODES}
* mode.
*/
protected void removeNodes() {
if (nodesToRemove == null)
return;
IndexWriter writer = null;
try {
writer = index.getWriter();
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(currentDoc.getDocId(), 0, bytes);
Term dt = new Term(FIELD_DOC_ID, bytes);
TermQuery tq = new TermQuery(dt);
for (NodeId nodeId : nodesToRemove) {
// store the node id
int nodeIdLen = nodeId.size();
byte[] data = new byte[nodeIdLen + 2];
ByteConversion.shortToByte((short) nodeId.units(), data, 0);
nodeId.serialize(data, 2);
Term it = new Term(LuceneUtil.FIELD_NODE_ID, new BytesRef(data));
TermQuery iq = new TermQuery(it);
BooleanQuery q = new BooleanQuery();
q.add(tq, BooleanClause.Occur.MUST);
q.add(iq, BooleanClause.Occur.MUST);
writer.deleteDocuments(q);
}
} catch (IOException e) {
LOG.warn("Error while deleting lucene index entries: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
nodesToRemove = null;
}
}
private NodeId readNodeId(int doc, BinaryDocValues nodeIdValues, BrokerPool pool) {
BytesRef ref = new BytesRef(buf);
nodeIdValues.get(doc, ref);
int units = ByteConversion.byteToShort(ref.bytes, ref.offset);
return pool.getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2);
}
/**
* Query the index. Returns a node set containing all matching nodes. Each node
* in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
* element attached, which stores the score and a link to the query which generated it.
*
* @param context current XQuery context
* @param contextId current context id, identify to track the position inside nested XPath predicates
* @param docs query will be restricted to documents in this set
* @param contextSet if specified, returned nodes will be descendants of the nodes in this set
* @param qnames query will be restricted to nodes with the qualified names given here
* @param queryStr a lucene query string
* @param axis which node is returned: the node in which a match was found or the corresponding ancestor
* from the contextSet
* @return node set containing all matching nodes
*
* @throws IOException
* @throws ParseException
*/
public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
List<QName> qnames, String queryStr, int axis, Properties options)
throws IOException, ParseException, XPathException {
qnames = getDefinedIndexes(qnames);
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
for (QName qname : qnames) {
String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
Analyzer analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
QueryParserWrapper parser = getQueryParser(field, analyzer, docs);
setOptions(options, parser.getConfiguration());
Query query = parser.parse(queryStr);
searchAndProcess(contextId, qname, docs, contextSet, resultSet,
returnAncestor, searcher, query, context.getWatchDog());
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
protected void setOptions(Properties options, CommonQueryParserConfiguration parser) throws ParseException {
if (options == null)
return;
String option = options.getProperty(OPTION_DEFAULT_OPERATOR);
if (option != null && parser instanceof QueryParserBase) {
if (DEFAULT_OPERATOR_OR.equals(option))
((QueryParserBase)parser).setDefaultOperator(QueryParser.OR_OPERATOR);
else
((QueryParserBase)parser).setDefaultOperator(QueryParser.AND_OPERATOR);
}
option = options.getProperty(OPTION_LEADING_WILDCARD);
if (option != null)
parser.setAllowLeadingWildcard(option.equalsIgnoreCase("yes"));
option = options.getProperty(OPTION_PHRASE_SLOP);
if (option != null) {
try {
int slop = Integer.parseInt(option);
parser.setPhraseSlop(slop);
} catch (NumberFormatException e) {
throw new ParseException("value for option " + OPTION_PHRASE_SLOP + " needs to be a number");
}
}
option = options.getProperty(OPTION_FILTER_REWRITE);
if (option != null) {
if (option.equalsIgnoreCase("yes"))
parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
else
parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
}
}
/**
* Query the index. Returns a node set containing all matching nodes. Each node
* in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
* element attached, which stores the score and a link to the query which generated it.
*
* @param context current XQuery context
* @param contextId current context id, identify to track the position inside nested XPath predicates
* @param docs query will be restricted to documents in this set
* @param contextSet if specified, returned nodes will be descendants of the nodes in this set
* @param qnames query will be restricted to nodes with the qualified names given here
* @param queryRoot an XML representation of the query, see {@link XMLToQuery}.
* @param axis which node is returned: the node in which a match was found or the corresponding ancestor
* from the contextSet
* @return node set containing all matching nodes
*
* @throws IOException
* @throws ParseException
*/
public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
List<QName> qnames, Element queryRoot, int axis, Properties options)
throws IOException, ParseException, XPathException {
qnames = getDefinedIndexes(qnames);
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
for (QName qname : qnames) {
String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
if (query != null) {
searchAndProcess(contextId, qname, docs, contextSet, resultSet,
returnAncestor, searcher, query, context.getWatchDog());
}
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
String field, Element queryRoot, int axis, Properties options)
throws IOException, XPathException {
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
analyzer = getAnalyzer(field, null, context.getBroker(), docs);
Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
if (query != null) {
searchAndProcess(contextId, null, docs, contextSet, resultSet,
returnAncestor, searcher, query, context.getWatchDog());
}
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
private void searchAndProcess(int contextId, QName qname, DocumentSet docs,
NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
IndexSearcher searcher, Query query, XQueryWatchDog watchDog) throws IOException, TerminatedException {
LuceneHitCollector collector = new LuceneHitCollector(qname, query, docs, contextSet, resultSet, returnAncestor, contextId, watchDog);
searcher.search(query, collector);
}
public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
String field, String queryString, int axis, Properties options)
throws IOException, ParseException, XPathException {
NodeSet resultSet = new NewArrayNodeSet();
boolean returnAncestor = axis == NodeSet.ANCESTOR;
IndexSearcher searcher = null;
try {
searcher = index.getSearcher();
Analyzer analyzer = getAnalyzer(field, null, context.getBroker(), docs);
LOG.debug("Using analyzer " + analyzer + " for " + queryString);
QueryParserWrapper parser = getQueryParser(field, analyzer, docs);
setOptions(options, parser.getConfiguration());
Query query = parser.parse(queryString);
searchAndProcess(contextId, null, docs, contextSet, resultSet,
returnAncestor, searcher, query, context.getWatchDog());
} finally {
index.releaseSearcher(searcher);
}
return resultSet;
}
/**
* Add SOLR formatted data to lucene index.
*
* <pre>
* {@code
* <doc>
* <field name="name1" boost="value1">data1</field>
* <field name="name2">data2</field>
* </doc>
* }
* </pre>
*
* @param descriptor SOLR styled data
*/
public void indexNonXML(NodeValue descriptor) {
// Verify input
if (!descriptor.getNode().getLocalName().contentEquals("doc")) {
// throw exception
LOG.error("Expected <doc> got <" + descriptor.getNode().getLocalName() + ">");
return;
}
// Setup parser for SOLR syntax and parse
PlainTextIndexConfig solrconfParser = new PlainTextIndexConfig();
solrconfParser.parse(descriptor);
// Get <doc> information
PlainTextDoc solrDoc = solrconfParser.getDoc();
if (pendingDoc == null) {
// create Lucene document
pendingDoc = new Document();
// Set DocId
NumericDocValuesField fDocId = new NumericDocValuesField(FIELD_DOC_ID, currentDoc.getDocId());
pendingDoc.add(fDocId);
IntField fDocIdIdx = new IntField(FIELD_DOC_ID, currentDoc.getDocId(), Field.Store.NO);
pendingDoc.add(fDocIdIdx);
// For binary documents the doc path needs to be stored
String uri = currentDoc.getURI().toString();
Field fDocUri = new Field(FIELD_DOC_URI, uri, Field.Store.YES, Field.Index.NOT_ANALYZED);
pendingDoc.add(fDocUri);
}
// Iterate over all found fields and write the data.
for (PlainTextField field : solrconfParser.getFields()) {
// Get field type configuration
FieldType fieldType = config == null ? null : config.getFieldType(field.getName());
Field.Store store = null;
if (fieldType != null)
store = fieldType.getStore();
if (store == null)
store = field.getStore();
// Get name from SOLR field
String contentFieldName = field.getName();
Analyzer fieldAnalyzer = (fieldType == null) ? null : fieldType.getAnalyzer();
// Actual field content ; Store flag can be set in solrField
Field contentField = new Field(contentFieldName, field.getData().toString(), store, Field.Index.ANALYZED, Field.TermVector.YES);
// Extract (document) Boost factor
if (field.getBoost() > 0) {
contentField.setBoost(field.getBoost());
}
pendingDoc.add(contentField);
}
}
public void writeNonXML() {
IndexWriter writer = null;
try {
writer = index.getWriter();
writer.addDocument(pendingDoc);
} catch (IOException e) {
LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
pendingDoc = null;
cachedNodesSize = 0;
}
}
/**
* SOLR
* @param context
* @param toBeMatchedURIs
* @param queryText
* @return search report
*/
public NodeImpl search(final XQueryContext context, final List<String> toBeMatchedURIs, String queryText) throws XPathException {
NodeImpl report = null;
IndexSearcher searcher = null;
try {
// Get index searcher
searcher = index.getSearcher();
// Get analyzer : to be retrieved from configuration
final Analyzer searchAnalyzer = new StandardAnalyzer(Version.LUCENE_43);
// Setup query Version, default field, analyzer
final QueryParserWrapper parser = getQueryParser("", searchAnalyzer, null);
final Query query = parser.parse(queryText);
// extract all used fields from query
final String[] fields = LuceneUtil.extractFields(query, searcher.getIndexReader());
final PlainTextHighlighter highlighter = new PlainTextHighlighter(query, searcher.getIndexReader());
final MemTreeBuilder builder = new MemTreeBuilder();
builder.startDocument();
// start root element
final int nodeNr = builder.startElement("", "results", "results", null);
// Perform actual search
searcher.search(query, new Collector() {
private Scorer scorer;
private AtomicReader reader;
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void collect(int docNum) throws IOException {
Document doc = reader.document(docNum);
// Get URI field of document
String fDocUri = doc.get(FIELD_DOC_URI);
// Get score
float score = scorer.score();
// Check if document URI has a full match or if a
// document is in a collection
if(isDocumentMatch(fDocUri, toBeMatchedURIs)){
DocumentImpl storedDoc = null;
try {
// try to read document to check if user is allowed to access it
storedDoc = context.getBroker().getXMLResource(XmldbURI.createInternal(fDocUri), Lock.READ_LOCK);
if (storedDoc == null) {
return;
}
// setup attributes
AttributesImpl attribs = new AttributesImpl();
attribs.addAttribute("", "uri", "uri", "CDATA", fDocUri);
attribs.addAttribute("", "score", "score", "CDATA", ""+score);
// write element and attributes
builder.startElement("", "search", "search", attribs);
for (String field : fields) {
String[] fieldContent = doc.getValues(field);
attribs.clear();
attribs.addAttribute("", "name", "name", "CDATA", field);
for (String content : fieldContent) {
List<Offset> offsets = highlighter.getOffsets(content, searchAnalyzer);
if (offsets != null) {
builder.startElement("", "field", "field", attribs);
highlighter.highlight(content, offsets, builder);
builder.endElement();
}
}
}
builder.endElement();
// clean attributes
attribs.clear();
} catch (PermissionDeniedException e) {
// not allowed to read the document: ignore the match.
} finally {
if (storedDoc != null) {
storedDoc.getUpdateLock().release(Lock.READ_LOCK);
}
}
}
}
@Override
public void setNextReader(AtomicReaderContext atomicReaderContext) throws IOException {
this.reader = atomicReaderContext.reader();
}
@Override
public boolean acceptsDocsOutOfOrder() {
return true;
}
});
// finish root element
builder.endElement();
//System.out.println(builder.getDocument().toString());
// TODO check
report = ((org.exist.memtree.DocumentImpl) builder.getDocument()).getNode(nodeNr);
} catch (Exception ex){
ex.printStackTrace();
LOG.error(ex);
throw new XPathException(ex);
} finally {
index.releaseSearcher(searcher);
}
return report;
}
public String getFieldContent(int docId, String field) throws IOException {
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(docId, 0, bytes);
Term dt = new Term(FIELD_DOC_ID, bytes);
IndexReader reader = null;
try {
reader = index.getReader();
List<AtomicReaderContext> leaves = reader.leaves();
for (AtomicReaderContext context : leaves) {
DocsEnum docs = context.reader().termDocsEnum(dt);
if (docs != null && docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
Document doc = reader.document(docs.docID());
return doc.get(field);
}
}
} finally {
index.releaseReader(reader);
}
return null;
}
public boolean hasIndex(int docId) throws IOException {
BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
NumericUtils.intToPrefixCoded(docId, 0, bytes);
Term dt = new Term(FIELD_DOC_ID, bytes);
IndexReader reader = null;
try {
reader = index.getReader();
boolean found = false;
List<AtomicReaderContext> leaves = reader.leaves();
for (AtomicReaderContext context : leaves) {
DocsEnum docs = context.reader().termDocsEnum(dt);
if (docs != null && docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
found = true;
break;
}
}
return found;
} finally {
index.releaseReader(reader);
}
}
/**
* Check if Lucene found document matches specified documents or collections.
* Collections should end with "/".
*
* @param docUri The uri of the document found by lucene
* @param toBeMatchedUris List of document and collection URIs
* @return TRUE if documenturi is matched or is in collection.
*/
private boolean isDocumentMatch(String docUri, List<String> toBeMatchedUris){
if(docUri==null){
LOG.error("docUri is null.");
return false;
}
if(toBeMatchedUris==null){
LOG.error("match is null.");
return false;
}
for(String doc : toBeMatchedUris){
if( docUri.startsWith(doc) ){
return true;
}
}
return false;
}
private class LuceneHitCollector extends Collector {
private Scorer scorer;
private AtomicReader reader;
private NumericDocValues docIdValues;
private BinaryDocValues nodeIdValues;
private final byte[] buf = new byte[1024];
private final QName qname;
private final DocumentSet docs;
private final NodeSet contextSet;
private final NodeSet resultSet;
private final boolean returnAncestor;
private final int contextId;
private final Query query;
private final XQueryWatchDog watchdog;
private LuceneHitCollector(QName qname, Query query, DocumentSet docs, NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
int contextId, XQueryWatchDog watchDog) {
this.qname = qname;
this.docs = docs;
this.contextSet = contextSet;
this.resultSet = resultSet;
this.returnAncestor = returnAncestor;
this.contextId = contextId;
this.query = query;
this.watchdog = watchDog;
}
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
@Override
public void setNextReader(AtomicReaderContext atomicReaderContext) throws IOException {
this.reader = atomicReaderContext.reader();
this.docIdValues = this.reader.getNumericDocValues(FIELD_DOC_ID);
this.nodeIdValues = this.reader.getBinaryDocValues(LuceneUtil.FIELD_NODE_ID);
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
@Override
public void collect(int doc) {
try {
float score = scorer.score();
int docId = (int) this.docIdValues.get(doc);
DocumentImpl storedDocument = docs.getDoc(docId);
if (storedDocument == null)
return;
BytesRef ref = new BytesRef(buf);
this.nodeIdValues.get(doc, ref);
int units = ByteConversion.byteToShort(ref.bytes, ref.offset);
NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2);
//LOG.info("doc: " + docId + "; node: " + nodeId.toString() + "; units: " + units);
NodeProxy storedNode = new NodeProxy(storedDocument, nodeId);
if (qname != null)
storedNode.setNodeType(qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
// if a context set is specified, we can directly check if the
// matching node is a descendant of one of the nodes
// in the context set.
if (contextSet != null) {
int sizeHint = contextSet.getSizeHint(storedDocument);
if (returnAncestor) {
NodeProxy parentNode = contextSet.get(storedNode);
// NodeProxy parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
if (parentNode != null) {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(score);
parentNode.addMatch(match);
resultSet.add(parentNode, sizeHint);
if (Expression.NO_CONTEXT_ID != contextId) {
parentNode.deepCopyContext(storedNode, contextId);
} else
parentNode.copyContext(storedNode);
}
} else {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(score);
storedNode.addMatch(match);
resultSet.add(storedNode, sizeHint);
}
} else {
LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
match.setScore(score);
storedNode.addMatch(match);
resultSet.add(storedNode);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Check index configurations for all collection in the given DocumentSet and return
* a list of QNames, which have indexes defined on them.
*
* @return List of QName objects on which indexes are defined
*/
public List<QName> getDefinedIndexes(List<QName> qnames) {
List<QName> indexes = new ArrayList<>(20);
if (qnames != null && !qnames.isEmpty()) {
for (QName qname : qnames) {
if (qname.getLocalName() == null || qname.getNamespaceURI() == null)
getDefinedIndexesFor(qname, indexes);
else
indexes.add(qname);
}
return indexes;
}
return getDefinedIndexesFor(null, indexes);
}
private List<QName> getDefinedIndexesFor(QName qname, List<QName> indexes) {
IndexReader reader = null;
try {
reader = index.getReader();
for (FieldInfo info: MultiFields.getMergedFieldInfos(reader)) {
if (!FIELD_DOC_ID.equals(info.name)) {
QName name = LuceneUtil.decodeQName(info.name, index.getBrokerPool().getSymbols());
if (name != null && (qname == null || matchQName(qname, name)))
indexes.add(name);
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
index.releaseReader(reader);
}
return indexes;
}
private static boolean matchQName(QName qname, QName candidate) {
boolean match = true;
if (qname.getLocalName() != null)
match = qname.getLocalName().equals(candidate.getLocalName());
if (match && qname.getNamespaceURI() != null && qname.getNamespaceURI().length() > 0)
match = qname.getNamespaceURI().equals(candidate.getNamespaceURI());
return match;
}
/**
* Return the analyzer to be used for the given field or qname. Either field
* or qname should be specified.
*/
protected Analyzer getAnalyzer(String field, QName qname, DBBroker broker, DocumentSet docs) {
for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
Collection collection = i.next();
IndexSpec idxConf = collection.getIndexConfiguration(broker);
if (idxConf != null) {
LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
if (config != null) {
Analyzer analyzer;
if (field == null)
analyzer = config.getAnalyzer(qname);
else
analyzer = config.getAnalyzer(field);
if (analyzer != null)
return analyzer;
}
}
}
return index.getDefaultAnalyzer();
}
protected QueryParserWrapper getQueryParser(String field, Analyzer analyzer, DocumentSet docs) {
if (docs != null) {
for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
Collection collection = i.next();
IndexSpec idxConf = collection.getIndexConfiguration(broker);
if (idxConf != null) {
LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
if (config != null) {
QueryParserWrapper parser = config.getQueryParser(field, analyzer);
if (parser != null) {
return parser;
}
}
}
}
}
// not found. return default query parser:
return new ClassicQueryParserWrapper(field, analyzer);
}
public boolean checkIndex(DBBroker broker) {
return false; //To change body of implemented methods use File | Settings | File Templates.
}
public Occurrences[] scanIndex(XQueryContext context, DocumentSet docs, NodeSet nodes, Map<?,?> hints) {
List<QName> qnames = hints == null ? null : (List<QName>)hints.get(QNAMES_KEY);
qnames = getDefinedIndexes(qnames);
//Expects a StringValue
String start = null, end = null;
long max = Long.MAX_VALUE;
if (hints != null) {
Object vstart = hints.get(START_VALUE);
Object vend = hints.get(END_VALUE);
start = vstart == null ? null : vstart.toString();
end = vend == null ? null : vend.toString();
IntegerValue vmax = (IntegerValue) hints.get(VALUE_COUNT);
max = vmax == null ? Long.MAX_VALUE : vmax.getValue();
}
return scanIndexByQName(qnames, docs, nodes, start, end, max);
}
private Occurrences[] scanIndexByQName(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
TreeMap<String, Occurrences> map = new TreeMap<>();
IndexReader reader = null;
try {
reader = index.getReader();
for (QName qname : qnames) {
String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
List<AtomicReaderContext> leaves = reader.leaves();
for (AtomicReaderContext context : leaves) {
NumericDocValues docIdValues = context.reader().getNumericDocValues(FIELD_DOC_ID);
BinaryDocValues nodeIdValues = context.reader().getBinaryDocValues(LuceneUtil.FIELD_NODE_ID);
Bits liveDocs = context.reader().getLiveDocs();
Terms terms = context.reader().terms(field);
if (terms == null)
continue;
TermsEnum termsIter = terms.iterator(null);
if (termsIter.next() == null) {
continue;
}
do {
if (map.size() >= max) {
break;
}
BytesRef ref = termsIter.term();
String term = ref.utf8ToString();
boolean include = true;
if (end != null) {
if (term.compareTo(end) > 0)
include = false;
} else if (start != null && !term.startsWith(start))
include = false;
if (include) {
DocsEnum docsEnum = termsIter.docs(null, null);
while (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
if (liveDocs != null && !liveDocs.get(docsEnum.docID())) {
continue;
}
int docId = (int) docIdValues.get(docsEnum.docID());
DocumentImpl storedDocument = docs.getDoc(docId);
if (storedDocument == null)
continue;
NodeId nodeId = null;
if (nodes != null) {
BytesRef nodeIdRef = new BytesRef(buf);
nodeIdValues.get(docsEnum.docID(), nodeIdRef);
int units = ByteConversion.byteToShort(nodeIdRef.bytes, nodeIdRef.offset);
nodeId = index.getBrokerPool().getNodeFactory().createFromData(units, nodeIdRef.bytes, nodeIdRef.offset + 2);
}
// DW: warning: nodes can be null?
if (nodeId == null || nodes.get(storedDocument, nodeId) != null) {
Occurrences oc = map.get(term);
if (oc == null) {
oc = new Occurrences(term);
map.put(term, oc);
}
oc.addDocument(storedDocument);
oc.addOccurrences(docsEnum.freq());
}
}
}
} while(termsIter.next() != null);
}
}
} catch (IOException e) {
LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
} finally {
index.releaseReader(reader);
}
Occurrences[] occur = new Occurrences[map.size()];
return map.values().toArray(occur);
}
/**
* Adds the passed character sequence to the lucene index. We
* create one lucene document per XML node, using 2 fields to identify
* the node:
*
* <ul>
* <li>docId: eXist-internal document id of the node, stored as string.</li>
* <li>nodeId: the id of the node, stored in binary compressed form.</li>
* </ul>
*
* The text is indexed into a field whose name encodes the qualified name of
* the node. The qualified name is stored as a hex sequence pointing into the
* global symbol table.
*
* @param nodeId
* @param qname
* @param content
*/
protected void indexText(NodeId nodeId, QName qname, NodePath path, LuceneIndexConfig config, CharSequence content) {
PendingDoc pending = new PendingDoc(nodeId, qname, path, content, config);
nodesToWrite.add(pending);
cachedNodesSize += content.length();
if (cachedNodesSize > maxCachedNodesSize)
write();
}
private class PendingDoc {
NodeId nodeId;
CharSequence text;
QName qname;
LuceneIndexConfig idxConf;
private PendingDoc(NodeId nodeId, QName qname, NodePath path, CharSequence text, LuceneIndexConfig idxConf) {
this.nodeId = nodeId;
this.qname = qname;
this.text = text;
this.idxConf = idxConf;
}
}
private void write() {
if (nodesToWrite == null || nodesToWrite.isEmpty())
return;
IndexWriter writer = null;
try {
writer = index.getWriter();
// docId and nodeId are stored as doc value
NumericDocValuesField fDocId = new NumericDocValuesField(FIELD_DOC_ID, 0);
BinaryDocValuesField fNodeId = new BinaryDocValuesField(LuceneUtil.FIELD_NODE_ID, new BytesRef(8));
// docId also needs to be indexed
IntField fDocIdIdx = new IntField(FIELD_DOC_ID, 0, IntField.TYPE_NOT_STORED);
final List<Field> metas = new ArrayList<>();
final List<CategoryPath> paths = new ArrayList<>();
broker.getIndexController().streamMetas(new MetaStreamListener() {
@Override
public void metadata(QName key, Object value) {
if (value instanceof String) {
String name = key.getLocalName();//LuceneUtil.encodeQName(key, index.getBrokerPool().getSymbols());
Field fld = new Field(name, value.toString(), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
metas.add(fld);
//System.out.println(" "+name+" = "+value.toString());
paths.add(new CategoryPath(name, value.toString()));
}
}
});
TaxonomyWriter taxoWriter = index.getTaxonomyWriter();
FacetFields facetFields = new FacetFields(taxoWriter);
for (PendingDoc pending : nodesToWrite) {
final Document doc = new Document();
fDocId.setLongValue(currentDoc.getDocId());
doc.add(fDocId);
// store the node id
int nodeIdLen = pending.nodeId.size();
byte[] data = new byte[nodeIdLen + 2];
ByteConversion.shortToByte((short) pending.nodeId.units(), data, 0);
pending.nodeId.serialize(data, 2);
fNodeId.setBytesValue(data);
doc.add(fNodeId);
// add separate index for node id
BinaryTokenStream bts = new BinaryTokenStream(new BytesRef(data));
Field fNodeIdIdx = new Field(LuceneUtil.FIELD_NODE_ID, bts, TYPE_NODE_ID);
doc.add(fNodeIdIdx);
String contentField;
// the text content is indexed in a field using either
// the qname of the element or attribute or the field
// name defined in the configuration
if (pending.idxConf.isNamed())
contentField = pending.idxConf.getName();
else
contentField = LuceneUtil.encodeQName(pending.qname, index.getBrokerPool().getSymbols());
Field fld = new Field(contentField, pending.text.toString(), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
if (pending.idxConf.getBoost() > 0)
fld.setBoost(pending.idxConf.getBoost());
else if (config.getBoost() > 0)
fld.setBoost(config.getBoost());
doc.add(fld);
fDocIdIdx.setIntValue(currentDoc.getDocId());
doc.add(fDocIdIdx);
for (Field meta : metas) {
doc.add(meta);
}
if (!paths.isEmpty()) {
facetFields.addFields(doc, paths);
}
if (pending.idxConf.getAnalyzer() == null)
writer.addDocument(doc);
else {
writer.addDocument(doc, pending.idxConf.getAnalyzer());
}
}
} catch (IOException e) {
LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
nodesToWrite = new ArrayList<>();
cachedNodesSize = 0;
}
}
/**
* Optimize the Lucene index by merging all segments into a single one. This
* may take a while and write operations will be blocked during the optimize.
*/
public void optimize() {
IndexWriter writer = null;
try {
writer = index.getWriter(true);
writer.forceMerge(1, true);
writer.commit();
} catch (IOException e) {
LOG.warn("An exception was caught while optimizing the lucene index: " + e.getMessage(), e);
} finally {
index.releaseWriter(writer);
}
}
private class LuceneStreamListener extends AbstractStreamListener {
@Override
public void startElement(Txn transaction, ElementImpl element, NodePath path) {
if (mode == STORE && config != null) {
if (contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.startElement(element.getQName());
}
}
Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
if (configIter != null) {
if (contentStack == null) contentStack = new Stack<>();
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
TextExtractor extractor = new DefaultTextExtractor();
extractor.configure(config, configuration);
contentStack.push(extractor);
}
}
}
}
super.startElement(transaction, element, path);
}
@Override
public void endElement(Txn transaction, ElementImpl element, NodePath path) {
if (config != null) {
if (mode == STORE && contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.endElement(element.getQName());
}
}
Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
if (mode != REMOVE_ALL_NODES && configIter != null) {
if (mode == REMOVE_SOME_NODES) {
nodesToRemove.add(element.getNodeId());
} else {
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
TextExtractor extractor = contentStack.pop();
indexText(element.getNodeId(), element.getQName(),
path, extractor.getIndexConfig(), extractor.getText());
}
}
}
}
}
super.endElement(transaction, element, path);
}
@Override
public void attribute(Txn transaction, AttrImpl attrib, NodePath path) {
path.addComponent(attrib.getQName());
Iterator<LuceneIndexConfig> configIter = null;
if (config != null)
configIter = config.getConfig(path);
if (mode != REMOVE_ALL_NODES && configIter != null) {
if (mode == REMOVE_SOME_NODES) {
nodesToRemove.add(attrib.getNodeId());
} else {
while (configIter.hasNext()) {
LuceneIndexConfig configuration = configIter.next();
if (configuration.match(path)) {
indexText(attrib.getNodeId(), attrib.getQName(), path,
configuration, attrib.getValue());
}
}
}
}
path.removeLastComponent();
super.attribute(transaction, attrib, path);
}
@Override
public void characters(Txn transaction, CharacterDataImpl text, NodePath path) {
if (contentStack != null && !contentStack.isEmpty()) {
for (TextExtractor extractor : contentStack) {
extractor.beforeCharacters();
extractor.characters(text.getXMLString());
}
}
super.characters(transaction, text, path);
}
@Override
public IndexWorker getWorker() {
return LuceneIndexWorker.this;
}
}
/**
* Match class containing the score of a match and a reference to
* the query that generated it.
*/
public class LuceneMatch extends Match {
private float score = 0.0f;
private final Query query;
public LuceneMatch(int contextId, NodeId nodeId, Query query) {
super(contextId, nodeId, null);
this.query = query;
}
public LuceneMatch(LuceneMatch copy) {
super(copy);
this.score = copy.score;
this.query = copy.query;
}
@Override
public Match createInstance(int contextId, NodeId nodeId, String matchTerm) {
return null;
}
public Match createInstance(int contextId, NodeId nodeId, Query query) {
return new LuceneMatch(contextId, nodeId, query);
}
@Override
public Match newCopy() {
return new LuceneMatch(this);
}
@Override
public String getIndexId() {
return LuceneIndex.ID;
}
public Query getQuery() {
return query;
}
public float getScore() {
return score;
}
protected void setScore(float score) {
this.score = score;
}
// DW: missing hashCode() ?
@Override
public boolean equals(Object other) {
if (!(other instanceof LuceneMatch)) {
return false;
}
LuceneMatch o = (LuceneMatch) other;
return (nodeId == o.nodeId || nodeId.equals(o.nodeId))
&& query == ((LuceneMatch) other).query;
}
@Override
public boolean matchEquals(Match other) {
return equals(other);
}
}
}