Source Code of org.exist.indexing.lucene.LuceneIndexWorker

/*
 *  eXist Open Source Native XML Database
 *  Copyright (C) 2011 The eXist Project
 *  http://exist-db.org
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 *  $Id$
 */
package org.exist.indexing.lucene;


import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.apache.lucene.queryparser.flexible.standard.CommonQueryParserConfiguration;
import org.apache.lucene.search.*;
import org.apache.lucene.util.*;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.indexing.*;
import org.exist.indexing.lucene.PlainTextHighlighter.Offset;
import org.exist.indexing.lucene.PlainTextIndexConfig.PlainTextDoc;
import org.exist.indexing.lucene.PlainTextIndexConfig.PlainTextField;
import org.exist.memtree.MemTreeBuilder;
import org.exist.memtree.NodeImpl;
import org.exist.numbering.NodeId;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.*;
import org.exist.storage.btree.DBException;
import org.exist.storage.lock.Lock;
import org.exist.storage.txn.Txn;
import org.exist.util.ByteConversion;
import org.exist.util.DatabaseConfigurationException;
import org.exist.util.Occurrences;
import org.exist.xmldb.XmldbURI;
import org.exist.xquery.*;
import org.exist.xquery.value.IntegerValue;
import org.exist.xquery.value.NodeValue;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.helpers.AttributesImpl;


import java.io.IOException;
import java.util.*;


/**
 * Class for handling all Lucene operations.
 *
 * @author Wolfgang Meier (wolfgang@exist-db.org)
 * @author Dannes Wessels (dannes@exist-db.org)
 */
public class LuceneIndexWorker implements OrderedValuesIndex, QNamedKeysIndex {


    public static final String OPTION_DEFAULT_OPERATOR = "default-operator";
    public static final String OPTION_PHRASE_SLOP = "phrase-slop";
    public static final String OPTION_LEADING_WILDCARD = "leading-wildcard";
    public static final String OPTION_FILTER_REWRITE = "filter-rewrite";
    public static final String DEFAULT_OPERATOR_OR = "or";


    public static final org.apache.lucene.document.FieldType TYPE_NODE_ID = new org.apache.lucene.document.FieldType();
    static {
        TYPE_NODE_ID.setIndexed(true);
        TYPE_NODE_ID.setStored(false);
        TYPE_NODE_ID.setOmitNorms(true);
        TYPE_NODE_ID.setStoreTermVectors(false);
        TYPE_NODE_ID.setTokenized(true);
    }


    static final Logger LOG = Logger.getLogger(LuceneIndexWorker.class);
    
    protected LuceneIndex index;
    
    private LuceneMatchListener matchListener = null;


    private XMLToQuery queryTranslator;


    private DBBroker broker;


    private DocumentImpl currentDoc = null;
    private int mode = 0;
    
    private LuceneConfig config;
    private Stack<TextExtractor> contentStack = null;
    private Set<NodeId> nodesToRemove = null;
    private List<PendingDoc> nodesToWrite = null;
    private Document pendingDoc = null;
    
    private int cachedNodesSize = 0;


    private int maxCachedNodesSize = 4096 * 1024;
    
    private Analyzer analyzer;


    public static final String FIELD_DOC_ID = "docId";
    public static final String FIELD_DOC_URI = "docUri";


    private final byte[] buf = new byte[1024];


    public LuceneIndexWorker(LuceneIndex parent, DBBroker broker) {
        this.index = parent;
        this.broker = broker;
        this.queryTranslator = new XMLToQuery(index);
    }


    public String getIndexId() {
        return LuceneIndex.ID;
    }


    public String getIndexName() {
        return index.getIndexName();
    }


    public QueryRewriter getQueryRewriter(XQueryContext context) {
        return null;
    }


    public Object configure(IndexController controller, NodeList configNodes, Map<String, String> namespaces) throws DatabaseConfigurationException {
        LOG.debug("Configuring lucene index...");
        config = new LuceneConfig(configNodes, namespaces);
        return config;
    }




    public void flush() {
        switch (mode) {
            case StreamListener.STORE:
                write();
                break;
            case StreamListener.REMOVE_ALL_NODES:
                removeDocument(currentDoc.getDocId());
                break;
            case StreamListener.REMOVE_SOME_NODES:
                removeNodes();
                break;
            case StreamListener.REMOVE_BINARY:
              removePlainTextIndexes();
              break;
        }
    }


    public void setDocument(DocumentImpl document) {
        setDocument(document, StreamListener.UNKNOWN);
    }


    public void setDocument(DocumentImpl document, int newMode) {
        currentDoc = document;
        //config = null;
        contentStack = null;
        IndexSpec indexConf = document.getCollection().getIndexConfiguration(broker);
        if (indexConf != null) {
            config = (LuceneConfig) indexConf.getCustomIndexSpec(LuceneIndex.ID);
            if (config != null)
              // Create a copy of the original LuceneConfig (there's only one per db instance), 
              // so we can safely work with it.
              config = new LuceneConfig(config);
        }
        mode = newMode;
    }


    public void setMode(int mode) {
        this.mode = mode;
        switch (mode) {
            case StreamListener.STORE:
                if (nodesToWrite == null)
                    nodesToWrite = new ArrayList<>();
                else
                    nodesToWrite.clear();
                cachedNodesSize = 0;
                break;
            case StreamListener.REMOVE_SOME_NODES:
                nodesToRemove = new TreeSet<>();
                break;
        }
    }


    public DocumentImpl getDocument() {
        return currentDoc;
    }


    public int getMode() {
        return this.mode;
    }


    public StoredNode getReindexRoot(StoredNode node, NodePath path, boolean insert, boolean includeSelf) {
        if (node.getNodeType() == Node.ATTRIBUTE_NODE)
            return null;
        if (config == null)
            return null;
        NodePath p = new NodePath(path);
        boolean reindexRequired = false;
        if (node.getNodeType() == Node.ELEMENT_NODE && !includeSelf)
            p.removeLastComponent();
        for (int i = 0; i < p.length(); i++) {
            if (config.matches(p)) {
                reindexRequired = true;
                break;
            }
            p.removeLastComponent();
        }
        if (reindexRequired) {
            p = new NodePath(path);
            StoredNode topMost = null;
            StoredNode currentNode = node;
            if (currentNode.getNodeType() != Node.ELEMENT_NODE)
                currentNode = currentNode.getParentStoredNode();
            while (currentNode != null) {
                if (config.matches(p))
                    topMost = currentNode;
                currentNode = currentNode.getParentStoredNode();
                p.removeLastComponent();
            }
            return topMost;
        }
        return null;
    }


    private StreamListener listener = new LuceneStreamListener();


    public StreamListener getListener() {
        return listener;
    }


    public MatchListener getMatchListener(DBBroker broker, NodeProxy proxy) {
        boolean needToFilter = false;
        Match nextMatch = proxy.getMatches();
        while (nextMatch != null) {
            if (nextMatch.getIndexId() == LuceneIndex.ID) {
                needToFilter = true;
                break;
            }
            nextMatch = nextMatch.getNextMatch();
        }
        if (!needToFilter)
            return null;
        if (matchListener == null)
            matchListener = new LuceneMatchListener(index, broker, proxy);
        else
            matchListener.reset(broker, proxy);
        return matchListener;
    }


    protected void removeDocument(int docId) {
      IndexWriter writer = null;
        try {
            writer = index.getWriter();
            BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
            NumericUtils.intToPrefixCoded(docId, 0, bytes);
            Term dt = new Term(FIELD_DOC_ID, bytes);
            writer.deleteDocuments(dt);
        } catch (IOException e) {
            LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            mode = StreamListener.STORE;
        }
    }


    protected void removePlainTextIndexes() {
      IndexWriter writer = null;
        try {
            writer = index.getWriter();
            String uri = currentDoc.getURI().toString();
            Term dt = new Term(FIELD_DOC_URI, uri);
            writer.deleteDocuments(dt);
        } catch (IOException e) {
            LOG.warn("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            mode = StreamListener.STORE;
        }
    }
    
    public void removeCollection(Collection collection, DBBroker broker, boolean reindex) {
        if (LOG.isDebugEnabled())
            LOG.debug("Removing collection " + collection.getURI());
        IndexWriter writer = null;
        try {
            writer = index.getWriter();
            for (Iterator<DocumentImpl> i = collection.iterator(broker); i.hasNext(); ) {
                DocumentImpl doc = i.next();
                BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
                NumericUtils.intToPrefixCoded(doc.getDocId(), 0, bytes);
                Term dt = new Term(FIELD_DOC_ID, bytes);
                writer.deleteDocuments(dt);
            }
        } catch (IOException | PermissionDeniedException e) {
            LOG.error("Error while removing lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            if (reindex) {
                try {
                    index.sync();
                } catch (DBException e) {
                    LOG.warn("Exception during reindex: " + e.getMessage(), e);
                }
            }
            mode = StreamListener.STORE;
        }
        if (LOG.isDebugEnabled())
            LOG.debug("Collection removed.");
    }


    /**
     * Remove specific nodes from the index. This method is used for node updates
     * and called from flush() if the worker is in {@link StreamListener#REMOVE_SOME_NODES}
     * mode.
     */
    protected void removeNodes() {
      if (nodesToRemove == null)
            return;
        IndexWriter writer = null;
        try {
            writer = index.getWriter();


            BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
            NumericUtils.intToPrefixCoded(currentDoc.getDocId(), 0, bytes);
            Term dt = new Term(FIELD_DOC_ID, bytes);
            TermQuery tq = new TermQuery(dt);
            for (NodeId nodeId : nodesToRemove) {
                // store the node id
                int nodeIdLen = nodeId.size();
                byte[] data = new byte[nodeIdLen + 2];
                ByteConversion.shortToByte((short) nodeId.units(), data, 0);
                nodeId.serialize(data, 2);


                Term it = new Term(LuceneUtil.FIELD_NODE_ID, new BytesRef(data));
                TermQuery iq = new TermQuery(it);
                BooleanQuery q = new BooleanQuery();
                q.add(tq, BooleanClause.Occur.MUST);
                q.add(iq, BooleanClause.Occur.MUST);
                writer.deleteDocuments(q);
            }
        } catch (IOException e) {
            LOG.warn("Error while deleting lucene index entries: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            nodesToRemove = null;
        }
    }


    private NodeId readNodeId(int doc, BinaryDocValues nodeIdValues, BrokerPool pool) {
        BytesRef ref = new BytesRef(buf);
        nodeIdValues.get(doc, ref);
        int units = ByteConversion.byteToShort(ref.bytes, ref.offset);
        return pool.getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2);
    }


    /**
     * Query the index. Returns a node set containing all matching nodes. Each node
     * in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
     * element attached, which stores the score and a link to the query which generated it.
     *
     * @param context current XQuery context
     * @param contextId current context id, identify to track the position inside nested XPath predicates
     * @param docs query will be restricted to documents in this set
     * @param contextSet if specified, returned nodes will be descendants of the nodes in this set
     * @param qnames query will be restricted to nodes with the qualified names given here
     * @param queryStr a lucene query string
     * @param axis which node is returned: the node in which a match was found or the corresponding ancestor
     *  from the contextSet
     * @return node set containing all matching nodes
     * 
     * @throws IOException
     * @throws ParseException
     */
    public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
        List<QName> qnames, String queryStr, int axis, Properties options)
            throws IOException, ParseException, XPathException {
        qnames = getDefinedIndexes(qnames);
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            for (QName qname : qnames) {
                String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
                Analyzer analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
                QueryParserWrapper parser = getQueryParser(field, analyzer, docs);
                setOptions(options, parser.getConfiguration());
                Query query = parser.parse(queryStr);
                searchAndProcess(contextId, qname, docs, contextSet, resultSet,
                    returnAncestor, searcher, query, context.getWatchDog());
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }


    protected void setOptions(Properties options, CommonQueryParserConfiguration parser) throws ParseException {
        if (options == null)
            return;
        String option = options.getProperty(OPTION_DEFAULT_OPERATOR);
        if (option != null && parser instanceof QueryParserBase) {
            if (DEFAULT_OPERATOR_OR.equals(option))
                ((QueryParserBase)parser).setDefaultOperator(QueryParser.OR_OPERATOR);
            else
                ((QueryParserBase)parser).setDefaultOperator(QueryParser.AND_OPERATOR);
        }
        option = options.getProperty(OPTION_LEADING_WILDCARD);
        if (option != null)
            parser.setAllowLeadingWildcard(option.equalsIgnoreCase("yes"));
        option = options.getProperty(OPTION_PHRASE_SLOP);
        if (option != null) {
            try {
                int slop = Integer.parseInt(option);
                parser.setPhraseSlop(slop);
            } catch (NumberFormatException e) {
                throw new ParseException("value for option " + OPTION_PHRASE_SLOP + " needs to be a number");
            }
        }
        option = options.getProperty(OPTION_FILTER_REWRITE);
        if (option != null) {
            if (option.equalsIgnoreCase("yes"))
                parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
            else
                parser.setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
        }
    }


    /**
     * Query the index. Returns a node set containing all matching nodes. Each node
     * in the node set has a {@link org.exist.indexing.lucene.LuceneIndexWorker.LuceneMatch}
     * element attached, which stores the score and a link to the query which generated it.
     *
     * @param context current XQuery context
     * @param contextId current context id, identify to track the position inside nested XPath predicates
     * @param docs query will be restricted to documents in this set
     * @param contextSet if specified, returned nodes will be descendants of the nodes in this set
     * @param qnames query will be restricted to nodes with the qualified names given here
     * @param queryRoot an XML representation of the query, see {@link XMLToQuery}.
     * @param axis which node is returned: the node in which a match was found or the corresponding ancestor
     *  from the contextSet
     * @return node set containing all matching nodes
     *
     * @throws IOException
     * @throws ParseException
     */
    public NodeSet query(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
                         List<QName> qnames, Element queryRoot, int axis, Properties options)
            throws IOException, ParseException, XPathException {
        qnames = getDefinedIndexes(qnames);
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            for (QName qname : qnames) {
                String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
                analyzer = getAnalyzer(null, qname, context.getBroker(), docs);
                Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
                if (query != null) {
                  searchAndProcess(contextId, qname, docs, contextSet, resultSet,
                        returnAncestor, searcher, query, context.getWatchDog());
                }
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }


    public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
            String field, Element queryRoot, int axis, Properties options)
            throws IOException, XPathException {
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            analyzer = getAnalyzer(field, null, context.getBroker(), docs);
            Query query = queryTranslator.parse(field, queryRoot, analyzer, options);
            if (query != null) {
                searchAndProcess(contextId, null, docs, contextSet, resultSet,
                    returnAncestor, searcher, query, context.getWatchDog());
            }
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }


    private void searchAndProcess(int contextId, QName qname, DocumentSet docs,
            NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
            IndexSearcher searcher, Query query, XQueryWatchDog watchDog) throws IOException, TerminatedException {
        LuceneHitCollector collector = new LuceneHitCollector(qname, query, docs, contextSet, resultSet, returnAncestor, contextId, watchDog);
        searcher.search(query, collector);
    }


    public NodeSet queryField(XQueryContext context, int contextId, DocumentSet docs, NodeSet contextSet,
            String field, String queryString, int axis, Properties options)
            throws IOException, ParseException, XPathException {
        NodeSet resultSet = new NewArrayNodeSet();
        boolean returnAncestor = axis == NodeSet.ANCESTOR;
        IndexSearcher searcher = null;
        try {
            searcher = index.getSearcher();
            Analyzer analyzer = getAnalyzer(field, null, context.getBroker(), docs);
            LOG.debug("Using analyzer " + analyzer + " for " + queryString);
            QueryParserWrapper parser = getQueryParser(field, analyzer, docs);
            setOptions(options, parser.getConfiguration());
            Query query = parser.parse(queryString);
            searchAndProcess(contextId, null, docs, contextSet, resultSet,
                returnAncestor, searcher, query, context.getWatchDog());
        } finally {
            index.releaseSearcher(searcher);
        }
        return resultSet;
    }


    /**
     * Add SOLR formatted data to lucene index.
     * 
     * <pre>
     * {@code
     * <doc>
     *   <field name="name1" boost="value1">data1</field>
     *  <field name="name2">data2</field>
     * </doc>
     * }
     * </pre>
     * 
     * @param descriptor SOLR styled data 
     */
    public void indexNonXML(NodeValue descriptor) {
        // Verify input
        if (!descriptor.getNode().getLocalName().contentEquals("doc")) {
            // throw exception
            LOG.error("Expected <doc> got <" + descriptor.getNode().getLocalName() + ">");
            return;
        }


        // Setup parser for SOLR syntax and parse
        PlainTextIndexConfig solrconfParser = new PlainTextIndexConfig();
        solrconfParser.parse(descriptor);
        
        // Get <doc> information
        PlainTextDoc solrDoc = solrconfParser.getDoc();
        
        if (pendingDoc == null) {
          // create Lucene document
          pendingDoc = new Document();
          
          // Set DocId
          NumericDocValuesField fDocId = new NumericDocValuesField(FIELD_DOC_ID, currentDoc.getDocId());


            pendingDoc.add(fDocId);


            IntField fDocIdIdx = new IntField(FIELD_DOC_ID, currentDoc.getDocId(), Field.Store.NO);
            pendingDoc.add(fDocIdIdx);


            // For binary documents the doc path needs to be stored
            String uri = currentDoc.getURI().toString();


            Field fDocUri = new Field(FIELD_DOC_URI, uri, Field.Store.YES, Field.Index.NOT_ANALYZED);
            pendingDoc.add(fDocUri);
        }
        
        // Iterate over all found fields and write the data.
        for (PlainTextField field : solrconfParser.getFields()) {
            
            // Get field type configuration
            FieldType fieldType = config == null ? null : config.getFieldType(field.getName());
            
            Field.Store store = null;
            if (fieldType != null)
              store = fieldType.getStore();
            if (store == null)
              store = field.getStore();
            
            // Get name from SOLR field
            String contentFieldName = field.getName();
            
            Analyzer fieldAnalyzer = (fieldType == null) ? null : fieldType.getAnalyzer();
            
            // Actual field content ; Store flag can be set in solrField
            Field contentField = new Field(contentFieldName, field.getData().toString(),  store, Field.Index.ANALYZED, Field.TermVector.YES);


            // Extract (document) Boost factor
            if (field.getBoost() > 0) {
                contentField.setBoost(field.getBoost());
            }


            pendingDoc.add(contentField);
        }
    }
    
    public void writeNonXML() {
      IndexWriter writer = null;
        try {
            writer = index.getWriter();
            
            writer.addDocument(pendingDoc);
        } catch (IOException e) {
            LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);


        } finally {
            index.releaseWriter(writer);
            pendingDoc = null;
            cachedNodesSize = 0;
        }
    }
    
    /**
     *  SOLR
     * @param context
     * @param toBeMatchedURIs
     * @param queryText
     * @return search report
     */
    public NodeImpl search(final XQueryContext context, final List<String> toBeMatchedURIs, String queryText) throws XPathException {
        
        NodeImpl report = null;
        
        IndexSearcher searcher = null;
        try {
            // Get index searcher
            searcher = index.getSearcher();
            
            // Get analyzer : to be retrieved from configuration
            final Analyzer searchAnalyzer = new StandardAnalyzer(Version.LUCENE_43);


            // Setup query Version, default field, analyzer
            final QueryParserWrapper parser = getQueryParser("", searchAnalyzer, null);
            final Query query = parser.parse(queryText);
                       
            // extract all used fields from query
            final String[] fields = LuceneUtil.extractFields(query, searcher.getIndexReader());


            final PlainTextHighlighter highlighter = new PlainTextHighlighter(query, searcher.getIndexReader());


            final MemTreeBuilder builder = new MemTreeBuilder();
            builder.startDocument();


            // start root element
            final int nodeNr = builder.startElement("", "results", "results", null);


            // Perform actual search
            searcher.search(query, new Collector() {
                private Scorer scorer;
                private AtomicReader reader;


                @Override
                public void setScorer(Scorer scorer) throws IOException {
                    this.scorer = scorer;
                }


                @Override
                public void collect(int docNum) throws IOException {
                    Document doc = reader.document(docNum);


                    // Get URI field of document
                    String fDocUri = doc.get(FIELD_DOC_URI);


                    // Get score
                    float score = scorer.score();


                    // Check if document URI has a full match or if a
                    // document is in a collection
                    if(isDocumentMatch(fDocUri, toBeMatchedURIs)){
                        
                        DocumentImpl storedDoc = null;
                        try {
                            // try to read document to check if user is allowed to access it
                            storedDoc = context.getBroker().getXMLResource(XmldbURI.createInternal(fDocUri), Lock.READ_LOCK);
                            if (storedDoc == null) {
                                return;
                            }


                            // setup attributes
                            AttributesImpl attribs = new AttributesImpl();
                            attribs.addAttribute("", "uri", "uri", "CDATA", fDocUri);
                            attribs.addAttribute("", "score", "score", "CDATA", ""+score);
    
                            // write element and attributes
                            builder.startElement("", "search", "search", attribs);
                            for (String field : fields) {
                                String[] fieldContent = doc.getValues(field);
                                attribs.clear();
                                attribs.addAttribute("", "name", "name", "CDATA", field);
                                for (String content : fieldContent) {
                                    List<Offset> offsets = highlighter.getOffsets(content, searchAnalyzer);
                                    if (offsets != null) {
                                        builder.startElement("", "field", "field", attribs);
                                        highlighter.highlight(content, offsets, builder);
                                        builder.endElement();
                                    }
                                }
                            }
                            builder.endElement();
    
                            // clean attributes
                            attribs.clear();
                        } catch (PermissionDeniedException e) {
                            // not allowed to read the document: ignore the match.
                        } finally {
                            if (storedDoc != null) {
                                storedDoc.getUpdateLock().release(Lock.READ_LOCK);
                            }
                        }
                    }
                }


                @Override
                public void setNextReader(AtomicReaderContext atomicReaderContext) throws IOException {
                    this.reader = atomicReaderContext.reader();
                }


                @Override
                public boolean acceptsDocsOutOfOrder() {
                    return true;
                }
            });


            // finish root element
            builder.endElement();
            
            //System.out.println(builder.getDocument().toString());
            
            // TODO check
            report = ((org.exist.memtree.DocumentImpl) builder.getDocument()).getNode(nodeNr);




        } catch (Exception ex){
            ex.printStackTrace();
            LOG.error(ex);
            throw new XPathException(ex);
        
        } finally {
            index.releaseSearcher(searcher);
        }
        
        
        return report;
    }
    
    public String getFieldContent(int docId, String field) throws IOException {
        BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
        NumericUtils.intToPrefixCoded(docId, 0, bytes);
        Term dt = new Term(FIELD_DOC_ID, bytes);


      IndexReader reader = null;
        try {
            reader = index.getReader();


            List<AtomicReaderContext> leaves = reader.leaves();
            for (AtomicReaderContext context : leaves) {
                DocsEnum docs = context.reader().termDocsEnum(dt);
                if (docs != null && docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
                    Document doc = reader.document(docs.docID());
                    return doc.get(field);
                }
            }
        } finally {
            index.releaseReader(reader);
        }
        return null;
    }
    
    public boolean hasIndex(int docId) throws IOException {
        BytesRef bytes = new BytesRef(NumericUtils.BUF_SIZE_INT);
        NumericUtils.intToPrefixCoded(docId, 0, bytes);
        Term dt = new Term(FIELD_DOC_ID, bytes);


        IndexReader reader = null;
        try {
            reader = index.getReader();


            boolean found = false;
            List<AtomicReaderContext> leaves = reader.leaves();
            for (AtomicReaderContext context : leaves) {
                DocsEnum docs = context.reader().termDocsEnum(dt);
                if (docs != null && docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
                    found = true;
                    break;
                }
            }
            return found;
        } finally {
            index.releaseReader(reader);
        }
    }
    
    /**
     *  Check if Lucene found document matches specified documents or collections.
     * Collections should end with "/".
     * 
     * @param docUri    The uri of the document found by lucene
     * @param toBeMatchedUris     List of document and collection URIs
     * @return TRUE if documenturi is matched or is in collection.
     */
    private boolean isDocumentMatch(String docUri, List<String> toBeMatchedUris){
        
        if(docUri==null){
            LOG.error("docUri is null.");
            return false;
        }
        
        if(toBeMatchedUris==null){
            LOG.error("match is null.");
            return false;
        }
        
        for(String doc : toBeMatchedUris){
            if( docUri.startsWith(doc) ){
                return true;
            }       
        }
        return false;
    }


    private class LuceneHitCollector extends Collector {


        private Scorer scorer;


        private AtomicReader reader;
        private NumericDocValues docIdValues;
        private BinaryDocValues nodeIdValues;
        private final byte[] buf = new byte[1024];
        private final QName qname;
        private final DocumentSet docs;
        private final NodeSet contextSet;
        private final NodeSet resultSet;
        private final boolean returnAncestor;
        private final int contextId;
        private final Query query;
        private final XQueryWatchDog watchdog;


        private LuceneHitCollector(QName qname, Query query, DocumentSet docs, NodeSet contextSet, NodeSet resultSet, boolean returnAncestor,
                                   int contextId, XQueryWatchDog watchDog) {
            this.qname = qname;
            this.docs = docs;
            this.contextSet = contextSet;
            this.resultSet = resultSet;
            this.returnAncestor = returnAncestor;
            this.contextId = contextId;
            this.query = query;
            this.watchdog = watchDog;
        }


        @Override
        public void setScorer(Scorer scorer) throws IOException {
            this.scorer = scorer;
        }


        @Override
        public void setNextReader(AtomicReaderContext atomicReaderContext) throws IOException {
            this.reader = atomicReaderContext.reader();
            this.docIdValues = this.reader.getNumericDocValues(FIELD_DOC_ID);
            this.nodeIdValues = this.reader.getBinaryDocValues(LuceneUtil.FIELD_NODE_ID);
        }


        @Override
        public boolean acceptsDocsOutOfOrder() {
            return false;
        }


        @Override
        public void collect(int doc) {
            try {
                float score = scorer.score();
                int docId = (int) this.docIdValues.get(doc);
                DocumentImpl storedDocument = docs.getDoc(docId);
                if (storedDocument == null)
                    return;
                BytesRef ref = new BytesRef(buf);
                this.nodeIdValues.get(doc, ref);
                int units = ByteConversion.byteToShort(ref.bytes, ref.offset);
                NodeId nodeId = index.getBrokerPool().getNodeFactory().createFromData(units, ref.bytes, ref.offset + 2);
                //LOG.info("doc: " + docId + "; node: " + nodeId.toString() + "; units: " + units);


                NodeProxy storedNode = new NodeProxy(storedDocument, nodeId);
                if (qname != null)
                    storedNode.setNodeType(qname.getNameType() == ElementValue.ATTRIBUTE ? Node.ATTRIBUTE_NODE : Node.ELEMENT_NODE);
                // if a context set is specified, we can directly check if the
                // matching node is a descendant of one of the nodes
                // in the context set.
                if (contextSet != null) {
                    int sizeHint = contextSet.getSizeHint(storedDocument);
                    if (returnAncestor) {
                        NodeProxy parentNode = contextSet.get(storedNode);
                        // NodeProxy parentNode = contextSet.parentWithChild(storedNode, false, true, NodeProxy.UNKNOWN_NODE_LEVEL);
                        if (parentNode != null) {
                            LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                            match.setScore(score);
                            parentNode.addMatch(match);
                            resultSet.add(parentNode, sizeHint);
                            if (Expression.NO_CONTEXT_ID != contextId) {
                                parentNode.deepCopyContext(storedNode, contextId);
                            } else
                                parentNode.copyContext(storedNode);
                        }
                    } else {
                        LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                        match.setScore(score);
                        storedNode.addMatch(match);
                        resultSet.add(storedNode, sizeHint);
                    }
                } else {
                    LuceneMatch match = new LuceneMatch(contextId, nodeId, query);
                    match.setScore(score);
                    storedNode.addMatch(match);
                    resultSet.add(storedNode);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }


    /**
     * Check index configurations for all collection in the given DocumentSet and return
     * a list of QNames, which have indexes defined on them.
     *
     * @return List of QName objects on which indexes are defined
     */
    public List<QName> getDefinedIndexes(List<QName> qnames) {
        List<QName> indexes = new ArrayList<>(20);
        if (qnames != null && !qnames.isEmpty()) {
            for (QName qname : qnames) {
                if (qname.getLocalName() == null || qname.getNamespaceURI() == null)
                    getDefinedIndexesFor(qname, indexes);
                else
                    indexes.add(qname);
            }
            return indexes;
        }
        return getDefinedIndexesFor(null, indexes);
    }


    private List<QName> getDefinedIndexesFor(QName qname, List<QName> indexes) {
        IndexReader reader = null;
        try {
            reader = index.getReader();
            for (FieldInfo info: MultiFields.getMergedFieldInfos(reader)) {
                if (!FIELD_DOC_ID.equals(info.name)) {
                    QName name = LuceneUtil.decodeQName(info.name, index.getBrokerPool().getSymbols());
                    if (name != null && (qname == null || matchQName(qname, name)))
                        indexes.add(name);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            index.releaseReader(reader);
        }
        return indexes;
    }


    private static boolean matchQName(QName qname, QName candidate) {
        boolean match = true;
        if (qname.getLocalName() != null)
            match = qname.getLocalName().equals(candidate.getLocalName());
        if (match && qname.getNamespaceURI() != null && qname.getNamespaceURI().length() > 0)
            match = qname.getNamespaceURI().equals(candidate.getNamespaceURI());
        return match;
    }


    /**
     * Return the analyzer to be used for the given field or qname. Either field
     * or qname should be specified.
     */
    protected Analyzer getAnalyzer(String field, QName qname, DBBroker broker, DocumentSet docs) {
        for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
            Collection collection = i.next();
            IndexSpec idxConf = collection.getIndexConfiguration(broker);
            if (idxConf != null) {
                LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
                if (config != null) {
                    Analyzer analyzer;
                    if (field == null)
                      analyzer = config.getAnalyzer(qname);
                    else
                      analyzer = config.getAnalyzer(field);
                    if (analyzer != null)
                        return analyzer;
                }
            }
        }
        return index.getDefaultAnalyzer();
    }


    protected QueryParserWrapper getQueryParser(String field, Analyzer analyzer, DocumentSet docs) {
        if (docs != null) {
            for (Iterator<Collection> i = docs.getCollectionIterator(); i.hasNext(); ) {
                Collection collection = i.next();
                IndexSpec idxConf = collection.getIndexConfiguration(broker);
                if (idxConf != null) {
                    LuceneConfig config = (LuceneConfig) idxConf.getCustomIndexSpec(LuceneIndex.ID);
                    if (config != null) {
                        QueryParserWrapper parser = config.getQueryParser(field, analyzer);
                        if (parser != null) {
                            return parser;
                        }
                    }
                }
            }
        }
        // not found. return default query parser:
        return new ClassicQueryParserWrapper(field, analyzer);
    }


    public boolean checkIndex(DBBroker broker) {
        return false;  //To change body of implemented methods use File | Settings | File Templates.
    }


    public Occurrences[] scanIndex(XQueryContext context, DocumentSet docs, NodeSet nodes, Map<?,?> hints) {
        List<QName> qnames = hints == null ? null : (List<QName>)hints.get(QNAMES_KEY);
        qnames = getDefinedIndexes(qnames);
        //Expects a StringValue
        String start = null, end = null;
        long max = Long.MAX_VALUE;
        if (hints != null) {
            Object vstart = hints.get(START_VALUE);
            Object vend = hints.get(END_VALUE);
            start = vstart == null ? null : vstart.toString();
            end = vend == null ? null : vend.toString();
            IntegerValue vmax = (IntegerValue) hints.get(VALUE_COUNT);
            max = vmax == null ? Long.MAX_VALUE : vmax.getValue();
        }
        return scanIndexByQName(qnames, docs, nodes, start, end, max);
    }


    private Occurrences[] scanIndexByQName(List<QName> qnames, DocumentSet docs, NodeSet nodes, String start, String end, long max) {
        TreeMap<String, Occurrences> map = new TreeMap<>();
        IndexReader reader = null;
        try {
            reader = index.getReader();
            for (QName qname : qnames) {
                String field = LuceneUtil.encodeQName(qname, index.getBrokerPool().getSymbols());
                List<AtomicReaderContext> leaves = reader.leaves();
                for (AtomicReaderContext context : leaves) {
                    NumericDocValues docIdValues = context.reader().getNumericDocValues(FIELD_DOC_ID);
                    BinaryDocValues nodeIdValues = context.reader().getBinaryDocValues(LuceneUtil.FIELD_NODE_ID);
                    Bits liveDocs = context.reader().getLiveDocs();
                    Terms terms = context.reader().terms(field);
                    if (terms == null)
                        continue;
                    TermsEnum termsIter = terms.iterator(null);
                    if (termsIter.next() == null) {
                        continue;
                    }
                    do {
                        if (map.size() >= max) {
                            break;
                        }
                        BytesRef ref = termsIter.term();
                        String term = ref.utf8ToString();
                        boolean include = true;
                        if (end != null) {
                            if (term.compareTo(end) > 0)
                                include = false;
                        } else if (start != null && !term.startsWith(start))
                            include = false;
                        if (include) {
                            DocsEnum docsEnum = termsIter.docs(null, null);
                            while (docsEnum.nextDoc() != DocsEnum.NO_MORE_DOCS) {
                                if (liveDocs != null && !liveDocs.get(docsEnum.docID())) {
                                    continue;
                                }
                                int docId = (int) docIdValues.get(docsEnum.docID());
                                DocumentImpl storedDocument = docs.getDoc(docId);
                                if (storedDocument == null)
                                    continue;
                                NodeId nodeId = null;
                                if (nodes != null) {
                                    BytesRef nodeIdRef = new BytesRef(buf);
                                    nodeIdValues.get(docsEnum.docID(), nodeIdRef);
                                    int units = ByteConversion.byteToShort(nodeIdRef.bytes, nodeIdRef.offset);
                                    nodeId = index.getBrokerPool().getNodeFactory().createFromData(units, nodeIdRef.bytes, nodeIdRef.offset + 2);
                                }
                                // DW: warning: nodes can be null?
                                if (nodeId == null || nodes.get(storedDocument, nodeId) != null) {
                                    Occurrences oc = map.get(term);
                                    if (oc == null) {
                                        oc = new Occurrences(term);
                                        map.put(term, oc);
                                    }
                                    oc.addDocument(storedDocument);
                                    oc.addOccurrences(docsEnum.freq());
                                }
                            }
                        }
                    } while(termsIter.next() != null);
                }
            }
        } catch (IOException e) {
            LOG.warn("Error while scanning lucene index entries: " + e.getMessage(), e);
        } finally {
            index.releaseReader(reader);
        }
        Occurrences[] occur = new Occurrences[map.size()];
        return map.values().toArray(occur);
    }


    /**
     * Adds the passed character sequence to the lucene index. We
     * create one lucene document per XML node, using 2 fields to identify
     * the node:
     *
     * <ul>
     *  <li>docId: eXist-internal document id of the node, stored as string.</li>
     *  <li>nodeId: the id of the node, stored in binary compressed form.</li>
     * </ul>
     *
     * The text is indexed into a field whose name encodes the qualified name of
     * the node. The qualified name is stored as a hex sequence pointing into the
     * global symbol table.
     *
     * @param nodeId
     * @param qname
     * @param content
     */
    protected void indexText(NodeId nodeId, QName qname, NodePath path, LuceneIndexConfig config, CharSequence content) {
        PendingDoc pending = new PendingDoc(nodeId, qname, path, content, config);
        nodesToWrite.add(pending);
        cachedNodesSize += content.length();
        if (cachedNodesSize > maxCachedNodesSize)
            write();
    }


    private class PendingDoc {
        NodeId nodeId;
        CharSequence text;
        QName qname;
        LuceneIndexConfig idxConf;


        private PendingDoc(NodeId nodeId, QName qname, NodePath path, CharSequence text, LuceneIndexConfig idxConf) {
            this.nodeId = nodeId;
            this.qname = qname;
            this.text = text;
            this.idxConf = idxConf;
        }
    }
    
    private void write() {
        if (nodesToWrite == null || nodesToWrite.isEmpty())
            return;
        IndexWriter writer = null;
        try {
            writer = index.getWriter();
            // docId and nodeId are stored as doc value
            NumericDocValuesField fDocId = new NumericDocValuesField(FIELD_DOC_ID, 0);
            BinaryDocValuesField fNodeId = new BinaryDocValuesField(LuceneUtil.FIELD_NODE_ID, new BytesRef(8));
            // docId also needs to be indexed
            IntField fDocIdIdx = new IntField(FIELD_DOC_ID, 0, IntField.TYPE_NOT_STORED);


            final List<Field> metas = new ArrayList<>();
            final List<CategoryPath> paths = new ArrayList<>();


            broker.getIndexController().streamMetas(new MetaStreamListener() {
                @Override
                public void metadata(QName key, Object value) {
                    if (value instanceof String) {
                        String name = key.getLocalName();//LuceneUtil.encodeQName(key, index.getBrokerPool().getSymbols());
                        Field fld = new Field(name, value.toString(), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
                        metas.add(fld);
                        //System.out.println(" "+name+" = "+value.toString());
                        
                        paths.add(new CategoryPath(name, value.toString()));
                    }
                }
            });
            
            TaxonomyWriter taxoWriter = index.getTaxonomyWriter();
            FacetFields facetFields = new FacetFields(taxoWriter);


            for (PendingDoc pending : nodesToWrite) {
                final Document doc = new Document();
                fDocId.setLongValue(currentDoc.getDocId());
                doc.add(fDocId);


                // store the node id
                int nodeIdLen = pending.nodeId.size();
                byte[] data = new byte[nodeIdLen + 2];
                ByteConversion.shortToByte((short) pending.nodeId.units(), data, 0);
                pending.nodeId.serialize(data, 2);
                fNodeId.setBytesValue(data);
                doc.add(fNodeId);


                // add separate index for node id
                BinaryTokenStream bts = new BinaryTokenStream(new BytesRef(data));
                Field fNodeIdIdx = new Field(LuceneUtil.FIELD_NODE_ID, bts, TYPE_NODE_ID);
                doc.add(fNodeIdIdx);


                String contentField;
                // the text content is indexed in a field using either
                // the qname of the element or attribute or the field
                // name defined in the configuration
                if (pending.idxConf.isNamed())
                  contentField = pending.idxConf.getName();
                else
                  contentField = LuceneUtil.encodeQName(pending.qname, index.getBrokerPool().getSymbols());


                Field fld = new Field(contentField, pending.text.toString(), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES);
                if (pending.idxConf.getBoost() > 0)
                    fld.setBoost(pending.idxConf.getBoost());


                else if (config.getBoost() > 0)
                    fld.setBoost(config.getBoost());


                doc.add(fld);


                fDocIdIdx.setIntValue(currentDoc.getDocId());
                doc.add(fDocIdIdx);
                
                for (Field meta : metas) {
                    doc.add(meta);
                }
                if (!paths.isEmpty()) {
                    facetFields.addFields(doc, paths);
                }
                
                if (pending.idxConf.getAnalyzer() == null)
                    writer.addDocument(doc);
                else {
                    writer.addDocument(doc, pending.idxConf.getAnalyzer());
                }
            }
        } catch (IOException e) {
            LOG.warn("An exception was caught while indexing document: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
            nodesToWrite = new ArrayList<>();
            cachedNodesSize = 0;
        }
    }


    /**
     * Optimize the Lucene index by merging all segments into a single one. This
     * may take a while and write operations will be blocked during the optimize.
     */
    public void optimize() {
        IndexWriter writer = null;
        try {
            writer = index.getWriter(true);
            writer.forceMerge(1, true);
            writer.commit();
        } catch (IOException e) {
            LOG.warn("An exception was caught while optimizing the lucene index: " + e.getMessage(), e);
        } finally {
            index.releaseWriter(writer);
        }
    }


    private class LuceneStreamListener extends AbstractStreamListener {


        @Override
        public void startElement(Txn transaction, ElementImpl element, NodePath path) {
            if (mode == STORE && config != null) {
                if (contentStack != null && !contentStack.isEmpty()) {
                    for (TextExtractor extractor : contentStack) {
                        extractor.startElement(element.getQName());
                    }
                }
                Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
                if (configIter != null) {
                    if (contentStack == null) contentStack = new Stack<>();
                    while (configIter.hasNext()) {
                        LuceneIndexConfig configuration = configIter.next();
                        if (configuration.match(path)) {
                            TextExtractor extractor = new DefaultTextExtractor();
                            extractor.configure(config, configuration);
                            contentStack.push(extractor);
                        }
                    }
                }
            }
            super.startElement(transaction, element, path);
        }


        @Override
        public void endElement(Txn transaction, ElementImpl element, NodePath path) {
            if (config != null) {
                if (mode == STORE && contentStack != null && !contentStack.isEmpty()) {
                    for (TextExtractor extractor : contentStack) {
                        extractor.endElement(element.getQName());
                    }
                }
                Iterator<LuceneIndexConfig> configIter = config.getConfig(path);
                if (mode != REMOVE_ALL_NODES && configIter != null) {
                    if (mode == REMOVE_SOME_NODES) {
                        nodesToRemove.add(element.getNodeId());
                    } else {
                        while (configIter.hasNext()) {
                            LuceneIndexConfig configuration = configIter.next();
                            if (configuration.match(path)) {
                                TextExtractor extractor = contentStack.pop();
                                indexText(element.getNodeId(), element.getQName(), 
                                    path, extractor.getIndexConfig(), extractor.getText());
                            }
                        }
                    }
                }
            }
            super.endElement(transaction, element, path);
        }


        @Override
        public void attribute(Txn transaction, AttrImpl attrib, NodePath path) {
            path.addComponent(attrib.getQName());
            Iterator<LuceneIndexConfig> configIter = null;
            if (config != null)
                configIter = config.getConfig(path);
            if (mode != REMOVE_ALL_NODES && configIter != null) {
                if (mode == REMOVE_SOME_NODES) {
                    nodesToRemove.add(attrib.getNodeId());
                } else {
                    while (configIter.hasNext()) {
                        LuceneIndexConfig configuration = configIter.next();
                        if (configuration.match(path)) {
                            indexText(attrib.getNodeId(), attrib.getQName(), path,
                                configuration, attrib.getValue());
                        }
                    }
                }
            }
            path.removeLastComponent();
            super.attribute(transaction, attrib, path);
        }


        @Override
        public void characters(Txn transaction, CharacterDataImpl text, NodePath path) {
            if (contentStack != null && !contentStack.isEmpty()) {
                for (TextExtractor extractor : contentStack) {
                  extractor.beforeCharacters();
                    extractor.characters(text.getXMLString());
                }
            }
            super.characters(transaction, text, path);
        }


        @Override
        public IndexWorker getWorker() {
            return LuceneIndexWorker.this;
        }
    }


    /**
     * Match class containing the score of a match and a reference to
     * the query that generated it.
     */
    public class LuceneMatch extends Match {


        private float score = 0.0f;
        private final Query query;


        public LuceneMatch(int contextId, NodeId nodeId, Query query) {
            super(contextId, nodeId, null);
            this.query = query;
        }


        public LuceneMatch(LuceneMatch copy) {
            super(copy);
            this.score = copy.score;
            this.query = copy.query;
        }


        @Override
        public Match createInstance(int contextId, NodeId nodeId, String matchTerm) {
            return null;
        }


        public Match createInstance(int contextId, NodeId nodeId, Query query) {
            return new LuceneMatch(contextId, nodeId, query);
        }


        @Override
        public Match newCopy() {
            return new LuceneMatch(this);
        }


        @Override
        public String getIndexId() {
            return LuceneIndex.ID;
        }


        public Query getQuery() {
            return query;
        }


        public float getScore() {
            return score;
        }


        protected void setScore(float score) {
            this.score = score;
        }


        // DW: missing hashCode() ?
        @Override
        public boolean equals(Object other) {
            if (!(other instanceof LuceneMatch)) {
                return false;
            }
            LuceneMatch o = (LuceneMatch) other;
            return (nodeId == o.nodeId || nodeId.equals(o.nodeId))
                    && query == ((LuceneMatch) other).query;
        }


        @Override
        public boolean matchEquals(Match other) {
            return equals(other);
        }
    }
}
Source Code of org.exist.indexing.lucene.LuceneIndexWorker

Related Classes of org.exist.indexing.lucene.LuceneIndexWorker