Source Code of org.apache.xindice.core.indexer.LuceneIndexer$Searcher

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * $Id: LuceneIndexer.java 586647 2007-10-20 00:32:43Z natalia $
 */


package org.apache.xindice.core.indexer;


import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xindice.core.Collection;
import org.apache.xindice.core.DBObject;
import org.apache.xindice.core.DBException;
import org.apache.xindice.core.data.Key;
import org.apache.xindice.core.FaultCodes;
import org.apache.xindice.core.query.CompilationException;
import org.apache.xindice.core.query.ProcessingException;
import org.apache.xindice.util.Configuration;
import org.apache.xindice.util.XindiceException;
import org.apache.xindice.util.StringUtilities;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Query;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;


import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;


/**
 * LuceneIndexer is used for maintaining full text indexes. It operates on
 * documents instead of elements and allows to search for documents using
 * native Lucene query. There can be only one LuceneIndexer per collection,
 * however, it may have more than one IndexPattern.<p>
 *
 * Every IndexPattern corresponds to a Lucene document field. For every Xindice
 * document, value of all matching elements will be indexed by a single Lucene
 * document, allowing to search across the patterns.</p><p>
 *
 * Sample LuceneIndexer configuration:
 * <pre>
 * &lt;index name='fulltext' class='org.apache.xindice.core.indexer.LuceneIndexer'
 *                        analyzer='org.apache.lucene.analysis.SimpleAnalyzer'&gt;
 *   &lt;pattern pattern='meta@title' alias='title'/&gt;
 *   &lt;pattern pattern='description' alias='text'/&gt;
 * &lt;/index&gt;</pre></p><p>
 *
 * To search over this sample index, one could issue a query <code>"title:tutorial
 * AND text:xml"</code>.</p><p>
 *
 * For more details about LuceneIndexer configuration please see documentation for
 * {@link #setConfig(org.apache.xindice.util.Configuration)}
 * </p>
 *
 * @author Andy Armstrong
 * @version $Revision: 586647 $, $Date: 2007-10-19 20:32:43 -0400 (Fri, 19 Oct 2007) $
 */
public final class LuceneIndexer implements Indexer, DBObject {


    private static final Log log = LogFactory.getLog(LuceneIndexer.class);


    private static final String NAME     = "name";
    private static final String PATTERN  = "pattern";
    private static final String DEFAULT  = "default";
    private static final String ANALYZER = "analyzer";
    private static final String PATTERN_STRING  = "pattern";
    private static final String PATTERN_ALIAS  = "alias";


    public static final String KEYNAME  = "key";


    // Default analyzer to use
    public static final String DEFANALYZER = "org.apache.lucene.analysis.SimpleAnalyzer";
    private static final IndexMatch[] EMPTY_MATCHES = new IndexMatch[0];


    private File          idxFile;
    private IndexWriter   iw;
    private Analyzer      an;


    /**
     * Most recently opened searcher. The same Searcher instance is going to
     * be used for all the searches unless index has changed and new Searcher
     * is required to access the changes.
     *
     * Searcher cannot be closed if it is being used (if there is a query in
     * progress or hits are iterated).
     */
    private Searcher searcher;


    private Configuration config;
    private Collection    collection;


    private String name;
    private HashMap patterns = new HashMap();


    // Keep a count of changes to the index
    private int docsAdded;
    private int docsDeleted;


    private final Object lock = new Object();


    private String defaultField = "";


    private void setFile(File f) {
        idxFile = f;
    }


    private File getFile() {
        if (null == idxFile) {
            throw new IllegalStateException("Not bound to a file");
        }
        return idxFile;
    }


    public String getIndexStyle() {
        return STYLE_FULLTEXT;
    }


    /**
     * Returns this Indexer's patterns. LuceneIndexer may have more than one
     * pattern.
     * @return Indexer's patterns
     */
    public IndexPattern[] getPatterns() {
        return (IndexPattern[]) patterns.keySet().toArray(new IndexPattern[0]);
    }


    /**
     * Return alias for the given pattern. If this exact pattern is not indexed,
     * method will look for matching indexed pattern.
     * @param pattern IndexPattern
     * @return Alias for the closest matching pattern or null, if there is none
     */
    public String getPatternAlias(IndexPattern pattern) {
        if (patterns.containsKey(pattern)) {
            return (String) patterns.get(pattern);
        }


        int match = 0;
        IndexPattern matchPattern = null;
        for (Iterator i = patterns.keySet().iterator(); i.hasNext(); ) {
            IndexPattern p = (IndexPattern) i.next();
            int cMatch = pattern.getMatchLevel(p);
            if (cMatch > match) {
                match = cMatch;
                matchPattern = p;
            }
        } 


        return (String) patterns.get(matchPattern);
    }


    /**
     * Configures LuceneIndexer instance.
     * <dl>
     * <dt>index
     * <dd>Top Indexer configuration element. Can have one or more pattern
     * child elements. Its attributes:
     *
     * <ul><li>name - Indexer name. Required.
     * <li>class - Indexer class. Required.
     * org.apache.xindice.core.indexer.LuceneIndexer for full text index.
     * <li>analyzer - Analyzer to use for indexing. Optional,
     * org.apache.lucene.analysis.SimpleAnalyzer by default.</ul>
     *
     * <dl><dt>pattern
     * <dd>Child element. Indexer must have at least one pattern. Its
     * attributes:
     * <ul><li>pattern - IndexPattern. For acceptable formats, see
     * {@link org.apache.xindice.core.indexer.Indexer#getPatterns()}
     * <li>alias - Name of the field to store/search values for that pattern.
     * </dl>
     * <dl><dt>default
     * <dd>Child element. Optional. Its attributes:
     * <li>alias - Indicates the pattern alias that will be used as
     * the default field for search. If omitted, search query has to include
     * field name for all terms, there will be no default.
     * </ul></dl>
     * </dl>
     *
     * @param config Configuration to apply
     * @throws XindiceException Configuration does not have required information,
     * Analyzer could not have been instantiated.
     */
    public void setConfig(Configuration config) throws XindiceException {
        this.config = config;
        try {
            name = config.getAttribute(NAME);
            String analyzer = config.getAttribute(ANALYZER);


            String anc = StringUtilities.isBlank(analyzer) ? DEFANALYZER : analyzer;
            Class c = Class.forName(anc);
            an = (Analyzer) c.newInstance();


            Configuration[] patterns = config.getChildren(PATTERN);
            if (patterns.length == 0) {
                throw new CannotCreateException("Configuration must have at least one pattern");
            }


            for (int i = 0; i < patterns.length; i++) {
                String name = patterns[i].getAttribute(PATTERN_STRING);
                String alias = patterns[i].getAttribute(PATTERN_ALIAS);
                this.patterns.put(new IndexPattern(collection.getSymbols(), name, null), alias);
            }


            Configuration[] defaults = config.getChildren(DEFAULT);
            if (defaults.length > 1) {
                throw new CannotCreateException("There may be only one default field");
            } else if (defaults.length == 1) {
                String alias = defaults[0].getAttribute(PATTERN_ALIAS);
                if (this.patterns.values().contains(alias)) {
                    defaultField = alias;
                } else {
                    throw new CannotCreateException("Alias '" + alias + "' is undefined in configuration");
                }
            }


            setFile(new File(collection.getCollectionRoot(), name));
        } catch (Exception e) {
            throw new XindiceException(e);
        }
    }


    public Configuration getConfig() {
        return config;
    }


    public boolean exists() {
        return IndexReader.indexExists(idxFile);
    }


    /**
     * Creates necessary resources.
     *
     * @return true, if successful
     * @throws DBException The was low-level IOException that prevented index
     * from creating resources.
     * @throws DuplicateIndexException Parent collection already has full text index
     */
    public synchronized boolean create() throws DBException {
        if (luceneIndexerFound()) {
            throw new DuplicateIndexException("Collection can only have one full text index.");
        }
        openWrite(true);
        return true;
    }


    private boolean luceneIndexerFound() throws DBException {
        String indexers[] = collection.getIndexManager().list();
        for (int i = 0; i < indexers.length; i++) {
            Indexer indexer = collection.getIndexer(indexers[i]);
            if (indexer instanceof LuceneIndexer) {
                return true;
            }
        }


        return false;
    }


    public boolean open() throws DBException {
        openWrite(false);
        return true;
    }


    public boolean isOpened() {
        return null != iw;
    }


    public synchronized boolean close() throws DBException {
        closeWrite();
        if (searcher != null) {
            searcher.close(true);
        }
        return true;
    }


    public boolean drop() throws DBException {
        try {
            if (IndexReader.indexExists(idxFile)) {
                close();
                return deepDelete(getFile());
            } else {
                return false;
            }
        } catch (IOException e) {
            throw new DBException(FaultCodes.IDX_CORRUPTED,
                                  "Failed to delete index " + name + ", collection " + collection.getCanonicalName(), e);
        }
    }


    public String getName() {
        return name;
    }


    public void setCollection(Collection collection) {
        this.collection = collection;
    }


    public Analyzer getAnalyzer() {
        return an;
    }


    private void openWrite(boolean create) throws DBException {
        if (log.isTraceEnabled()) {
            log.trace("Calling openWrite(" + create + ")");
        }


        try {
            if (iw == null) {
                iw = new IndexWriter(getFile(), getAnalyzer(), create);
            }
        } catch (IOException e) {
            if (create) {
                throw new DBException(FaultCodes.IDX_CANNOT_CREATE,
                                      "Failed to cleate index " + name + ", collection " + collection.getCanonicalName(), e);
            } else {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to open index " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }
    }


    private void assertOpen() {
        if (!isOpened()) {
            throw new IllegalStateException("Index has not been opened");
        }
    }


    private void closeWrite() throws DBException {
        if (null != iw) {
            try {
                iw.close();
                iw = null;
            } catch (IOException e) {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to close writer for index " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }
    }


    private boolean deepDelete(File f) throws IOException {
        if (f.isDirectory()) {
            File fl[] = f.listFiles();
            for (int i = 0; i < fl.length; i++) {
                if (!deepDelete(fl[i])) {
                    return false;
                }
            }
        }
        return f.delete();
    }


    public void flush() throws DBException {
        try {
            assertOpen();
            if (iw != null) {
                iw.flush();


                int nDocs = iw.docCount();
                /* Fairly arbitrary rules for triggering index optimisation. Need to
                 * play with these.
                 */
                synchronized(lock) {
                    if (docsAdded > nDocs / 10 || docsAdded > 50 || docsDeleted > 10) {
                        if (log.isDebugEnabled()) {
                            log.debug("Optimizing text index for " + collection.getCanonicalName() + "...");
                        }


                        iw.optimize();
                        docsAdded = 0;
                        docsDeleted = 0;
                    }
                }


            }
        } catch (IOException e) {
            throw new DBException(FaultCodes.IDX_CORRUPTED,
                                  "Could not force unwritten data to disk for index " + name + ", collection " + collection.getCanonicalName(), e);
        }
    }


    /**
     * Creates new instance of a handler to listen to indexer events. For
     * every document that being added there will be a separate handler
     * that will assemble all relevant values in a single Lucene document.
     *
     * @return new instance of IndexerEventHandler
     */
    public IndexerEventHandler getIndexerEventHandler() {
        return new BasicIndexerEventHandler() {
            Document doc;


            public void onDocumentAdded(Key key) throws DBException {
                if (doc != null) {
                    assertOpen();


                    try {
                        iw.addDocument(doc);
                        synchronized(lock) {
                            docsAdded++;
                        }
                    } catch (IOException e) {
                        throw new DBException(FaultCodes.IDX_CORRUPTED,
                                              "Failed to add document to the index " + name + ", collection " + collection.getCanonicalName(), e);
                    }
                }
            }


            public void onDocumentDeleted(Key key) throws DBException {
                assertOpen();


                try {
                    iw.deleteDocuments(new Term(KEYNAME, key.toString()));
                    synchronized(lock) {
                        docsDeleted++;
                    }
                } catch (IOException e) {
                    throw new DBException(FaultCodes.IDX_CORRUPTED,
                                          "Failed to delete document from the index " + name + ", collection " + collection.getCanonicalName(), e);
                }
            }


            public void onValueAdded(IndexPattern pattern, String value, Key key, int pos, int len, short elemID, short attrID) {
                if (doc == null) {
                    doc = new Document();
                    doc.add(new Field(KEYNAME, key.toString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
                }


                String field = (String) patterns.get(pattern);
                doc.add(new Field(field, value, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
            }
        };
    }


    public IndexMatch[] queryMatches(final IndexQuery query) throws DBException {
        // this indexer only supports text queries
        if (query.getOperator() != IndexQuery.TQ) {
            return null;
        }


        String textQuery = query.getValue(0).toString();
        try {
            return queryMatches(new QueryParser(defaultField, getAnalyzer()).parse(textQuery));
        } catch (ParseException e) {
            throw new CompilationException("Failed to parse query '" + textQuery + "'", e);
        }
    }


    /**
     * Same as {@link Indexer#queryMatches(IndexQuery)}, but accepts compiled Lucene query as
     * parameter.
     *
     * @param query Compiled Lucene query.
     * @return The resulting matches
     * @throws DBException if IOException prevented indexer from executing the query.
     */
    public IndexMatch[] queryMatches(Query query) throws DBException {
        ArrayList matches = new ArrayList();
        Searcher searcher = getSearcher();


        try {
            Hits hits = searcher.search(query);
            for (Iterator i = hits.iterator(); i.hasNext(); ) {
                Hit hit = (Hit) i.next();
                Key key = new Key(hit.getDocument().getField(KEYNAME).stringValue());
                matches.add(new IndexMatch(key, -1, -1));
            }
        } catch (IOException e) {
            throw new ProcessingException("Failed to process a query", e);
        } finally {
            searcher.free();
        }


        return (IndexMatch[]) matches.toArray(EMPTY_MATCHES);
    }


    /**
     * getSearcher returns Searcher that uses current version of the index.
     * If index has been modified since last time searcher was requested
     * this method will create new Searcher instance, otherwise it will
     * return Searcher instance it created previously.
     *
     * @return current Searcher
     * @throws DBException
     */
    private synchronized Searcher getSearcher() throws DBException {


        if (searcher != null && !searcher.isCurrent()) {
            searcher.close(false);
            searcher = null;
        }


        if (searcher == null) {
            searcher = new Searcher();
        } else {
            searcher.incRef();
        }


        return searcher;
    }


    private class Searcher {
        private IndexReader ir;
        private IndexSearcher is;


        // number of searches in progress using that searcher
        private int ref = 1;


        public Searcher() throws DBException {
            try {
                ir = IndexReader.open(getFile());
                is = new IndexSearcher(ir);
            } catch (IOException e) {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to open access " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }


        public boolean isCurrent() throws DBException {
            try {
                return ir.isCurrent();
            } catch (IOException e) {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }


        public void incRef() {
            ref++;
        }


        /**
         * This method must be called after executing text query to cleanup
         * resources that are not in use anymore. It decrements number of
         * searches referencing this searcher and then attempts to close it
         * unless it is the most recently opened searcher. If there were no
         * searchers opened after this one, the searcher will be kept open
         * for future use, even if it is not used at the moment.
         *
         * @throws DBException if there was IOException
         */
        public void free() throws DBException {
            synchronized (LuceneIndexer.this) {
                ref--;


                if (searcher != this) {
                    close(false);
                }
            }
        }


        /**
         * Closes the searcher if it is not used in any search.
         *
         * @param force true if searcher has to be closed even if it is used
         * @throws DBException if there was IOException
         */
        public void close(boolean force) throws DBException {
            try {
                if (ref == 0 || force) {
                    is.close();
                    ir.close();
                }
            } catch (IOException e) {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }


        public Hits search(Query query) throws DBException {
            try {
                return is.search(query);
            } catch (IOException e) {
                throw new DBException(FaultCodes.IDX_CORRUPTED,
                                      "Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
            }
        }
    }
}
Source Code of org.apache.xindice.core.indexer.LuceneIndexer$Searcher

Related Classes of org.apache.xindice.core.indexer.LuceneIndexer$Searcher