/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* $Id: LuceneIndexer.java 586647 2007-10-20 00:32:43Z natalia $
*/
package org.apache.xindice.core.indexer;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xindice.core.Collection;
import org.apache.xindice.core.DBObject;
import org.apache.xindice.core.DBException;
import org.apache.xindice.core.data.Key;
import org.apache.xindice.core.FaultCodes;
import org.apache.xindice.core.query.CompilationException;
import org.apache.xindice.core.query.ProcessingException;
import org.apache.xindice.util.Configuration;
import org.apache.xindice.util.XindiceException;
import org.apache.xindice.util.StringUtilities;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Query;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.ArrayList;
/**
* LuceneIndexer is used for maintaining full text indexes. It operates on
* documents instead of elements and allows to search for documents using
* native Lucene query. There can be only one LuceneIndexer per collection,
* however, it may have more than one IndexPattern.<p>
*
* Every IndexPattern corresponds to a Lucene document field. For every Xindice
* document, value of all matching elements will be indexed by a single Lucene
* document, allowing to search across the patterns.</p><p>
*
* Sample LuceneIndexer configuration:
* <pre>
* <index name='fulltext' class='org.apache.xindice.core.indexer.LuceneIndexer'
* analyzer='org.apache.lucene.analysis.SimpleAnalyzer'>
* <pattern pattern='meta@title' alias='title'/>
* <pattern pattern='description' alias='text'/>
* </index></pre></p><p>
*
* To search over this sample index, one could issue a query <code>"title:tutorial
* AND text:xml"</code>.</p><p>
*
* For more details about LuceneIndexer configuration please see documentation for
* {@link #setConfig(org.apache.xindice.util.Configuration)}
* </p>
*
* @author Andy Armstrong
* @version $Revision: 586647 $, $Date: 2007-10-19 20:32:43 -0400 (Fri, 19 Oct 2007) $
*/
public final class LuceneIndexer implements Indexer, DBObject {
private static final Log log = LogFactory.getLog(LuceneIndexer.class);
private static final String NAME = "name";
private static final String PATTERN = "pattern";
private static final String DEFAULT = "default";
private static final String ANALYZER = "analyzer";
private static final String PATTERN_STRING = "pattern";
private static final String PATTERN_ALIAS = "alias";
public static final String KEYNAME = "key";
// Default analyzer to use
public static final String DEFANALYZER = "org.apache.lucene.analysis.SimpleAnalyzer";
private static final IndexMatch[] EMPTY_MATCHES = new IndexMatch[0];
private File idxFile;
private IndexWriter iw;
private Analyzer an;
/**
* Most recently opened searcher. The same Searcher instance is going to
* be used for all the searches unless index has changed and new Searcher
* is required to access the changes.
*
* Searcher cannot be closed if it is being used (if there is a query in
* progress or hits are iterated).
*/
private Searcher searcher;
private Configuration config;
private Collection collection;
private String name;
private HashMap patterns = new HashMap();
// Keep a count of changes to the index
private int docsAdded;
private int docsDeleted;
private final Object lock = new Object();
private String defaultField = "";
private void setFile(File f) {
idxFile = f;
}
private File getFile() {
if (null == idxFile) {
throw new IllegalStateException("Not bound to a file");
}
return idxFile;
}
public String getIndexStyle() {
return STYLE_FULLTEXT;
}
/**
* Returns this Indexer's patterns. LuceneIndexer may have more than one
* pattern.
* @return Indexer's patterns
*/
public IndexPattern[] getPatterns() {
return (IndexPattern[]) patterns.keySet().toArray(new IndexPattern[0]);
}
/**
* Return alias for the given pattern. If this exact pattern is not indexed,
* method will look for matching indexed pattern.
* @param pattern IndexPattern
* @return Alias for the closest matching pattern or null, if there is none
*/
public String getPatternAlias(IndexPattern pattern) {
if (patterns.containsKey(pattern)) {
return (String) patterns.get(pattern);
}
int match = 0;
IndexPattern matchPattern = null;
for (Iterator i = patterns.keySet().iterator(); i.hasNext(); ) {
IndexPattern p = (IndexPattern) i.next();
int cMatch = pattern.getMatchLevel(p);
if (cMatch > match) {
match = cMatch;
matchPattern = p;
}
}
return (String) patterns.get(matchPattern);
}
/**
* Configures LuceneIndexer instance.
* <dl>
* <dt>index
* <dd>Top Indexer configuration element. Can have one or more pattern
* child elements. Its attributes:
*
* <ul><li>name - Indexer name. Required.
* <li>class - Indexer class. Required.
* org.apache.xindice.core.indexer.LuceneIndexer for full text index.
* <li>analyzer - Analyzer to use for indexing. Optional,
* org.apache.lucene.analysis.SimpleAnalyzer by default.</ul>
*
* <dl><dt>pattern
* <dd>Child element. Indexer must have at least one pattern. Its
* attributes:
* <ul><li>pattern - IndexPattern. For acceptable formats, see
* {@link org.apache.xindice.core.indexer.Indexer#getPatterns()}
* <li>alias - Name of the field to store/search values for that pattern.
* </dl>
* <dl><dt>default
* <dd>Child element. Optional. Its attributes:
* <li>alias - Indicates the pattern alias that will be used as
* the default field for search. If omitted, search query has to include
* field name for all terms, there will be no default.
* </ul></dl>
* </dl>
*
* @param config Configuration to apply
* @throws XindiceException Configuration does not have required information,
* Analyzer could not have been instantiated.
*/
public void setConfig(Configuration config) throws XindiceException {
this.config = config;
try {
name = config.getAttribute(NAME);
String analyzer = config.getAttribute(ANALYZER);
String anc = StringUtilities.isBlank(analyzer) ? DEFANALYZER : analyzer;
Class c = Class.forName(anc);
an = (Analyzer) c.newInstance();
Configuration[] patterns = config.getChildren(PATTERN);
if (patterns.length == 0) {
throw new CannotCreateException("Configuration must have at least one pattern");
}
for (int i = 0; i < patterns.length; i++) {
String name = patterns[i].getAttribute(PATTERN_STRING);
String alias = patterns[i].getAttribute(PATTERN_ALIAS);
this.patterns.put(new IndexPattern(collection.getSymbols(), name, null), alias);
}
Configuration[] defaults = config.getChildren(DEFAULT);
if (defaults.length > 1) {
throw new CannotCreateException("There may be only one default field");
} else if (defaults.length == 1) {
String alias = defaults[0].getAttribute(PATTERN_ALIAS);
if (this.patterns.values().contains(alias)) {
defaultField = alias;
} else {
throw new CannotCreateException("Alias '" + alias + "' is undefined in configuration");
}
}
setFile(new File(collection.getCollectionRoot(), name));
} catch (Exception e) {
throw new XindiceException(e);
}
}
public Configuration getConfig() {
return config;
}
public boolean exists() {
return IndexReader.indexExists(idxFile);
}
/**
* Creates necessary resources.
*
* @return true, if successful
* @throws DBException The was low-level IOException that prevented index
* from creating resources.
* @throws DuplicateIndexException Parent collection already has full text index
*/
public synchronized boolean create() throws DBException {
if (luceneIndexerFound()) {
throw new DuplicateIndexException("Collection can only have one full text index.");
}
openWrite(true);
return true;
}
private boolean luceneIndexerFound() throws DBException {
String indexers[] = collection.getIndexManager().list();
for (int i = 0; i < indexers.length; i++) {
Indexer indexer = collection.getIndexer(indexers[i]);
if (indexer instanceof LuceneIndexer) {
return true;
}
}
return false;
}
public boolean open() throws DBException {
openWrite(false);
return true;
}
public boolean isOpened() {
return null != iw;
}
public synchronized boolean close() throws DBException {
closeWrite();
if (searcher != null) {
searcher.close(true);
}
return true;
}
public boolean drop() throws DBException {
try {
if (IndexReader.indexExists(idxFile)) {
close();
return deepDelete(getFile());
} else {
return false;
}
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to delete index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
public String getName() {
return name;
}
public void setCollection(Collection collection) {
this.collection = collection;
}
public Analyzer getAnalyzer() {
return an;
}
private void openWrite(boolean create) throws DBException {
if (log.isTraceEnabled()) {
log.trace("Calling openWrite(" + create + ")");
}
try {
if (iw == null) {
iw = new IndexWriter(getFile(), getAnalyzer(), create);
}
} catch (IOException e) {
if (create) {
throw new DBException(FaultCodes.IDX_CANNOT_CREATE,
"Failed to cleate index " + name + ", collection " + collection.getCanonicalName(), e);
} else {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to open index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
}
private void assertOpen() {
if (!isOpened()) {
throw new IllegalStateException("Index has not been opened");
}
}
private void closeWrite() throws DBException {
if (null != iw) {
try {
iw.close();
iw = null;
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to close writer for index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
}
private boolean deepDelete(File f) throws IOException {
if (f.isDirectory()) {
File fl[] = f.listFiles();
for (int i = 0; i < fl.length; i++) {
if (!deepDelete(fl[i])) {
return false;
}
}
}
return f.delete();
}
public void flush() throws DBException {
try {
assertOpen();
if (iw != null) {
iw.flush();
int nDocs = iw.docCount();
/* Fairly arbitrary rules for triggering index optimisation. Need to
* play with these.
*/
synchronized(lock) {
if (docsAdded > nDocs / 10 || docsAdded > 50 || docsDeleted > 10) {
if (log.isDebugEnabled()) {
log.debug("Optimizing text index for " + collection.getCanonicalName() + "...");
}
iw.optimize();
docsAdded = 0;
docsDeleted = 0;
}
}
}
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Could not force unwritten data to disk for index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
/**
* Creates new instance of a handler to listen to indexer events. For
* every document that being added there will be a separate handler
* that will assemble all relevant values in a single Lucene document.
*
* @return new instance of IndexerEventHandler
*/
public IndexerEventHandler getIndexerEventHandler() {
return new BasicIndexerEventHandler() {
Document doc;
public void onDocumentAdded(Key key) throws DBException {
if (doc != null) {
assertOpen();
try {
iw.addDocument(doc);
synchronized(lock) {
docsAdded++;
}
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to add document to the index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
}
public void onDocumentDeleted(Key key) throws DBException {
assertOpen();
try {
iw.deleteDocuments(new Term(KEYNAME, key.toString()));
synchronized(lock) {
docsDeleted++;
}
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to delete document from the index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
public void onValueAdded(IndexPattern pattern, String value, Key key, int pos, int len, short elemID, short attrID) {
if (doc == null) {
doc = new Document();
doc.add(new Field(KEYNAME, key.toString(), Field.Store.YES, Field.Index.UN_TOKENIZED));
}
String field = (String) patterns.get(pattern);
doc.add(new Field(field, value, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
}
};
}
public IndexMatch[] queryMatches(final IndexQuery query) throws DBException {
// this indexer only supports text queries
if (query.getOperator() != IndexQuery.TQ) {
return null;
}
String textQuery = query.getValue(0).toString();
try {
return queryMatches(new QueryParser(defaultField, getAnalyzer()).parse(textQuery));
} catch (ParseException e) {
throw new CompilationException("Failed to parse query '" + textQuery + "'", e);
}
}
/**
* Same as {@link Indexer#queryMatches(IndexQuery)}, but accepts compiled Lucene query as
* parameter.
*
* @param query Compiled Lucene query.
* @return The resulting matches
* @throws DBException if IOException prevented indexer from executing the query.
*/
public IndexMatch[] queryMatches(Query query) throws DBException {
ArrayList matches = new ArrayList();
Searcher searcher = getSearcher();
try {
Hits hits = searcher.search(query);
for (Iterator i = hits.iterator(); i.hasNext(); ) {
Hit hit = (Hit) i.next();
Key key = new Key(hit.getDocument().getField(KEYNAME).stringValue());
matches.add(new IndexMatch(key, -1, -1));
}
} catch (IOException e) {
throw new ProcessingException("Failed to process a query", e);
} finally {
searcher.free();
}
return (IndexMatch[]) matches.toArray(EMPTY_MATCHES);
}
/**
* getSearcher returns Searcher that uses current version of the index.
* If index has been modified since last time searcher was requested
* this method will create new Searcher instance, otherwise it will
* return Searcher instance it created previously.
*
* @return current Searcher
* @throws DBException
*/
private synchronized Searcher getSearcher() throws DBException {
if (searcher != null && !searcher.isCurrent()) {
searcher.close(false);
searcher = null;
}
if (searcher == null) {
searcher = new Searcher();
} else {
searcher.incRef();
}
return searcher;
}
private class Searcher {
private IndexReader ir;
private IndexSearcher is;
// number of searches in progress using that searcher
private int ref = 1;
public Searcher() throws DBException {
try {
ir = IndexReader.open(getFile());
is = new IndexSearcher(ir);
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to open access " + name + ", collection " + collection.getCanonicalName(), e);
}
}
public boolean isCurrent() throws DBException {
try {
return ir.isCurrent();
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
public void incRef() {
ref++;
}
/**
* This method must be called after executing text query to cleanup
* resources that are not in use anymore. It decrements number of
* searches referencing this searcher and then attempts to close it
* unless it is the most recently opened searcher. If there were no
* searchers opened after this one, the searcher will be kept open
* for future use, even if it is not used at the moment.
*
* @throws DBException if there was IOException
*/
public void free() throws DBException {
synchronized (LuceneIndexer.this) {
ref--;
if (searcher != this) {
close(false);
}
}
}
/**
* Closes the searcher if it is not used in any search.
*
* @param force true if searcher has to be closed even if it is used
* @throws DBException if there was IOException
*/
public void close(boolean force) throws DBException {
try {
if (ref == 0 || force) {
is.close();
ir.close();
}
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
public Hits search(Query query) throws DBException {
try {
return is.search(query);
} catch (IOException e) {
throw new DBException(FaultCodes.IDX_CORRUPTED,
"Failed to access index " + name + ", collection " + collection.getCanonicalName(), e);
}
}
}
}