Source Code of org.apache.lucene.index.IndexWriter

package org.apache.lucene.index;


/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */


import java.io.IOException;
import java.io.File;
import java.io.PrintStream;
import java.util.Vector;


import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.Analyzer;


/**
  An IndexWriter creates and maintains an index.


  The third argument to the <a href="#IndexWriter"><b>constructor</b></a>
  determines whether a new index is created, or whether an existing index is
  opened for the addition of new documents.


  In either case, documents are added with the <a
  href="#addDocument"><b>addDocument</b></a> method.  When finished adding
  documents, <a href="#close"><b>close</b></a> should be called.


  If an index will not have more documents added for a while and optimal search
  performance is desired, then the <a href="#optimize"><b>optimize</b></a>
  method should be called before the index is closed.
  */


public final class IndexWriter {
  private Directory directory;        // where this index resides
  private Analyzer analyzer;        // how to analyze text


  private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
  private final Directory ramDirectory = new RAMDirectory(); // for temp segs


  private Lock writeLock;


  /** Constructs an IndexWriter for the index in <code>path</code>.  Text will
    be analyzed with <code>a</code>.  If <code>create</code> is true, then a
    new, empty index will be created in <code>path</code>, replacing the index
    already there, if any. */
  public IndexWriter(String path, Analyzer a, boolean create)
       throws IOException {
    this(FSDirectory.getDirectory(path, create), a, create);
  }


  /** Constructs an IndexWriter for the index in <code>path</code>.  Text will
    be analyzed with <code>a</code>.  If <code>create</code> is true, then a
    new, empty index will be created in <code>path</code>, replacing the index
    already there, if any. */
  public IndexWriter(File path, Analyzer a, boolean create)
       throws IOException {
    this(FSDirectory.getDirectory(path, create), a, create);
  }


  /** Constructs an IndexWriter for the index in <code>d</code>.  Text will be
    analyzed with <code>a</code>.  If <code>create</code> is true, then a new,
    empty index will be created in <code>d</code>, replacing the index already
    there, if any. */
  public IndexWriter(Directory d, Analyzer a, final boolean create)
       throws IOException {
    directory = d;
    analyzer = a;


    Lock writeLock = directory.makeLock("write.lock");
    if (!writeLock.obtain())                      // obtain write lock
      throw new IOException("Index locked for write: " + writeLock);
    this.writeLock = writeLock;                   // save it


    synchronized (directory) {        // in- & inter-process sync
      new Lock.With(directory.makeLock("commit.lock")) {
    public Object doBody() throws IOException {
      if (create)
        segmentInfos.write(directory);
      else
        segmentInfos.read(directory);
      return null;
    }
  }.run();
    }
  }


  /** Flushes all changes to an index, closes all associated files, and closes
    the directory that the index is stored in. */
  public final synchronized void close() throws IOException {
    flushRamSegments();
    ramDirectory.close();
    writeLock.release();                          // release write lock
    writeLock = null;
    directory.close();
  }


  /** Release the write lock, if needed. */
  protected final void finalize() throws IOException {
    if (writeLock != null) {
      writeLock.release();                        // release write lock
      writeLock = null;
    }
  }


  /** Returns the number of documents currently in this index. */
  public final synchronized int docCount() {
    int count = 0;
    for (int i = 0; i < segmentInfos.size(); i++) {
      SegmentInfo si = segmentInfos.info(i);
      count += si.docCount;
    }
    return count;
  }


  /** The maximum number of terms that will be indexed for a single field in a
    document.  This limits the amount of memory required for indexing, so that
    collections with very large files will not crash the indexing process by
    running out of memory.


    <p>By default, no more than 10,000 terms will be indexed for a field. */
  public int maxFieldLength = 10000;


  /** Adds a document to this index.*/
  public final void addDocument(Document doc) throws IOException {
    DocumentWriter dw =
      new DocumentWriter(ramDirectory, analyzer, maxFieldLength);
    String segmentName = newSegmentName();
    dw.addDocument(segmentName, doc);
    synchronized (this) {
      segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory));
      maybeMergeSegments();
    }
  }


  private final synchronized String newSegmentName() {
    return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
  }


  /** Determines how often segment indexes are merged by addDocument().  With
   * smaller values, less RAM is used while indexing, and searches on
   * unoptimized indexes are faster, but indexing speed is slower.  With larger
   * values more RAM is used while indexing and searches on unoptimized indexes
   * are slower, but indexing is faster.  Thus larger values (> 10) are best
   * for batched index creation, and smaller values (< 10) for indexes that are
   * interactively maintained.
   *
   * <p>This must never be less than 2.  The default value is 10.*/
  public int mergeFactor = 10;


  /** Determines the largest number of documents ever merged by addDocument().
   * Small values (e.g., less than 10,000) are best for interactive indexing,
   * as this limits the length of pauses while indexing to a few seconds.
   * Larger values are best for batched indexing and speedier searches.
   *
   * <p>The default value is {@link Integer#MAX_VALUE}. */
  public int maxMergeDocs = Integer.MAX_VALUE;


  /** If non-null, information about merges will be printed to this. */
  public PrintStream infoStream = null;


  /** Merges all segments together into a single segment, optimizing an index
      for search. */
  public final synchronized void optimize() throws IOException {
    flushRamSegments();
    while (segmentInfos.size() > 1 ||
     (segmentInfos.size() == 1 &&
      (SegmentReader.hasDeletions(segmentInfos.info(0)) ||
             segmentInfos.info(0).dir != directory))) {
      int minSegment = segmentInfos.size() - mergeFactor;
      mergeSegments(minSegment < 0 ? 0 : minSegment);
    }
  }


  /** Merges all segments from an array of indexes into this index.
   *
   * <p>This may be used to parallelize batch indexing.  A large document
   * collection can be broken into sub-collections.  Each sub-collection can be
   * indexed in parallel, on a different thread, process or machine.  The
   * complete index can then be created by merging sub-collection indexes
   * with this method.
   *
   * <p>After this completes, the index is optimized. */
  public final synchronized void addIndexes(Directory[] dirs)
      throws IOException {
    optimize();            // start with zero or 1 seg
    for (int i = 0; i < dirs.length; i++) {
      SegmentInfos sis = new SegmentInfos();    // read infos from dir
      sis.read(dirs[i]);
      for (int j = 0; j < sis.size(); j++) {
  segmentInfos.addElement(sis.info(j));    // add each info
      }
    }
    optimize();            // final cleanup
  }


  /** Merges all RAM-resident segments. */
  private final void flushRamSegments() throws IOException {
    int minSegment = segmentInfos.size()-1;
    int docCount = 0;
    while (minSegment >= 0 &&
     (segmentInfos.info(minSegment)).dir == ramDirectory) {
      docCount += segmentInfos.info(minSegment).docCount;
      minSegment--;
    }
    if (minSegment < 0 ||        // add one FS segment?
  (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor ||
  !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory))
      minSegment++;
    if (minSegment >= segmentInfos.size())
      return;            // none to merge
    mergeSegments(minSegment);
  }


  /** Incremental segment merger.  */
  private final void maybeMergeSegments() throws IOException {
    long targetMergeDocs = mergeFactor;
    while (targetMergeDocs <= maxMergeDocs) {
      // find segments smaller than current target size
      int minSegment = segmentInfos.size();
      int mergeDocs = 0;
      while (--minSegment >= 0) {
  SegmentInfo si = segmentInfos.info(minSegment);
  if (si.docCount >= targetMergeDocs)
    break;
  mergeDocs += si.docCount;
      }


      if (mergeDocs >= targetMergeDocs)      // found a merge to do
  mergeSegments(minSegment+1);
      else
  break;
      
      targetMergeDocs *= mergeFactor;      // increase target size
    }
  }


  /** Pops segments off of segmentInfos stack down to minSegment, merges them,
    and pushes the merged index onto the top of the segmentInfos stack. */
  private final void mergeSegments(int minSegment)
      throws IOException {
    String mergedName = newSegmentName();
    int mergedDocCount = 0;
    if (infoStream != null) infoStream.print("merging segments");
    SegmentMerger merger = new SegmentMerger(directory, mergedName);
    final Vector segmentsToDelete = new Vector();
    for (int i = minSegment; i < segmentInfos.size(); i++) {
      SegmentInfo si = segmentInfos.info(i);
      if (infoStream != null)
  infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
      SegmentReader reader = new SegmentReader(si);
      merger.add(reader);
      if ((reader.directory == this.directory) || // if we own the directory
          (reader.directory == this.ramDirectory))
  segmentsToDelete.addElement(reader);    // queue segment for deletion
      mergedDocCount += si.docCount;
    }
    if (infoStream != null) {
      infoStream.println();
      infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
    }
    merger.merge();


    segmentInfos.setSize(minSegment);      // pop old infos & add new
    segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount,
              directory));
    
    synchronized (directory) {        // in- & inter-process sync
      new Lock.With(directory.makeLock("commit.lock")) {
    public Object doBody() throws IOException {
      segmentInfos.write(directory);    // commit before deleting
      deleteSegments(segmentsToDelete);    // delete now-unused segments
      return null;
    }
  }.run();
    }
  }


  /* Some operating systems (e.g. Windows) don't permit a file to be deleted
     while it is opened for read (e.g. by another process or thread).  So we
     assume that when a delete fails it is because the file is open in another
     process, and queue the file for subsequent deletion. */


  private final void deleteSegments(Vector segments) throws IOException {
    Vector deletable = new Vector();


    deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable
    
    for (int i = 0; i < segments.size(); i++) {
      SegmentReader reader = (SegmentReader)segments.elementAt(i);
      if (reader.directory == this.directory)
  deleteFiles(reader.files(), deletable);    // try to delete our files
      else
  deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files
    }


    writeDeleteableFiles(deletable);      // note files we can't delete
  }


  private final void deleteFiles(Vector files, Directory directory)
       throws IOException {
    for (int i = 0; i < files.size(); i++)
      directory.deleteFile((String)files.elementAt(i));
  }


  private final void deleteFiles(Vector files, Vector deletable)
       throws IOException {
    for (int i = 0; i < files.size(); i++) {
      String file = (String)files.elementAt(i);
      try {
  directory.deleteFile(file);      // try to delete each file
      } catch (IOException e) {        // if delete fails
  if (directory.fileExists(file)) {
    if (infoStream != null)
      infoStream.println(e.getMessage() + "; Will re-try later.");
    deletable.addElement(file);      // add to deletable
  }
      }
    }
  }


  private final Vector readDeleteableFiles() throws IOException {
    Vector result = new Vector();
    if (!directory.fileExists("deletable"))
      return result;


    InputStream input = directory.openFile("deletable");
    try {
      for (int i = input.readInt(); i > 0; i--)    // read file names
  result.addElement(input.readString());
    } finally {
      input.close();
    }
    return result;
  }


  private final void writeDeleteableFiles(Vector files) throws IOException {
    OutputStream output = directory.createFile("deleteable.new");
    try {
      output.writeInt(files.size());
      for (int i = 0; i < files.size(); i++)
  output.writeString((String)files.elementAt(i));
    } finally {
      output.close();
    }
    directory.renameFile("deleteable.new", "deletable");
  }
}
Source Code of org.apache.lucene.index.IndexWriter

Related Classes of org.apache.lucene.index.IndexWriter