Package net.sf.regain.crawler

Source Code of net.sf.regain.crawler.IndexWriterManager

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2009-11-15 23:12:24 +0100 (So, 15 Nov 2009) $
*   $Author: thtesche $
* $Revision: 424 $
*/
package net.sf.regain.crawler;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;

import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.config.CrawlerConfig;
import net.sf.regain.crawler.config.UrlMatcher;
import net.sf.regain.crawler.document.DocumentFactory;
import net.sf.regain.crawler.document.RawDocument;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
* Kontrolliert und kapselt die Erstellung des Suchindex.
* <p>
* <b>Anwendung:</b><br>
* Rufen Sie für jedes Dokument {@link #addToIndex(RawDocument, ErrorLogger)}
* auf. Rufen Sie am Ende {@link #close(boolean)} auf, um den Index zu
* schlie�en. Danach sind keine weiteren Aufrufe von
* {@link #addToIndex(RawDocument, ErrorLogger)} erlaubt.
*
* @author Til Schneider, www.murfman.de
*/
public class IndexWriterManager {

  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(IndexWriterManager.class);
  /**
   * Der Name des Index-Unterverzeichnisses, in das der neue Index gestellt
   * werden soll, sobald er fertig ist ohne dass fatale Fehler aufgetreten sind.
   * <p>
   * Die Suchmaske wird, sobald es diese Verzeichnis gibt seine Suche darauf
   * umstellen. Dabei wird es in "index" umbenannt.
   */
  private static final String NEW_INDEX_SUBDIR = "new";
  /**
   * Der Name des Index-Unterverzeichnisses, in das der neue Index gestellt
   * werden soll, sobald er fertig ist, wobei fatale Fehler sufgetreten sind.
   */
  private static final String QUARANTINE_INDEX_SUBDIR = "quarantine";
  /** Der Name des Index-Unterverzeichnisses, in dem der genutzte Index steht. */
  private static final String WORKING_INDEX_SUBDIR = "index";
  /**
   * Der Name des Index-Unterverzeichnisses, in dem der neue Index aufgebaut
   * werden soll.
   */
  private static final String TEMP_INDEX_SUBDIR = "temp";
  /**
   * The name of the index sub directory that contains a breakpoint.
   * <p>
   * NOTE: The crawler creates periodically so called breakpoints. If the
   * crawler should be stopped before it is finished it can use a breakpoint
   * to go on the next time. Besides the search mask can use a breakpoint if no
   * other index exists. So the user can already search before the first index
   * was fully created.
   */
  private static final String BREAKPOINT_INDEX_SUBDIR = "breakpoint";
  /**
   * Gibt an, ob die Terme sortiert in die Terme-Datei geschrieben werden soll.
   *
   * @see #writeTermFile(File, File)
   */
  private static final boolean WRITE_TERMS_SORTED = true;
  /**
   * Workaround: Unter Windows klappt das Umbenennen unmittelbar nach Schlie�en
   * des Index nicht. Wahrscheinlich sind die Filepointer auf die gerade
   * geschlossenen Dateien noch nicht richtig aufger�umt, so dass ein Umbenennen
   * des Indexverzeichnisses fehl schl�gt. Das Umbenennen wird daher regelm��ig
   * probiert, bis es entweder funktioniert oder bis der Timeout abgelaufen ist.
   */
  private static final long RENAME_TIMEOUT = 60000; // 1 min
  /**
   * The writing mode.
   * @see #setIndexMode(int)
   */
  private static final int WRITING_MODE = 1;
  /**
   * The reading mode.
   * @see #setIndexMode(int)
   */
  private static final int READING_MODE = 2;
  /**
   * The searching mode.
   * @see #setIndexMode(int)
   */
  private static final int SEARCHING_MODE = 3;
  /**
   * The all closed mode.
   * @see #setIndexMode(int)
   */
  private static final int ALL_CLOSED_MODE = 4;
  /** The crawler configuration. */
  private CrawlerConfig mConfig;
  /** Der Analyzer, der vom IndexWriter genutzt werden soll. */
  private Analyzer mAnalyzer;
  /** Der gekapselte IndexWriter, der den eigentlichen Index erstellt. */
  private IndexWriter mIndexWriter;
  /**
   * Der gekapselte IndexReader. Wird zum L�schen von Dokumenten aus dem Index
   * ben�tigt.
   * <p>
   * Ist <code>null</code>, wenn der Index nicht aktualisiert werden soll.
   */
  private IndexReader mIndexReader;
  /**
   * Der gekapselte IndexSearcher. Wird zum Finden von Dokumenten ben�tigt.
   * <p>
   * Ist <code>null</code>, wenn der Index nicht aktualisiert werden soll.
   */
  private IndexSearcher mIndexSearcher;
  /**
   * Gibt an, ob ein bestehender Index aktualisiert wird.
   * <p>
   * Anderenfalls wird ein komplett neuer Index angelegt.
   */
  private boolean mUpdateIndex;
  /**
   * Specifies whether a document that couldn't be prepared the last time should be retried.
   */
  private boolean mRetryFailedDocs;
  /** Die DocumentFactory, die die Inhalte für die Indizierung aufbereitet. */
  private DocumentFactory mDocumentFactory;
  /**
   * Das Verzeichnis, in dem der Suchindex am Ende stehen soll, wenn es keine
   * fatalen Fehler gab.
   */
  private File mNewIndexDir;
  /**
   * Das Verzeichnis, in dem der Suchindex am Ende stehen soll, wenn es
   * fatale Fehler gab.
   */
  private File mQuarantineIndexDir;
  /** Das Verzeichnis, in dem der neue Suchindex aufgebaut werden soll. */
  private File mTempIndexDir;
  /** The lucene representation of mTempIndexDir. */
  private Directory mLuceneTempIndexDir;
  /** The directory to create breakpoint indices. */
  private File mBreakpointIndexDir;
  /** Das Verzeichnis, in dem die Analyse-Dateien erstellt werden soll. */
  private File mAnalysisDir;
  /** The file where the error log should be stored. */
  private File mErrorLogFile;
  /**
   * The stream used for writing errors to the error log of the index.
   * May be <code>null</code>.
   */
  private FileOutputStream mErrorLogStream;
  /**
   * The print writer used for writing errors to the error log of the index.
   * May be <code>null</code>.
   */
  private PrintWriter mErrorLogWriter;
  /**
   * The number of documents that were in the (old) index when the
   * IndexWriterManager was created.
   */
  private int mInitialDocCount;
  /** Der Profiler der das Hinzufügen zum Index mißt. */
  private Profiler mAddToIndexProfiler = new Profiler("Indexed documents", "docs");
  /** The profiler for the breakpoint creation. */
  private Profiler mBreakpointProfiler = new Profiler("Created breakpoints", "breakpoints");
  /**
   * enthält die URL und den LastUpdated-String aller Dokumente, deren Eintr�ge
   * beim Abschlie�en des Index entfernt werden m�ssen.
   * <p>
   * Die URL bildet den key, der LastUpdated-String die value.
   */
  private HashMap mUrlsToDeleteHash;

  /**
   * Erzeugt eine neue IndexWriterManager-Instanz.
   *
   * @param config Die zu verwendende Konfiguration.
   * @param updateIndex Gibt an, ob ein bereits bestehender Index aktualisiert
   *        werden soll.
   * @param retryFailedDocs Specifies whether a document that couldn't be
   *        prepared the last time should be retried.
   *
   * @throws RegainException Wenn der neue Index nicht vorbereitet werden konnte.
   */
  public IndexWriterManager(CrawlerConfig config, boolean updateIndex,
          boolean retryFailedDocs)
          throws RegainException {
    mConfig = config;
    mUpdateIndex = updateIndex;
    mRetryFailedDocs = retryFailedDocs;

    mInitialDocCount = 0;

    File indexDir = new File(config.getIndexDir());

    if (!indexDir.exists()) {
      // The index directory does not exist -> Create it
      mLog.info("Creating index directory " + indexDir.getAbsolutePath());
      indexDir.mkdirs();
    }

    mNewIndexDir = new File(indexDir, NEW_INDEX_SUBDIR);
    mQuarantineIndexDir = new File(indexDir, QUARANTINE_INDEX_SUBDIR);
    mTempIndexDir = new File(indexDir, TEMP_INDEX_SUBDIR);
    try {
      mLuceneTempIndexDir = FSDirectory.open(mTempIndexDir);
    } catch (IOException ioEx) {
      throw new RegainException("Couldn't open tmpIndexDir", ioEx);
    }
    mBreakpointIndexDir = new File(indexDir, BREAKPOINT_INDEX_SUBDIR);

    mErrorLogFile = new File(mTempIndexDir, "log/error.log");

    // Delete the old temp index directory if it should still exist
    if (mTempIndexDir.exists()) {
      RegainToolkit.deleteDirectory(mTempIndexDir);
    }
    // and create a new, empty one
    if (!mTempIndexDir.mkdir()) {
      throw new RegainException("Creating working directory failed: " + mTempIndexDir.getAbsolutePath());
    }

    // Get the untokenized field names
    String[] untokenizedFieldNames = config.getUntokenizedFieldNames();

    // Create the Analyzer
    // NOTE: Make shure you use the same Analyzer in the SearchContext too!
    String analyzerType = config.getAnalyzerType();
    String[] stopWordList = config.getStopWordList();
    String[] exclusionList = config.getExclusionList();
    mAnalyzer = RegainToolkit.createAnalyzer(analyzerType, stopWordList,
            exclusionList, untokenizedFieldNames);

    // Alten Index kopieren, wenn Index aktualisiert werden soll
    if (updateIndex) {
      if (!copyExistingIndex(indexDir, analyzerType)) {
        mUpdateIndex = updateIndex = false;
      }
    }

    // Check whether we have to create a new index
    boolean createNewIndex = !updateIndex;
    if (createNewIndex) {
      // Create a new index
      try {
        mIndexWriter = createIndexWriter(true);
      } catch (IOException exc) {
        throw new RegainException("Creating new index failed", exc);
      }
    }

    if (updateIndex) {
      // Force an unlock of the index (we just created a copy so this is save)
      setIndexMode(READING_MODE);
      try {

        IndexWriter.unlock(mIndexReader.directory());
        mInitialDocCount = mIndexReader.numDocs();
      } catch (IOException exc) {
        throw new RegainException("Forcing unlock failed", exc);
      }
    }

    // Write the stopWordList and the exclusionList in a file so it can be found
    // by the search mask
    RegainToolkit.writeToFile(analyzerType, new File(mTempIndexDir, "analyzerType.txt"));
    RegainToolkit.writeListToFile(stopWordList, new File(mTempIndexDir, "stopWordList.txt"));
    RegainToolkit.writeListToFile(exclusionList, new File(mTempIndexDir, "exclusionList.txt"));
    if (untokenizedFieldNames.length != 0) {
      RegainToolkit.writeListToFile(untokenizedFieldNames, new File(mTempIndexDir, "untokenizedFieldNames.txt"));
    }

    // Prepare the analysis directory if wanted
    if (config.getWriteAnalysisFiles()) {
      mAnalysisDir = new File(mTempIndexDir.getAbsolutePath() + File.separator + "analysis");

      if (!mAnalysisDir.mkdir()) {
        throw new RegainException("Creating analysis directory failed: " + mAnalysisDir.getAbsolutePath());
      }
    }

    mDocumentFactory = new DocumentFactory(config, mAnalysisDir);
  }

  /**
   * Gibt zurück, ob ein bestehender Index aktualisiert wird.
   * <p>
   * Anderenfalls wird ein komplett neuer Index angelegt.
   *
   * @return Ob ein bestehender Index aktualisiert wird.
   */
  public boolean getUpdateIndex() {
    return mUpdateIndex;
  }

  /**
   * Gets the number of documents that were in the (old) index when the
   * IndexWriterManager was created.
   *
   * @return The initial number of documents in the index.
   */
  public int getInitialDocCount() {
    return mInitialDocCount;
  }

  /**
   * Gets the number of documents that were added to the index.
   *
   * @return The number of documents added to the index.
   */
  public int getAddedDocCount() {
    return mAddToIndexProfiler.getMeasureCount();
  }

  /**
   * Gets the number of documents that will be removed from the index.
   *
   * @return The number of documents removed from the index.
   */
  public int getRemovedDocCount() {
    // NOTE: We get a local pointer to the mUrlsToDeleteHash, if the hash should
    //       be set to null in the same time.
    HashMap hash = mUrlsToDeleteHash;
    return (hash == null) ? 0 : hash.size();
  }

  /**
   * Logs an error at the error log of the index.
   *
   * @param msg The error message.
   * @param thr The error to log. May be <code>null</code>.
   * @throws RegainException If writing to the error log failed.
   */
  public void logError(String msg, Throwable thr) throws RegainException {
    if (mErrorLogStream == null) {
      try {
        new File(mTempIndexDir, "log").mkdir();
        mErrorLogStream = new FileOutputStream(mErrorLogFile, true);
        mErrorLogWriter = new PrintWriter(mErrorLogStream);
      } catch (IOException exc) {
        throw new RegainException("Opening error log file of the index failed", exc);
      }
    }

    if (thr == null) {
      mErrorLogWriter.println(msg);
    } else {
      mErrorLogWriter.println(msg + ":");
      thr.printStackTrace(mErrorLogWriter);
      mErrorLogWriter.println();
    }
    mErrorLogWriter.flush();
  }

  /**
   * Sets the current mode
   * <p>
   * The are the following modes:
   * <ul>
   *   <li>Writing mode: The mIndexWriter is opened, the mIndexSearcher may be
   *     opened, the mIndexReader is closed. In this mode documents may be added
   *     to the index.</li>
   *   <li>Reading mode: The mIndexReader is opened, the mIndexSearcher may be
   *     opened, the mIndexWriter is closed. In this mode documents may be
   *     read or removed from the index.</li>
   *   <li>Searching mode: The mIndexSearcher is opened, the mIndexWriter or
   *     mIndexReader may be opened. In this mode documents may be searched.
   *   <li>All closed mode: All access to the index ist closed:
   *     mIndexWriter, mIndexReader and mIndexSearcher. In this mode the index
   *     can't be accessed at all.
   * </ul>
   * <p>
   * If the index already is in the wanted mode nothing happens. This method is
   * very fast in this case.
   *
   * @param mode The mode the index should have. Must be one of
   *        {@link #WRITING_MODE}, {@link #READING_MODE}, {@link #SEARCHING_MODE}
   *        or {@link #ALL_CLOSED_MODE}.
   * @throws RegainException If closing or opening failed.
   */
  private void setIndexMode(int mode) throws RegainException {
    // Close the mIndexReader in WRITING_MODE and ALL_CLOSED_MODE
    if ((mode == WRITING_MODE) || (mode == ALL_CLOSED_MODE)) {
      if (mIndexReader != null) {
        try {
          mIndexReader.close();
          mIndexReader = null;
        } catch (IOException exc) {
          throw new RegainException("Closing IndexReader failed", exc);
        }
      }
    }

    // Close the mIndexWriter in READING_MODE and ALL_CLOSED_MODE
    if ((mode == READING_MODE) || (mode == ALL_CLOSED_MODE)) {
      if (mIndexWriter != null) {
        try {
          mIndexWriter.close();
          mIndexWriter = null;
        } catch (IOException exc) {
          throw new RegainException("Closing IndexWriter failed", exc);
        }
      }
    }

    // Close the mIndexSearcher in ALL_CLOSED_MODE
    if ((mode == ALL_CLOSED_MODE) && (mIndexSearcher != null)) {
      try {
        mIndexSearcher.close();
        mIndexSearcher = null;
      } catch (IOException exc) {
        throw new RegainException("Closing IndexSearcher failed", exc);
      }
    }

    // Open the mIndexWriter in WRITING_MODE
    if ((mode == WRITING_MODE) && (mIndexWriter == null)) {
      mLog.info("Switching to index mode: adding mode");
      try {
        mIndexWriter = createIndexWriter(false);
      } catch (IOException exc) {
        throw new RegainException("Creating IndexWriter failed", exc);
      }
    }

    // Open the mIndexReader in READING_MODE
    if ((mode == READING_MODE) && (mIndexReader == null)) {
      mLog.info("Switching to index mode: deleting mode");
      try {
        mIndexReader = IndexReader.open(mLuceneTempIndexDir, false);
      } catch (IOException exc) {
        throw new RegainException("Creating IndexReader failed", exc);
      }
    }

    // Open the mIndexSearcher in SEARCHING_MODE
    if ((mode == SEARCHING_MODE) && (mIndexSearcher == null)) {
      mLog.info("Switching to index mode: searching mode");
      try {
        mIndexSearcher = new IndexSearcher(mLuceneTempIndexDir, false);
      } catch (IOException exc) {
        throw new RegainException("Creating IndexSearcher failed", exc);
      }
    }

    // Tell the user, when switching to ALL_CLOSED_MODE
    if (mode == ALL_CLOSED_MODE) {
      mLog.info("Switching to index mode: all closed mode");
    }
  }

  private IndexWriter createIndexWriter(boolean createNewIndex)
          throws IOException {
    IndexWriter indexWriter = new IndexWriter(mLuceneTempIndexDir, mAnalyzer,
            createNewIndex, IndexWriter.MaxFieldLength.UNLIMITED);

    int maxFieldLength = mConfig.getMaxFieldLength();
    if (maxFieldLength > 0) {
      indexWriter.setMaxFieldLength(maxFieldLength);
    }

    return indexWriter;
  }

  /**
   * Kopiert den zuletzt erstellten Index in das Arbeitsverzeichnis.
   *
   * @param indexDir Das Verzeichnis, in dem der Index liegt.
   * @param analyzerType Der Analyzer-Typ, den der alte Index haben muss, um
   *        übernommen zu werden.
   * @return Ob ein alter Index gefunden wurde.
   * @throws RegainException Wenn das Kopieren fehl schlug.
   */
  private boolean copyExistingIndex(File indexDir, String analyzerType)
          throws RegainException {
    // Find the newest index
    File oldIndexDir;
    if (mBreakpointIndexDir.exists()) {
      oldIndexDir = mBreakpointIndexDir;
    } else if (mNewIndexDir.exists()) {
      oldIndexDir = mNewIndexDir;
    } else {
      // Es gibt keinen neuen Index -> Wir m�ssen den Index nehmen, der gerade
      // verwendet wird
      oldIndexDir = new File(indexDir, WORKING_INDEX_SUBDIR);
    }
    if (!oldIndexDir.exists()) {
      mLog.warn("Can't update index, because there was no old index. " +
              "A complete new index will be created...");
      return false;
    }

    // Analyzer-Typ des alten Index pr�en
    File analyzerTypeFile = new File(oldIndexDir, "analyzerType.txt");
    String analyzerTypeOfIndex = RegainToolkit.readStringFromFile(analyzerTypeFile);
    if ((analyzerTypeOfIndex == null) || (!analyzerType.equals(analyzerTypeOfIndex.trim()))) {
      mLog.warn("Can't update index, because the index was created using " +
              "another analyzer type (index type: '" + analyzerTypeOfIndex.trim() +
              "', configured type '" + analyzerType + "'). " +
              "A complete new index will be created...");
      return false;
    }

    // Index in Arbeitsverzeichnis kopieren
    mLog.info("Updating index from " + oldIndexDir.getAbsolutePath());
    RegainToolkit.copyDirectory(oldIndexDir, mTempIndexDir, false, ".txt");

    return true;
  }

  /**
   * Lookup for a document matching to a given url.
   *
   * @param url to check for
   * @return true if there exist the document for the url in the index.
   *
   * @throws RegainException if checking for url failed
   */
  public boolean isAlreadyIndexed(String url) throws RegainException {
    boolean result = false;

    if (mUpdateIndex) {
      // Search the entry for this URL
      Term urlTerm = new Term("url", url);
      Query query = new TermQuery(urlTerm);

      try {
        setIndexMode(SEARCHING_MODE);
        TopScoreDocCollector collector = TopScoreDocCollector.create(2, false);
        mIndexSearcher.search(query, collector);

        if (collector.getTotalHits() == 1) {
          // we found one hit for our URL
          result = true;

        }
      } catch (IOException exc) {
        throw new RegainException("Searching old index entry failed for " + url, exc);
      }
    }
    return result;
  }

  /**
   * Adds a document to an index.<p>
   *
   * @param rawDocument the document to add to a index
   * @param errorLogger The error logger to use for logging errors.
   *
   * @throws RegainException if adding of the document failed
   */
  public void addToIndex(RawDocument rawDocument, ErrorLogger errorLogger)
          throws RegainException {
    // Check whether there already is an up-to-date entry in the index
    if (mUpdateIndex) {
      boolean removeOldEntry = false;

      // Search the entry for this URL
      Term urlTerm = new Term("url", rawDocument.getUrl());
      Query query = new TermQuery(urlTerm);
      Document doc;
      try {
        setIndexMode(SEARCHING_MODE);
        TopScoreDocCollector collector = TopScoreDocCollector.create(20, false);
        mIndexSearcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        if (hits.length > 0) {
          if (hits.length > 1) {
            for (int i = 1; i < hits.length; i++) {
              markForDeletion(mIndexSearcher.doc(hits[i].doc));
            }
            mLog.warn("There are duplicate entries (" + hits.length + " in " +
                    "total) for " + rawDocument.getUrl() + ". They will be removed.");
          }

          doc = mIndexSearcher.doc(hits[0].doc);
        } else {
          doc = null;
        }
      } catch (IOException exc) {
        throw new RegainException("Searching old index entry failed for " + rawDocument.getUrl(), exc);
      }

      // If we found an entry, check whether it is up-to-date
      if (doc != null) {
        // Get the last modification date from the document
        Date docLastModified = rawDocument.getLastModified();

        if (docLastModified == null) {
          // We are not able to get the last modification date from the
          // document (this happens with all http-URLs)
          // -> Delete the old entry and create a new one
          mLog.info("Don't know when the document was last modified. " +
                  "Creating a new index entry...");
          removeOldEntry = true;

        } else {
          // Compare the modification date with the one from the index entry
          String asString = doc.get("last-modified");
          if (asString != null) {
            long diff = 86400001L;
            Date indexLastModified = null;
            try {
              indexLastModified = DateTools.stringToDate(asString);
              diff = docLastModified.getTime() - indexLastModified.getTime();
            } catch (ParseException parseException) {
              mLog.warn("Couldn't parse last-modified date from index. Document: " +
                      rawDocument.getUrl(), parseException);
            }
            if (diff > 86400000L) {
              // -> The index entry is not up-to-date -> Delete the old entry
              mLog.info("Index entry is outdated. Creating a new one (source=" +
                      docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());
              removeOldEntry = true;

            } else if ((new Date().getTime()) - indexLastModified.getTime() < 86400000L) {
              // Spidering at the same day
              // Due to the fuzziness of the docLastModified.getTime() (day accuracy)
              // we can't be sure whether the document is up-to-date or not
              mLog.info("Index entry is from the same day. Therefore we have to recrawl but do not index the document." +
                      "Creating a new one (source=" + docLastModified + "), (index=" + indexLastModified + "): " +
                      rawDocument.getUrl());

              parseDocument(rawDocument, errorLogger);

              return;

            } else {
              // The index entry is up-to-date

              // Check whether the preparation failed the last time
              boolean failedLastTime = doc.get("preparation-error") != null;
              if (failedLastTime) {
                if (mRetryFailedDocs) {
                  // The entry failed the last time, the user want's a retry
                  // -> We do a retry
                  mLog.info("Retrying preparation of: " + rawDocument.getUrl());
                  removeOldEntry = true;
                } else {
                  // The entry failed the last time, the user want's no retry
                  // -> We are done
                  mLog.info("Ignoring " + rawDocument.getUrl() + ", because " +
                          "preparation already failed the last time and no retry is wanted.");
                  return;
                }
              } else {
                // The entry is up-to-date and contains text -> We are done
                mLog.info("Index entry is already up to date (index=" + indexLastModified + "), " +
                        "(source=" + docLastModified + "): " + rawDocument.getUrl());
                return;
              }
            }
          } else {
            // We don't know the last modification date from the index entry
            // -> Delete the entry
            mLog.info("Index entry has no last-modified field. " +
                    "Creating a new one: " + rawDocument.getUrl());
            removeOldEntry = true;
          }
        }
      }

      // Check whether we have to delete the old entry
      if (removeOldEntry) {
        // We don't delete the entry immediately, but we remember it.
        // See javadoc of markForDeletion(Document)
        markForDeletion(doc);
      }
    }

    // Create a new entry
    createNewIndexEntry(rawDocument, errorLogger);
  }

  /**
   * Creates a indexable document and add this to the index
   *
   * @param rawDocument which will be parsed
   * @param errorLogger The error logger to use for logging errors.
   *
   * @throws RegainException if indexing of the document failed
   */
  public void createNewIndexEntry(RawDocument rawDocument, ErrorLogger errorLogger)
          throws RegainException {
    // Dokument erzeugen
    if (mLog.isDebugEnabled()) {
      mLog.debug("Creating document: " + rawDocument.getUrl());
    }
    Document doc = mDocumentFactory.createDocument(rawDocument, errorLogger);

    // Dokument in den Index aufnehmen
    if (doc != null) {
      mAddToIndexProfiler.startMeasuring();
      try {
        setIndexMode(WRITING_MODE);
        mIndexWriter.addDocument(doc);
        mAddToIndexProfiler.stopMeasuring(rawDocument.getLength());
      } catch (IOException exc) {
        mAddToIndexProfiler.abortMeasuring();
        throw new RegainException("Adding document to index failed", exc);
      }
    }
  }

  /**
   * Creates a  document but don't add  this to the index
   *
   * @param rawDocument which will be parsed
   * @param errorLogger The error logger to use for logging errors.
   *
   * @throws RegainException if parsing of the document failed
   */
  public void parseDocument(RawDocument rawDocument, ErrorLogger errorLogger)
          throws RegainException {
    // Dokument erzeugen
    if (mLog.isDebugEnabled()) {
      mLog.debug("Creating document: " + rawDocument.getUrl() + " only for parsing.");
    }

    mDocumentFactory.createDocument(rawDocument, errorLogger);


  }

  /**
   * Getter for the current and initialised DocumentFactory.
   *
   * @return the current and initialised DocumentFactory
   */
  public DocumentFactory getDocumentFactory() {
    return mDocumentFactory;
  }

  /**
   * Goes through the index and deletes all obsolete entries.
   * <p>
   * Entries are obsolete if they are marked for deletion by the
   * IndexWriterManager (see {@link #mUrlsToDeleteHash}) or if the don't neither
   * match an entry of the urlToKeepSet nor of the prefixesToKeepArr.
   *
   * @param urlChecker The UrlChecker to use for deciding whether an index entry
   *        should be kept in the index or not. If null only the documents in
   *        the {@link #mUrlsToDeleteHash} will be deleted.
   * @throws RegainException If an index entry could either not be read or
   *         deleted.
   */
  public void removeObsoleteEntries(UrlChecker urlChecker)
          throws RegainException {
    if (!mUpdateIndex) {
      // Wir haben einen komplett neuen Index erstellt
      // -> Es kann keine Einträge zu nicht vorhandenen Dokumenten geben
      // -> Wir sind fertig
      return;
    }

    if ((mUrlsToDeleteHash == null) && (urlChecker == null)) {
      // There is nothing to delete -> Fast return
      return;
    }

    // Get the UrlMatchers that identify URLs that should not be deleted
    UrlMatcher[] preserveUrlMatcherArr = null;
    if (urlChecker != null) {
      preserveUrlMatcherArr = urlChecker.createPreserveUrlMatcherArr();
    }

    // Go through the index
    setIndexMode(READING_MODE);
    int docCount = mIndexReader.numDocs();
    for (int docIdx = 0; docIdx < docCount; docIdx++) {
      if (!mIndexReader.isDeleted(docIdx)) {
        // Document lesen
        Document doc;
        try {
          doc = mIndexReader.document(docIdx);
        } catch (Throwable thr) {
          throw new RegainException("Getting document #" + docIdx + " from index failed.", thr);
        }

        // URL und last-modified holen
        String url = doc.get("url");
        String lastModified = doc.get("last-modified");

        // Prüfen, ob die URL gelöscht werden soll
        boolean shouldBeDeleted;
        if (url != null) {
          // Prüfen, ob dieser Eintrag zum Löschen vorgesehen ist
          if (isMarkedForDeletion(doc)) {
            shouldBeDeleted = true;
          } // Check whether all other documents should NOT be deleted
          else if ((urlChecker == null)) {
            shouldBeDeleted = false;
          } // Check whether this document should be kept in the index
          else if (urlChecker.shouldBeKeptInIndex(url)) {
            shouldBeDeleted = false;
          } // Prüfen, ob die URL zu einem zu-verschonen-Präfix passt
          else {
            shouldBeDeleted = true;
            for (int i = 0; i < preserveUrlMatcherArr.length; i++) {
              if (preserveUrlMatcherArr[i].matches(url)) {
                shouldBeDeleted = false;
                break;
              }
            }
          }

          if (shouldBeDeleted) {
            try {
              mLog.info("Deleting from index: " + url + " from " + lastModified);
              mIndexReader.deleteDocument(docIdx);
            } catch (IOException exc) {
              throw new RegainException("Deleting document #" + docIdx + " from index failed: " + url + " from " + lastModified, exc);
            }
          }
        }
      }
    }

    // Merkliste der zu l�schenden Eintr�ge l�schen
    mUrlsToDeleteHash = null;
  }

  /**
   * Goes through the index and deletes all obsolete entries.
   * <p>
   * Entries are obsolete if they are marked for deletion by the
   * IndexWriterManager (see {@link #mUrlsToDeleteHash}).
   *
   * @throws RegainException If an index entry could either not be read or
   *         deleted.
   */
  private void removeObsoleteEntries() throws RegainException {
    removeObsoleteEntries(null);
  }

  /**
   * Merkt ein Dokument für die sp�tere L�schung vor.
   * <p>
   * Diese Methode ist Teil eines Workaround: Ein alter Eintrag, der durch einen
   * neuen ersetzt wird, wird nicht sofort gel�scht, sondern nur zur L�schung
   * vorgemerkt. Auf diese Weise wird ein seltener Fehler umgangen, der das
   * Schlie�en des IndexWriter verhindert, wenn h�ufig zwischen InderWriter und
   * IndexReader gewechselt wird.
   *
   * @param doc Das vorzumerkende Dokument.
   */
  private void markForDeletion(Document doc) {
    if (mUrlsToDeleteHash == null) {
      mUrlsToDeleteHash = new HashMap();
    }

    String url = doc.get("url");
    String lastModified = doc.get("last-modified");
    if ((url != null) || (lastModified != null)) {
      mLog.info("Marking old entry for a later deletion: " + url + " from " + lastModified);
      mUrlsToDeleteHash.put(url, lastModified);
    }
  }

  /**
   * Gibt zurück, ob ein Dokument für die Löschung vorgemerkt wurde.
   *
   * @param doc Das zu prüfende Dokument.
   * @return Ob das Dokument für die Löschung vorgemerkt wurde.
   */
  private boolean isMarkedForDeletion(Document doc) {
    String url = doc.get("url");
    String lastModified = doc.get("last-modified");

    if ((url == null) || (lastModified == null)) {
      // url und last-modified sind Mussfelder
      // Da eines fehlt -> Dokument l�schen
      return true;
    }

    if (mUrlsToDeleteHash == null) {
      // Es sind gar keine Dokumente zum Löschen vorgemerkt
      return false;
    }

    // Prüfen, ob es einen Eintrag für diese URL gibt und ob er dem
    // last-modified des Dokuments entspricht
    String lastModifiedToDelete = (String) mUrlsToDeleteHash.get(url);
    return lastModified.equals(lastModifiedToDelete);
  }

  /**
   * Gibt die Anzahl der Eintr�ge im Index zurück.
   *
   * @return Die Anzahl der Eintr�ge im Index.
   * @throws RegainException Wenn die Anzahl nicht ermittelt werden konnte.
   */
  public int getIndexEntryCount() throws RegainException {
    if (mIndexReader != null) {
      return mIndexReader.numDocs();
    } else {
      setIndexMode(WRITING_MODE);
      return mIndexWriter.maxDoc();
    }
  }

  /**
   * Prepares a breakpoint.
   *
   * @throws RegainException If preparing the breakpoint failed.
   */
  private void prepareBreakpoint() throws RegainException {
    // Testen, ob noch Eintr�ge für die L�schung vorgesehen sind
    if (mUrlsToDeleteHash != null) {
      throw new RegainException("There are still documents marked for deletion." + " The method removeObsoleteEntires(...) has to be called first.");
    }

    // Switch to ALL_CLOSED_MODE
    setIndexMode(ALL_CLOSED_MODE);

    // Close the error log of the index
    if (mErrorLogStream != null) {
      mErrorLogWriter.close();
      try {
        mErrorLogStream.close();
      } catch (IOException exc) {
        throw new RegainException("Closing error log file failed", exc);
      }

      mErrorLogWriter = null;
      mErrorLogStream = null;
    }
  }

  /**
   * Creates a breakpoint.
   *
   * @throws RegainException If creating the breakpoint failed.
   */
  public void createBreakpoint() throws RegainException {
    mLog.info("Creating a breakpoint...");
    try {
      mBreakpointProfiler.startMeasuring();

      // Remove the entries that were marked for deletion
      removeObsoleteEntries();

      // Prepare the breakpoint
      prepareBreakpoint();

      // Create a temp directory
      // NOTE: We copy to a temp directory and rename it when we are finished.
      File tempDir = new File(mBreakpointIndexDir.getAbsolutePath() + "_tmp");
      RegainToolkit.deleteDirectory(tempDir);
      tempDir.mkdir();

      // Copy the current working index to the breakpoint directory
      RegainToolkit.copyDirectory(mTempIndexDir, tempDir, false);

      // Delete the old breakpoint if it exists
      deleteOldIndex(mBreakpointIndexDir);

      // Rename the temp directory and let it become the new breakpoint
      if (!tempDir.renameTo(mBreakpointIndexDir)) {
        throw new RegainException("Renaming temporary copy directory failed: " +
                tempDir.getAbsolutePath());
      }

      // Stop measuring
      long breakpointSize = RegainToolkit.getDirectorySize(mBreakpointIndexDir);
      mBreakpointProfiler.stopMeasuring(breakpointSize);
    } catch (RegainException exc) {
      mBreakpointProfiler.abortMeasuring();
      throw exc;
    }
  }

  /**
   * Optimiert und schlie�t den Index
   *
   * @param putIntoQuarantine Gibt an, ob der Index in Quarant�ne soll.
   * @throws RegainException Wenn der Index nicht geschlossen werden konnte.
   */
  public void close(boolean putIntoQuarantine) throws RegainException {
    // Index optimieren
    try {
      setIndexMode(WRITING_MODE);
      mIndexWriter.optimize();
    } catch (IOException exc) {
      throw new RegainException("Finishing IndexWriter failed", exc);
    }

    // Prefetch destinct field values
    String[] prefetchFields = mConfig.getValuePrefetchFields();
    if (prefetchFields != null && prefetchFields.length != 0) {
      String msg = "Prefetching destinct field values for: ";
      for (int i = 0; i < prefetchFields.length; i++) {
        msg += (i != 0 ? ", " : "") + prefetchFields[i];
      }
      mLog.info(msg);

      setIndexMode(READING_MODE);
      RegainToolkit.readFieldValues(mIndexReader, prefetchFields, mTempIndexDir);
    }

    // Prepare the final 'breakpoint'
    // NOTE: This will set the ALL_CLOSED_MODE
    prepareBreakpoint();

    // Ressourcen der DocumentFactory freigeben
    mDocumentFactory.close();

    // Write all terms in the index into a file
    if (mAnalysisDir != null) {
      File termFile = new File(mAnalysisDir.getAbsolutePath() + File.separator + "AllTerms.txt");
      writeTermFile(mTempIndexDir, termFile);
    }

    // Verzeichnis bestimmen, in das der Index kommen soll
    File targetDir;
    if (putIntoQuarantine) {
      targetDir = mQuarantineIndexDir;
    } else {
      targetDir = mNewIndexDir;
    }

    // If there is already the target directory -> delete it
    deleteOldIndex(targetDir);

    // Let the new index become the working index
    // Workaround: Siehe Javadoc von RENAME_TIMEOUT
    long deadline = System.currentTimeMillis() + RENAME_TIMEOUT;
    boolean renameSucceed = false;
    while ((!renameSucceed) && (System.currentTimeMillis() < deadline)) {
      renameSucceed = mTempIndexDir.renameTo(targetDir);
      try {
        Thread.sleep(100);
      } catch (Exception exc) {
      }
    }

    if (renameSucceed) {
      // Delete the last breakpoint if there should be one
      deleteOldIndex(mBreakpointIndexDir);
    } else {
      throw new RegainException("Renaming " + mTempIndexDir + " to " + targetDir + " failed after " + (RENAME_TIMEOUT / 1000) + " seconds!");
    }
  }

  /**
   * Delets an old index directory.
   *
   * @param oldIndexDir The old index directory.
   * @throws RegainException If deleting failed.
   */
  private void deleteOldIndex(File oldIndexDir) throws RegainException {
    if (oldIndexDir.exists()) {
      // We rename it before deletion so there will be no problems when the
      // search mask tries not to switch to the new index during deletion. This
      // case is very unlikely but it may happen once in 100.000 years...
      File secureDir = new File(oldIndexDir.getAbsolutePath() + "_del");
      if (oldIndexDir.renameTo(secureDir)) {
        RegainToolkit.deleteDirectory(secureDir);
      } else {
        throw new RegainException("Deleting old index failed: " +
                oldIndexDir.getAbsolutePath());
      }
    }
  }

  /**
   * Erzeugt eine Datei, die alle Terme (also alle erlaubten Suchtexte) enthält.
   *
   * @param indexDir Das Verzeichnis, in dem der Index steht.
   * @param termFile Der Ort, wo die Datei erstellt werden soll.
   *
   * @throws RegainException Wenn die Erstellung fehlgeschlagen ist.
   */
  private void writeTermFile(File indexDir, File termFile) throws RegainException {
    IndexReader reader = null;
    FileOutputStream stream = null;
    PrintWriter writer = null;
    try {
      reader = IndexReader.open(FSDirectory.open(indexDir), true);

      stream = new FileOutputStream(termFile);
      writer = new PrintWriter(stream);
      writer.println("This file was generated by the crawler and contains all " + "terms in the index.");
      writer.println("It's no error when endings like 'e', 'en', and so on " + "are missing.");
      writer.println("They have been cuttet by the GermanAnalyzer and will be " + "cuttet from a search query too.");
      writer.println();

      // Write the terms
      TermEnum termEnum = reader.terms();
      int termCount;
      if (WRITE_TERMS_SORTED) {
        termCount = writeTermsSorted(termEnum, writer);
      } else {
        termCount = writeTermsSimply(termEnum, writer);
      }

      mLog.info("Wrote " + termCount + " terms into " + termFile.getAbsolutePath());
    } catch (IOException exc) {
      throw new RegainException("Writing term file failed", exc);
    } finally {
      if (reader != null) {
        try {
          reader.close();
        } catch (IOException exc) {
        }
      }
      if (writer != null) {
        writer.close();
      }
      if (stream != null) {
        try {
          stream.close();
        } catch (IOException exc) {
        }
      }
    }
  }

  /**
   * Schreibt die Terme so wie sie vom IndexReader kommen in den Writer.
   * <p>
   * Diese Methode braucht minimale Ressourcen.
   *
   * @param termEnum Die Aufz�hlung mit allen Termen.
   * @param writer Der Writer auf den geschrieben werden soll.
   *
   * @return Die Anzahl der Terme.
   * @throws IOException Wenn das Schreiben fehl schlug.
   */
  private int writeTermsSimply(TermEnum termEnum, PrintWriter writer)
          throws IOException {
    int termCount = 0;
    while (termEnum.next()) {
      Term term = termEnum.term();
      writer.println(term.text());
      termCount++;
    }

    return termCount;
  }

  /**
   * Schreibt die Terme vom IndexReader sortiert in den Writer.
   * <p>
   * Um die Terme sortieren zu können, m�ssen sie zwischengespeichert werden. Falls
   * es zu viele sind, k�nnte das schief gehen. In diesem Fall sollte man auf simples
   * Schreiben umstellen.
   *
   * @param termEnum Die Aufz�hlung mit allen Termen.
   * @param writer Der Writer auf den geschrieben werden soll.
   *
   * @return Die Anzahl der Terme.
   * @throws IOException Wenn das Schreiben fehl schlug.
   */
  private int writeTermsSorted(TermEnum termEnum, PrintWriter writer)
          throws IOException {
    // Put all terms in a list for a later sorting
    ArrayList list = new ArrayList();
    while (termEnum.next()) {
      Term term = termEnum.term();
      list.add(term.text());
    }

    String[] asArr = new String[list.size()];
    list.toArray(asArr);

    // Sort the terms
    Arrays.sort(asArr);

    // Write them to the writer
    for (int i = 0; i < asArr.length; i++) {
      writer.println(asArr[i]);
    }

    return asArr.length;
  }
}
TOP

Related Classes of net.sf.regain.crawler.IndexWriterManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.