Source Code of net.sf.regain.crawler.document.DocumentFactory

/*
 * regain - A file search engine providing plenty of formats
 * Copyright (C) 2004  Til Schneider
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Contact: Til Schneider, info@murfman.de
 *
 * CVS information:
 *  $RCSfile$
 *   $Source$
 *     $Date: 2010-11-07 17:03:46 +0100 (So, 07 Nov 2010) $
 *   $Author: thtesche $
 * $Revision: 467 $
 */
package net.sf.regain.crawler.document;


import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;


import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.ErrorLogger;
import net.sf.regain.crawler.Profiler;
import net.sf.regain.crawler.access.CrawlerAccessController;
import net.sf.regain.crawler.config.AuxiliaryField;
import net.sf.regain.crawler.config.CrawlerConfig;
import net.sf.regain.crawler.config.PreparatorSettings;


import org.apache.log4j.Logger;
import java.io.FileInputStream;
import java.io.StringReader;
import java.util.ArrayList;
import net.sf.regain.util.io.PathFilenamePair;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.CompressionTools;
import org.apache.lucene.document.DateTools;
import org.semanticdesktop.aperture.mime.identifier.MimeTypeIdentifier;
import org.semanticdesktop.aperture.mime.identifier.magic.MagicMimeTypeIdentifierFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;
import org.ontoware.rdf2go.model.node.impl.URIImpl;


/**
 * Fabrik, die aus der URL und den Rohdaten eines Dokuments ein Lucene-Ducument
 * erzeugt, das nur noch den, von Formatierungen gesäuberten, Text des Dokuments,
 * sowie seine URL und seinen Titel enthält.
 *
 * @see Document
 * @author Til Schneider, www.murfman.de
 */
public class DocumentFactory {


  /** The logger for this class */
  private static Logger mLog = Logger.getLogger(DocumentFactory.class);


  /** The crawler config. */
  private CrawlerConfig mConfig;


  /** The maximum amount of characters which will be copied from content to summary */
  private int mMaxSummaryLength;
  
  /** should the whole content stored in the index for a preview on the result page */
  private boolean storeContentForPreview;
  
  /**
   * Das Verzeichnis, in dem Analyse-Dateien erzeugt werden sollen. Ist
   * <CODE>null</CODE>, wenn keine Analyse-Dateien erzeugt werden sollen.
   */
  private File mAnalysisDir = null;


  /** The preparators. */
  private Preparator[] mPreparatorArr;


  /** Die Profiler, die die Bearbeitung durch die Präparatoren messen. */
  private Profiler[] mPreparatorProfilerArr;
  
  /**
   * The {@link CrawlerAccessController} to use for identifying the groups that
   * are allowed to read a document. May be <code>null</code>.
   */
  private CrawlerAccessController mCrawlerAccessController;


  /**
   * Die regul�ren Ausdr�cke, auf die die URL eines Dokuments passen muss,
   * damit anstatt des wirklichen Dokumententitels der Text des Links, der auf
   * das Dokument gezeigt hat, als Dokumententitel genutzt wird.
   */
  private RE[] mUseLinkTextAsTitleReArr;


  /** Der Profiler der das Hinzuf�gen zum Index mi�t. */
  private Profiler mWriteAnalysisProfiler
    = new Profiler("Writing Analysis files", "files");


  /** The mimetype mimeTypeIdentifier */
  MimeTypeIdentifier mimeTypeIdentifier;


  /**
   * Creates a new instance of DocumentFactory.
   * 
   * @param config The crawler configuration.
   * @param analysisDir The directory where to store the analysis files. Is
   *        <code>null</code> if no analysis files should be created.
   *
   * @throws RegainException If a preparator could not be created or if a regex
   *         has a syntax error. 
   */
  public DocumentFactory(CrawlerConfig config, File analysisDir)
    throws RegainException
  {
    mConfig = config;
    mAnalysisDir = analysisDir;


    // Create the preparators
    try {
      PreparatorSettings[] prepConf = config.getPreparatorSettingsList();
      mPreparatorArr = PreparatorFactory.getInstance().createPreparatorArr(prepConf);
    }
    catch (RegainException exc) {
      throw new RegainException("Creating the document preparators failed", exc);
    }


    // Create a profiler for each preparator
    mPreparatorProfilerArr = new Profiler[mPreparatorArr.length];
    for (int i = 0; i < mPreparatorProfilerArr.length; i++) {
      String name = mPreparatorArr[i].getClass().getName();
      mPreparatorProfilerArr[i] = new Profiler("Preparator " + name, "docs");
    }


    // Create the CrawlerAccessController
    String accessClass = config.getCrawlerAccessControllerClass();
    if (accessClass != null) {
      String accessJar = config.getCrawlerAccessControllerJar();
      mCrawlerAccessController = (CrawlerAccessController)
        RegainToolkit.createClassInstance(accessClass, CrawlerAccessController.class,
                                          accessJar);
      
      Properties accessControllerConfig = config.getCrawlerAccessControllerConfig();
      if (accessControllerConfig == null) {
        accessControllerConfig = new Properties();
      }
      mCrawlerAccessController.init(accessControllerConfig);
      
      mLog.info("Using crawler access controller: " + accessClass);
    }


    // Create the mUseLinkTextAsTitleReArr
    String[] useLinkTextAsTitleRegexArr = config.getUseLinkTextAsTitleRegexList();
    if (useLinkTextAsTitleRegexArr == null) {
      mUseLinkTextAsTitleReArr = new RE[0];
    } else {
      mUseLinkTextAsTitleReArr = new RE[useLinkTextAsTitleRegexArr.length];
      for (int i = 0; i < useLinkTextAsTitleRegexArr.length; i++) {
        try {
          mUseLinkTextAsTitleReArr[i] = new RE(useLinkTextAsTitleRegexArr[i]);
        }
        catch (RESyntaxException exc) {
          throw new RegainException("Regular expression of "
            + "use-link-text-as-title-pattern #" + i + " has wrong syntax '"
            + useLinkTextAsTitleRegexArr[i] + "'", exc);
        }
      }
    }
    // Read some more configuration entries from the config
    this.mMaxSummaryLength = this.mConfig.getMaxSummaryLength();
    this.storeContentForPreview = this.mConfig.getStoreContentForPreview();


    // Set up the MimeTypeIdentifierFactory
    MagicMimeTypeIdentifierFactory factory = new MagicMimeTypeIdentifierFactory();
    mimeTypeIdentifier = factory.get();


  }




  /**
   * Creates a lucene {@link Document} from a {@link RawDocument}.
   *
   * @param rawDocument The raw document.
   * @param errorLogger The error logger to use for logging errors.
   *
   * @return The lucene document with the prepared data or <code>null</code> if
   *         the document couldn't be created.
   */
  public Document createDocument(RawDocument rawDocument, ErrorLogger errorLogger) {


    // Determine the mime-type 
    String mimeType;
    try {
      File file = rawDocument.getContentAsFile();
      if (file.canRead() == false) {
        mLog.warn("canRead() on file return: false. Maybe no access rights for sourceURL: " + 
          RegainToolkit.fileToUrl(file));
        return null;
      } 


      FileInputStream fis = new FileInputStream(file);
      byte[] bytes = new byte[mimeTypeIdentifier.getMinArrayLength()];
      fis.read(bytes);
      mimeType = mimeTypeIdentifier.identify(bytes, file.getPath(),
              new URIImpl(rawDocument.getUrl(),false)) ;
      if (mimeType == null || mimeType.length() == 0) {
        mimeType = "application/x-unknown-mime-type";
      }


      mLog.debug("Detected mimetype cylcle 1: " + mimeType + ". " + rawDocument.getUrl());
      if (mimeType.equalsIgnoreCase("application/zip")) {
        // some new files like MS Office documents are zip files
        // so rewrite the URL for the correct mimetype detection
        mimeType = mimeTypeIdentifier.identify(bytes, null,
                new URIImpl("zip:mime:file:" + rawDocument.getUrl()));
        mLog.debug("Detected mimetype cylcle 2: " + mimeType + ". " + "zip:mime:file:" + rawDocument.getUrl());
      }
    } catch (Exception exc) {
      errorLogger.logError("Determine mime-type of " + rawDocument.getUrl() +
                              " failed", exc, false);
      mimeType = "application/x-unknown-mime-type";
    }


    rawDocument.setMimeType( mimeType );
    
    // Find the preparator that will prepare this URL
    Document doc = null;
    boolean preparatorFound = false;
    ArrayList <Integer>matchingPreperators = new ArrayList <Integer>();
    for (int i = 0; i < mPreparatorArr.length; i++) {
      if (mPreparatorArr[i].accepts(rawDocument)) {
        // This preparator can prepare this URL
        preparatorFound = true;
        matchingPreperators.add(new Integer(i));
        if (mLog.isDebugEnabled()) {
          mLog.debug("Found: " + mPreparatorArr[i].getClass().getSimpleName() +
                   ", Prio: " + mPreparatorArr[i].getPriority() );
        }
      }
    }
    
    if (preparatorFound) {
      // Find the preparator with the highest priority
      Iterator prepIdxIter = matchingPreperators.iterator();
      int highestPriorityIdx = ((Integer)prepIdxIter.next()).intValue();
      // In case of more than one matching preperator find the one with the highest prio
      while( prepIdxIter.hasNext() ) {
        int currI = ((Integer)prepIdxIter.next()).intValue();
        if( mPreparatorArr[currI].getPriority() > mPreparatorArr[highestPriorityIdx].getPriority() )
          highestPriorityIdx = currI;
      }
      
      try {
        doc = createDocument(mPreparatorArr[highestPriorityIdx], mPreparatorProfilerArr[highestPriorityIdx], rawDocument);
        mLog.info("Preparation with " + mPreparatorArr[highestPriorityIdx].getClass().getSimpleName() +
                " done: " + rawDocument.getUrl());
      }
      catch (RegainException exc) {
        errorLogger.logError("Preparing " + rawDocument.getUrl() +
            " with preparator " + mPreparatorArr[highestPriorityIdx].getClass().getName() +
            " failed", exc, false);
      }
    } else {
      mLog.info("No preparator feels responsible for " + rawDocument.getUrl());
    }
    
    if (preparatorFound && (doc == null)) {
      // There were preparators that felt responsible for the document, but they
      // weren't able to process it
      // -> Create a substitute document to avoid that the same document is
      // tried to be processed the next time
      try {
        doc = createSubstituteDocument(rawDocument);
        mLog.info("Created substitute document: " + rawDocument.getUrl());
      }
      catch (RegainException exc) {
        errorLogger.logError("Creating substitute document for "
            + rawDocument.getUrl() + " failed", exc, false);
      }
    }


    // return the document
    return doc;
  }


  
  /**
   * Creates a lucene {@link Document} from a {@link RawDocument} using a
   * certain Preparator.
   * 
   * @param preparator The preparator to use.
   * @param preparatorProfiler The profile of the preparator.
   * @param rawDocument The raw document.
   * @return The lucene document with the prepared data.
   * @throws RegainException If creating the document failed.
   */
  private Document createDocument(Preparator preparator, Profiler preparatorProfiler,
    RawDocument rawDocument)
    throws RegainException
  {
    String url = rawDocument.getUrl();
    
    // Extract the file type specific information
    String cleanedContent;
    String title;
    String summary;
    String metadata;
    String headlines;
    PathElement[] path;
    Map additionalFieldMap;
    if (mLog.isDebugEnabled()) {
      mLog.debug("Using preparator " + preparator.getClass().getName()
        + " for " + rawDocument + ", " + rawDocument.getMimeType());
    }
    preparatorProfiler.startMeasuring();
    try {
      preparator.prepare(rawDocument);


      cleanedContent     = preparator.getCleanedContent();
      title              = preparator.getTitle();
      summary            = preparator.getSummary();
      metadata           = preparator.getCleanedMetaData();
      headlines          = preparator.getHeadlines();
      path               = preparator.getPath();
      additionalFieldMap = preparator.getAdditionalFields();


      preparator.cleanUp();


      preparatorProfiler.stopMeasuring(rawDocument.getLength());
    }
    catch (Throwable thr) {
      preparatorProfiler.abortMeasuring();
      throw new RegainException("Preparing " + url
        + " with preparator " + preparator.getClass().getName() + " failed", thr);
    }


    // Check the mandatory information
    if (cleanedContent == null) {
      throw new RegainException("Preparator " + preparator.getClass().getName()
        + " did not extract the content of " + url);
    }


    // Preparing succeed -> Create the document
    Document doc = createDocument(rawDocument, cleanedContent, title,
                                  summary, metadata, headlines, path, additionalFieldMap);


    // return the document
    return doc;
  }




  /**
   * Creates a substitute lucene {@link Document} for a {@link RawDocument}.
   * <p>
   * Substitute documents have no "content" field, but a "preparation-error"
   * field. They are added to the index if preparation failed. This way at least
   * the URL may be searched and following spider runs are much faster as a
   * previously failed document is not retried.
   * 
   * @param rawDocument The document to create the substitute document for.
   * @return The substitide document.
   * @throws RegainException If the user groups that are allowed to read this
   *         document couldn't be determined. 
   */
  private Document createSubstituteDocument(RawDocument rawDocument)
    throws RegainException
  {
    return createDocument(rawDocument, null, null, null, null, null, null, null);
  }


  
  /**
   * Create a lucene {@link Document}.
   * 
   * @param rawDocument The raw document to create the lucene {@link Document}
   *        for.
   * @param cleanedContent The content of the document. (May be null, if the
   *        content couldn't be extracted. In this case a substitute document is
   *        created)
   * @param title The title. May be null.
   * @param summary The summary. May be null.
   * @param metadata The cleaned meta data. May be null.
   * @param headlines The headlines. May be null.
   * @param path The path to the document. May be null.
   * @param additionalFieldMap The additional fields provided by the preparator.
   * @return The lucene {@link Document}.
   * 
   * @throws RegainException If the user groups that are allowed to read this
   *         document couldn't be determined. 
   */
  private Document createDocument(RawDocument rawDocument, String cleanedContent,
    String title, String summary, String metadata, String headlines, PathElement[] path,
    Map additionalFieldMap)
    throws RegainException
  {
    String url = rawDocument.getUrl();


    // Create a new, empty document
    Document doc = new Document();
    
    // Create the auxiliary fields
    // NOTE: We do this at first, because if someone defined an auxiliary field
    //       having the same name as a normal field, then the field will be
    //       overriden by the normal field. This way we can be sure that the
    //       normal fields have the value we expect.
    AuxiliaryField[] auxiliaryFieldArr = mConfig.getAuxiliaryFieldList();
    if (auxiliaryFieldArr != null) {
      for (int i = 0; i < auxiliaryFieldArr.length; i++) {
        RE regex = auxiliaryFieldArr[i].getUrlRegex();
        if (regex.match(url)) {
          String fieldName = auxiliaryFieldArr[i].getFieldName();


          String value = auxiliaryFieldArr[i].getValue();
          if (value == null) {
            // We have no value set -> Extract the value from the regex
            value = regex.getParen(auxiliaryFieldArr[i].getUrlRegexGroup());
          }


          if (value != null) {
            if (auxiliaryFieldArr[i].getToLowerCase()) {
              value = value.toLowerCase();
            }


            if (mLog.isDebugEnabled()) {
              mLog.debug("Adding auxiliary field: " + fieldName + "=" + value);
            }
            boolean store = auxiliaryFieldArr[i].isStored();
            boolean index = auxiliaryFieldArr[i].isIndexed();
            boolean token = auxiliaryFieldArr[i].isTokenized();


            doc.add(new Field(fieldName, value,
                store ? Field.Store.YES : Field.Store.NO,
                index ? (token ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO));
          }
        }
      }
    }
    
    // Add the groups of the document
    if (mCrawlerAccessController != null) {
      String[] groupArr = mCrawlerAccessController.getDocumentGroups(rawDocument);
      
      // Check the Group array
      RegainToolkit.checkGroupArray(mCrawlerAccessController, groupArr);


      // Add the field
      // NOTE: The field "groups" is tokenized, but not stemmed.
      //       See: RegainToolkit.WrapperAnalyzer
      Iterator groupIter = Arrays.asList(groupArr).iterator();
      StringBuilder tokenBuilder = new StringBuilder();
      while (groupIter.hasNext()) {
        tokenBuilder.append((String) groupIter.next());
        tokenBuilder.append(" ");
      }
    
      //doc.add(new Field("groups", new IteratorTokenStream(groupIter)));
      doc.add(new Field("groups", new WhitespaceTokenizer(new StringReader(tokenBuilder.toString()))));
    }


    // Add the URL of the document
    doc.add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
    
    // Add the file name (without protocol, drive-letter and path)
    String filenameWithVariants = RegainToolkit.urlToWhitespacedFileName(url);
    doc.add(new Field("filename", new WhitespaceTokenizer(new StringReader(filenameWithVariants))));
    PathFilenamePair pfPair = RegainToolkit.fragmentUrl(url);


    // Add the filename field for sorting
    doc.add(new Field("filename_sort", pfPair.getFilename(), Field.Store.YES, Field.Index.NOT_ANALYZED));


    // Add the document's size
    int size = rawDocument.getLength();
    doc.add(new Field("size", Integer.toString(size), Field.Store.YES, Field.Index.NOT_ANALYZED));


    // Add the mime-type
    String mimeType = rawDocument.getMimeType();
    doc.add(new Field("mimetype", mimeType, Field.Store.YES, Field.Index.NOT_ANALYZED));
    
    // Add last modified
    Date lastModified = rawDocument.getLastModified();
    if (lastModified == null) {
      // We don't know when the document was last modified
      // -> Take the current time
      lastModified = new Date();
    }
    doc.add(new Field("last-modified", 
      DateTools.dateToString(lastModified, DateTools.Resolution.DAY), Field.Store.YES,
        Field.Index.NOT_ANALYZED));


    // Write the raw content to an analysis file
    writeContentAnalysisFile(rawDocument);
    
    // Add the additional fields
    if (additionalFieldMap != null) {
      Iterator iter = additionalFieldMap.keySet().iterator();
      while (iter.hasNext()) {
        String fieldName = (String) iter.next();
        String fieldValue = (String) additionalFieldMap.get(fieldName);
        //doc.add(new Field(fieldName, fieldValue, Field.Store.COMPRESS, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, fieldValue, Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field(fieldName, CompressionTools.compressString(fieldValue), Field.Store.YES));
      }
    }


    if (hasContent(cleanedContent)) {
      // Write the clean content to an analysis file
      writeAnalysisFile(url, "clean", cleanedContent);


      // Add the cleaned content of the document
      doc.add(new Field("content", cleanedContent, 
        this.storeContentForPreview ? Field.Store.YES : Field.Store.NO, Field.Index.ANALYZED));
    } else {
      // We have no content! This is a substitute document
      // -> Add a "preparation-error"-field
      doc.add(new Field("preparation-error", "true", Field.Store.YES,
          Field.Index.NO));
    }


    // Check whether to use the link text as title
    for (int i = 0; i < mUseLinkTextAsTitleReArr.length; i++) {
      if (mUseLinkTextAsTitleReArr[i].match(url)) {
        String linkText = rawDocument.getSourceLinkText();
        if (linkText != null) {
          title = linkText;
        }
        break;
      }
    }


    // Add the document's title
    if (hasContent(title)) {
      doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("title_sort", title.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));
    } else {
      doc.add(new Field("title_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }


    // Add the document's summary
    if (! hasContent(summary) && hasContent(cleanedContent)) {
      summary = createSummaryFromContent(cleanedContent);
    }
    if (hasContent(summary)) {
      doc.add(new Field("summary", summary, Field.Store.NO, Field.Index.ANALYZED));
      doc.add(new Field("summary", CompressionTools.compressString(summary), Field.Store.YES));
    }


   // Add the document's metadata
    if (hasContent(metadata)) {
      doc.add(new Field("metadata", metadata, Field.Store.YES, Field.Index.ANALYZED));
    }


    // Add the document's headlines
    if (hasContent(headlines)) {
      doc.add(new Field("headlines", headlines, Field.Store.NO,
          Field.Index.ANALYZED));
    }


    // Add the document's path
    if (pfPair.getPath() != null) {
      //String asString = pathToString(path);
      doc.add(new Field("path", pfPair.getPath(), Field.Store.YES, Field.Index.NO));
      doc.add(new Field("path_sort", pfPair.getPath().toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED));


      // Write the path to an analysis file
      writeAnalysisFile(url, "path", pfPair.getPath());
    } else {
      doc.add(new Field("path_sort", "", Field.Store.YES, Field.Index.NOT_ANALYZED));
    }


    return doc;
  }




  /**
   * Gibt zurück, ob der String einen Inhalt hat. Dies ist der Fall, wenn er
   * weder <code>null</code> noch ein Leerstring ist.
   *
   * @param str Der zu untersuchende String
   * @return Ob der String einen Inhalt hat.
   */
  private boolean hasContent(String str) {
    return (str != null) && (str.length() != 0);
  }






  /**
   * Erzeugt eine Zusammenfassung aus dem Inhalt eines Dokuments.
   * <p>
   * Wenn keine Zusammenfassung möglich ist, wird <code>null</code>
   * zurückgegeben.
   *
   * @param content Der Inhalt für den die Zusammenfassung erstellt werden soll.
   * @return Eine Zusammenfassung des Dokuments oder <code>null</code>, wenn
   *         keine erzeugt werden konnte.
   */
  private String createSummaryFromContent(String content) {
    
    if( content.length() > mMaxSummaryLength ) {
      // cut the content only if it exceeds the max size for the summary
      int lastSpacePos = content.lastIndexOf(' ', mMaxSummaryLength);


      if (lastSpacePos == -1) {
        return null;
      } else {
        return content.substring(0, lastSpacePos) + "...";
      }
    } else {
      return content;
    }
}






  /**
   * Wandelt einen Pfad in einen String um.
   *
   * @param path Der Pfad
   * @return Der Pfad als String
   */
  private String pathToString(PathElement[] path) {
    StringBuilder builder = new StringBuilder();


    for (int i = 0; i < path.length; i++) {
      builder.append(path[i].getUrl());
      builder.append(' ');
      builder.append(path[i].getTitle());
      builder.append('\n');
    }


    return builder.toString();
  }




  /**
   * Schreibt eine Ananlyse-Datei mit dem Inhalt des Roh-Dokuments.
   *
   * @param rawDocument
   */
  private void writeContentAnalysisFile(RawDocument rawDocument) {
    if (mAnalysisDir == null) {
      // Analysis is disabled -> nothing to do
      return;
    }


    File file = getAnalysisFile(rawDocument.getUrl(), null);
    mWriteAnalysisProfiler.startMeasuring();
    try {
      rawDocument.writeToFile(file);
      mWriteAnalysisProfiler.stopMeasuring(rawDocument.getLength());
    }
    catch (RegainException exc) {
      mWriteAnalysisProfiler.abortMeasuring();
      mLog.error("Writing analysis file failed", exc);
    }
  }




  /**
   * Schreibt eine Analyse-Datei.
   * <p>
   * Eine Analyse-Datei enthält die Daten des Dokuments bei jedem
   * Zwischenschritt der Aufbereitung. Sie hilft die Qualit�t der
   * Index-Erstellung zu Prüfen und wird in einem Unterverzeichnis im
   * Index-Verzeichnis angelegt.
   *
   * @param url Die URL des Dokuments.
   * @param extension Der Erweiterung, die die Analyse-Datei erhalten soll.
   * @param content Der Inhalt, der in die Datei geschrieben werden soll.
   */
  public void writeAnalysisFile(String url, String extension, String content) {
    if (mAnalysisDir == null) {
      // Analysis is disabled -> nothing to do
      return;
    }


    File file = getAnalysisFile(url, extension);


    if (content == null) {
      throw new NullPointerException("Content for analysis file is null: "
        + file.getAbsolutePath());
    }


    mWriteAnalysisProfiler.startMeasuring();


    FileOutputStream stream = null;
    OutputStreamWriter writer = null;
    try {
      stream = new FileOutputStream(file);
      writer = new OutputStreamWriter(stream);


      writer.write(content);


      mWriteAnalysisProfiler.stopMeasuring(content.length());
    }
    catch (IOException exc) {
      mWriteAnalysisProfiler.abortMeasuring();
      mLog.error("Writing analysis file failed", exc);
    }
    finally {
      if (writer != null) {
        try { writer.close(); } catch (IOException exc) {}
      }
      if (stream != null) {
        try { stream.close(); } catch (IOException exc) {}
      }
    }
  }






  /**
   * Erzeugt den Dateinamen einer Analyse-Datei.
   *
   * @param url Die URL des Dokuments.
   * @param extension Der Erweiterung, die die Analyse-Datei erhalten soll.
   *
   * @return Den Dateinamen einer Analyse-Datei.
   */
  private File getAnalysisFile(String url, String extension) {
    // Cut the protocol
    if (url.startsWith("http://") || url.startsWith("file://")) {
      url = url.substring(7);
    }


    url = RegainToolkit.replace(url, ":", "_");
    url = RegainToolkit.replace(url, "/", "_");


    if (extension == null) {
      return new File(mAnalysisDir, url);
    } else {
      return new File(mAnalysisDir, url + "." + extension);
    }
  }




  /**
   * Gibt alle Ressourcen frei, die von den Präparatoren genutzt wurden.
   * <p>
   * Wird ganz am Ende des Crawler-Prozesses aufgerufen, nachdem alle Dokumente
   * bearbeitet wurden.
   */
  public void close() {
    for (int i = 0; i < mPreparatorArr.length; i++) {
      mLog.info("Closing preparator " + mPreparatorArr[i].getClass().getName());
      try {
        mPreparatorArr[i].close();
      }
      catch (Throwable thr) {
        mLog.error("Closing preparator failed: "
          + mPreparatorArr[i].getClass().getName(), thr);
      }
    }


    // Ensure that no call of createDocument(RawDocument) is possible any more
    mPreparatorArr = null;
  }


}
Source Code of net.sf.regain.crawler.document.DocumentFactory

Related Classes of net.sf.regain.crawler.document.DocumentFactory