Package net.sf.regain.crawler.preparator

Source Code of net.sf.regain.crawler.preparator.SwingRtfPreparator

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2009-05-17 21:20:00 +0200 (So, 17 Mai 2009) $
*   $Author: thtesche $
* $Revision: 391 $
*/
package net.sf.regain.crawler.preparator;

import java.io.InputStream;

import javax.swing.text.Document;
import javax.swing.text.rtf.RTFEditorKit;

import net.sf.regain.RegainException;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;

/**
* Präpariert ein RTF-Dokument für die Indizierung. Dazu wird der RTF-Parser
* von Swing genutzt.
* <p>
* Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit.
*
* @author Til Schneider, www.murfman.de
*/
public class SwingRtfPreparator extends AbstractPreparator {

  /** Das RTFEditorKit, das zum laden von RTF-Dokumenten verwendet wird. */
  private RTFEditorKit mRTFEditorKit;


  /**
   * Creates a new instance of SwingRtfPreparator.
   *
   * @throws RegainException If creating the preparator failed.
   */
  public SwingRtfPreparator() throws RegainException {
    super("text/rtf");
  }


  /**
   * Präpariert ein Dokument für die Indizierung.
   *
   * @param rawDocument Das zu pr�pariernde Dokument.
   *
   * @throws RegainException Wenn die Pr�paration fehl schlug.
   */
  public void prepare(RawDocument rawDocument) throws RegainException {
    if (mRTFEditorKit == null) {
      mRTFEditorKit = new RTFEditorKit();
    }

    InputStream stream = null;
    try {
      stream = rawDocument.getContentAsStream();
      Document doc = mRTFEditorKit.createDefaultDocument();
      mRTFEditorKit.read(stream, doc, 0);

      String cleanedContent = doc.getText(0, doc.getLength());
      setCleanedContent(cleanedContent);
    }
    catch (Exception exc) {
      throw new RegainException("Reading RTF dokument failed: "
        + rawDocument.getUrl(), exc);
    }
    finally {
      if (stream != null) {
        try { stream.close(); } catch (Exception exc) {}
      }
    }
  }


  /**
   * Frees all resources reserved by the preparator.
   * <p>
   * Is called at the end of the crawler process after all documents were
   * processed.
   *
   * @throws RegainException If freeing the resources failed.
   */
  @Override
  public void close() throws RegainException {
    mRTFEditorKit = null;
  }

}
TOP

Related Classes of net.sf.regain.crawler.preparator.SwingRtfPreparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.