Package net.sf.regain.crawler.preparator

Source Code of net.sf.regain.crawler.preparator.OpenOfficePreparator

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2008-08-06 16:04:27 +0200 (Mi, 06 Aug 2008) $
*   $Author: thtesche $
* $Revision: 325 $
*/
package net.sf.regain.crawler.preparator;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.CrawlerToolkit;
import net.sf.regain.crawler.document.AbstractPreparator;
import net.sf.regain.crawler.document.RawDocument;

/**
* Preparates OpenOffice, StarOffice and OpenDocument documents.
*
* @author Til Schneider, www.murfman.de
*/
public class OpenOfficePreparator extends AbstractPreparator {

  /**
   * Creates a new instance of OpenOfficePreparator.
   *
   * @throws RegainException If creating the preparator failed.
   */
  public OpenOfficePreparator() throws RegainException {
    super( new String[] {
      "application/vnd.sun.xml.writer",
      "application/vnd.sun.xml.writer.template",
      "application/vnd.sun.xml.writer.global",
      "application/vnd.sun.xml.calc",
      "application/vnd.sun.xml.calc.template",
      "application/vnd.stardivision.calc",
      "application/vnd.sun.xml.impress",
      "application/vnd.sun.xml.impress.template ",
      "application/vnd.stardivision.impress sdd",
      "application/vnd.sun.xml.draw",
      "application/vnd.sun.xml.draw.template",
      "application/vnd.stardivision.draw",
      "application/vnd.sun.xml.math",
      "application/vnd.stardivision.math",
      "application/vnd.oasis.opendocument.text",
      "application/vnd.oasis.opendocument.text-template",
      "application/vnd.oasis.opendocument.text-web",
      "application/vnd.oasis.opendocument.text-master",
      "application/vnd.oasis.opendocument.graphics",
      "application/vnd.oasis.opendocument.graphics-template",
      "application/vnd.oasis.opendocument.presentation",
      "application/vnd.oasis.opendocument.presentation-template",
      "application/vnd.oasis.opendocument.spreadsheet",
      "application/vnd.oasis.opendocument.spreadsheet-template",
      "application/vnd.oasis.opendocument.chart",
      "application/vnd.oasis.opendocument.formula",
      "application/vnd.oasis.opendocument.database",
      "application/vnd.oasis.opendocument.image"});
   
    /*super(new String[] {
        // Writer
        "odt", "oth", "ott", "sdw", "stw", "sxw",

        // Calc
        "ods", "ots", "sdc", "stc", "sxc",

        // Draw
        "otg", "sda", "std",

        // Impress
        "odg", "odp", "otp", "sdd", "sti", "sxd", "sxi",

        // Base
        "odb",

        // Math
        "odf", "sxm",

        // Chart
        "odc", "sds",

        // Image
        "odi",

        // Master
        "odm", "sgl", "sxg",
      });*/
  }


  /**
   * Prepares a document for indexing.
   *
   * @param rawDocument The document to prepare.
   *
   * @throws RegainException If preparing the document failed.
   */
  public void prepare(RawDocument rawDocument) throws RegainException {
    File file = rawDocument.getContentAsFile();
   
    ZipFile zipFile;
    try {
      zipFile = new ZipFile(file);
    }
    catch (IOException exc) {
      throw new RegainException("Opening OpenOffice file failed: " +
          file.getAbsolutePath(), exc);
    }

    // Read the content.xml
    ZipEntry entry = zipFile.getEntry("content.xml");
    InputStream xmlStream = null;
    String content;
    try {
      xmlStream = zipFile.getInputStream(entry);
      content = RegainToolkit.readStringFromStream(xmlStream, "UTF-8");
    }
    catch (IOException exc) {
      throw new RegainException("Reading text from OpenOffice file failed: " +
          file.getAbsolutePath(), exc);
    }
    finally {
      if (xmlStream != null) {
        try { xmlStream.close(); } catch (IOException exc) {}
      }
      try { zipFile.close(); } catch (IOException exc) {}
    }

    // Clean the content from tags
    String cleanedContent = CrawlerToolkit.cleanFromHtmlTags(content);
    setCleanedContent(cleanedContent);
  }

}
TOP

Related Classes of net.sf.regain.crawler.preparator.OpenOfficePreparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.