Package net.sf.regain.crawler.preparator

Source Code of net.sf.regain.crawler.preparator.AbstractJacobMsOfficePreparator

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2008-10-25 18:35:21 +0200 (Sa, 25 Okt 2008) $
*   $Author: thtesche $
* $Revision: 349 $
*/
package net.sf.regain.crawler.preparator;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import net.sf.regain.RegainException;
import net.sf.regain.RegainToolkit;
import net.sf.regain.crawler.config.PreparatorConfig;
import net.sf.regain.crawler.document.AbstractPreparator;

import com.jacob.com.Dispatch;
import com.jacob.com.Variant;

/**
*
* @author Tilman Schneider, STZ-IDA an der FH Karlsruhe
*/
public abstract class AbstractJacobMsOfficePreparator extends AbstractPreparator {
 
  /** The properties that should be extracted. */
  private String[] mWantedPropertiesArr;
 
  /**
   * Holds the document properties that may be extracted from a word document.
   * (key: The property name (String), value: The property constant (Variant))
   */
  private HashMap mPropertyMap;


  /**
   * Creates a new instance of JacobMsWordPreparator.
   *
   * @param extentionArr The file extensions a URL must have one to be accepted
   *        by this preparator.
   * @throws RegainException If creating the preparator failed.
   */
  public AbstractJacobMsOfficePreparator(String[] extentionArr)
    throws RegainException
  {
    super(extentionArr);
   
    // NOTE: See: http://mypage.bluewin.ch/reprobst/WordFAQ/DokEigen.htm#DokEigen04
    mPropertyMap = new HashMap();
    mPropertyMap.put("propTitle",       new Variant(1))// german: Titel
    mPropertyMap.put("subject",         new Variant(2))// german: Thema
    mPropertyMap.put("author",          new Variant(3))// german: Autor
    mPropertyMap.put("keywords",        new Variant(4))// german: Stichwoerter
    mPropertyMap.put("comments",        new Variant(5))// german: Kommentar
    mPropertyMap.put("template",        new Variant(6))// german: Vorlage
    mPropertyMap.put("lastAuthor",      new Variant(7))// german: Zuletzt gespeichert von
    mPropertyMap.put("revision",        new Variant(8))// german: Version
    // mPropertyMap.put("appName",      new Variant(9));  // N/A
    mPropertyMap.put("timeLastPrinted", new Variant(10)); // german: Gedruckt am
    mPropertyMap.put("timeCreated",     new Variant(11)); // german: Erstellt am
    mPropertyMap.put("timeLastSaved",   new Variant(12)); // german: Geaendert am
    mPropertyMap.put("totalEditTime",   new Variant(13)); // german: Gesamtbearbeitungszeit
    mPropertyMap.put("pages",           new Variant(14)); // german: Seiten
    mPropertyMap.put("words",           new Variant(15)); // german: Woerter
    mPropertyMap.put("characters",      new Variant(16)); // german: Zeichen (ohne Leerzeichen)
    mPropertyMap.put("security",        new Variant(17)); // german: Dokumentenschutz
    mPropertyMap.put("category",        new Variant(18)); // german: Kategorie
    // mPropertyMap.put("format",       new Variant(19)); // N/A
    mPropertyMap.put("manager",         new Variant(20)); // german: Manager
    mPropertyMap.put("company",         new Variant(21)); // german: Firma
    mPropertyMap.put("bytes",           new Variant(22)); // german: Bytes
    mPropertyMap.put("lines",           new Variant(23)); // german: Zeilen
    mPropertyMap.put("paras",           new Variant(24)); // german: Absätze
    mPropertyMap.put("slides",          new Variant(25)); // N/A (MS PowerPoint)
    mPropertyMap.put("notes",           new Variant(26)); // N/A (MS PowerPoint)
    mPropertyMap.put("hiddenSlides",    new Variant(27)); // N/A (MS PowerPoint)
    mPropertyMap.put("mmClips",         new Variant(28)); // N/A (MS PowerPoint)
    mPropertyMap.put("hyperlinkBase",   new Variant(29)); // german: Hyperlinkbasis
    mPropertyMap.put("charsWSpaces",    new Variant(30)); // german: Buchstaben (mit Leerzeichen)
  }


  /**
   * Initializes the preparator.
   *
   * @param config The configuration.
   * @throws RegainException If the configuration has an error.
   */
  public void init(PreparatorConfig config) throws RegainException {
    Map main = config.getSectionWithName("main");
    if (main != null) {
      String properties = (String) main.get("properties");
      if (properties != null) {
        mWantedPropertiesArr = RegainToolkit.splitString(properties, ";", true);
       
        // Check the properties
        for (int i = 0; i < mWantedPropertiesArr.length; i++) {
          if (mPropertyMap.get(mWantedPropertiesArr[i]) == null) {
            // This propery does not exist -> Show an error that lists the
            // properties
            StringBuffer possProp = new StringBuffer();
            Iterator iter = mPropertyMap.keySet().iterator();
            while (iter.hasNext()) {
              String property = (String) iter.next();
              if (possProp.length() > 0) {
                possProp.append(", ");
              }
              possProp.append(property);
            }
           
            throw new RegainException("MS Word property '"
                + mWantedPropertiesArr[i] + "' does not exist. Possible "
                + "properties are: " + possProp.toString());
          }
        }
      }
    }
  }

 
  /**
   * Reads the configured document properties from a MS Office document.
   *
   * @param document The document to read the properties from.
   */
  protected void readProperties(Dispatch document) {
    // Read the document properties
    // NOTE: VB-Code: Autor = ActiveDocument.BuiltInDocumentProperties(wdPropertyAuthor).Value
    if (mWantedPropertiesArr != null) {
      for (int i = 0; i < mWantedPropertiesArr.length; i++) {
        // NOTE: We should always get a propertyConstant here since we checked
        //       it in the readConfig method.
        String propertyName = mWantedPropertiesArr[i];
        Variant propertyConstant = (Variant) mPropertyMap.get(propertyName);
        Object property = Dispatch.call(document, "BuiltInDocumentProperties", propertyConstant).getDispatch();
        String value = Dispatch.get(property, "Value").toString();
       
        addAdditionalField(propertyName, value);
      }
    }
  }
 
}
TOP

Related Classes of net.sf.regain.crawler.preparator.AbstractJacobMsOfficePreparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.