Package net.sf.regain.crawler.preparator

Source Code of net.sf.regain.crawler.preparator.JacobMsExcelPreparator

/*
* regain - A file search engine providing plenty of formats
* Copyright (C) 2004  Til Schneider
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
* Contact: Til Schneider, info@murfman.de
*
* CVS information:
*  $RCSfile$
*   $Source$
*     $Date: 2008-10-25 18:35:21 +0200 (Sa, 25 Okt 2008) $
*   $Author: thtesche $
* $Revision: 349 $
*/
package net.sf.regain.crawler.preparator;

import net.sf.regain.RegainException;
import net.sf.regain.crawler.config.PreparatorConfig;
import net.sf.regain.crawler.document.RawDocument;

import com.jacob.com.ComFailException;
import com.jacob.com.ComThread;
import com.jacob.com.Dispatch;
import com.jacob.com.SafeArray;
import com.jacob.com.Variant;

import de.filiadata.lucene.spider.generated.msoffice2000.excel.*;

/**
* Präpariert ein Microsoft-Excel-Dokument für die Indizierung mit Hilfe der
* <a href="http://danadler.com/jacob/">Jacob-API</a>, wobei
* <a href="http://www.bigatti.it/projects/jacobgen/">Jacobgen</a>
* genutzt wurde, um den Zugriff zu erleichtern.
* <p>
* Dabei werden die Rohdaten des Dokuments von Formatierungsinformation befreit,
* es wird der Titel extrahiert.
*
* @author Til Schneider, www.murfman.de
* @author Reinhard Balling
*/
public class JacobMsExcelPreparator extends AbstractJacobMsOfficePreparator {

  /**
   * Die Excel-Applikation. Ist <code>null</code>, solange noch kein Dokument
   * bearbeitet wurde.
   */
  private Application mExcelApplication;
 

  /**
   * Creates a new instance of JacobMsExcelPreparator.
   *
   * @throws RegainException If creating the preparator failed.
   */
  public JacobMsExcelPreparator() throws RegainException {
    super(new String[] { "xls", "xlt" });
  }

 
  /**
   * Initializes the preparator.
   *
   * @param config The configuration
   * @throws RegainException If the configuration has an error.
   */
  public void init(PreparatorConfig config) throws RegainException {
    // NOTE: This method is not nessesary since it only calls the super method,
    //       but I defined it to ensure that the super call is not forgotten
    //       when there should be a config some day.
    super.init(config);
  }


  /**
   * Wrapper for calling the ActiveX-Method with input-parameter(s).
   *
   * @param row an input-parameter of type int
   * @param col an input-parameter of type int
   * @return the result is of type Range
   */
  public Range getCells(Worksheet sheet, int row, int col) {
    return new Range(Dispatch.call(sheet, "Cells", new Variant(row),
        new Variant(col)).toDispatch());
  }


  /**
   * Präpariert ein Dokument für die Indizierung.
   *
   * @param rawDocument Das zu pr�pariernde Dokument.
   *
   * @throws RegainException Wenn die Pr�paration fehl schlug.
   */
  public void prepare(RawDocument rawDocument) throws RegainException {
    if (mExcelApplication == null) {
      // COM-Thread initialisieren
      ComThread.InitSTA();

      // Neue Excel-Applikation erstellen
      mExcelApplication = new Application();
    }
    //Dispatch.put(mExcelApplication, "Visible", new Variant(true));

    try {
      // Some constants needed later on
      Variant what = new Variant("*");
      Variant lookIn = new Variant(-4163);
      Variant lookAt = new Variant(XlLookAt.xlWhole);
      Variant searchOrderByRows = new Variant(XlSearchOrder.xlByRows);
      Variant searchOrderByCols = new Variant(XlSearchOrder.xlByColumns);
      int searchDirectionUp = XlSearchDirection.xlNext;
      int searchDirectionDown = XlSearchDirection.xlPrevious;
      Variant xlFalse = new Variant(false);   

      // Get all workbooks
      Workbooks wbs = mExcelApplication.getWorkbooks();

      // Open the file
      java.io.File file = rawDocument.getContentAsFile();
      // RB 20051106: Need to add second argument to open call, otherwise Excel
      // can hang here waiting for an answer to the dialogue box asking to
      // update external links
      Workbook wb = wbs.open(file.getAbsolutePath(), new Variant(0));

      // Collect the content of all sheets
      Sheets sheets = wb.getWorksheets();
      int sheetCount = sheets.getCount();

      //System.out.println("Sheetcount = "+sheetCount);
      StringBuffer contentBuf = new StringBuffer(DEFAULT_BUFFER_SIZE);
      for (int sheetIdx = 1; sheetIdx <= sheetCount; sheetIdx++) {
        Variant sheetVariant = (Variant) sheets.getItem(new Variant(sheetIdx));
        Worksheet sheet = new Worksheet(sheetVariant.toDispatch());
        Variant after = new Variant(sheet.getRange(new Variant("IV65536")));
        Range extractRange;
        // Find the top left and bottom right corner enclosing the data in the sheet
        try {
          // The following calls may fail because the find operation returns
          // a null pointer. This exception is caught by reverting to the
          // UsedRange call which also returns the used area in a spreadsheet,
          // even though it is often bigger than necessary.
          int firstRow = sheet.getCells().find(what, after, lookIn, lookAt, searchOrderByRows,
                  searchDirectionUp, xlFalse, xlFalse).getRow();
          int firstCol = sheet.getCells().find(what, after, lookIn, lookAt, searchOrderByCols,
              searchDirectionUp, xlFalse, xlFalse).getColumn();
          int lastRow = sheet.getCells().find(what, after, lookIn, lookAt, searchOrderByRows,
                  searchDirectionDown, xlFalse, xlFalse).getRow();
          int lastCol = sheet.getCells().find(what, after, lookIn, lookAt, searchOrderByCols,
                  searchDirectionDown, xlFalse, xlFalse).getColumn();
          extractRange = sheet.getRange(new Variant(getCells(sheet, firstRow, firstCol)),
                                        new Variant(getCells(sheet, lastRow, lastCol)));
          //System.out.println("Sheet "+sheetIdx+"   ProperUsedRange="+extractRange.getAddress());       
        } catch(NullPointerException e) {
          extractRange = sheet.getUsedRange();
          //System.out.println("Sheet "+sheetIdx+"   UsedRange="+extractRange.getAddress());       
        }
        //RB 20051106: If the sheet is empty, getUsedRange returns $A$1 and
        // toSafeArray fails. Therefore we have to check that there is some text
        // to extract by comparing the address of the range to the string "$A$1".
        if (!extractRange.getAddress().equals("$A$1")) {
          SafeArray cellArray = extractRange.getValue().toSafeArray();

          int startRow = cellArray.getLBound(1);
          int startCol = cellArray.getLBound(2);
          int endRow   = cellArray.getUBound(1);
          int endCol   = cellArray.getUBound(2);
 
          for (int row = startRow; row <= endRow; row++) {
            for (int col = startCol; col <= endCol; col++) {
              String cellValue = cellArray.getString(row, col);
              if ((cellValue != null) && (cellValue.length() != 0)) {
                contentBuf.append(cellValue);
                contentBuf.append(" ");
              }
            }
            contentBuf.append("\n");
          }
        }
      }
     
      // Read the document properties
      readProperties(wb);
     
      // Set the content
      setCleanedContent(contentBuf.toString());

      // Close the workbook without saving
      wb.close(new Variant(false));
    }
    catch (ComFailException exc) {
      throw new RegainException("Using COM failed.", exc);
    }
  }


  /**
   * Frees all resources reserved by the preparator.
   * <p>
   * Is called at the end of the crawler process after all documents were
   * processed.
   *
   * @throws RegainException If freeing the resources failed.
   */
  public void close() throws RegainException {
    if (mExcelApplication != null) {
      // Excel schlie�en
      try {
        mExcelApplication.quit();
      }
      catch (Throwable thr) {
        throw new RegainException("Using COM failed.", thr);
      }
      finally {
        // Alle Ressourcen des COM-Threads freigeben
        ComThread.Release();
      }
    }
  }

}
TOP

Related Classes of net.sf.regain.crawler.preparator.JacobMsExcelPreparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.