Package com.google.code.ftspc.lector.parsers.POI

Source Code of com.google.code.ftspc.lector.parsers.POI.DocParser

package com.google.code.ftspc.lector.parsers.POI;

import com.google.code.ftspc.lector.indexers.AddDataToIndex;
import com.google.code.ftspc.lector.ini_and_vars.Vars;
import com.google.code.ftspc.lector.parsers.Parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

/**
* Class for the DOC parser
* @author Arthur Khusnutdinov
*/
public class DocParser extends Thread implements Parser {

    private String pathToFile;
    private String fileName;

    @Override
    public void run() {
        InputStream isr = null;
        try {
            isr = new FileInputStream(pathToFile);
            WordExtractor word = new WordExtractor(isr);
            String fileContent = "";
            String[] paragraphes = word.getParagraphText();
            for (String paragraph : paragraphes) {
                fileContent += " " + paragraph;
            }
            AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
            AddDataToIndex.doAddData(fileContent, pathToFile, fileName);
            Vars.current_run_indexes--;
           
        } catch (OldWordFileFormatException ex) {
            parseWord6(pathToFile);
        } catch (Exception ex) {
            Vars.current_run_indexes--;
            Vars.logger.fatal(ex);
        } finally {
            try {
                isr.close();
            } catch (IOException ex) {
                Vars.logger.fatal(ex);
            }
        }
    }

    /**
     * Check this!!!
     * @param pathToFile
     * @deprecated
     */
    @Deprecated
    private void parseWord6(String pathToFile) {
        FileInputStream fis = null;
        try {
            File docFile = new File(pathToFile);
            fis = new FileInputStream(docFile.getAbsolutePath());
            POIFSFileSystem pfs = new POIFSFileSystem(fis);
            HWPFOldDocument doc = new HWPFOldDocument(pfs);
            Word6Extractor docExtractor = new Word6Extractor(doc);
           
            String fileContent = "";
            String[] paragraphes = docExtractor.getParagraphText();
            for (String paragraph : paragraphes) {
                fileContent += " " + paragraph;
            }
            AddDataToIndex AddDataToIndex = new AddDataToIndex(null);
            AddDataToIndex.doAddData(fileContent, pathToFile, fileName);
            Vars.current_run_indexes--;
           
        } catch (Exception ex) {
            Vars.current_run_indexes--;
            Vars.logger.fatal("Error: ", ex);
        } finally {
            try {
                fis.close();
            } catch (IOException ex) {
                Vars.logger.fatal("Error", ex);
            }
        }
    }

    @Override
    public void start_th(String pathToFile, String fileName) {
        this.pathToFile = pathToFile;
        this.fileName = fileName;
        this.start();
    }
}
TOP

Related Classes of com.google.code.ftspc.lector.parsers.POI.DocParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.