Package com.google.code.ftspc.lector.parsers.RTF

Source Code of com.google.code.ftspc.lector.parsers.RTF.RTFParser

package com.google.code.ftspc.lector.parsers.RTF;

import com.google.code.ftspc.lector.indexers.AddDataToIndex;
import com.google.code.ftspc.lector.ini_and_vars.Vars;
import com.google.code.ftspc.lector.parsers.Parser;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import javax.swing.text.Document;
import javax.swing.text.rtf.RTFEditorKit;

/**
* Class for the DOC parser
* @author Arthur Khusnutdinov
*/
public class RTFParser extends Thread implements Parser {

    private String pathToFile;
    private String fileName;

    @Override
    public void run() {
        InputStream isr = null;
        try {
            isr = new FileInputStream(pathToFile);
            String fileContent = "";
            String fileContentForLanguageDetermination = "";
            RTFEditorKit RTFEditorKit = new RTFEditorKit();
            Document RTFdoc = RTFEditorKit.createDefaultDocument();
            String lang = "";

            RTFEditorKit.read(isr, RTFdoc, 0);
            fileContent = RTFdoc.getText(0, RTFdoc.getLength()).trim();

            fileContentForLanguageDetermination =
                    new String(fileContent.getBytes("ISO-8859-1"), "cp1251");

            if (fileContentForLanguageDetermination.length() < 1000) {
                lang = Vars.TextCategorizerLocal.categorize(
                        fileContentForLanguageDetermination.substring(0,
                        fileContentForLanguageDetermination.length()));
            } else {
                lang = Vars.TextCategorizerLocal.categorize(
                        fileContentForLanguageDetermination.substring(0, 1000));
            }

            if (lang.equals("russian")) {
                fileContentForLanguageDetermination =
                        new String(fileContentForLanguageDetermination.getBytes("UTF-8"));
                fileContent = fileContentForLanguageDetermination;
            }
            fileContentForLanguageDetermination = null;

            AddDataToIndex AddDataToIndex = new AddDataToIndex(lang);
            AddDataToIndex.doAddData(fileContent, pathToFile, fileName);

            AddDataToIndex = null;
            fileContent = null;
            RTFEditorKit = null;
            RTFdoc = null;
            lang = null;
            Vars.current_run_indexes--;

        } catch (Exception ex) {
            Vars.current_run_indexes--;
            Vars.logger.fatal(ex.getMessage(), ex);
        } finally {
            try {
                isr.close();
            } catch (IOException ex) {
                Vars.logger.fatal(ex.getMessage(), ex);
            }
        }
    }
   
    @Override
    public void start_th(String pathToFile, String fileName) {
        this.pathToFile = pathToFile;
        this.fileName = fileName;
        this.start();
    }
}
TOP

Related Classes of com.google.code.ftspc.lector.parsers.RTF.RTFParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.