Package com.google.code.ftspc.lector.parsers.CHM

Source Code of com.google.code.ftspc.lector.parsers.CHM.CHMParserFunctions

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.google.code.ftspc.lector.parsers.CHM;

import com.google.code.ftspc.lector.indexers.CommonFunctions;
import com.google.code.ftspc.lector.ini_and_vars.Vars;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import net.htmlparser.jericho.Source;

/**
* This class now for testing only.
* @author Arthur Khusnutdinov
*/
class CHMParserFunctions extends CommonFunctions {

    public boolean deleteDirectory(File path) {
        if (path.exists()) {
            File[] files = path.listFiles();
            for (int i = 0; i < files.length; i++) {
                if (files[i].isDirectory()) {
                    deleteDirectory(files[i]);
                } else {
                    files[i].delete();
                }
            }
        }
        return (path.delete());
    }

    public String processUnpackedObjects(File path) {

        if (path.exists()) {
            String localgeneralContent = "";

            FileFilter filefilter = new FileFilter() {

                @Override
                public boolean accept(File file) {
                    if (file.getName().endsWith(".htm")
                            || file.getName().endsWith(".html")) {
                        return true;
                    }
                    return false;
                }
            };
            /*
             * Let's process html files.
             */
            for (File fileInDir : path.listFiles(filefilter)) {
                String textFromHTML = getTextFromHTML(fileInDir);
                localgeneralContent += " " + textFromHTML;
            }

            /*
             * Let's process dirs.
             */

            for (File fileInDir : path.listFiles()) {
                if (fileInDir.isDirectory()) {
                    localgeneralContent += " " + processUnpackedObjects(fileInDir);
                }
            }
            return localgeneralContent;
        } else {
            return " ";
        }
    }

    String getTextFromHTML(File fileInDir) {
        try {
            String fileContent = "";
            String pathToFile = fileInDir.getAbsolutePath();
            File fileForParsing;
            int length;

            String fileEnc = this.detectEncoding(fileInDir.getAbsolutePath());

            fileForParsing = new File(pathToFile);
            length = (int) fileForParsing.length();

            if (length != 0) {
                Source source;
                char[] cbuf = new char[length];
                InputStreamReader isr = new InputStreamReader(
                        new FileInputStream(fileForParsing), fileEnc);
                final int read = isr.read(cbuf);

                fileContent = new String(cbuf, 0, read);
                isr.close();

                source = new Source(fileContent);
                source.setLogger(null);
                fileContent = source.getTextExtractor().toString();

                pathToFile = null;
                source = null;
                isr = null;
                fileForParsing = null;

                if (!fileEnc.equals("UTF-8")) {
                    return new String(fileContent.getBytes("UTF-8"), "UTF-8");
                } else {
                    return fileContent;
                }
            } else {
                pathToFile = null;
                fileContent = null;
                fileForParsing = null;
                return null;
            }


        } catch (Exception ex) {
            Vars.logger.fatal("Error: ", ex);
            return null;
        }

    }
}
TOP

Related Classes of com.google.code.ftspc.lector.parsers.CHM.CHMParserFunctions

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.