Package com.nanolaba.surtur.core.filereaders

Source Code of com.nanolaba.surtur.core.filereaders.HtmlFileReader

package com.nanolaba.surtur.core.filereaders;

import com.nanolaba.surtur.core.Document;
import com.nanolaba.surtur.core.FileDescription;
import com.nanolaba.surtur.core.Paragraph;
import com.nanolaba.surtur.core.Word;
import com.nanolaba.surtur.util.Html2Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;

/**
* $Revision: 1 $
* $Author: andriishin $
* $Date: 2011-01-09 11:49:06 -0600 (Sun, 09 Jan 2011) $
*/
public class HtmlFileReader extends AbstractFileReader {

    @SuppressWarnings({"UnusedDeclaration"})
    private static final Log LOGGER = LogFactory.getLog(HtmlFileReader.class);

    @Override
    public Collection<FileDescription> getSupportedFileExtentions() {
        return Arrays.asList(
                new FileDescription("HTML pages (*.htm, *.html)", "htm", "html")
        );
    }

    @Override
    public Document readDocument(String lang1, String lang2, File file) throws IOException {
        Document document = new Document();

        FileReader in = new FileReader(file);
        try {
            Html2Text parser = new Html2Text();
            parser.parse(in);

            String text = parser.getText();

            if (text != null) {
                int size = text.length();

                String[] lines = text.split("\n");

                int currentSymbol = 0;

                for (String line : lines) {
                    Paragraph paragraph = new Paragraph();
                    for (String word : line.split(" ")) {
                        if (isUserCancelled()) {
                            throw new IllegalStateException("Loading cancelled");
                        }
                        if (word != null) {
                            Word translate = getTranslator().translate(lang1, lang2, word);
                            paragraph.getWords().add(translate);
                            paragraph.getWords().add(Word.SPACE);
                        }
                    }
                    fireChangeCurrent(file, currentSymbol = currentSymbol + line.length() + 1, size, "");

                    document.getParagraphs().add(paragraph);
                }
            }
            return document;
        } finally {
            in.close();
        }
    }
}
TOP

Related Classes of com.nanolaba.surtur.core.filereaders.HtmlFileReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.