Package net.sf.jpluck.plucker.parsing.html

Source Code of net.sf.jpluck.plucker.parsing.html.TidyParser

package net.sf.jpluck.plucker.parsing.html;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintWriter;

public class TidyParser {

/**
* Parses the specified InputStream to a DOM Document.
*/
    public static Document parse(InputStream in) {
        Tidy tidy = new Tidy();
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        tidy.setDocType("omit");
        tidy.setErrout(new PrintWriter(new ByteArrayOutputStream()));
        tidy.setInputEncoding("UTF-8");
        tidy.setDropEmptyParas(false);
        tidy.setMakeClean(false);
        tidy.setDropFontTags(false);
        tidy.setUpperCaseAttrs(false);
        tidy.setUpperCaseTags(false);
        tidy.setXHTML(true);
        Document document = tidy.parseDOM(in, null);
        removeProcessingInstructions(document);
        // TODO: This is a workaround for empty <li> list items. Try to fix this in the parser itself.
        removeEmptyListItems(document);
        return document;
    }

    private static void removeProcessingInstructions(Node parent) {
        NodeList nodeList = parent.getChildNodes();
        for (int i = 0; i < nodeList.getLength(); i++) {
            Node node = nodeList.item(i);
            if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
                parent.removeChild(node);
                i--;
            } else {
                removeProcessingInstructions(node);
            }
        }
    }

    private static void removeEmptyListItems(final Node node) {
        NodeList nodeList = node.getChildNodes();
        for (int i = 0; i < nodeList.getLength(); i++) {
            final Node child = nodeList.item(i);
            if (child.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) child;
                if (elem.getNodeName().equals("li") && elem.getAttribute("style").equals("list-style: none") &&
                        elem.getChildNodes().getLength() == 1 && elem.getChildNodes().item(0).getNodeName().equals("br")) {
                    node.removeChild(child);
                    i--;
                } else {
                    removeEmptyListItems(child);
                }
            }
        }
    }

}
TOP

Related Classes of net.sf.jpluck.plucker.parsing.html.TidyParser

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.