Source Code of net.sf.jpluck.plucker.parsing.html.TidyParser

package net.sf.jpluck.plucker.parsing.html;


import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintWriter;


public class TidyParser {


/**
 * Parses the specified InputStream to a DOM Document. 
 */
    public static Document parse(InputStream in) {
        Tidy tidy = new Tidy();
        tidy.setQuiet(true);
        tidy.setShowWarnings(false);
        tidy.setDocType("omit");
        tidy.setErrout(new PrintWriter(new ByteArrayOutputStream()));
        tidy.setInputEncoding("UTF-8");
        tidy.setDropEmptyParas(false);
        tidy.setMakeClean(false);
        tidy.setDropFontTags(false);
        tidy.setUpperCaseAttrs(false);
        tidy.setUpperCaseTags(false);
        tidy.setXHTML(true);
        Document document = tidy.parseDOM(in, null);
        removeProcessingInstructions(document);
        // TODO: This is a workaround for empty <li> list items. Try to fix this in the parser itself.
        removeEmptyListItems(document);
        return document;
    }


    private static void removeProcessingInstructions(Node parent) {
        NodeList nodeList = parent.getChildNodes();
        for (int i = 0; i < nodeList.getLength(); i++) {
            Node node = nodeList.item(i);
            if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
                parent.removeChild(node);
                i--;
            } else {
                removeProcessingInstructions(node);
            }
        }
    }


    private static void removeEmptyListItems(final Node node) {
        NodeList nodeList = node.getChildNodes();
        for (int i = 0; i < nodeList.getLength(); i++) {
            final Node child = nodeList.item(i);
            if (child.getNodeType() == Node.ELEMENT_NODE) {
                Element elem = (Element) child;
                if (elem.getNodeName().equals("li") && elem.getAttribute("style").equals("list-style: none") &&
                        elem.getChildNodes().getLength() == 1 && elem.getChildNodes().item(0).getNodeName().equals("br")) {
                    node.removeChild(child);
                    i--;
                } else {
                    removeEmptyListItems(child);
                }
            }
        }
    }


}
Source Code of net.sf.jpluck.plucker.parsing.html.TidyParser

Related Classes of net.sf.jpluck.plucker.parsing.html.TidyParser