package net.sf.jpluck.plucker.parsing.html;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintWriter;
public class TidyParser {
/**
* Parses the specified InputStream to a DOM Document.
*/
public static Document parse(InputStream in) {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
tidy.setDocType("omit");
tidy.setErrout(new PrintWriter(new ByteArrayOutputStream()));
tidy.setInputEncoding("UTF-8");
tidy.setDropEmptyParas(false);
tidy.setMakeClean(false);
tidy.setDropFontTags(false);
tidy.setUpperCaseAttrs(false);
tidy.setUpperCaseTags(false);
tidy.setXHTML(true);
Document document = tidy.parseDOM(in, null);
removeProcessingInstructions(document);
// TODO: This is a workaround for empty <li> list items. Try to fix this in the parser itself.
removeEmptyListItems(document);
return document;
}
private static void removeProcessingInstructions(Node parent) {
NodeList nodeList = parent.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
if (node.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
parent.removeChild(node);
i--;
} else {
removeProcessingInstructions(node);
}
}
}
private static void removeEmptyListItems(final Node node) {
NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node child = nodeList.item(i);
if (child.getNodeType() == Node.ELEMENT_NODE) {
Element elem = (Element) child;
if (elem.getNodeName().equals("li") && elem.getAttribute("style").equals("list-style: none") &&
elem.getChildNodes().getLength() == 1 && elem.getChildNodes().item(0).getNodeName().equals("br")) {
node.removeChild(child);
i--;
} else {
removeEmptyListItems(child);
}
}
}
}
}