/******************************************************************************
* __ *
* <-----/@@\-----> *
* <-< < \\// > >-> *
* <-<-\ __ /->-> *
* Data / \ Crow *
* ^ ^ *
* info@datacrow.net *
* *
* This file is part of Data Crow. *
* Data Crow is free software; you can redistribute it and/or *
* modify it under the terms of the GNU General Public *
* License as published by the Free Software Foundation; either *
* version 3 of the License, or any later version. *
* *
* Data Crow is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* See the GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public *
* License along with this program. If not, see http://www.gnu.org/licenses *
* *
******************************************************************************/
package net.datacrow.util;
import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import net.datacrow.core.http.HttpConnection;
import net.datacrow.core.http.HttpConnectionUtil;
import org.apache.log4j.Logger;
import org.lobobrowser.html.parser.HtmlParser;
import org.lobobrowser.html.test.SimpleUserAgentContext;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
public class HtmlUtils {
private static Logger logger = Logger.getLogger(HtmlUtils.class.getName());
private static final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
private static DocumentBuilder builder;
static {
try {
builder = factory.newDocumentBuilder();
} catch (Exception e) {
logger.fatal("Cannot get a document builder!", e);
}
}
public static Document getDocument(URL url, boolean cleanup) throws Exception {
return getDocument(url, "ISO-8859-1");
}
public static Document getDocument(URL url, String charset) throws Exception {
return getDocument(getHtmlCleaned(url, charset, false));
}
public static Document getDocument(URL url, String charset, boolean cleanup) throws Exception {
return getDocument(getHtmlCleaned(url, charset, cleanup));
}
public static Document getDocument(String html) throws Exception {
ByteArrayInputStream in = new ByteArrayInputStream(html.getBytes());
// if (logger.isDebugEnabled()) {
// Utilities.writeToFile(html.getBytes(), "online_service_document.xml");
// }
Reader reader = new InputStreamReader(in);
Document document = builder.newDocument();
try {
HtmlParser parser = new HtmlParser(new SimpleUserAgentContext(), document);
parser.parse(reader);
} catch (Exception e) {
logger.error(e, e);
}
in.close();
return document;
}
public static String getHtmlCleaned(URL url, String charset, boolean cleanup) throws Exception {
HttpConnection connection = HttpConnectionUtil.getConnection(url);
String html = connection.getString(charset);
connection.close();
if (html.contains("<html") || html.contains("<HTML")) {
String title = StringUtils.getValueBetween("<title>", "</title>", html);
html = StringUtils.getValueBetween("<body", "</body>", html);
html = html.substring(html.indexOf(">") + 1);
// start the document
StringBuffer sb = new StringBuffer();
sb.append("<html>\n");
// create the title part
if (!Utilities.isEmpty(title)) {
sb.append("<head>\n");
sb.append("<title>");
sb.append(title);
sb.append("</title>\n");
sb.append("</head>\n");
}
// create the body
sb.append("<body>\n");
sb.append(html);
sb.append("</body>\n");
sb.append("</html>\n");
String[][] removeSections = {{"<script", "</script>"},
{"<style", "</style>"},
{"onclick=\"", "\""},
{"rel=\"", "\""},
{"<!--", "-->"}};
if (cleanup) {
int idx;
String part1;
String part2;
for (String[] sections : removeSections) {
while((idx = sb.indexOf(sections[0])) > 0) {
part1 = sb.substring(0, idx);
part2 = sb.substring(sb.indexOf(sections[1], idx + sections[0].length()) + sections[1].length());
sb.setLength(0);
sb.append(part1);
sb.append(part2);
}
}
String[] removeWords = {" ", " href=\"#\""};
for (String word : removeWords) {
while((idx = sb.indexOf(word)) > 0) {
part1 = sb.substring(0, idx);
part2 = sb.substring(idx + word.length());
sb.setLength(0);
sb.append(part1);
sb.append(part2);
}
}
}
html = sb.toString();
//perform specific fixes
while (html.indexOf("width\"") != -1) {
html = html.replace("width\"", "width=\"");
}
if (cleanup) {
// only needed for Bol.com
while (html.indexOf("=\"\"/") != -1) {
html = html.replace("=\"\"/", "=\"/");
}
}
}
return html;
}
public static String toPlainText(String html) {
return toPlainText(html, "ISO-8859-1");
}
/**
* Clean the string of any unwanted characters
* @param s string to clean
*/
public static String toPlainText(String html, String charset) {
try {
String s = html;
if (!s.toUpperCase().startsWith("<HTML")) {
StringBuffer sb = new StringBuffer(s);
sb.insert(0, "<html><body>");
sb.append("</body></html>");
s = sb.toString();
}
Document document = getDocument(s);
XPath xpath = XPathFactory.newInstance().newXPath();
Node node = (Node) xpath.evaluate("/html/body", document, XPathConstants.NODE);
String text = node.getTextContent();
while (text.length() > 1 && (text.startsWith("\r") || text.startsWith("\n")))
text = text.substring(1);
while (text.length() > 1 && (text.endsWith("\r") || text.endsWith("\n")))
text = text.substring(0, text.length() - 1);
return text;
} catch (Exception e) {
logger.debug("Failed to parse: " + html);
}
return html;
}
}