Source Code of net.datacrow.util.HtmlUtils

/******************************************************************************
 *                                     __                                     *
 *                              <-----/@@\----->                              *
 *                             <-< <  \\//  > >->                             *
 *                               <-<-\ __ /->->                               *
 *                               Data /  \ Crow                               *
 *                                   ^    ^                                   *
 *                              info@datacrow.net                             *
 *                                                                            *
 *                       This file is part of Data Crow.                      *
 *       Data Crow is free software; you can redistribute it and/or           *
 *        modify it under the terms of the GNU General Public                 *
 *       License as published by the Free Software Foundation; either         *
 *              version 3 of the License, or any later version.               *
 *                                                                            *
 *        Data Crow is distributed in the hope that it will be useful,        *
 *      but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *           MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.             *
 *           See the GNU General Public License for more details.             *
 *                                                                            *
 *        You should have received a copy of the GNU General Public           *
 *  License along with this program. If not, see http://www.gnu.org/licenses  *
 *                                                                            *
 ******************************************************************************/


package net.datacrow.util;


import java.io.ByteArrayInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;


import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;


import net.datacrow.core.http.HttpConnection;
import net.datacrow.core.http.HttpConnectionUtil;


import org.apache.log4j.Logger;
import org.lobobrowser.html.parser.HtmlParser;
import org.lobobrowser.html.test.SimpleUserAgentContext;
import org.w3c.dom.Document;
import org.w3c.dom.Node;


public class HtmlUtils {
    
    private static Logger logger = Logger.getLogger(HtmlUtils.class.getName());
    private static final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    private static DocumentBuilder builder;
    
    static {
        try {
            builder = factory.newDocumentBuilder();
        } catch (Exception e) {
            logger.fatal("Cannot get a document builder!", e);
        }
    }


    public static Document getDocument(URL url, boolean cleanup) throws Exception {
        return getDocument(url, "ISO-8859-1");
    }
    
    public static Document getDocument(URL url, String charset) throws Exception {
        return getDocument(getHtmlCleaned(url, charset, false));
    }
    
    public static Document getDocument(URL url, String charset, boolean cleanup) throws Exception {
        return getDocument(getHtmlCleaned(url, charset, cleanup));
    }
    
    public static Document getDocument(String html) throws Exception { 
        
        ByteArrayInputStream in = new ByteArrayInputStream(html.getBytes());
        
//        if (logger.isDebugEnabled()) {
//            Utilities.writeToFile(html.getBytes(), "online_service_document.xml");
//        }
        
        Reader reader = new InputStreamReader(in);
        Document document = builder.newDocument();
        
        try {
            HtmlParser parser = new HtmlParser(new SimpleUserAgentContext(), document);
            parser.parse(reader);
        } catch (Exception e) {
            logger.error(e, e);
        }


        in.close();
        
        return document;
    }    
    
    public static String getHtmlCleaned(URL url, String charset, boolean cleanup) throws Exception { 
        HttpConnection connection = HttpConnectionUtil.getConnection(url);
        String html = connection.getString(charset);
        connection.close();        
        
        if (html.contains("<html") || html.contains("<HTML")) {
            String title = StringUtils.getValueBetween("<title>", "</title>", html);
            html = StringUtils.getValueBetween("<body", "</body>", html);
            html = html.substring(html.indexOf(">") + 1);
    
            // start the document
            StringBuffer sb = new StringBuffer();
            sb.append("<html>\n");
            
            // create the title part
            if (!Utilities.isEmpty(title)) {
                sb.append("<head>\n");
                sb.append("<title>");
                sb.append(title);
                sb.append("</title>\n");
                sb.append("</head>\n");
            }
            
            // create the body
            sb.append("<body>\n");
            sb.append(html);
            sb.append("</body>\n");
            sb.append("</html>\n");
            
            String[][] removeSections = {{"<script", "</script>"},
                                         {"<style", "</style>"},   
                                         {"onclick=\"", "\""},
                                         {"rel=\"", "\""},
                                         {"<!--", "-->"}};
            
            if (cleanup) {
                int idx;
                String part1;
                String part2;
                for (String[] sections : removeSections) {
                    while((idx = sb.indexOf(sections[0])) > 0) {
                        part1 = sb.substring(0, idx);
                        part2 = sb.substring(sb.indexOf(sections[1], idx + sections[0].length()) + sections[1].length());
                        
                        sb.setLength(0);
                        sb.append(part1);
                        sb.append(part2);
                    }
                }
            
                String[] removeWords = {"&nbsp;", " href=\"#\""};
                for (String word : removeWords) {
                    while((idx = sb.indexOf(word)) > 0) {
                        part1 = sb.substring(0, idx);
                        part2 = sb.substring(idx + word.length());
                        
                        sb.setLength(0);
                        sb.append(part1);
                        sb.append(part2);
                    }
                }
            }
                
            html = sb.toString();
            
            //perform specific fixes
            while (html.indexOf("width\"") != -1) {
                html = html.replace("width\"", "width=\"");
            }
            
            if (cleanup) {
                // only needed for Bol.com
                while (html.indexOf("=\"\"/") != -1) {
                    html = html.replace("=\"\"/", "=\"/");
                }
            }
        }


        return html;
    }


    
    public static String toPlainText(String html) {
        return toPlainText(html, "ISO-8859-1");
    }
    
    /**
     * Clean the string of any unwanted characters
     * @param s string to clean
     */
    public static String toPlainText(String html, String charset) {
        try {
            String s = html;
            if (!s.toUpperCase().startsWith("<HTML")) {
                StringBuffer sb = new StringBuffer(s);
                sb.insert(0, "<html><body>");
                sb.append("</body></html>");
                s = sb.toString();
            }
            
            Document document = getDocument(s);
                
            XPath xpath = XPathFactory.newInstance().newXPath();
            Node node = (Node) xpath.evaluate("/html/body", document, XPathConstants.NODE);


            String text = node.getTextContent();
            
            while (text.length() > 1 && (text.startsWith("\r") || text.startsWith("\n")))
                text = text.substring(1);


            while (text.length() > 1 && (text.endsWith("\r") || text.endsWith("\n")))
                text = text.substring(0, text.length() - 1);
            
            return text;
            
        } catch (Exception e) {
            logger.debug("Failed to parse: " + html);
        }


        return html;
    }    
}
Source Code of net.datacrow.util.HtmlUtils

Related Classes of net.datacrow.util.HtmlUtils