Package net.bpiwowar.mg4j.extensions.warc

Source Code of net.bpiwowar.mg4j.extensions.warc.WarcHTMLResponseRecord$MyCallback

/**
* Container for a Warc Record of type "response"
*
* (C) 2009 - Carnegie Mellon University
*
* 1. Redistributions of this source code must retain the above copyright
*    notice, this list of conditions and the following disclaimer.
* 2. The names "Lemur", "Indri", "University of Massachusetts", 
*    "Carnegie Mellon", and "lemurproject" must not be used to
*    endorse or promote products derived from this software without
*    prior written permission. To obtain permission, contact
*    license@lemurproject.org.
*
* 4. Products derived from this software may not be called "Lemur" or "Indri"
*    nor may "Lemur" or "Indri" appear in their names without prior written
*    permission of The Lemur Project. To obtain permission,
*    contact license@lemurproject.org.
*
* THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
* PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
* NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* @author mhoy@cs.cmu.edu (Mark J. Hoy)
*
* Some extensions were made to keep track of positions in stream. Stop and
* start markers to mark the position in a stream where a WARC record starts and
* ends were added.
* @author ingo@dcs.gla.ac.uk (Ingo Frommholz)
*/

package net.bpiwowar.mg4j.extensions.warc;

import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.parser.Attribute;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.Element;
import it.unimi.dsi.parser.HTMLFactory;
import it.unimi.dsi.parser.callback.Callback;
import net.bpiwowar.mg4j.extensions.utils.ByteMatch;
import org.apache.log4j.Logger;

import java.io.*;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WarcHTMLResponseRecord {

    private final WarcRecord warcRecord = new WarcRecord();

    private static String SINGLE_SPACE = " ";
    private String contentType = null;

    // Start of the content
    private int start;

    /**
     * Get a pattern that matches any case for a sequence of characters
     */
    private static String getLUPattern(String name) {
        StringBuilder sb = new StringBuilder();
        String lc = name.toLowerCase();
        String uc = name.toUpperCase();

        for (int i = 0; i < name.length(); i++) {
            sb.append('[');
            sb.append(lc.charAt(i));
            sb.append(uc.charAt(i));
            sb.append(']');
        }
        ;
        return sb.toString();
    }

    // Outlinks
    private static Pattern ALL_HTML_TAGS = Pattern.compile("<(.*?)>");
    private static Pattern A_HREF_PATTERN = Pattern.compile("[aA].+?[hH][rR][eE][fF]=['\"](.+?)['\"].*?");
    private static Pattern AREA_HREF_PATTERN = Pattern.compile("[aA][rR][eE][aA].+?[hH][rR][eE][fF]=['\"](.*?)['\"].*?");
    private static Pattern FRAME_SRC_PATTERN = Pattern.compile("[fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
    private static Pattern IFRAME_SRC_PATTERN = Pattern.compile("[iI][fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
    private static Pattern HTTP_START_PATTERN = Pattern.compile("^[hH][tT][tT][pP][sS]?://.*");


    // create our pattern set
    private final Vector<Pattern> patternSet = new Vector<Pattern>();

    private boolean isHTMLResponse = false;

    final static private Logger LOGGER = Logger.getLogger(WarcHTMLResponseRecord.class);

    /**
     * Default constructor
     */
    public WarcHTMLResponseRecord() {
        createPatternSet();
    }

    /**
     * Copy constructor
     *
     * @param o
     */
    public WarcHTMLResponseRecord(WarcHTMLResponseRecord o) {
        this.warcRecord.set(o.warcRecord);
        createPatternSet();
        processHTTPHeader();
    }

    /**
     * Constructor creation from a generic WARC record
     *
     * @param o
     */
    public WarcHTMLResponseRecord(WarcRecord o) {
        if (o.getHeaderRecordType().compareToIgnoreCase("response") == 0) {
            this.warcRecord.set(o);
            this.isHTMLResponse = true;
        }
        createPatternSet();
        processHTTPHeader();
    }

    private void createPatternSet() {
        patternSet.add(A_HREF_PATTERN);
        patternSet.add(AREA_HREF_PATTERN);
        patternSet.add(FRAME_SRC_PATTERN);
        patternSet.add(IFRAME_SRC_PATTERN);
    }

    public void setRecord(WarcRecord o) {
        if (o.getHeaderRecordType().compareToIgnoreCase("response") == 0) {
            this.warcRecord.set(o);
            this.isHTMLResponse = true;
        }
    }

    /**
     * Test if the underlying record is really a HTML response.
     *
     * @return <code>true</code> if record is an HYTML response,
     *         <code>false</code> otherwise
     */
    public boolean isHTMLResponse() {
        return this.isHTMLResponse;
    }

    public WarcRecord getRawRecord() {
        return warcRecord;
    }

    public String getTargetURI() {
        return warcRecord.getHeaderMetadataItem("WARC-Target-URI");
    }

    public String getTargetTrecID() {
        return warcRecord.getHeaderMetadataItem("WARC-TREC-ID");
    }

    private String getNormalizedContentURL(String pageURL, String contentURL) {
        String fixedContentURL = contentURL;
        try {
            // resolve any potentially relative paths to the full URL based on the page
            java.net.URI baseURI = new java.net.URI(pageURL);
            // ensure that the content doesn't have query parameters - if so, strip them
            int contentParamIndex = contentURL.indexOf("?");
            if (contentParamIndex > 0) {
                fixedContentURL = contentURL.substring(0, contentParamIndex);
            }
            java.net.URI resolvedURI = baseURI.resolve(fixedContentURL);
            return resolvedURI.toString();
        } catch (URISyntaxException ex) {
        } catch (IllegalArgumentException iaEx) {
            return fixedContentURL;
        } catch (Exception gEx) {
        }
        return "";
    }

    private HashSet<String> getMatchesOutputSet(Vector<String> tagSet, String baseURL) {
        HashSet<String> retSet = new HashSet<String>();

        Iterator<String> vIter = tagSet.iterator();
        while (vIter.hasNext()) {
            String thisCheckPiece = vIter.next();
            Iterator<Pattern> pIter = patternSet.iterator();
            boolean hasAdded = false;
            while (!hasAdded && pIter.hasNext()) {
                Pattern thisPattern = pIter.next();
                Matcher matcher = thisPattern.matcher(thisCheckPiece);
                if (matcher.find() && (matcher.groupCount() > 0)) {
                    String thisMatch = getNormalizedContentURL(baseURL, matcher.group(1));
                    if (HTTP_START_PATTERN.matcher(thisMatch).matches()) {
                        if (!retSet.contains(thisMatch) && !baseURL.equals(thisMatch)) {
                            retSet.add(thisMatch);
                            hasAdded = true;
                        } // end if (!retSet.contains(thisMatch))
                    } // end if (HTTP_START_PATTERN.matcher(thisMatch).matches())
                } // end if (matcher.find() && (matcher.groupCount() > 0))
                matcher.reset();
            } // end while (!hasAdded && pIter.hasNext())
        } // end while (vIter.hasNext())

        return retSet;
    }

    /**
     * Gets a vector of normalized URLs (normalized to this target URI)
     * of the outlinks of the page
     *
     * @return
     */
    public Vector<String> getURLOutlinks() {
        Vector<String> retVec = new Vector<String>();

        String baseURL = getTargetURI();
        if ((baseURL == null) || (baseURL.length() == 0)) {
            return retVec;
        }

        byte[] contentBytes = warcRecord.getContent();

        ByteArrayInputStream contentStream = new ByteArrayInputStream(contentBytes);
        BufferedReader inReader = new BufferedReader(new InputStreamReader(contentStream));

        // forward to the first \n\n
        try {
            boolean inHeader = true;
            String line = null;
            while (inHeader && ((line = inReader.readLine()) != null)) {
                if (line.trim().length() == 0) {
                    inHeader = false;
                }
            }

            // now we have the rest of the lines
            // read them all into a string buffer
            // to remove all new lines
            Vector<String> htmlTags = new Vector<String>();
            while ((line = inReader.readLine()) != null) {
                // get all HTML tags from the line...
                Matcher HTMLMatcher = ALL_HTML_TAGS.matcher(line);
                while (HTMLMatcher.find()) {
                    htmlTags.add(HTMLMatcher.group(1));
                }
            }

            HashSet<String> retSet = getMatchesOutputSet(htmlTags, baseURL);

            Iterator<String> oIter = retSet.iterator();
            while (oIter.hasNext()) {
                String thisValue = oIter.next();
                if (!thisValue.equals(baseURL)) {
                    retVec.add(thisValue);
                }
            }

        } catch (IOException ioEx) {
            retVec.clear();
        }

        return retVec;
    }

    /*
    * Stuff for start and end markers within the stream from which the object
    * was created from
    */

    /**
     * Gets the difference between the start and stop marker
     *
     * @return the difference between start and stop marker or -1 if this value
     *         is undefined
     */
    public int getStopMarkerDiff() {
        if (warcRecord != null) return warcRecord.getStopMarkerDiff();
        else return -1;
    }

    /**
     * Gets the stop marker, the position in the stream where the WARC record
     * end.
     *
     * @return the stop marker or -1 if this value is undefined
     */
    public long getStopMarker() {
        if (warcRecord != null)
            return warcRecord.getStopMarker();
        else return -1;
    }

    /**
     * Gets the start marker, the position in the stream where the WARC record
     * starts.
     *
     * @return the start marker or -1 if this value is undefined
     */
    public long getStartMarker() {
        if (warcRecord != null)
            return warcRecord.getStartMarker();
        else return -1;
    }


    static final private Charset DEFAULT_CHARSET = Charset.forName("UTF-8");

    /**
     * Get HTML Content
     *
     * Prefer {@linkplain #getContentReader()} for effiency
     *
     * @return
     */
    public String getHTMLContent() {
        if (start < 0)
            return "";

        final byte[] content = warcRecord.getContent();
        return new String(content, start, content.length-start+1, retrieveEncoding());
    }

    public Reader getContentReader() {
        Charset encoding = retrieveEncoding();
        final byte[] content = start >= 0 ? warcRecord.getContent() : null;
        return new InputStreamReader(new ByteArrayInputStream(content, start, content.length-start), encoding);

    }



    /**
     * Process the HTTP header
     * and sets {linkplain #start} to the start of content
     */
    public void processHTTPHeader() {
        start = -1;

        // Get the content
        if (warcRecord == null)
            return;

        byte[] content = warcRecord.getContent();
        if (content == null)
            return;


        // Start to read
        final ByteMatch match = ByteMatch.create("Content-Type:", true);

        int newlines = 0;
        start = 0;
        for (; start < content.length; ++start) {
            byte b = content[start];
            if (match.match(b)) {
                // Read the rest of the line
                int start = ++this.start;
                for (++this.start; this.start < content.length; ++this.start) {
                    b = content[this.start];
                    if (b == '\n') {
                        contentType = new String(content, start, this.start -start).trim();
//                        System.err.println("Found MIME type: " + contentType);
                        break;
                    }
                }

            }

            if (b == '\n') {
                if (++newlines == 2) {
                    ++start;
                    break;
                }
            }
            else newlines = 0;
        }
    }



    private static Pattern CHARSET_PATTERN = Pattern.compile("^" + getLUPattern("charset") + "\\s*=\\s*(\\S+)\\s*$");

    final char [] buffer = new char[8192];

    private Charset retrieveEncoding() {
        // Parse the file

        final Charset[] charset = {retrieveCharsetFromContentType(contentType)}; // use an array so it can be accessed by inner classes
        if (charset[0] != null)
            return charset[0];

        final BulletParser parser = new BulletParser(HTMLFactory.INSTANCE);

        parser.setCallback(new MyCallback(charset));

        final byte[] content = start >= 0 ? warcRecord.getContent() : null;
        final Reader reader = new InputStreamReader(new ByteArrayInputStream(content, start, content.length-start), Charset.defaultCharset());


        try {
            int read = reader.read(buffer, 0, buffer.length);
            parser.parse(buffer, 0, read);
        } catch (IOException e) {
            LOGGER.error("Error while reading stored HTML file");
        }

        if (charset[0] != null) {
            if (LOGGER.isDebugEnabled())
                LOGGER.debug("Retrieved charset from HTML: " + charset[0]);
            return charset[0];
        }


            // By default
        return DEFAULT_CHARSET;
    }

    /** Retrieve the charset from a content type */
    private Charset retrieveCharsetFromContentType(String contentType) {
        if (contentType == null) return null;

        String[] fields = contentType.split("\\s*;\\s*");
        for (String field : fields) {
            final Matcher matcher = CHARSET_PATTERN.matcher(field);
            if (matcher.matches()) {
                // Cleans up the encoding: uppercase, remove quotes
                String encoding = matcher.group(1).toUpperCase();
                if (encoding.startsWith("\"")) encoding = encoding.substring(1);
                if (encoding.endsWith("\"")) encoding = encoding.substring(0, encoding.length()-1);

                // Handles some aliases
                if (encoding.equals("ISO-LATIN-1"))
                    encoding = "ISO-8859-1";

                return getCharset(encoding);

            }
        }
        return null;
    }

    static private Charset getCharset(String encoding) {
        try {
            return Charset.forName(encoding);
        } catch(IllegalCharsetNameException e) {
            LOGGER.error("Cannot handle charset [" + encoding + "]: " + e);
        } catch(UnsupportedCharsetException e) {
            LOGGER.error("Cannot handle charset [" + encoding + "]: " + e);
        }
        return null;
    }


    /** Callback used to parse the content encoding */
    private class MyCallback implements Callback {
        private final Charset[] charset;

        public MyCallback(Charset[] charset) {
            this.charset = charset;
        }

        @Override
        public void configure(BulletParser parser) {
            parser.parseTags(true);
            parser.parseAttributes(true);
            parser.parseAttribute(Attribute.CHARSET);
            parser.parseAttribute(Attribute.HTTP_EQUIV);
            parser.parseAttribute(Attribute.CONTENT);
        }

        @Override
        public void startDocument() {
        }

        @Override
        public boolean startElement(Element element, Map<Attribute, MutableString> attributes) {
            if (charset[0] != null)
                return true;

            if (element == Element.META) {
                final MutableString httpEquiv = attributes.get(Attribute.HTTP_EQUIV);
                final MutableString content = attributes.get(Attribute.CONTENT);
                if (content != null && httpEquiv != null && httpEquiv.toLowerCase().equals("content-type")) {
                    charset[0] = retrieveCharsetFromContentType(content.toString());
//                    return charset[0] == null;
                    return true;
                }

                final MutableString metaCharset = attributes.get(Attribute.CHARSET);
                if  (metaCharset != null) {
                    charset[0] = getCharset(metaCharset.toUpperCase().toString());
                    return true;
//                    return charset[0] == null;
                }
            }


            return true; // element != Element.BODY;
        }

        @Override
        public boolean endElement(Element element) {
//                FIXME: bug in bullet parser ? return element != Element.HEAD;
            return true;
        }

        @Override
        public boolean characters(char[] chars, int i, int i1, boolean b) {
            return true;
        }

        @Override
        public boolean cdata(Element element, char[] chars, int i, int i1) {
            return true;
        }

        @Override
        public void endDocument() {
        }
    }
}
TOP

Related Classes of net.bpiwowar.mg4j.extensions.warc.WarcHTMLResponseRecord$MyCallback

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.