Source Code of net.bpiwowar.mg4j.extensions.warc.WarcRecord$Marker

/**
 * Container for a generic Warc Record
 *
 * (C) 2009 - Carnegie Mellon University
 *
 * 1. Redistributions of this source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. The names "Lemur", "Indri", "University of Massachusetts",
 *    "Carnegie Mellon", and "lemurproject" must not be used to
 *    endorse or promote products derived from this software without
 *    prior written permission. To obtain permission, contact
 *    license@lemurproject.org.
 *
 * 4. Products derived from this software may not be called "Lemur" or "Indri"
 *    nor may "Lemur" or "Indri" appear in their names without prior written
 *    permission of The Lemur Project. To obtain permission,
 *    contact license@lemurproject.org.
 *
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * @author mhoy@cs.cmu.edu (Mark J. Hoy)
 *
 * Some extensions were made to keep track of positions in stream. Stop and
 * start markers to mark the position in a stream where a WARC record starts and
 * ends were added.
 * @author ingo@dcs.gla.ac.uk (Ingo Frommholz)
 */


package net.bpiwowar.mg4j.extensions.warc;


import it.unimi.dsi.io.SegmentedInputStream;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import net.bpiwowar.mg4j.extensions.trec.TRECParsingFactory;
import net.bpiwowar.mg4j.extensions.utils.StructuredTextExtractor;
import net.sf.samtools.util.BlockCompressedInputStream;


import java.io.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Vector;


public class WarcRecord {


    public static String WARC_VERSION = "WARC/0.18";
    public static String WARC_VERSION_LINE = "WARC/0.18\n";
    private static String NEWLINE = "\n";


    private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0);
    private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0);
    private static byte MASK_TOPMOST_BIT = (byte) (0x80);
    private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F);
    private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F);
    private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F);


    /**
     * Flag that controls if we read or drop the content
     */
    private static boolean readContent = true;


    /**
     * Set the flag that controls if we read or drop the content we read from
     * the stream. If this is set to <code>false</code>, no content will be read
     * (just some important headers), for instance if we are just interested in
     * the recod boundaries in a stream (e.g. when working with MG4J). This has
     * consequences on what is returned with, e.g.,
     * {@link #readNextWarcRecord(java.io.DataInputStream)} Content is read by default.
     *
     * @param readC <code>true</code> if content should be read,
     *              <code>false</code> otherwise.
     * @return the previous state of the readContent flag
     */
    public static boolean readContent(boolean readC) {
        boolean oldFlag = readContent;
        readContent = readC;
        return oldFlag;
    }


    /*
    * Some new stuff to support start and end markers in stream
    */
    /**
     * A pointer indicating the current position (in bytes) in the stream
     * (starting with 0)
     */
    private static long currentPosition = 0;




    /**
     * Sets the current position pointer to 0. Invoke this before parsing
     * a new WARC file.
     */
    public static void newFile() {
        currentPosition = 0;
    }




    /**
     * The start position in the stream
     */
    private long startMarker = -1;


    /**
     * Sets the start marker, the position in the stream where the WARC record
     * starts.
     *
     * @param startMarker the start marker or -1 if this value is undefined
     */
    public void setStartMarker(long startMarker) {
        this.startMarker = startMarker;
    }


    /**
     * Sets the stop marker difference, the difference between the position in
     * the stream where the WARC record starts and where it ends.
     *
     * @param stopMarkerDiff
     */
    public void setStopMarkerDiff(int stopMarkerDiff) {
        this.stopMarkerDiff = stopMarkerDiff;
    }


    /**
     * startMarker - stopMarker, where the WARC record ends in the stream
     */
    private int stopMarkerDiff = -1;


    /**
     * Gets the difference between the start and stop marker
     *
     * @return the difference between start and stop marker or -1 if this value
     *         is undefined
     */
    public int getStopMarkerDiff() {
        return stopMarkerDiff;
    }


    /**
     * Gets the stop marker, the position in the stream where the WARC record
     * end.
     *
     * @return the stop marker or -1 if this value is undefined
     */
    public long getStopMarker() {
        if (startMarker > -1 && stopMarkerDiff > -1)
            return startMarker + stopMarkerDiff;
        else return -1;
    }


    /**
     * Gets the start marker, the position in the stream where the WARC record
     * starts.
     *
     * @return the start marker
     */
    public long getStartMarker() {
        return startMarker;
    }




    /**
     * Helper class to store start and end markers
     *
     * @author <a href="mailto:ingo@dcs.gla.ac.uk">Ingo Frommholz</a>
     */
    private static class Marker {
        private long startMarker = -1;
        private int stopMarkerDifference = -1;


        /**
         * Checks if the start and stop markers are valid ones.
         *
         * @return <code>true</code> if the markers are valid
         */
        private boolean isValid() {
            return startMarker > -1 && stopMarkerDifference > -1;
        }
    }


    /*
    * End of start and end marker stuff
    */




    /**
     * Our read line implementation. We cannot allow buffering here (for gzip streams)
     * so, we need to use DataInputStream. Also - we need to account for java's
     * UTF8 implementation
     *
     * @param in the input data stream
     * @return the read line (or null if eof)
     * @throws java.io.IOException
     */
    private static String readLineFromInputStream(DataInputStream in) throws IOException {
        StringBuilder retString = new StringBuilder();


        boolean keepReading = true;
        try {
            do {
                char thisChar = 0;
                byte readByte = in.readByte();
                currentPosition++;


                // check to see if it's a multibyte character
                if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
                    // need to read the next 2 bytes
                    if (in.available() < 2) {
                        // treat these all as individual characters
                        retString.append((char) readByte);
                        int numAvailable = in.available();
                        for (int i = 0; i < numAvailable; i++) {
                            retString.append((char) (in.readByte()));
                            currentPosition++;
                        }
                        continue;
                    }
                    byte secondByte = in.readByte();
                    currentPosition++;
                    byte thirdByte = in.readByte();
                    currentPosition++;
                    // ensure the topmost bit is set
                    if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) {
                        //treat these as individual characters
                        retString.append((char) readByte);
                        retString.append((char) secondByte);
                        retString.append((char) thirdByte);
                        continue;
                    }
                    int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64 * (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096 * (readByte & MASK_BOTTOM_FOUR_BITS);
                    thisChar = (char) finalVal;
                } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
                    // need to read next byte
                    if (in.available() < 1) {
                        // treat this as individual characters
                        retString.append((char) readByte);
                        continue;
                    }
                    byte secondByte = in.readByte();
                    currentPosition++;
                    if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) {
                        retString.append((char) readByte);
                        retString.append((char) secondByte);
                        continue;
                    }
                    int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64 * (readByte & MASK_BOTTOM_SIX_BITS);
                    thisChar = (char) finalVal;
                } else {
                    // interpret it as a single byte
                    thisChar = (char) readByte;
                }


                if (thisChar == '\n') {
                    keepReading = false;
                } else {
                    retString.append(thisChar);
                }
            } while (keepReading);
        } catch (EOFException eofEx) {
            return null;
        }


        if (retString.length() == 0) {
            return "";
        }


        return retString.toString();
    }


    /**
     * The actual heavy lifting of reading in the next WARC record
     *
     * @param in           the data input stream
     * @param headerBuffer a blank string buffer to contain the WARC header
     * @param marker       to contain the start marker and the difference to the stop
     *                     marker of the WARC record in the stream
     * @return the content bytes (w/ the headerBuffer populated)
     * @throws java.io.IOException
     */
    private static byte[] readNextRecord(DataInputStream in,
                                         StringBuffer headerBuffer,
                                         Marker marker) throws IOException {
        if (in == null) {
            return null;
        }
        if (headerBuffer == null) {
            return null;
        }


        String line = null;
        boolean foundMark = false;
        boolean inHeader = true;
        byte[] retContent = null;


        // cannot be using a buffered reader here!!!!
        // just read the header
        // first - find our WARC header
        // record the decision in stream before the WARC record is read
        marker.startMarker = currentPosition;
        while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) {
            if (line.startsWith(WARC_VERSION)) {
                foundMark = true;
            } else marker.startMarker = currentPosition;
        }


        // no WARC mark?
        if (!foundMark) {
            return null;
        }


        // then read to the first newline
        // make sure we get the content length here
        int contentLength = -1;
        boolean foundContentLength = false;
        while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
            if ((line.trim().length() == 0) && foundContentLength) {
                inHeader = false;
            } else {
                headerBuffer.append(line);
                headerBuffer.append(NEWLINE);
                String[] thisHeaderPieceParts = line.split(":", 2);
                if (thisHeaderPieceParts.length == 2) {
                    if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
                        foundContentLength = true;
                        try {
                            contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
                        } catch (NumberFormatException nfEx) {
                            contentLength = -1;
                        }
                    }
                }
            }
        }


        if (contentLength < 0) {
            return null;
        }




        // now read the bytes of the content
        retContent = new byte[contentLength];
        int totalWant = contentLength;
        int totalRead = 0;
        while (totalRead < contentLength) {
            try {
                int numRead = in.read(retContent, totalRead, totalWant);
                if (numRead < 0) {
                    return null;
                } else {
                    currentPosition += numRead;
                    totalRead += numRead;
                    totalWant = contentLength - totalRead;
                } // end if (numRead < 0) / else
            } catch (EOFException eofEx) {
                // resize to what we have
                if (totalRead > 0) {
                    byte[] newReturn = new byte[totalRead];
                    System.arraycopy(retContent, 0, newReturn, 0, totalRead);
                    return newReturn;
                } else {
                    return null;
                }
            } // end try/catch (EOFException)
        } // end while (totalRead < contentLength)


        // record the end marker difference
        marker.stopMarkerDifference = (int) (currentPosition - marker.startMarker);


        return retContent;
    }


    /**
     * Reads in a WARC record from a data input stream
     *
     * @param in the input stream
     * @return a WARC record (or null if eof)
     * @throws java.io.IOException
     */


    public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
        StringBuffer recordHeader = new StringBuffer();


        // start marker and stop marker difference for positions in stream
        Marker marker = new Marker();


        byte[] recordContent = readNextRecord(in, recordHeader, marker);


        if (recordContent == null) {
            return null;
        }


        // extract out our header information
        String thisHeaderString = recordHeader.toString();
        String[] headerLines = thisHeaderString.split(NEWLINE);


        WarcRecord retRecord = new WarcRecord();
        for (int i = 0; i < headerLines.length; i++) {
            String[] pieces = headerLines[i].split(":", 2);
            if (pieces.length != 2) {
                retRecord.addHeaderMetadata(pieces[0], "");
                continue;
            }
            String thisKey = pieces[0].trim();
            String thisValue = pieces[1].trim();


            // check for known keys
            if (thisKey.equals("WARC-Type")) {
                retRecord.setWarcRecordType(thisValue);
            } else if (thisKey.equals("WARC-Date")) {
                retRecord.setWarcDate(thisValue);
            } else if (thisKey.equals("WARC-Record-ID")) {
                retRecord.setWarcUUID(thisValue);
            } else if (thisKey.equals("Content-Type")) {
                retRecord.setWarcContentType(thisValue);
            } else {
                retRecord.addHeaderMetadata(thisKey, thisValue);
            }
        }


        // set the content
        if (readContent) retRecord.setContent(recordContent);


        // store start and stop markers
        if (marker.isValid()) {
            retRecord.setStartMarker(marker.startMarker);
            retRecord.setStopMarkerDiff(marker.stopMarkerDifference);
        }


        return retRecord;
    }


    /**
     * Warc header class
     */
    public class WarcHeader {
        public String contentType = "";
        public String UUID = "";
        public String dateString = "";
        public String recordType = "";
        public HashMap<String, String> metadata = new HashMap<String, String>();
        public int contentLength = 0;


        /**
         * Default constructor
         */
        public WarcHeader() {
        }


        /**
         * Copy Constructor
         *
         * @param o other WARC header
         */
        public WarcHeader(WarcHeader o) {
            this.contentType = o.contentType;
            this.UUID = o.UUID;
            this.dateString = o.dateString;
            this.recordType = o.recordType;
            this.metadata.putAll(o.metadata);
            this.contentLength = o.contentLength;
        }


        /**
         * Serialization output
         *
         * @param out the data output stream
         * @throws java.io.IOException
         */
        public void write(DataOutput out) throws IOException {
            out.writeUTF(contentType);
            out.writeUTF(UUID);
            out.writeUTF(dateString);
            out.writeUTF(recordType);
            out.writeInt(metadata.size());
            Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
            while (metadataIterator.hasNext()) {
                Entry<String, String> thisEntry = metadataIterator.next();
                out.writeUTF(thisEntry.getKey());
                out.writeUTF(thisEntry.getValue());
            }
            out.writeInt(contentLength);
        }


        /**
         * Serialization input
         *
         * @param in the data input stream
         * @throws java.io.IOException
         */
        public void readFields(DataInput in) throws IOException {
            contentType = in.readUTF();
            UUID = in.readUTF();
            dateString = in.readUTF();
            recordType = in.readUTF();
            metadata.clear();
            int numMetaItems = in.readInt();
            for (int i = 0; i < numMetaItems; i++) {
                String thisKey = in.readUTF();
                String thisValue = in.readUTF();
                metadata.put(thisKey, thisValue);
            }
            contentLength = in.readInt();
        }


        @Override
        public String toString() {
            StringBuffer retBuffer = new StringBuffer();


            retBuffer.append(WARC_VERSION);
            retBuffer.append(NEWLINE);


            retBuffer.append("WARC-Type: " + recordType + NEWLINE);
            retBuffer.append("WARC-Date: " + dateString + NEWLINE);


            retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
            Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
            while (metadataIterator.hasNext()) {
                Entry<String, String> thisEntry = metadataIterator.next();
                retBuffer.append(thisEntry.getKey());
                retBuffer.append(": ");
                retBuffer.append(thisEntry.getValue());
                retBuffer.append(NEWLINE);
            }


            retBuffer.append("Content-Type: " + contentType + NEWLINE);
            retBuffer.append("Content-Length: " + contentLength + NEWLINE);


            return retBuffer.toString();
        }
    }


    private WarcHeader warcHeader = new WarcHeader();
    private byte[] warcContent = null;
    private String warcFilePath = "";


    /**
     * Default Constructor
     */
    public WarcRecord() {
    }


    /**
     * Copy Constructor
     *
     * @param o
     */
    public WarcRecord(WarcRecord o) {
        this.warcHeader = new WarcHeader(o.warcHeader);
        this.warcContent = o.warcContent;
    }


    /**
     * Retrieves the total record length (header and content)
     *
     * @return total record length
     */
    public int getTotalRecordLength() {
        int headerLength = warcHeader.toString().length();
        return (headerLength + warcContent.length);
    }


    /**
     * Sets the record content (copy)
     *
     * @param o record to copy from
     */
    public void set(WarcRecord o) {
        this.warcHeader = new WarcHeader(o.warcHeader);
        this.warcContent = o.warcContent;
        this.startMarker = o.getStartMarker();
        this.stopMarkerDiff = o.getStopMarkerDiff();
    }


    /**
     * Gets the file path from this WARC file (if set)
     *
     * @return
     */
    public String getWarcFilePath() {
        return warcFilePath;
    }


    /**
     * Sets the warc file path (optional - for use with getWarcFilePath)
     *
     * @param path
     */
    public void setWarcFilePath(String path) {
        warcFilePath = path;
    }


    /**
     * Sets the record type string
     *
     * @param recordType
     */
    public void setWarcRecordType(String recordType) {
        warcHeader.recordType = recordType;
    }


    /**
     * Sets the content type string
     *
     * @param contentType
     */
    public void setWarcContentType(String contentType) {
        warcHeader.contentType = contentType;
    }


    /**
     * Sets the WARC header date string
     *
     * @param dateString
     */
    public void setWarcDate(String dateString) {
        warcHeader.dateString = dateString;
    }


    /**
     * Sets the WARC uuid string
     *
     * @param UUID
     */
    public void setWarcUUID(String UUID) {
        warcHeader.UUID = UUID;
    }


    /**
     * Adds a key/value pair to a WARC header. This is needed to filter
     * out known keys
     *
     * @param key
     * @param value
     */
    public void addHeaderMetadata(String key, String value) {
        // don't allow addition of known keys
        if (key.equals("WARC-Type")) {
            return;
        }
        if (key.equals("WARC-Date")) {
            return;
        }
        if (key.equals("WARC-Record-ID")) {
            return;
        }
        if (key.equals("Content-Type")) {
            return;
        }
        if (key.equals("Content-Length")) {
            return;
        }


        warcHeader.metadata.put(key, value);
    }


    /**
     * Clears all metadata items from a header
     */
    public void clearHeaderMetadata() {
        warcHeader.metadata.clear();
    }


    /**
     * Gets the set of metadata items from the header
     *
     * @return
     */
    public Set<Entry<String, String>> getHeaderMetadata() {
        return warcHeader.metadata.entrySet();
    }


    /**
     * Gets a value for a specific header metadata key
     *
     * @param key
     * @return
     */
    public String getHeaderMetadataItem(String key) {
        return warcHeader.metadata.get(key);
    }


    /**
     * Sets the byte content for this record
     *
     * @param content
     */
    public void setContent(byte[] content) {
        warcContent = content;
        warcHeader.contentLength = content.length;
    }


    /**
     * Sets the byte content for this record
     *
     * @param content
     */
    public void setContent(String content) {
        setContent(content.getBytes());
    }


    /**
     * Restrieves the byte content for this record
     *
     * @return
     */
    public byte[] getContent() {
        return warcContent;
    }


    /**
     * Retrieves the bytes content as a UTF-8 string
     *
     * @return
     */
    public String getContentUTF8() {
        String retString = null;
        try {
            retString = new String(warcContent, "UTF-8");
        } catch (UnsupportedEncodingException ex) {
            retString = new String(warcContent);
        }
        return retString;
    }


    /**
     * Gets the header record type string
     *
     * @return
     */
    public String getHeaderRecordType() {
        return warcHeader.recordType;
    }


    @Override
    public String toString() {
        StringBuffer retBuffer = new StringBuffer();
        retBuffer.append(warcHeader.toString());
        retBuffer.append(NEWLINE);
        retBuffer.append(warcContent);


        if (startMarker > -1 && stopMarkerDiff > -1) {
            long endMarker = startMarker + stopMarkerDiff;
            retBuffer.append(NEWLINE);
            retBuffer.append("Start pos: " + startMarker
                    + "  End pos: " + endMarker
                    + "  Length: " + stopMarkerDiff);
        }
        retBuffer.append(NEWLINE);
        retBuffer.append(NEWLINE);


        return retBuffer.toString();
    }


    /**
     * Gets the WARC header as a string
     *
     * @return
     */
    public String getHeaderString() {
        return warcHeader.toString();
    }


    /**
     * Serialization output
     *
     * @param out
     * @throws java.io.IOException
     */
    public void write(DataOutput out) throws IOException {
        warcHeader.write(out);
        out.write(warcContent);
    }


    /**
     * Serialization input
     *
     * @param in
     * @throws java.io.IOException
     */
    public void readFields(DataInput in) throws IOException {
        warcHeader.readFields(in);
        int contentLengthBytes = warcHeader.contentLength;
        warcContent = new byte[contentLengthBytes];
        in.readFully(warcContent);
    }


    /**
     * Use for testing purposes
     *
     * @param args Command line arguments
     */
    public static void main(String[] args) throws IOException {
        String filename = args[0];
        BlockCompressedInputStream stream =
                new BlockCompressedInputStream(new File(filename));
        DataInputStream in = new DataInputStream(stream);
        WarcRecord warcRecord = null;
        Vector<long[]> markers = new Vector<long[]>();
        while ((warcRecord = readNextWarcRecord(in)) != null) {
            //System.out.println(warcRecord.toString());
            markers.add(new long[]{warcRecord.getStartMarker(), warcRecord.getStopMarker()});
        }
        stream.close();


        // check if we can read the substreams from the markers
        for (Iterator<long[]> iterator = markers.iterator(); iterator.hasNext(); ) {
            long[] ls = iterator.next();
            stream =
                    new BlockCompressedInputStream(new File(filename));
            SegmentedInputStream sis = new SegmentedInputStream(stream, ls[0], ls[1]);
            in = new DataInputStream(sis);
            while ((warcRecord = readNextWarcRecord(in)) != null) {
                System.out.println(warcRecord.toString());
                WarcHTMLResponseRecord w = new WarcHTMLResponseRecord(warcRecord);
                if (w.isHTMLResponse()) {
                    //System.out.println(w.getHTMLContent());


                    // See how the parsed content looks like
                    BulletParser parser = new BulletParser(TRECParsingFactory.INSTANCE);
                    ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
                    StructuredTextExtractor textExtractor = new StructuredTextExtractor();
                    composedBuilder.add(textExtractor);
                    parser.setCallback(composedBuilder.compose());
                    parser.parse(w.getHTMLContent().toCharArray());
                    System.out.println(textExtractor.getText());
                }
            }
            in.close();
            stream.close();
        }


    }


}
Source Code of net.bpiwowar.mg4j.extensions.warc.WarcRecord$Marker

Related Classes of net.bpiwowar.mg4j.extensions.warc.WarcRecord$Marker