/**
* Container for a generic Warc Record
*
* (C) 2009 - Carnegie Mellon University
*
* 1. Redistributions of this source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. The names "Lemur", "Indri", "University of Massachusetts",
* "Carnegie Mellon", and "lemurproject" must not be used to
* endorse or promote products derived from this software without
* prior written permission. To obtain permission, contact
* license@lemurproject.org.
*
* 4. Products derived from this software may not be called "Lemur" or "Indri"
* nor may "Lemur" or "Indri" appear in their names without prior written
* permission of The Lemur Project. To obtain permission,
* contact license@lemurproject.org.
*
* THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
* PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
* NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* @author mhoy@cs.cmu.edu (Mark J. Hoy)
*
* Some extensions were made to keep track of positions in stream. Stop and
* start markers to mark the position in a stream where a WARC record starts and
* ends were added.
* @author ingo@dcs.gla.ac.uk (Ingo Frommholz)
*/
package net.bpiwowar.mg4j.extensions.warc;
import it.unimi.dsi.io.SegmentedInputStream;
import it.unimi.dsi.parser.BulletParser;
import it.unimi.dsi.parser.callback.ComposedCallbackBuilder;
import net.bpiwowar.mg4j.extensions.trec.TRECParsingFactory;
import net.bpiwowar.mg4j.extensions.utils.StructuredTextExtractor;
import net.sf.samtools.util.BlockCompressedInputStream;
import java.io.*;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Vector;
public class WarcRecord {
public static String WARC_VERSION = "WARC/0.18";
public static String WARC_VERSION_LINE = "WARC/0.18\n";
private static String NEWLINE = "\n";
private static byte MASK_THREE_BYTE_CHAR = (byte) (0xE0);
private static byte MASK_TWO_BYTE_CHAR = (byte) (0xC0);
private static byte MASK_TOPMOST_BIT = (byte) (0x80);
private static byte MASK_BOTTOM_SIX_BITS = (byte) (0x1F);
private static byte MASK_BOTTOM_FIVE_BITS = (byte) (0x3F);
private static byte MASK_BOTTOM_FOUR_BITS = (byte) (0x0F);
/**
* Flag that controls if we read or drop the content
*/
private static boolean readContent = true;
/**
* Set the flag that controls if we read or drop the content we read from
* the stream. If this is set to <code>false</code>, no content will be read
* (just some important headers), for instance if we are just interested in
* the recod boundaries in a stream (e.g. when working with MG4J). This has
* consequences on what is returned with, e.g.,
* {@link #readNextWarcRecord(java.io.DataInputStream)} Content is read by default.
*
* @param readC <code>true</code> if content should be read,
* <code>false</code> otherwise.
* @return the previous state of the readContent flag
*/
public static boolean readContent(boolean readC) {
boolean oldFlag = readContent;
readContent = readC;
return oldFlag;
}
/*
* Some new stuff to support start and end markers in stream
*/
/**
* A pointer indicating the current position (in bytes) in the stream
* (starting with 0)
*/
private static long currentPosition = 0;
/**
* Sets the current position pointer to 0. Invoke this before parsing
* a new WARC file.
*/
public static void newFile() {
currentPosition = 0;
}
/**
* The start position in the stream
*/
private long startMarker = -1;
/**
* Sets the start marker, the position in the stream where the WARC record
* starts.
*
* @param startMarker the start marker or -1 if this value is undefined
*/
public void setStartMarker(long startMarker) {
this.startMarker = startMarker;
}
/**
* Sets the stop marker difference, the difference between the position in
* the stream where the WARC record starts and where it ends.
*
* @param stopMarkerDiff
*/
public void setStopMarkerDiff(int stopMarkerDiff) {
this.stopMarkerDiff = stopMarkerDiff;
}
/**
* startMarker - stopMarker, where the WARC record ends in the stream
*/
private int stopMarkerDiff = -1;
/**
* Gets the difference between the start and stop marker
*
* @return the difference between start and stop marker or -1 if this value
* is undefined
*/
public int getStopMarkerDiff() {
return stopMarkerDiff;
}
/**
* Gets the stop marker, the position in the stream where the WARC record
* end.
*
* @return the stop marker or -1 if this value is undefined
*/
public long getStopMarker() {
if (startMarker > -1 && stopMarkerDiff > -1)
return startMarker + stopMarkerDiff;
else return -1;
}
/**
* Gets the start marker, the position in the stream where the WARC record
* starts.
*
* @return the start marker
*/
public long getStartMarker() {
return startMarker;
}
/**
* Helper class to store start and end markers
*
* @author <a href="mailto:ingo@dcs.gla.ac.uk">Ingo Frommholz</a>
*/
private static class Marker {
private long startMarker = -1;
private int stopMarkerDifference = -1;
/**
* Checks if the start and stop markers are valid ones.
*
* @return <code>true</code> if the markers are valid
*/
private boolean isValid() {
return startMarker > -1 && stopMarkerDifference > -1;
}
}
/*
* End of start and end marker stuff
*/
/**
* Our read line implementation. We cannot allow buffering here (for gzip streams)
* so, we need to use DataInputStream. Also - we need to account for java's
* UTF8 implementation
*
* @param in the input data stream
* @return the read line (or null if eof)
* @throws java.io.IOException
*/
private static String readLineFromInputStream(DataInputStream in) throws IOException {
StringBuilder retString = new StringBuilder();
boolean keepReading = true;
try {
do {
char thisChar = 0;
byte readByte = in.readByte();
currentPosition++;
// check to see if it's a multibyte character
if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
// need to read the next 2 bytes
if (in.available() < 2) {
// treat these all as individual characters
retString.append((char) readByte);
int numAvailable = in.available();
for (int i = 0; i < numAvailable; i++) {
retString.append((char) (in.readByte()));
currentPosition++;
}
continue;
}
byte secondByte = in.readByte();
currentPosition++;
byte thirdByte = in.readByte();
currentPosition++;
// ensure the topmost bit is set
if (((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT)) {
//treat these as individual characters
retString.append((char) readByte);
retString.append((char) secondByte);
retString.append((char) thirdByte);
continue;
}
int finalVal = (thirdByte & MASK_BOTTOM_FIVE_BITS) + 64 * (secondByte & MASK_BOTTOM_FIVE_BITS) + 4096 * (readByte & MASK_BOTTOM_FOUR_BITS);
thisChar = (char) finalVal;
} else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
// need to read next byte
if (in.available() < 1) {
// treat this as individual characters
retString.append((char) readByte);
continue;
}
byte secondByte = in.readByte();
currentPosition++;
if ((secondByte & MASK_TOPMOST_BIT) != MASK_TOPMOST_BIT) {
retString.append((char) readByte);
retString.append((char) secondByte);
continue;
}
int finalVal = (secondByte & MASK_BOTTOM_FIVE_BITS) + 64 * (readByte & MASK_BOTTOM_SIX_BITS);
thisChar = (char) finalVal;
} else {
// interpret it as a single byte
thisChar = (char) readByte;
}
if (thisChar == '\n') {
keepReading = false;
} else {
retString.append(thisChar);
}
} while (keepReading);
} catch (EOFException eofEx) {
return null;
}
if (retString.length() == 0) {
return "";
}
return retString.toString();
}
/**
* The actual heavy lifting of reading in the next WARC record
*
* @param in the data input stream
* @param headerBuffer a blank string buffer to contain the WARC header
* @param marker to contain the start marker and the difference to the stop
* marker of the WARC record in the stream
* @return the content bytes (w/ the headerBuffer populated)
* @throws java.io.IOException
*/
private static byte[] readNextRecord(DataInputStream in,
StringBuffer headerBuffer,
Marker marker) throws IOException {
if (in == null) {
return null;
}
if (headerBuffer == null) {
return null;
}
String line = null;
boolean foundMark = false;
boolean inHeader = true;
byte[] retContent = null;
// cannot be using a buffered reader here!!!!
// just read the header
// first - find our WARC header
// record the decision in stream before the WARC record is read
marker.startMarker = currentPosition;
while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) {
if (line.startsWith(WARC_VERSION)) {
foundMark = true;
} else marker.startMarker = currentPosition;
}
// no WARC mark?
if (!foundMark) {
return null;
}
// then read to the first newline
// make sure we get the content length here
int contentLength = -1;
boolean foundContentLength = false;
while (!foundContentLength && inHeader && ((line = readLineFromInputStream(in)) != null)) {
if ((line.trim().length() == 0) && foundContentLength) {
inHeader = false;
} else {
headerBuffer.append(line);
headerBuffer.append(NEWLINE);
String[] thisHeaderPieceParts = line.split(":", 2);
if (thisHeaderPieceParts.length == 2) {
if (thisHeaderPieceParts[0].toLowerCase().startsWith("content-length")) {
foundContentLength = true;
try {
contentLength = Integer.parseInt(thisHeaderPieceParts[1].trim());
} catch (NumberFormatException nfEx) {
contentLength = -1;
}
}
}
}
}
if (contentLength < 0) {
return null;
}
// now read the bytes of the content
retContent = new byte[contentLength];
int totalWant = contentLength;
int totalRead = 0;
while (totalRead < contentLength) {
try {
int numRead = in.read(retContent, totalRead, totalWant);
if (numRead < 0) {
return null;
} else {
currentPosition += numRead;
totalRead += numRead;
totalWant = contentLength - totalRead;
} // end if (numRead < 0) / else
} catch (EOFException eofEx) {
// resize to what we have
if (totalRead > 0) {
byte[] newReturn = new byte[totalRead];
System.arraycopy(retContent, 0, newReturn, 0, totalRead);
return newReturn;
} else {
return null;
}
} // end try/catch (EOFException)
} // end while (totalRead < contentLength)
// record the end marker difference
marker.stopMarkerDifference = (int) (currentPosition - marker.startMarker);
return retContent;
}
/**
* Reads in a WARC record from a data input stream
*
* @param in the input stream
* @return a WARC record (or null if eof)
* @throws java.io.IOException
*/
public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
StringBuffer recordHeader = new StringBuffer();
// start marker and stop marker difference for positions in stream
Marker marker = new Marker();
byte[] recordContent = readNextRecord(in, recordHeader, marker);
if (recordContent == null) {
return null;
}
// extract out our header information
String thisHeaderString = recordHeader.toString();
String[] headerLines = thisHeaderString.split(NEWLINE);
WarcRecord retRecord = new WarcRecord();
for (int i = 0; i < headerLines.length; i++) {
String[] pieces = headerLines[i].split(":", 2);
if (pieces.length != 2) {
retRecord.addHeaderMetadata(pieces[0], "");
continue;
}
String thisKey = pieces[0].trim();
String thisValue = pieces[1].trim();
// check for known keys
if (thisKey.equals("WARC-Type")) {
retRecord.setWarcRecordType(thisValue);
} else if (thisKey.equals("WARC-Date")) {
retRecord.setWarcDate(thisValue);
} else if (thisKey.equals("WARC-Record-ID")) {
retRecord.setWarcUUID(thisValue);
} else if (thisKey.equals("Content-Type")) {
retRecord.setWarcContentType(thisValue);
} else {
retRecord.addHeaderMetadata(thisKey, thisValue);
}
}
// set the content
if (readContent) retRecord.setContent(recordContent);
// store start and stop markers
if (marker.isValid()) {
retRecord.setStartMarker(marker.startMarker);
retRecord.setStopMarkerDiff(marker.stopMarkerDifference);
}
return retRecord;
}
/**
* Warc header class
*/
public class WarcHeader {
public String contentType = "";
public String UUID = "";
public String dateString = "";
public String recordType = "";
public HashMap<String, String> metadata = new HashMap<String, String>();
public int contentLength = 0;
/**
* Default constructor
*/
public WarcHeader() {
}
/**
* Copy Constructor
*
* @param o other WARC header
*/
public WarcHeader(WarcHeader o) {
this.contentType = o.contentType;
this.UUID = o.UUID;
this.dateString = o.dateString;
this.recordType = o.recordType;
this.metadata.putAll(o.metadata);
this.contentLength = o.contentLength;
}
/**
* Serialization output
*
* @param out the data output stream
* @throws java.io.IOException
*/
public void write(DataOutput out) throws IOException {
out.writeUTF(contentType);
out.writeUTF(UUID);
out.writeUTF(dateString);
out.writeUTF(recordType);
out.writeInt(metadata.size());
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
out.writeUTF(thisEntry.getKey());
out.writeUTF(thisEntry.getValue());
}
out.writeInt(contentLength);
}
/**
* Serialization input
*
* @param in the data input stream
* @throws java.io.IOException
*/
public void readFields(DataInput in) throws IOException {
contentType = in.readUTF();
UUID = in.readUTF();
dateString = in.readUTF();
recordType = in.readUTF();
metadata.clear();
int numMetaItems = in.readInt();
for (int i = 0; i < numMetaItems; i++) {
String thisKey = in.readUTF();
String thisValue = in.readUTF();
metadata.put(thisKey, thisValue);
}
contentLength = in.readInt();
}
@Override
public String toString() {
StringBuffer retBuffer = new StringBuffer();
retBuffer.append(WARC_VERSION);
retBuffer.append(NEWLINE);
retBuffer.append("WARC-Type: " + recordType + NEWLINE);
retBuffer.append("WARC-Date: " + dateString + NEWLINE);
retBuffer.append("WARC-Record-ID: " + UUID + NEWLINE);
Iterator<Entry<String, String>> metadataIterator = metadata.entrySet().iterator();
while (metadataIterator.hasNext()) {
Entry<String, String> thisEntry = metadataIterator.next();
retBuffer.append(thisEntry.getKey());
retBuffer.append(": ");
retBuffer.append(thisEntry.getValue());
retBuffer.append(NEWLINE);
}
retBuffer.append("Content-Type: " + contentType + NEWLINE);
retBuffer.append("Content-Length: " + contentLength + NEWLINE);
return retBuffer.toString();
}
}
private WarcHeader warcHeader = new WarcHeader();
private byte[] warcContent = null;
private String warcFilePath = "";
/**
* Default Constructor
*/
public WarcRecord() {
}
/**
* Copy Constructor
*
* @param o
*/
public WarcRecord(WarcRecord o) {
this.warcHeader = new WarcHeader(o.warcHeader);
this.warcContent = o.warcContent;
}
/**
* Retrieves the total record length (header and content)
*
* @return total record length
*/
public int getTotalRecordLength() {
int headerLength = warcHeader.toString().length();
return (headerLength + warcContent.length);
}
/**
* Sets the record content (copy)
*
* @param o record to copy from
*/
public void set(WarcRecord o) {
this.warcHeader = new WarcHeader(o.warcHeader);
this.warcContent = o.warcContent;
this.startMarker = o.getStartMarker();
this.stopMarkerDiff = o.getStopMarkerDiff();
}
/**
* Gets the file path from this WARC file (if set)
*
* @return
*/
public String getWarcFilePath() {
return warcFilePath;
}
/**
* Sets the warc file path (optional - for use with getWarcFilePath)
*
* @param path
*/
public void setWarcFilePath(String path) {
warcFilePath = path;
}
/**
* Sets the record type string
*
* @param recordType
*/
public void setWarcRecordType(String recordType) {
warcHeader.recordType = recordType;
}
/**
* Sets the content type string
*
* @param contentType
*/
public void setWarcContentType(String contentType) {
warcHeader.contentType = contentType;
}
/**
* Sets the WARC header date string
*
* @param dateString
*/
public void setWarcDate(String dateString) {
warcHeader.dateString = dateString;
}
/**
* Sets the WARC uuid string
*
* @param UUID
*/
public void setWarcUUID(String UUID) {
warcHeader.UUID = UUID;
}
/**
* Adds a key/value pair to a WARC header. This is needed to filter
* out known keys
*
* @param key
* @param value
*/
public void addHeaderMetadata(String key, String value) {
// don't allow addition of known keys
if (key.equals("WARC-Type")) {
return;
}
if (key.equals("WARC-Date")) {
return;
}
if (key.equals("WARC-Record-ID")) {
return;
}
if (key.equals("Content-Type")) {
return;
}
if (key.equals("Content-Length")) {
return;
}
warcHeader.metadata.put(key, value);
}
/**
* Clears all metadata items from a header
*/
public void clearHeaderMetadata() {
warcHeader.metadata.clear();
}
/**
* Gets the set of metadata items from the header
*
* @return
*/
public Set<Entry<String, String>> getHeaderMetadata() {
return warcHeader.metadata.entrySet();
}
/**
* Gets a value for a specific header metadata key
*
* @param key
* @return
*/
public String getHeaderMetadataItem(String key) {
return warcHeader.metadata.get(key);
}
/**
* Sets the byte content for this record
*
* @param content
*/
public void setContent(byte[] content) {
warcContent = content;
warcHeader.contentLength = content.length;
}
/**
* Sets the byte content for this record
*
* @param content
*/
public void setContent(String content) {
setContent(content.getBytes());
}
/**
* Restrieves the byte content for this record
*
* @return
*/
public byte[] getContent() {
return warcContent;
}
/**
* Retrieves the bytes content as a UTF-8 string
*
* @return
*/
public String getContentUTF8() {
String retString = null;
try {
retString = new String(warcContent, "UTF-8");
} catch (UnsupportedEncodingException ex) {
retString = new String(warcContent);
}
return retString;
}
/**
* Gets the header record type string
*
* @return
*/
public String getHeaderRecordType() {
return warcHeader.recordType;
}
@Override
public String toString() {
StringBuffer retBuffer = new StringBuffer();
retBuffer.append(warcHeader.toString());
retBuffer.append(NEWLINE);
retBuffer.append(warcContent);
if (startMarker > -1 && stopMarkerDiff > -1) {
long endMarker = startMarker + stopMarkerDiff;
retBuffer.append(NEWLINE);
retBuffer.append("Start pos: " + startMarker
+ " End pos: " + endMarker
+ " Length: " + stopMarkerDiff);
}
retBuffer.append(NEWLINE);
retBuffer.append(NEWLINE);
return retBuffer.toString();
}
/**
* Gets the WARC header as a string
*
* @return
*/
public String getHeaderString() {
return warcHeader.toString();
}
/**
* Serialization output
*
* @param out
* @throws java.io.IOException
*/
public void write(DataOutput out) throws IOException {
warcHeader.write(out);
out.write(warcContent);
}
/**
* Serialization input
*
* @param in
* @throws java.io.IOException
*/
public void readFields(DataInput in) throws IOException {
warcHeader.readFields(in);
int contentLengthBytes = warcHeader.contentLength;
warcContent = new byte[contentLengthBytes];
in.readFully(warcContent);
}
/**
* Use for testing purposes
*
* @param args Command line arguments
*/
public static void main(String[] args) throws IOException {
String filename = args[0];
BlockCompressedInputStream stream =
new BlockCompressedInputStream(new File(filename));
DataInputStream in = new DataInputStream(stream);
WarcRecord warcRecord = null;
Vector<long[]> markers = new Vector<long[]>();
while ((warcRecord = readNextWarcRecord(in)) != null) {
//System.out.println(warcRecord.toString());
markers.add(new long[]{warcRecord.getStartMarker(), warcRecord.getStopMarker()});
}
stream.close();
// check if we can read the substreams from the markers
for (Iterator<long[]> iterator = markers.iterator(); iterator.hasNext(); ) {
long[] ls = iterator.next();
stream =
new BlockCompressedInputStream(new File(filename));
SegmentedInputStream sis = new SegmentedInputStream(stream, ls[0], ls[1]);
in = new DataInputStream(sis);
while ((warcRecord = readNextWarcRecord(in)) != null) {
System.out.println(warcRecord.toString());
WarcHTMLResponseRecord w = new WarcHTMLResponseRecord(warcRecord);
if (w.isHTMLResponse()) {
//System.out.println(w.getHTMLContent());
// See how the parsed content looks like
BulletParser parser = new BulletParser(TRECParsingFactory.INSTANCE);
ComposedCallbackBuilder composedBuilder = new ComposedCallbackBuilder();
StructuredTextExtractor textExtractor = new StructuredTextExtractor();
composedBuilder.add(textExtractor);
parser.setCallback(composedBuilder.compose());
parser.parse(w.getHTMLContent().toCharArray());
System.out.println(textExtractor.getText());
}
}
in.close();
stream.close();
}
}
}