Package org.commoncrawl.hadoop.mapred

Source Code of org.commoncrawl.hadoop.mapred.ArcRecord$ByteArraySessionInputBuffer

package org.commoncrawl.hadoop.mapred;

import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.EOFException;
import java.io.InputStream;
import java.io.IOException;
import java.lang.IllegalArgumentException;
import java.lang.Integer;
import java.lang.Math;
import java.lang.NumberFormatException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;

// Hadoop classes
import org.apache.hadoop.io.Writable;

// Apache log4j classes
import org.apache.log4j.Logger;

// Apache HTTP Components classes
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpException;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.DefaultHttpResponseFactory;
import org.apache.http.impl.io.AbstractSessionInputBuffer;
import org.apache.http.impl.io.DefaultHttpResponseParser;
import org.apache.http.io.SessionInputBuffer;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicHeaderValueParser;
import org.apache.http.message.BasicLineParser;
import org.apache.http.params.BasicHttpParams;

// Jsoup classes
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
* An entry in an ARC (Internet Archive) data file.
*
* @author Chris Stephens
*/
public class ArcRecord
    implements Writable {

  private static final Logger LOG = Logger.getLogger(ArcRecord.class);

  // ARC v1 metadata
  private String _url;
  private String _ipAddress;
  private Date   _archiveDate;
  private String _contentType;
  private int    _contentLength;

  // ARC v2 metadata
//private int    _resultCode;
//private String _checksum;
//private String _location;
//private long   _offset;
//private String _filename;

  private byte[] _payload;

  private HttpResponse _httpResponse;

  private int _httpContentStart;

  /**
   * <p>Creates an empty ARC record.</p>
   */
  public ArcRecord() { }

  private void _clear() {
    this._url = null;
    this._ipAddress = null;
    this._archiveDate = null;
    this._contentType = null;
    this._contentLength = 0;
    this._payload = null;
    this._httpResponse = null;
  }

  private String _readLine(InputStream in)
      throws IOException, EOFException {

    StringBuffer line = new StringBuffer(128);

    // read a line of content
    int b = in.read();
    int n = 1;

    // if -1 is returned, we are at EOF
    if (b == -1)
      throw new EOFException();

    // read until an NL
    do {

      if (((char) b) == '\n')
        break;

      line.append((char) b);

      b = in.read();
      n++;
    }
    while (b != -1);

    return line.toString();
  }

  /**
   * <p>Parses the ARC record header and payload (content) from a stream.</p>
   *
   * @return TRUE if the ARC record was parsed and loaded successfully, FALSE if not.
   */
  public boolean readFrom(InputStream in)
      throws IOException, EOFException {

    if (in == null) {
      LOG.error("ArcRecord cannot be created from NULL/missing input stream.");
      return false;
    }

    // Clear any current values assigned to the object.
    this._clear();

    // Read the ARC header from the stream.
    String arcRecordHeader = this._readLine(in);

    try {
      this.setArcRecordHeader(arcRecordHeader);
      this.setPayload(in);
    }
    catch (IOException ex) {
      throw ex;
    }
    catch (Exception ex) {
      LOG.error("Exception thrown while parsing ARC record", ex);
      return false;
    }
    
    return true;
  }

  /**
   * <p>Parses and sets the ARC record header fields.</p>
   * <p>Currently, this method expects the ARC record header string to contain
   * the following fields, in order, separated by space:
   * <ul>
   * <li>URL</li>
   * <li>IP Address</li>
   * <li>Archive Date</li>
   * <li>Content Type</li>
   * <li>Content Length</li>
   * </ul>
   * </p>
   * <p>For more information on the arc file format, see
   * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
   *
   * @param arcRecordHeader The first line of an ARC file entry - the header
   *                        line for an ARC file item.
   */
  public void setArcRecordHeader(String arcRecordHeader)
      throws IllegalArgumentException, ParseException {

    if (arcRecordHeader == null || arcRecordHeader.equals(""))
      throw new IllegalArgumentException("ARC v1 record header string is empty.");

    String[] metadata = arcRecordHeader.split(" ");

    if (metadata.length != 5) {
      LOG.info(" [ "+arcRecordHeader+" ] ");
      throw new IllegalArgumentException("ARC v1 record header must be 5 fields.");
    }

    SimpleDateFormat format = new SimpleDateFormat("yyyyMMddHHmmss");

    this._url            =  metadata[0];
    this._ipAddress      =  metadata[1];
    this._archiveDate    =  format.parse(metadata[2]);
    this._contentType    =  metadata[3];
    this._contentLength  = (new Integer(metadata[4])).intValue();
  }

  /**
   * <p>Reads and sets the ARC record payload from an input stream.</p>
   *
   * @param in An input stream positioned at the start of the ARC record payload.
   */
  public void setPayload(InputStream in)
      throws IllegalArgumentException, ParseException, IOException {

    if (in == null)
      throw new IllegalArgumentException("ArcRecord cannot be created from NULL/missing input stream.");

    int bufferSize = this._contentLength;

    this._payload = new byte[bufferSize];

    int n = in.read(this._payload, 0, this._payload.length);

    if (n < this._payload.length) {
      LOG.warn("Expecting "+bufferSize+" bytes in ARC record payload, found "+n+" bytes.  Performing array copy.");
      this._payload = Arrays.copyOf(this._payload, n);
    }

    // After this, we should be at the end of this GZIP member.  Let the
    // calling function verify the position of the stream.
  }

  public void addToPayload(byte[] data) {
    this.addToPayload(data, data.length);
  }

  public void addToPayload(byte[] data, int length) {

    LOG.warn("Content Length must have been incorrect - someone needed to add more data to the payload.");

    if (this._payload == null) {
      this._payload = Arrays.copyOf(data, length);
    }
    else {
      int i = this._payload.length;
      int n = this._payload.length + length;

      // resize the payload buffer
      this._payload = Arrays.copyOf(this._payload, n);

      // copy in the additional data
      System.arraycopy(data, 0, this._payload, i, length);
    }
  }

  /**
   * {@inheritDoc}
   */
  public String toString() {
    return this._url + " - " + this._archiveDate.toString() + " - " + this._contentType;
  }

  /**
   * {@inheritDoc}
   */
  public void write(DataOutput out)
      throws IOException {

    // write out ARC header info
    out.writeUTF(this._url);
    out.writeUTF(this._ipAddress);
    out.writeUTF(this._contentType);
    out.writeLong(this._archiveDate.getTime());
    out.writeInt(this._contentLength);

    // write out the payload
    out.writeInt(this._payload.length);
    out.write(this._payload, 0, this._payload.length);
  }

  /**
   * {@inheritDoc}
   */
  public void readFields(DataInput in)
      throws IOException {

    // read in ARC header info
    this._url           = in.readUTF();
    this._ipAddress     = in.readUTF();
    this._contentType   = in.readUTF();
    this._archiveDate   = new Date(in.readLong());
    this._contentLength = in.readInt();

    // read in the payload
    int payloadLength = in.readInt();

    // resize the payload buffer if necessary
    if (this._payload == null || this._payload.length != payloadLength)
      this._payload = new byte[payloadLength];

    try {
      in.readFully(this._payload, 0, payloadLength);
    }
    catch (EOFException ex) {
      throw new IOException("End of input reached before payload was fully deserialized.");
    }

    // assume that if a new payload was loaded, HTTP response will need to be reparsed.
    this._httpResponse = null;
  }

  /**
   * <p>Returns the full ARC record payload.  This is usually a complete HTTP
   * response.</p>
   *
   * @return The raw ARC record content.
   */
  public byte[] getPayload() {
    return this._payload;
  }

  /**
   * <p>Returns the URL from the ARC record header.</p>
   *
   * @return The URL for this entry.
   */
  public String getURL() {
    return this._url;
  }

  /**
   * <p>Returns the IP address from the ARC record header.</p>
   *
   * @return The IP address for this entry.
   */
  public String getIpAddress() {
    return this._ipAddress;
  }

  /**
   * <p>Returns the archive date from the ARC record header.</p>
   *
   * @return The archive date for this entry.
   */
  public Date getArchiveDate() {
    return this._archiveDate;
  }

  /**
   * <p>Returns the MIME content type from the ARC record header.</p>
   * <p>Note: The MIME content type in the ARC record header is not necessarily the
   * same as the <code>Content-Type</code> HTTP header inside the content body
   * (if one is present).</p>
   *
   * @return The MIME content type for this entry.
   */
  public String getContentType() {
    return this._contentType;
  }

  /**
   * <p>Returns the content length from the ARC record header.</p>
   * <p>Note: The content length in the ARC record header is not necessarily the
   * same as the <code>Content-Length</code> HTTP header inside the content body
   * (if one is present).</p>
   *
   * @return The content length for this entry.
   */
  public int getContentLength() {
    return this._contentLength;
  }

  /**
   * <p>Returns the HTTP status code.</p>
   * <p>If the payload could not be parsed as an HTTP response, returns -1.</p>
   * <p>Warning: if the payload has not yet been parsed as an HTTP response,
   * calling this function parses the full response.  Parsing is only performed
   * once - parsed data is retained for subsequent calls.</p>
   *
   * @return The HTTP status code.
   */
  public int getHttpStatusCode()
      throws IOException, HttpException {

    HttpResponse httpResponse = this.getHttpResponse();

    if (httpResponse == null)
      return -1;

    return httpResponse.getStatusLine().getStatusCode();
  }

  /**
   * <p>Returns an array of HTTP headers.</p>
   * <p>If the payload could not be parsed as an HTTP response, returns <code>null</code>.</p>
   * <p>Warning: if the payload has not yet been parsed as an HTTP response,
   * calling this function parses the full response.  Parsing is only performed
   * once - parsed data is retained for subsequent calls.</p>
   *
   * @return An array of HTTP headers.
   */
  public Header[] getHttpHeaders()
      throws IOException, HttpException {

    HttpResponse httpResponse = this.getHttpResponse();

    if (httpResponse == null)
      return null;

    return httpResponse.getAllHeaders();
  }

  /**
   *
   */
  public static class ByteArraySessionInputBuffer
      extends AbstractSessionInputBuffer {

    public ByteArraySessionInputBuffer(byte[] buf) {
      BasicHttpParams params = new BasicHttpParams();
      this.init(new ByteArrayInputStream(buf), 4096, params);
    }

    public ByteArraySessionInputBuffer(byte[] buf, int offset, int length) {
      BasicHttpParams params = new BasicHttpParams();
      this.init(new ByteArrayInputStream(buf, offset, length), 4096, params);
    }

    public boolean isDataAvailable(int timeout) {
      return true;
    }
  }

  /**
   * <p>Helper function to search a byte array for CR-LF-CR-LF (the end of
   * HTTP headers in the payload buffer).</p>
   *
   * @return The offset of the end of HTTP headers, after the last CRLF.
   */
  private int _searchForCRLFCRLF(byte[] data) {

    final byte CR = (byte)'\r';
    final byte LF = (byte)'\n';

    int i;
    int s = 0;

    for (i = 0; i < data.length; i++) {

      if      (data[i] == CR) {
        if      (s == 0) s = 1;
        else if (s == 1) s = 0;
        else if (s == 2) s = 3;
        else if (s == 3) s = 0;
      }
      else if (data[i] == LF) {
        if      (s == 0) s = 0;
        else if (s == 1) s = 2;
        else if (s == 2) s = 0;
        else if (s == 3) s = 4;
      }
      else {
        s = 0;
      }

      if (s == 4)
        return i + 1;
    }

    return -1;
  }

  /**
   * <p>Returns an HTTP response object parsed from the ARC record payload.<p>
   * <p>Note: The payload is parsed on-demand, but is only parsed once.  The
   * parsed data is saved for subsequent calls.</p>
   *
   * @return The ARC record payload as an HTTP response object.  See the Apache
   * HttpComponents project.
   */
  public HttpResponse getHttpResponse()
      throws IOException, HttpException {

    if (this._httpResponse != null)
      return this._httpResponse;

    if (this._payload == null) {
      LOG.error("Unable to parse HTTP response: Payload has not been set"); return null;
    }

    if (this._url != null && !this._url.startsWith("http://") && !this._url.startsWith("https://")) {
      LOG.error("Unable to parse HTTP response: URL protocol is not HTTP"); return null;
    }

    this._httpResponse = null;

    // Find where the HTTP headers stop
    int end = this._searchForCRLFCRLF(this._payload);

    if (end == -1) {
      LOG.error("Unable to parse HTTP response: End of HTTP headers not found"); return null;
    }

    // Parse the HTTP status line and headers
    DefaultHttpResponseParser parser =
      new DefaultHttpResponseParser(
        new ByteArraySessionInputBuffer(this._payload, 0, end),
        new BasicLineParser(),
        new DefaultHttpResponseFactory(),
        new BasicHttpParams()
      );

    this._httpResponse = parser.parse();

    if (this._httpResponse == null) {
      LOG.error("Unable to parse HTTP response"); return null;
    }     

    // Set the reset of the payload as the HTTP entity.  Use an InputStreamEntity
    // to avoid a memory copy.
    InputStreamEntity entity = new InputStreamEntity(new ByteArrayInputStream(this._payload, end, this._payload.length - end), this._payload.length - end);
    entity.setContentType(this._httpResponse.getFirstHeader("Content-Type"));
    entity.setContentEncoding(this._httpResponse.getFirstHeader("Content-Encoding"));
    this._httpResponse.setEntity(entity);

    return this._httpResponse;
  }

  /**
   * <p>Returns a Jsoup HTML document, parsed using the Charset in the
   * "Content-Type" header.  If the document charset cannot be found, parse is
   * attempted using </p>
   *
   * @return A Jsoup parsed HTML document from the HTTP response content.
   */
  public Document getParsedHTML()
      throws IOException {

    if (this._url == null) {
      LOG.error("Unable to parse HTML: URL from ARC header has not been set");
      return null;
    }

    // if response has not been parsed yet, this parses it
    try {
      this.getHttpResponse();
    }
    catch (HttpException ex) {
      LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null;
    }

    if (this._httpResponse == null) {
      LOG.error("Unable to parse HTML: Exception during HTTP response parsing"); return null;
    }

    if (this._httpResponse.getEntity() == null) {
      LOG.error("Unable to parse HTML: No HTTP response entity found"); return null;
    }

    if (!this._contentType.toLowerCase().contains("html")) {
      LOG.warn("Unable to parse HTML: Content is not HTML"); return null;
    }

    String charset = null;

    try {
      // Default value returned is "text/plain" with charset of ISO-8859-1.
      charset = ContentType.getOrDefault(this._httpResponse.getEntity()).getCharset().name();
    }
    catch (Throwable ex) {

    }

    // if anything goes wrong, try ISO-8859-1
    if (charset == null)
      charset = "ISO-8859-1";

    // parse the content using the derived charset and the URL from the ARC header
    return Jsoup.parse(this._httpResponse.getEntity().getContent(), charset, this._url);
  }
}
TOP

Related Classes of org.commoncrawl.hadoop.mapred.ArcRecord$ByteArraySessionInputBuffer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.