Package org.archive.io

Source Code of org.archive.io.ArchiveTest

/*
*  This file is part of the Heritrix web crawler (crawler.archive.org).
*
*  Licensed to the Internet Archive (IA) by one or more individual
*  contributors.
*
*  The IA licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.archive.io;

import java.io.IOException;
import java.text.NumberFormat;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.io.arc.ARCRecordMetaData;
import org.archive.io.warc.WARCReader;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.io.warc.WARCRecord;

import java.util.logging.Logger;

/**
* useful for determining why ArchiveReader fails for problematic W/ARC files
* @author siznax
*
*/
public class ArchiveTest
{
  /** input W/ARC filename */
    String arcFilename;

    void setArcFile(String arcFile) {
      this.arcFilename = arcFile;     
    }
   
    /** one of available modes */
    String mode;
  public void setMode(String mode) {
    this.mode = mode;
  }

    /** mimetype to select from input */
    String filter;
    public void setFilter(String filter) {
      this.filter = filter;
    }

    /** byte offset into input file */
    long offset;
  public void setOffset(long offset) {
    this.offset = offset;
  }

    /** W/ARC record index to begin output */
  protected int recordStartIndex;

  /** W/ARC record index to end output */
  protected int recordEndIndex;
    void setRecordRange(int start, int end) {
    this.recordStartIndex = start;
    this.recordEndIndex = end;
  }

    /** count of W/ARC records found in input */
  protected int recordCount;

  /** count of selected mimetype found in input */
  protected int filterCount;

  /** logger for errors, warnings */
    private static Logger logger = Logger.getLogger(ArchiveTest.class.getName());

    /** main method modes to scan for errors, filter,
     * and emulate wayback use cases
     */
  public static String[] modes = {"index","replay","dump","cdx","filter"};

  /** arbitrary buffer size for replay mode */
  static int BUFFER_SIZE = 1024*16;
   
  /** some typical mimetypes found in W/ARCs */
  static String[] mimeTypes =  {
    "image/gif",
    "image/png",
    "text/css",
    "text/dns",
    "text/html",
    "text/plain"
  };

  public ArchiveTest() throws IOException {
  }
       
  /**
   * @return true if archive filename ends in "arc" or "arc.gz"
   */
  boolean isARCFormat() {
    return this.arcFilename.endsWith(".arc")
      || this.arcFilename.endsWith(".arc.gz");
  }

  /**
   * @return ARCReader if {@link #isARCFormat()}=true, else WARCReader
   * @throws IOException
   */
  ArchiveReader getReader() throws IOException {
    if (this.isARCFormat()) {
      return ARCReaderFactory.get(this.arcFilename);
    } else {
      return WARCReaderFactory.get(this.arcFilename);
   
  }

  /**
   * @param  index current record index into arc file
   * @return true if current index is in range
   */
  boolean inRecordRange(long index) {
      if (index >= this.recordStartIndex && index <= this.recordEndIndex)
        return true;
      else
        return false;
    }
   
  /**
   * @param r ArchiveRecord
   * @param filter mimetype string, see mimeTypes
   * @return true if current record mimetype equals mimetype filter field
   */
  boolean filterMimeType(ArchiveRecord r, String filter) {
    if (r.getHeader().getMimetype().equals(this.filter))
      return true;
    else
      return false;
  }

  void logRecordErrors(ArchiveRecord record) {
        Logger logger = Logger.getLogger(this.getClass().getName());
        if (this.isARCFormat()) {
          ARCRecord arcRecord = (ARCRecord) record;
          if (arcRecord.hasErrors()) {
            ArchiveRecordHeader header = record.getHeader();
            logger.warning("record at offset: " + header.getOffset()
                + " has errors: " + arcRecord.getErrors());
          }
        } else {
          WARCRecord warcRecord = (WARCRecord) record;
          warcRecord.getHeader();
        }
  }
 
  /** emulate ArchiveRecord.outputCDX for comparison */
  static void outputCdx(ArchiveRecordHeader h) {
    Long rl = h.getLength();
    Long ro = h.getOffset();
    String[] hdr = {
        h.getDate(),
        "-", // Ip
        h.getUrl(),
        h.getMimetype(),
        "-", // status code
        "-", // digest
        ro.toString(),
        rl.toString(),
    };
    for (String fld : hdr)
      System.out.print(fld + " ");
    System.out.println();
  }

  void printMetadata(ARCRecord record, ArchiveRecordHeader header) {
    System.out.print( "  Date  : " + header.getDate() + "\n"
        + "  IP    : " + ((ARCRecordMetaData)header).getIp()  + "\n"
        + "  URL   : " + header.getUrl() + "\n"
        + "  MIME  : " + header.getMimetype()   + "\n"
        + "  Status: " + ((ARCRecordMetaData)header).getStatusCode() + "\n"
        + "  Digest: " + record.getDigestStr() + "\n"
        + "  Offset: " + header.getOffset() + "\n"
        + "  Length: " + header.getLength() + "\n");
  }
 
  void printMetadata(WARCRecord record, ArchiveRecordHeader header) {
    System.out.print( "  Date  : " + header.getDate() + "\n"
        + "  IP    : " + header.getHeaderValue("WARC-IP-Address") + "\n"
        + "  URL   : " + header.getUrl() + "\n"
        + "  MIME  : " + header.getMimetype() + "\n"
        + "  Status: " + "-" + "\n"
        + "  Digest: " + header.getHeaderValue("WARC-Payload-Digest") + "\n"
        + "  Offset: " + header.getOffset() + "\n"
        + "  Length: " + header.getLength() + "\n");
  }
 
  void printInfo() {
    System.out.println(this.getClass().getName());
    System.out.println("  file:    " + this.arcFilename);
    System.out.println("  format:  " + this.getFormat());
    System.out.println("  mode:    " + this.mode);
    if (this.mode.equals("filter"))
      System.out.println("  filter:  " + this.filter);
    if (this.mode.equals("fetch"))
    System.out.println("  offset:  " + this.offset);
    if (this.mode.equals("filter")
        || this.mode.equals("cdx")
        || this.mode.equals("dump"))
      System.out.println("  range:   " + "[" + this.recordStartIndex
          + "," + this.recordEndIndex + "]");
  }

  /**
   * return W/ARC extension and compression extension
   */
  String getFormat() {
    if(this.arcFilename.endsWith(".gz")) {
      return this.arcFilename.substring(this.arcFilename
          .lastIndexOf(".",this.arcFilename.length()-4));
    }
    return this.arcFilename.substring(this.arcFilename.lastIndexOf("."));
  }

  /**
   * process output by selected mode
   * @throws IOException
   */
  void readArchive() throws IOException {

      ArchiveReader reader = this.getReader();

        if (this.mode.equals("index")) {
          // parse HTTP header only
      System.out.println("INDEX " + this.getArcType()
          + " record at offset: " + offset);
      if (this.isARCFormat()) {
        indexRecord((ARCReader)reader);
      } else {
        indexRecord((WARCReader)reader);
      }
        } else if (this.mode.equals("replay")) {
          // skip header and read 
          System.out.println("REPLAY " + this.getArcType()
              + " record at offset: " + offset + "");
          if (this.isARCFormat()) {
            this.replayRecord((ARCReader)reader);
          } else {
            this.replayRecord((WARCReader)reader);
          }
        } else if (this.mode.equals("dump")) {
          this.dumpArchive(reader);
        } else if (this.mode.equals("cdx")) {
          this.outputArchiveCDX(reader);
        } else if (this.mode.equals("filter")) { // filter MIME type
          this.filterArchive(reader);
        } else { // scan; do nothing, but count iterations
          this.scanArchive(reader);
        }
        if (this.offset == -1) {
          System.out.println("\n========== found: "
              + this.recordCount + " records. ");
        }
    System.out.println("\n========== Done.");
  }

  /**
   * get archive type by file extension
   * @return arc file extension, e.g. 'warc.gz'
   */
  private String getArcType() {
    return getFormat().split("\\.")[1];
  }

  /**
   * scan (read) archive printing "." for each record or errors if they occur
   * and total number of records found
   * @param reader and ArchiveReader instance
   */
  private void scanArchive(ArchiveReader reader) {
      System.out.println();
      for (ArchiveRecord record : reader) {
        this.recordCount++;
        logRecordErrors(record);
        System.out.print(".");
        if ((this.recordCount % 100) == 0)
          System.out.print("[" + this.recordCount+ "]\n");
      }
  }

  /**
   * filter archive on a mimetype for records in range
   * @param reader an ArchiveReader instance
   */
  private void filterArchive(ArchiveReader reader) {
      for (ArchiveRecord record : reader) {
        recordCount++;
        if (inRecordRange(recordCount)) {
          if (filterMimeType(record,this.filter)==true) {
            System.out.print(mode + " [" + recordCount + "] ");
            outputCdx(record.getHeader());
            filterCount++;
          }
        }
        if (recordCount > this.recordEndIndex)
          break;
      }
      double filterPercent = (double)filterCount/recordCount;
      NumberFormat filterPercentFmt = NumberFormat.getPercentInstance();
      filterPercentFmt.setMinimumFractionDigits(2);
      System.out.println("\n========== found: "
          + filterCount + "/" + recordCount + " = "
          + filterPercentFmt.format(filterPercent)
          + " mimetype=" + filter
          + " records. ");
  }

  /**
   * output CDX-like output for records in range
   * @param reader an ArchiveReader instance
   */
  private void outputArchiveCDX(ArchiveReader reader) {
    for (ArchiveRecord record : reader) {
      recordCount++;
      if (inRecordRange(recordCount)) {
        System.out.print(mode + " [" + recordCount + "] ");
        logRecordErrors(record);
        outputCdx(record.getHeader());
      }
        if (recordCount > this.recordEndIndex) {
          break;
        }
    }
  }

  /**
   * write records in range on STDOUT
   * @param reader an ArchiveReader instance
   * @throws IOException
   */
  private void dumpArchive(ArchiveReader reader) throws IOException {
      for (ArchiveRecord record : reader) {
      recordCount++;
      if (inRecordRange(recordCount)) {
        System.out.println("\n********** "
            + mode + " ["+recordCount+"] "
            + "**********\n")
        record.dump();
      }
        if (recordCount > this.recordEndIndex) {
          break;
        }
    }
  }

  /**
   * wayback-like replay of ARC record at offset
   * @param arcReader an ARCReader intance
   * @throws IOException
   */
  private void replayRecord(ARCReader arcReader) throws IOException {
      arcReader.setStrict(true);
      ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
      arcRecord.skipHttpHeader();
      if (arcRecord.hasErrors()) {
        logger.warning("record has errors: " + arcRecord.getErrors());
      }
      byte[] buffer = new byte[BUFFER_SIZE];
      if (arcRecord.available() > 0) {
        // for (int r = -1; (r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1;) {
        int r = -1;
        while((r = arcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
          // os.write(buffer, 0, r);
          System.out.write(buffer, 0, r);
        }
      } else {
        System.out.println("record bytes available: "
            + arcRecord.available());
      }
  }
 
  /**
   * wayback-like replay of WARC record at offset
   * @param warcReader a WARCReader instance
   * @throws IOException
   */
  private void replayRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    WARCRecord warcRecord = (WARCRecord) warcReader.get(this.offset);
      byte[] buffer = new byte[BUFFER_SIZE];
      if (warcRecord.available() > 0) {
        int r = -1;
        while((r = warcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
          System.out.write(buffer, 0, r);
        }
      }
    System.out.println("record bytes available: "
        + warcRecord.available());
  }

  /**
   * wayback-like index an ARC record at offset
   * @param arcReader an ARCReader instance
   * @throws IOException
   */
  private void indexRecord(ARCReader arcReader) throws IOException {
    arcReader.setStrict(true);
    arcReader.setParseHttpHeaders(true);
    ARCRecord arcRecord = (ARCRecord) arcReader.get(this.offset);
    ArchiveRecordHeader header = arcRecord.getHeader();
    if (arcRecord.hasErrors())
      logger.warning("record has errors: " + arcRecord.getErrors());
    System.out.println("========== dumping HTTP header:");
    arcRecord.dumpHttpHeader();
    System.out.println("========== selected metadata:");
    arcRecord.close(); // must close record to get digest
    printMetadata(arcRecord,header);
    System.out.println("========== getting metadata:");
    System.out.println(arcRecord.getMetaData());
    System.out.println("\n"
        + "record length declared: "
        + header.getLength() + "\n"
        + "header bytes read     : "
        + arcRecord.httpHeaderBytesRead);
  }

  /**
   * wayback-like index a WARC record at offset
   * @param warcReader a WARCReader instance
   * @throws IOException
   */
  private void indexRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    // warcReader.setParseHttpHeaders(true);
    WARCRecord warcRecord = (WARCRecord)warcReader.get(this.offset);
    ArchiveRecordHeader header = warcRecord.getHeader();
    System.out.println("========== selected metadata:");
    warcRecord.close(); // must close record to get digest
    printMetadata(warcRecord,header);
    System.out.println("========== header: \n" + header);
  }

 
  /**
   * test (scan|cdx|index|replay|dump) an archive. 
   * some of these modes are use-cases for wayback indexing mentioned in:
   * http://webarchive.jira.com/browse/HER-1568
   * @param arcfile a ARC or WARC archive (possibly .gz)
   * @param offset byte offset into archive
   * @param mode (default=scan)|cdx|index|replay|dump
   * @param record_range_start record index start (default=0)
   * @param record_range_end record index end (default=100)
   * @param filter mimetype, e.g. "text/html"
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    new ArchiveTest().instanceMain(args);
  }

  public void instanceMain(String[] args) throws IOException {
    if (args.length > 1) {
      int offset    = Integer.valueOf(args[1]);
      String mode   = (args.length>2) ? args[2] : "scan";
      int start     = (args.length>3) ? Integer.valueOf(args[3]) : 0;
      int end       = (args.length>4) ? Integer.valueOf(args[4]) : 100;
      String filter = (args.length>5) ? args[5] : null;
      setArcFile(args[0]);
      setOffset(Integer.valueOf(args[1]));
      setOffset(offset);
      setMode(mode);
      setRecordRange(start,end);
      setFilter(filter);
      printInfo();
      readArchive();
    } else {
      String usage = "ArcWarcTests.java arcfile offset "
        + "[ [scan|cdx|index|replay|dump] "
        + "record_range_start record_range_end filter]";
      System.out.println(usage);
    }
  }
}
TOP

Related Classes of org.archive.io.ArchiveTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.