Package org.archive.io.warc

Examples of org.archive.io.warc.WARCRecord


    /* (non-Javadoc)
     * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
     */
    public WARCRecord adapt(ArchiveRecord o) {
      WARCRecord rec = null;
      if (o instanceof WARCRecord) {
        rec = (WARCRecord) o;
      }
      return rec;
    }
View Full Code Here


            ArchiveRecordHeader header = record.getHeader();
            logger.warning("record at offset: " + header.getOffset()
                + " has errors: " + arcRecord.getErrors());
          }
        } else {
          WARCRecord warcRecord = (WARCRecord) record;
          warcRecord.getHeader();
        }
  }
View Full Code Here

   * @param warcReader a WARCReader instance
   * @throws IOException
   */
  private void replayRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    WARCRecord warcRecord = (WARCRecord) warcReader.get(this.offset);
      byte[] buffer = new byte[BUFFER_SIZE];
      if (warcRecord.available() > 0) {
        int r = -1;
        while((r = warcRecord.read(buffer, 0, BUFFER_SIZE)) != -1) {
          System.out.write(buffer, 0, r);
        }
      }
    System.out.println("record bytes available: "
        + warcRecord.available());
  }
View Full Code Here

   * @throws IOException
   */
  private void indexRecord(WARCReader warcReader) throws IOException {
    warcReader.setStrict(true);
    // warcReader.setParseHttpHeaders(true);
    WARCRecord warcRecord = (WARCRecord)warcReader.get(this.offset);
    ArchiveRecordHeader header = warcRecord.getHeader();
    System.out.println("========== selected metadata:");
    warcRecord.close(); // must close record to get digest
    printMetadata(warcRecord,header);
    System.out.println("========== header: \n" + header);
  }
View Full Code Here

       Logger l = Logger.getLogger(writer.getClass().getName());
       Level oldLevel = l.getLevel();
     try {
           l.setLevel(Level.WARNING);
       for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) {
               WARCRecord r = (WARCRecord)i.next();
               if (!isARCType(r.getHeader().getMimetype())) {
                   continue;
               }
               if (r.getHeader().getContentBegin() <= 0) {
                   // Otherwise, because length include Header-Line and
                   // Named Fields, these will end up in the ARC unless there
                   // is a non-zero content begin.
                   continue;
               }
               String ip = (String)r.getHeader().
                   getHeaderValue((WARCConstants.HEADER_KEY_IP));
               long length = r.getHeader().getLength();
               int offset = r.getHeader().getContentBegin();
               // This mimetype is not exactly what you'd expect to find in
               // an ARC though technically its 'correct'.  To get right one,
               // need to parse the HTTP Headers.  Thats messy.  Not doing for
               // now.
               String mimetype = r.getHeader().getMimetype();
               // Clean out ISO time string '-', 'T', ':', and 'Z' characters.
               String t = r.getHeader().getDate().replaceAll("[-T:Z]", "");
               long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime();
               writer.write(r.getHeader().getUrl(), mimetype, ip, time,
                   (int)(length - offset), r);
       }
     } finally {
       if (reader != null) {
         reader.close();
View Full Code Here

           + "Content-Length: " + (HTTPHEADER.length() + BODY.length()) + "\r\n"
           + "\r\n";

        final String hdr = warcHeader + HTTPHEADER + BODY;

        WARCRecord r = new WARCRecord(new ByteArrayInputStream(hdr.getBytes()),
                "READER_IDENTIFIER", 0, false, true);
        HeaderedArchiveRecord har = new HeaderedArchiveRecord(r, true);

        har.skipHttpHeader();
View Full Code Here

    }
   
    public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

  }
 
  public static Resource createTestJSResource(byte[] payloadBytes) throws IOException {
    WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse("text/javascript", payloadBytes);
    TestWARCReader ar = new TestWARCReader(recinfo);
    WARCRecord rec = ar.get(0);
    WarcResource resource = new WarcResource(rec, ar);
    resource.parseHeaders();
    return resource;
  }
View Full Code Here

    }
   
    public static Resource createTestHtmlResource(byte[] payloadBytes) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createCompressedHttpResponse("text/html", payloadBytes);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

    }
    public static Resource createTestRevisitResource(byte[] payloadBytes, boolean withHeader, boolean gzipContent) throws IOException {
        WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(
                "text/html", payloadBytes.length, withHeader, gzipContent);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource resource = new WarcResource(rec, ar);
        resource.parseHeaders();
        return resource;
    }
View Full Code Here

TOP

Related Classes of org.archive.io.warc.WARCRecord

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.