Examples of ArchiveRecordHeader


Examples of org.archive.io.ArchiveRecordHeader

        int max = 300;
        int count = 0;
        int validRecords = 0;
        while (count++ < max && iterator.hasNext()) {
            ArchiveRecord archiveRecord = iterator.next();
            ArchiveRecordHeader header = archiveRecord.getHeader();
            String url = header.getUrl();

            String protocol = "";
            try {
                protocol = new URL(url).getProtocol();
            } catch (MalformedURLException e) {
                // Ignore and skip
            }

            if (protocol.equals("http")) {
                validRecords += 1;
                int contentOffset = header.getContentBegin();
                long totalLength = header.getLength();
                int contentLength = (int) totalLength - contentOffset;

                archiveRecord.skip(contentOffset);
                byte[] content = new byte[contentLength];
                archiveRecord.read(content);

                String mimetype = header.getMimetype();
                // The Arc headers != HTTP headers, but it's at least some data we can jam
                // into the FetchedDatum as a test. Note that the Arc headers will have value
                // types other than a long, so we have do to the conversion.
                HttpHeaders headers = new HttpHeaders();
                Set<String> keys = header.getHeaderFieldKeys();
                for (String key : keys) {
                    String value = header.getHeaderValue(key).toString();
                    headers.add(key, value);
                }
               
                FetchedDatum contentTuple = new FetchedDatum(url, url, System.currentTimeMillis(), headers, new ContentBytes(content), mimetype, 0);
                write.add(contentTuple.getTuple());
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

  }
 
  private SearchResult adaptInner(WARCRecord rec) throws IOException {
   
    SearchResult result = null;
    ArchiveRecordHeader header = rec.getHeader();
    String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
    if(type.equals(WARCConstants.RESPONSE)) {
      String mime = header.getMimetype();
      if(mime.equals("text/dns")) {
        result = adaptDNS(header,rec);
      } else {
        result = adaptResponse(header,rec);
      }
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

        // the end to start.  Reopen the arc so no context between this test
        // and the previous.
       
        for (int i = headers.size() - 1; i >= 0; i--) {
            reader = WARCReaderFactory.get(f);
            ArchiveRecordHeader h = (ArchiveRecordHeader)headers.get(i);
            ArchiveRecord r = reader.get(h.getOffset());
            String mimeType = r.getHeader().getMimetype();
            assertTrue("Record is bogus",
                mimeType != null && mimeType.length() > 0);
            reader.close();
        }
       
        assertTrue("Metadatas not equal", headers.size() == recordCount);
        for (Iterator<ArchiveRecordHeader> i = headers.iterator(); i.hasNext();) {
            ArchiveRecordHeader r = (ArchiveRecordHeader)i.next();
            assertTrue("Record is empty", r.getLength() > 0);
        }
    }
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

    }
  }

  private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
   
    ArchiveRecordHeader header = rec.getHeader();

    String typeStr = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
    WARCRecordType type;
    try {
      type = WARCRecordType.valueOf(typeStr);
    } catch (IllegalArgumentException e) {
      LOGGER.warning("Skipping unrecognized record type : " + typeStr);
      return null;
    }

    CaptureSearchResult result = genericResult(rec);

    switch (type) {
    case response:
      String mime = annotater.transformHTTPMime(header.getMimetype());
      if(mime != null && mime.equals("text/dns")) {
        // close to complete reading, then the digest is legit
        // TODO: DO we want to use the WARC header digest for this?
        rec.close();
        result.setDigest(transformWARCDigest(rec.getDigestStr()));
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

    result.setMimeType(DEFAULT_VALUE);
    result.setHttpCode(DEFAULT_VALUE);
    result.setRedirectUrl(DEFAULT_VALUE);

    ArchiveRecordHeader header = rec.getHeader();

    String file = transformWARCFilename(header.getReaderIdentifier());
    long offset = header.getOffset();
   
    result.setCaptureTimestamp(transformWARCDate(header.getDate()));
    result.setFile(file);
    result.setOffset(offset);
    result.setDigest(transformWARCDigest(header.getHeaderValue(
        WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
   
    String origUrl = header.getUrl();
    if(origUrl == null) {
      String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
      if(type.equals(WARCConstants.WARCRecordType.warcinfo)) {
        String filename = header.getHeaderValue(
            WARCConstants.HEADER_KEY_FILENAME).toString();
        result.setOriginalUrl("filedesc:"+filename);
        result.setUrlKey("filedesc:"+filename);       
      } else {
        result.setOriginalUrl(DEFAULT_VALUE);
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

     * for Robot Meta tags.
     */
  private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
      WARCRecord rec) throws IOException {

    ArchiveRecordHeader header = rec.getHeader();
    // need to parse the documents HTTP message and headers here: WARCReader
    // does not implement this... yet..
   
        byte [] statusBytes = LaxHttpParser.readRawLine(rec);
        int eolCharCount = getEolCharsCount(statusBytes);
        if (eolCharCount <= 0) {
            throw new RecoverableIOException("Failed to read http status where one " +
                    " was expected: " +
                    ((statusBytes == null) ? "(null)" : new String(statusBytes)));
        }
        String statusLine = EncodingUtil.getString(statusBytes, 0,
            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
        if ((statusLine == null) ||
                !StatusLine.startsWithHTTP(statusLine)) {
           throw new RecoverableIOException("Failed parse of http status line.");
        }
        StatusLine status = new StatusLine(statusLine);
    result.setHttpCode(String.valueOf(status.getStatusCode()));
       
    Header[] headers = LaxHttpParser.parseHeaders(rec,
                ARCConstants.DEFAULT_ENCODING);

   
    annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());

    return result;
  }
View Full Code Here

Examples of org.archive.io.ArchiveRecordHeader

            CaptureSearchResult result = new CaptureSearchResult();
            // TODO: Resource should have methods for accessing URI and date
            if (res instanceof WarcResource) {
                // TODO: want to use WARCRecordToSearchResultAdapter? WarcResource
                // has no method to retrieve underlining WARCRecord.
                ArchiveRecordHeader h = ((WarcResource)res).getWarcHeaders();
                String originalUrl = h.getUrl();
                String ts = (String)h.getHeaderValue("WARC-Date");
                // WARC-Date is in ISOZ format.
                ts = transformWARCDate(ts);
                result.setOriginalUrl(originalUrl);
                result.setCaptureTimestamp(ts);
                result.setOffset(0);
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.