Package org.archive.wayback.resourcestore.resourcefile

Source Code of org.archive.wayback.resourcestore.resourcefile.WarcResourceTest

/**
*
*/
package org.archive.wayback.resourcestore.resourcefile;

import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.Map;

import junit.framework.TestCase;

import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveReader;
import org.archive.io.warc.TestWARCReader;
import org.archive.io.warc.TestWARCRecordInfo;
import org.archive.io.warc.WARCRecord;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.wayback.core.Resource;
import org.archive.wayback.replay.GzipDecodingResource;
import org.archive.wayback.replay.TextReplayRenderer;
import org.archive.wayback.replay.charset.CharsetDetector;
import org.archive.wayback.replay.charset.StandardCharsetDetector;


/**
* TODO: add more tests. it has only tests relevant to recent
* changes.
*
* @contributor kenji
*
*/
public class WarcResourceTest extends TestCase {

    /* (non-Javadoc)
     * @see junit.framework.TestCase#setUp()
     */
    protected void setUp() throws Exception {
        super.setUp();
    }

    /**
     * plain HTTP response (without any transfer/content-encoding)
     * @throws Exception
     */
    public void testPlainHttpRecord() throws Exception {
        String payload = "hogehogehogehogehoge";
        WARCRecordInfo recinfo = TestWARCRecordInfo.createHttpResponse(payload);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", "text/plain", res.getHeader("Content-Type"));
        byte[] buf = new byte[payload.getBytes().length + 1];
        int n = res.read(buf);
        assertEquals("content length", buf.length - 1, n);
       
        res.close();
    }
    /**
     * uncompressed, but chunked-encoded HTTP response
     * @throws Exception
     */
    public void testPlainChunkedHttpRecord() throws Exception {
        String payload = "hogehogehogehogehoge";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(
                TestWARCRecordInfo.buildHttpResponseBlock("200 OK",
                        "text/plain", payload.getBytes("UTF-8"), true));
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", "text/plain", res.getHeader("Content-Type"));
        byte[] buf = new byte[payload.getBytes().length + 1];
        int n = res.read(buf);
        assertEquals("content length", buf.length - 1, n);
       
        res.close();
    }

    /**
     * gzip-compressed HTTP response.
     * @throws Exception
     */
    public void testCompressedHttpRecord() throws Exception {
        String payload = "hogehogehogehogehoge";
        String ctype = "text/plain";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(
                TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype,
                        payload.getBytes()));
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", ctype, res.getHeader("Content-Type"));
       
        Resource zres = TextReplayRenderer.decodeResource(res);
        assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource));
       
        byte[] buf = new byte[payload.getBytes().length + 1];
        int n = zres.read(buf);
        assertEquals("content length", buf.length - 1, n);
       
        res.close();
    }
   
    /**
     * gzip-compressed, chunked-encoded HTTP response.
     * @throws Exception
     */
    public void testCompressedChunkedHttpRecord() throws Exception {
        String payload = "hogehogehogehogehoge";
        String ctype = "text/plain";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(
                TestWARCRecordInfo.buildCompressedHttpResponseBlock(ctype,
                        payload.getBytes(), true));
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", ctype, res.getHeader("Content-Type"));
       
        Resource zres = TextReplayRenderer.decodeResource(res);
        assertTrue("wrapped with GzipDecodingResource", (zres instanceof GzipDecodingResource));
       
        byte[] buf = new byte[payload.getBytes().length + 1];
        int n = zres.read(buf);
        assertEquals("content length", buf.length - 1, n);
       
        res.close();
    }

    // TODO: add more tests on various Transfer-Encoding and Content-Encoding.
    // TODO: add more tests on corner cases.
   
    /**
     * metadata record with render-able content like site screenshot image.
     *
     * HTTP status is assumed to be 200, and Content-Type of WARC header
     * becomes Content-Type of replay response.
     * @throws Exception
     */
    public void testMetadataRecord() throws Exception {
        // 1-dot transparent GIF found everywhere if you wonder :-)
        final byte[] block = new byte[] {
                71, 73, 70, 56, 57, 97, 1, 0, 1, 0, -128, 0, 0, -64, -64, -64,
                0, 0, 0, 33, -7, 4, 1, 0, 0, 0, 0, 44, 0, 0, 0, 0,
                1, 0, 1, 0, 0, 2, 2, 68, 1, 0, 59, 13, 10, 13, 10
        };
        final String ct = "image/gif";
        WARCRecordInfo recinfo = new TestWARCRecordInfo(block);
        recinfo.setType(WARCRecordType.metadata);
        recinfo.setMimetype(ct);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        // must not fail
        res.parseHeaders();
       
        // should return assumed 200
        assertEquals("statusCode", 200, res.getStatusCode());
        // content-type is what's specified in WARC header.
        assertEquals("content-type", ct, res.getHeader("Content-Type"));
        // must have Date header, in HTTP Date format.
        String date = res.getHeader("Date");
        assertNotNull("has date header", date);
        new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(date);
       
        // block as content
        byte[] buf = new byte[block.length + 1];
        int n = res.read(buf);
        assertEquals("content length", block.length, n);
        for (int i = 0; i < block.length; i++) {
            assertEquals("byte " + i, block[i], buf[i]);
        }
       
        res.close();
    }
   
    final String REVISIT_WARC_PROFILE =
            "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
    /**
     * new, current revisit record, which has just HTTP response line and
     * headers part of the capture.
     * <p>Expectations:
     * TextReplayRender receives revisit WarcResource as {@code httpHeaderResource},
     * and calls following methods on it:</p>
     * <ul>
     * <li>{@link WarcResource#getStatusCode()}</li>
     * <li>{@link WarcResource#getHttpHeaders()} (ok to return null)</li>
     * </ul>
     * @throws Exception
     */
    public void testRevisitRecord() throws Exception {
        final String ct = "text/html";
        WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(ct, 1345);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        // these are from this record.
        assertEquals("statusCode", 200, res.getStatusCode());
        assertEquals("content-type", ct, res.getHeader("Content-Type"));
       
        StandardCharsetDetector csd = new StandardCharsetDetector();
        // assuming WaybackRequest (3rd parameter) is not used in getCharset()
        csd.getCharset(res, res, null);

        res.close();
    }
   
    /**
     * old revisit record, which has zero-length block (no HTTP response
     * line, no HTTP headers).
     *
     * in this case, {@link WarcResource#getStatusCode()} should not fail, but
     * either return special value or throw an appropriate exception signifying
     * there's no HTTP status line recorded in this resource, and thus ReplayRenderer
     * should fallback on using payloadResource for the info instead.
     * {@link WarcResource#getHttpHeaders()} must not return null, but should
     * return empty Map object, so that {@link CharsetDetector} can return null
     * without failing.
     *
     * for the better, this fallback may be encapsulated in
     * virtual Resource combining httpHeaderResource and payloadResource.
     *
     * related issue: https://webarchive.jira.com/browse/ACC-126
     * @throws Exception
     * @see TextReplayRenderer
     * @see StandardCharsetDetector#getCharset(org.archive.wayback.core.Resource, org.archive.wayback.core.Resource, org.archive.wayback.core.WaybackRequest)
     */
    public void testOldRevisitRecord() throws Exception {
        final String ct = "text/html";
        WARCRecordInfo recinfo = TestWARCRecordInfo.createRevisitHttpResponse(ct, 1345, false);
        TestWARCReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
       
        // should either return special value or throw appropriate exception (TBD)
        int scode = res.getStatusCode();
        assertEquals("status code", 0, scode);
       
        Map<String, String> headers = res.getHttpHeaders();
        //assertNotNull("headers", headers);
        assertNull("headers", headers);
       
        res.close();
    }
    /**
     * resource record, typically used for archiving ftp fetches.
     * @throws Exception
     */
    public void testResourceRecord() throws Exception {
        final String ct = "text/plain";
        final byte[] block = "blahblahblah\n".getBytes();
        WARCRecordInfo recinfo = new TestWARCRecordInfo(block);
        recinfo.setType(WARCRecordType.resource);
        recinfo.setUrl("ftp://ftp.example.com/afile.txt");
        recinfo.setMimetype(ct);
        ArchiveReader ar = new TestWARCReader(recinfo);
        WARCRecord rec = (WARCRecord)ar.get(0);
        WarcResource res = new WarcResource(rec, ar);
        res.parseHeaders();
               
        int scode = res.getStatusCode();
        assertEquals("statusCode", 200, scode);

        Map<String, String> headers = res.getHttpHeaders();
        assertNotNull("headers", headers);
       
        assertEquals("content-type", ct, res.getHeader("Content-Type"));

        // must have Date header, in HTTP Date format.
        String date = res.getHeader("Date");
        assertNotNull("has date header", date);
        new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(date);
       
        res.close();
    }
   
    // TODO: there can be revisit for ftp fetches, right?
}
TOP

Related Classes of org.archive.wayback.resourcestore.resourcefile.WarcResourceTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.