Package org.archive.modules

Examples of org.archive.modules.CrawlURI


    }
   
   
    private CrawlURI createTestUri(String urlStr) throws URIException {
        UURI testUuri = UURIFactory.getInstance(urlStr);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);

        return testUri;
    }
View Full Code Here


        return testUri;
    }
    private CrawlURI createTestUri(String urlStr, String via) throws URIException {
       
        UURI testViaUuri = UURIFactory.getInstance(via);
        CrawlURI testUri = createTestUri(urlStr);
        testUri.setVia(testViaUuri);

        return testUri;
    }
View Full Code Here

     * See http://stackoverflow.com/questions/100841/artificially-create-a-connection-timeout-error
     * This test seems to fail if not connected to internet, because
     * connection fails immediately insted of timing out.
     */
    public void testConnectionTimeout() throws Exception {
        CrawlURI curi = makeCrawlURI("http://10.255.255.1/");
        fetcher().setSoTimeoutMs(300);
       
        long start = System.currentTimeMillis();
        fetcher().process(curi);
        long elapsed = System.currentTimeMillis() - start;
       
        assertTrue(elapsed >= 300 && elapsed < 400);
       
        // Httpcomponents throws org.apache.http.conn.ConnectTimeoutException,
        // commons-httpclient throws java.net.SocketTimeoutException. Both are
        // instances of InterruptedIOException
        assertEquals(1, curi.getNonFatalFailures().size());
        assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof InterruptedIOException);
        assertTrue(curi.getNonFatalFailures().toArray()[0].toString().matches("(?i).*connect.*timed out.*"));

        assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
       
        assertEquals(0, curi.getFetchCompletedTime());
    }
View Full Code Here

   
    // XXX testSocketTimeout() (the other kind) - how to simulate?
   
    public void testSslTrustLevel() throws Exception {
        // default "open" trust level
        CrawlURI curi = makeCrawlURI("https://localhost:7443/");
        fetcher().process(curi);
        runDefaultChecks(curi, "hostHeader");
       
        // "normal" trust level
        curi = makeCrawlURI("https://localhost:7443/");
        fetcher().setSslTrustLevel(TrustLevel.NORMAL);
        fetcher().process(curi);
        assertEquals(1, curi.getNonFatalFailures().size());
        assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof SSLException);
        assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
        assertEquals(0, curi.getFetchCompletedTime());
    }
View Full Code Here

        assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
        assertEquals(0, curi.getFetchCompletedTime());
    }
   
    public void testHttp11() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/");
        fetcher().setUseHTTP11(true);
        fetcher().process(curi);
        assertTrue(httpRequestString(curi).startsWith("GET / HTTP/1.1\r\n"));
        // what else?
        runDefaultChecks(curi, "requestLine");
View Full Code Here

        // what else?
        runDefaultChecks(curi, "requestLine");
    }

    public void testChunked() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/chunked.txt");
        fetcher().setUseHTTP11(true);
        fetcher().setSendConnectionClose(false);
       
        /* XXX Server expects us to close the connection apparently. But we
         * don't detect end of chunked transfer. With these small timeouts we
         * can finish quickly. A couple of SocketTimeoutExceptions will happen
         * within RecordingInputStream.readFullyOrUntil().
         */
        fetcher().setSoTimeoutMs(500);
        fetcher().setTimeoutSeconds(1);
       
        fetcher().process(curi);
       
        //        logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
        //        logger.info("\n----- rawResponseString -----\n" + rawResponseString(curi));
        //        logger.info("\n----- contentString -----\n" + contentString(curi));
        //        logger.info("\n----- entityString -----\n" + entityString(curi));
        //        logger.info("\n----- messageBodyString -----\n" + messageBodyString(curi));
       
        assertEquals("chunked", curi.getHttpResponseHeader("transfer-encoding"));
        assertEquals("25\r\n" + DEFAULT_PAYLOAD_STRING + "\r\n0\r\n\r\n", messageBodyString(curi));
        assertEquals(DEFAULT_PAYLOAD_STRING, entityString(curi));
        assertEquals(DEFAULT_PAYLOAD_STRING, contentString(curi));
    }
View Full Code Here

    public void testNoResponse() throws Exception {
        NoResponseServer noResponseServer = new NoResponseServer("localhost", 7780);
        noResponseServer.start();
       
        // CrawlURI curi = makeCrawlURI("http://stats.bbc.co.uk/robots.txt");
        CrawlURI curi = makeCrawlURI("http://localhost:7780");
        fetcher().process(curi);
        assertEquals(1, curi.getNonFatalFailures().size());
        assertTrue(curi.getNonFatalFailures().toArray()[0] instanceof NoHttpResponseException);
        assertEquals(FetchStatusCodes.S_CONNECT_FAILED, curi.getFetchStatus());
        assertEquals(0, curi.getFetchCompletedTime());
       
        noResponseServer.beDone();
        noResponseServer.join();
    }
View Full Code Here

     * url. See class comment on {@link UURI}.
     *
     * @throws Exception
     */
    public void testLaxUrlEncoding() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/99%");
        fetcher().process(curi);
        // logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
        assertTrue(httpRequestString(curi).startsWith("GET /99% HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");
    }
View Full Code Here

        assertTrue(httpRequestString(curi).startsWith("GET /99% HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");
    }
   
    public void testTwoQuestionMarks() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/??blahblah");
        fetcher().process(curi);
        // logger.info('\n' + httpRequestString(curi) + "\n\n" + rawResponseString(curi));
        assertTrue(httpRequestString(curi).startsWith("GET /??blahblah HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");
    }
View Full Code Here

        assertTrue(httpRequestString(curi).startsWith("GET /??blahblah HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");
    }
   
    public void testUrlWithSpaces() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/url with spaces?query%20with%20spaces");
        fetcher().process(curi);
        assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");

        curi = makeCrawlURI("http://localhost:7777/url%20with%20spaces?query with spaces");
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.