Package org.archive.modules

Examples of org.archive.modules.CrawlURI


        assertTrue(httpRequestString(curi).startsWith("GET /url%20with%20spaces?query%20with%20spaces HTTP/1.0\r\n"));
        runDefaultChecks(curi, "requestLine");
    }
   
    public void testCharsets() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/cp1251");
        fetcher().process(curi);
        assertEquals("text/plain;charset=cp1251", curi.getHttpResponseHeader("content-type"));
        assertEquals(Charset.forName("cp1251"), curi.getRecorder().getCharset());
        assertTrue(Arrays.equals(FetchHTTPTest.CP1251_PAYLOAD, IOUtils.toByteArray(curi.getRecorder().getContentReplayInputStream())));
        assertEquals("\u041A\u043E\u0447\u0430\u043D\u0438 \u041E\u0440\u043A"
                + "\u0435\u0441\u0442\u0430\u0440 \u0435 \u0435\u0434\u0435"
                + "\u043D \u043E\u0434 \u043D\u0430\u0458\u043F\u043E\u0437"
                + "\u043D\u0430\u0442\u0438\u0442\u0435 \u0438 \u043D\u0430"
                + "\u0458\u043F\u043E\u043F\u0443\u043B\u0430\u0440\u043D"
                + "\u0438\u0442\u0435 \u0431\u043B\u0435\u0445-\u043E\u0440"
                + "\u043A\u0435\u0441\u0442\u0440\u0438 \u0432\u043E \u0441"
                + "\u0432\u0435\u0442\u043E\u0442, \u043A\u043E\u0458 \u0433"
                + "\u043E \u0441\u043E\u0447\u0438\u043D\u0443\u0432\u0430"
                + "\u0430\u0442 \u0434\u0435\u0441\u0435\u0442\u043C\u0438"
                + "\u043D\u0430 \u0420\u043E\u043C\u0438-\u041C\u0430\u043A"
                + "\u0435\u0434\u043E\u043D\u0446\u0438 \u043F\u043E \u043F"
                + "\u043E\u0442\u0435\u043A\u043B\u043E \u043E\u0434 \u041A"
                + "\u043E\u0447\u0430\u043D\u0438, \u043F\u0440\u0435\u0434"
                + "\u0432\u043E\u0434\u0435\u043D\u0438 \u043E\u0434 \u0442"
                + "\u0440\u0443\u0431\u0430\u0447\u043E\u0442 \u041D\u0430"
                + "\u0430\u0442 (\u041D\u0435\u0430\u0442) \u0412\u0435\u043B"
                + "\u0438\u043E\u0432.\n",
                curi.getRecorder().getContentReplayCharSequence().toString());

        curi = makeCrawlURI("http://localhost:7777/unsupported-charset");
        fetcher().process(curi);
        assertEquals("text/plain;charset=UNSUPPORTED-CHARSET", curi.getHttpResponseHeader("content-type"));
        assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:UNSUPPORTED-CHARSET"));
        assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
        runDefaultChecks(curi, "requestLine", "contentType");
       
        curi = makeCrawlURI("http://localhost:7777/invalid-charset");
        fetcher().process(curi);
        assertEquals("text/plain;charset=%%INVALID-CHARSET%%", curi.getHttpResponseHeader("content-type"));
        assertTrue(curi.getAnnotations().contains("unsatisfiableCharsetInHeader:%%INVALID-CHARSET%%"));
        assertEquals(Charset.forName("latin1"), curi.getRecorder().getCharset()); // default fallback
        runDefaultChecks(curi, "requestLine", "contentType");
    }
View Full Code Here


        runDefaultChecks(curi, "requestLine", "contentType");
    }

    // see https://webarchive.jira.com/browse/HER-2063
    public void testHostHeaderDefaultPort() throws Exception {
        CrawlURI curi = makeCrawlURI("http://example.com/");
        fetcher().process(curi);
        assertTrue(httpRequestString(curi).contains("Host: example.com\r\n"));

        curi = makeCrawlURI("https://example.com/");
        fetcher().process(curi);
View Full Code Here

        fetcher().process(curi);
        assertTrue(httpRequestString(curi).contains("Host: example.com\r\n"));
    }
   
    public void testHttpPost() throws Exception {
        CrawlURI curi = makeCrawlURI("http://localhost:7777/");
        curi.setFetchType(FetchType.HTTP_POST);

        List<NameValuePair> params = new LinkedList<NameValuePair>();
        params.add(new BasicNameValuePair("name1", "value1"));
        params.add(new BasicNameValuePair("name1", "value2"));
        params.add(new BasicNameValuePair("funky name 2", "whoa crazy\t && 🍺 🍻 \n crazier \rooo"));
        String submitData = URLEncodedUtils.format(params, "UTF-8");
        assertEquals("name1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo", submitData);

        curi.getData().put(CoreAttributeConstants.A_SUBMIT_DATA, submitData);
        fetcher().process(curi);
       
        assertTrue(httpRequestString(curi).startsWith("POST / HTTP/1.0\r\n"));
        assertTrue(httpRequestString(curi).endsWith("\r\n\r\nname1=value1&name1=value2&funky+name+2=whoa+crazy%09+%26%26+%F0%9F%8D%BA+%F0%9F%8D%BB+%0A+crazier+%0Dooo"));
        assertEquals(FetchType.HTTP_POST, curi.getFetchType());
        runDefaultChecks(curi, "requestLine", "trailingCRLFCRLF", "fetchTypeGET");
    }
View Full Code Here

    }

   
    @Override
    protected String getString(CrawlURI uri) {
        CrawlURI curi = (CrawlURI)uri;
        return controller.getFrontier().getClassKey(curi);
    }
View Full Code Here

        return puri instanceof CrawlURI;
    }
   
   
    protected void innerProcess(final CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
       
        // If prerequisites or no links, nothing to be done in here.
        if (curi.hasPrerequisiteUri() || curi.getOutLinks().isEmpty()) {
            return;
        }
       
//        Collection<CrawlURI> inScopeLinks = new HashSet<CrawlURI>();
        Iterator<CrawlURI> iter = curi.getOutLinks().iterator();
        while (iter.hasNext()) {
            CrawlURI cauri = iter.next();
            if (!isInScope(cauri)) {
                iter.remove();
            }
        }
//        for (CrawlURI cauri: curi.getOutCandidates()) {
View Full Code Here

//        curi.replaceOutlinks(inScopeLinks);
    }
   
    protected boolean isInScope(CrawlURI caUri) {
        // TODO: Fix filters so work on CrawlURI.
        CrawlURI curi = (caUri instanceof CrawlURI)?
            (CrawlURI)caUri:
            new CrawlURI(caUri.getUURI());
        boolean result = false;
        DecideRule seq = getSupplementaryRule();
        if (seq.decisionFor(curi) == DecideResult.ACCEPT) {
            result = true;
            if (LOGGER.isLoggable(Level.FINER)) {
View Full Code Here

     */
    protected void addOutlink(CrawlURI curi, String uri, LinkContext context,
            Hop hop) {
        try {
            UURI dest = UURIFactory.getInstance(curi.getUURI(), uri);
            CrawlURI link = curi.createCrawlURI(dest, context, hop);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logUriError(e, curi.getUURI(), uri);
        }
    }
View Full Code Here

    }
   
    protected void addOutlink(CrawlURI curi, UURI uuri, LinkContext context,
            Hop hop) {
        try {
            CrawlURI link = curi.createCrawlURI(uuri, context, hop);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logUriError(e, curi.getUURI(), uuri.toString());
        }
    }
View Full Code Here


    private static CrawlURI add2(CrawlURI curi, int max, UURI dest,
            LinkContext context, Hop hop) throws URIException {
        if (curi.getOutLinks().size() < max) {
            CrawlURI link = curi.createCrawlURI(dest, context, hop);
            curi.getOutLinks().add(link);
            return link;
        } else {
            curi.incrementDiscardedOutLinks();
            return null;
View Full Code Here

    protected void innerProcess(CrawlURI puri) {
        throw new AssertionError();
    }
   
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
        final CrawlServer server = serverCache.getServerFor(curi.getUURI());
        final CrawlHost host = serverCache.getHostFor(curi.getUURI());
        FetchStats.HasFetchStats[] haveStats =
            new FetchStats.HasFetchStats[] {
                server,
                host,
                frontier.getGroup(curi)
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.