Examples of org.archive.modules.CrawlURI

org.archive.modules.CrawlURI
Represents a candidate URI and the associated state it collects as it is crawled.
Core state is in instance variables but a flexible attribute list is also available. Use this 'bucket' to carry custom processing extracted data and state across CrawlURI processing. See the {@link #putString(String,String)}, {@link #getString(String)}, etc.
Note: getHttpMethod() has been removed starting with Heritrix 3.3.0. HTTP response headers are available using {@link #getHttpResponseHeader(String)}. (HTTP fetchers are responsible for setting the values using {@link #putHttpResponseHeader(String,String)}). @author Gordon Mohr

            oos.writeObject(this.seed);
            oos.close();
            // Read in the object.
            FileInputStream fis = new FileInputStream(serialize);
            ObjectInputStream ois = new ObjectInputStream(fis);
            CrawlURI deserializedCuri = (CrawlURI)ois.readObject();
            deserializedCuri = (CrawlURI)ois.readObject();
            deserializedCuri = (CrawlURI)ois.readObject();
            assertEquals("Deserialized not equal to original",
                    this.seed.toString(), deserializedCuri.toString());
            String host = this.seed.getUURI().getHost();
            assertTrue("Deserialized host not null",
                    host != null && host.length() >= 0);
        } finally {
            serialize.delete();

View Full Code Here

    }


    public void testCandidateURIWithLoadedAList()
            throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.archive.org");
        CrawlURI curi = new CrawlURI(uuri);
        curi.setSeed(true);
        curi.getData().put("key", "value");
        assertTrue("Didn't find AList item",
                curi.getData().get("key").equals("value"));
    }

View Full Code Here

    }




    public void testNullPathFromSeed() throws URIException {
        // check comparing with null
        CrawlURI a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                null, // a < b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        assertEquals("", a.getPathFromSeed());


        CrawlURI b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "", // a < b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        assertEquals("", b.getPathFromSeed());


        assertEquals(0, a.compareTo(b));
        assertEquals(0, b.compareTo(a));


    }

View Full Code Here


    }


    public void testOrdering() throws URIException {
        // check that via is highest precedence
        CrawlURI a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/2"), // a > b
                "2", // a > b
                UURIFactory.getInstance("http://example.com/via/1"), // a < b
                new SimpleLinkContext("2")); // a > b
        CrawlURI b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a > b
                "1", // a > b
                UURIFactory.getInstance("http://example.com/via/2"), // a < b
                new SimpleLinkContext("1")); // a > b
        assertEquals(-1, a.compareTo(b));
        assertEquals(1, b.compareTo(a));


        // check that uri is next highest
        a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a < b
                "2", // a > b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("2")); // a > b
        b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/2"), // a < b
                "1", // a > b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a > b
        assertEquals(-1, a.compareTo(b));
        assertEquals(1, b.compareTo(a));


        // check that via context is next
        a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "2", // a > b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a < b
        b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "1", // a > b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("2")); // a < b
        assertEquals(-1, a.compareTo(b));
        assertEquals(1, b.compareTo(a));


        // check that pathFromSeed is next
        a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "1", // a < b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "2", // a < b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        assertEquals(-1, a.compareTo(b));
        assertEquals(1, b.compareTo(a));


        // check equality
        a = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "1", // a == b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        b = new CrawlURI(
                UURIFactory.getInstance("http://example.com/1"), // a == b
                "1", // a == b
                UURIFactory.getInstance("http://example.com/via/1"), // a == b
                new SimpleLinkContext("1")); // a == b
        assertEquals(0, a.compareTo(b));
        assertEquals(0, b.compareTo(a));
    }

View Full Code Here

    public FetchHistoryProcessor() {
    }


    @Override
    protected void innerProcess(CrawlURI puri) throws InterruptedException {
        CrawlURI curi = (CrawlURI) puri;
        curi.addPersistentDataMapKey(A_FETCH_HISTORY);
        HashMap<String, Object> latestFetch = new HashMap<String, Object>();


        // save status
        latestFetch.put(A_STATUS, curi.getFetchStatus());
        // save fetch start time
        latestFetch.put(A_FETCH_BEGAN_TIME, curi.getFetchBeginTime());
        // save digest
        String digest = curi.getContentDigestSchemeString();
        if (digest != null) {
            latestFetch.put(A_CONTENT_DIGEST, digest);
        }
        // save relevant HTTP headers, if available
        if (curi.isHttpTransaction()) {
            saveHeader(curi, latestFetch, A_ETAG_HEADER);
            saveHeader(curi, latestFetch, A_LAST_MODIFIED_HEADER);


            // save reference length (real or virtual)
            long referenceLength;
            if (curi.containsDataKey(A_REFERENCE_LENGTH)) {
                // reuse previous length if available (see FetchHTTP#setSizes).
                referenceLength = (Long) curi.getData().get(A_REFERENCE_LENGTH);
            } else {
                // normally, use content-length
                referenceLength = curi.getContentLength();
            }
            latestFetch.put(A_REFERENCE_LENGTH, referenceLength);
        }


        HashMap<String, Object>[] history = historyRealloc(curi);


        // rotate all history entries up one slot, insert new at [0]
        for (int i = history.length - 1; i > 0; i--) {
            history[i] = history[i - 1];
        }
        history[0] = latestFetch;


        curi.getData().put(A_FETCH_HISTORY, history);


        if (curi.getFetchStatus() == 304) {
            ServerNotModifiedRevisit revisit = new ServerNotModifiedRevisit();
            revisit.setETag((String) latestFetch.get(A_ETAG_HEADER));
            revisit.setLastModified((String) latestFetch.get(A_LAST_MODIFIED_HEADER));
            curi.setRevisitProfile(revisit);
        } else if (hasIdenticalDigest(curi)) {
            curi.getAnnotations().add("duplicate:digest");
            IdenticalPayloadDigestRevisit revisit = 
                new IdenticalPayloadDigestRevisit((String)history[1].get(A_CONTENT_DIGEST));
            revisit.setRefersToTargetURI(curi.getURI()); // Matches are always on the same URI
            revisit.setRefersToDate((Long)history[1].get(A_FETCH_BEGAN_TIME));
            curi.setRevisitProfile(revisit);
        }
    }

View Full Code Here

    
    /* (non-Javadoc)
     * @see org.archive.crawler.framework.Frontier#next()
     */
    public CrawlURI next() throws InterruptedException {
        CrawlURI crawlable = null;
        while(crawlable==null) {
            outboundLock.readLock().lockInterruptibly();
            // try filling outbound until we get something to work on
            crawlable = findEligibleURI();
            outboundLock.readLock().unlock();

View Full Code Here

                String lineType = read.substring(0, 3);
                m.reset(lineType);
                if(m.matches()) {
                    try {
                        String uriHopsViaString = read.substring(3).trim();
                        CrawlURI curi = CrawlURI.fromHopsViaString(uriHopsViaString);
                        if(scope!=null) {
                            sheetOverlaysManager.applyOverlaysTo(curi);
                            try {
                                KeyedProperties.loadOverridesFrom(curi);
                                if(!scope.accepts(curi)) {
                                    // skip out-of-scope URIs if so configured
                                    continue;
                                }
                            } finally {
                                KeyedProperties.clearOverridesFrom(curi); 
                            }
                        }
                        if(includeOnly) {
                            considerIncluded(curi);
                            newJournal.included(curi);
                        } else {
                            curi.setForceFetch(forceFetch);
                            schedule(curi);
                        }
                    } catch (URIException e) {
                        logger.log(Level.WARNING,"Problem line: "+read, e);
                    }

View Full Code Here

    @Override
    protected Collection<TestData> makeData(String content, String destURI)
    throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null, 
                LinkContext.NAVLINK_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/html");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
                
        UURI dest = UURIFactory.getInstance(destURI);
        LinkContext context = determineContext(content);
        Hop hop = determineHop(content);
        CrawlURI link = euri.createCrawlURI(dest, context, hop);
        result.add(new TestData(euri, link));
        
        euri = new CrawlURI(src, null, null, LinkContext.NAVLINK_MISC);
        recorder = createRecorder(content, "UTF-8");
        euri.setContentType("application/xhtml");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
        result.add(new TestData(euri, link));

View Full Code Here

     * @param expected String target URI that should be extracted
     * @param source CharSequence source material to extract
     * @throws URIException
     */
    protected void expectSingleLink(String expected, CharSequence source) throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com"));
        getExtractor().extract(puri, source);
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        assertTrue("did not find single link",links.length==1);
        assertTrue("expected link not found", 
                links[0].getURI().equals(expected));
    }

View Full Code Here

     * [HER-1280] do not by default GET form action URLs declared as POST, 
     * because it can cause problems/complaints 
     * http://webteam.archive.org/jira/browse/HER-1280
     */
    public void testOnlyExtractFormGets() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com"));
        CharSequence cs = 
            "<form method=\"get\" action=\"http://www.example.com/ok1\"> "+
            "<form action=\"http://www.example.com/ok2\" method=\"get\"> "+
            "<form method=\"post\" action=\"http://www.example.com/notok\"> "+
            "<form action=\"http://www.example.com/ok3\"> ";
        getExtractor().extract(puri, cs);
        // find exactly 3 (not the POST) action URIs
        assertTrue("incorrect number of links found", puri.getOutLinks().size()==3);
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.archive.modules.CrawlURI

org.archive.crawler.datamodel.CandidateURITest

org.archive.crawler.datamodel.CrawlURITest

org.archive.crawler.deciderules.ClassKeyMatchesRegexDecideRule

org.archive.crawler.framework.ToeThread

org.archive.crawler.frontier.AbstractFrontier

org.archive.crawler.frontier.AMQPUrlReceiver$UrlConsumer

org.archive.crawler.frontier.BdbMultipleWorkQueues

org.archive.crawler.frontier.BdbMultipleWorkQueuesTest

org.archive.crawler.frontier.BdbWorkQueue

org.archive.crawler.frontier.FrontierJournal

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.