Package org.archive.modules

Examples of org.archive.modules.CrawlURI


        }
    }

    public void xestAri3712() throws MalformedURLException, IOException {
        String url = "https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/container.swf";
        CrawlURI curi = setupURI(url);
        curi.setVia(UURIFactory.getInstance("http://nyumedecs.kk5.org/"));
        long startTime = System.currentTimeMillis();
        this.extractor.extract(curi);
        long elapsed = System.currentTimeMillis() - startTime;
        logger.info(this.extractor.getClass().getSimpleName() + " took "
                + elapsed + "ms to process " + url);

        HashMap<CharSequence, String> expected = new HashMap<CharSequence, String>();
        expected.put("http://nyumedecs.kk5.org/sm4/portal", "extractorSWFRelToVia");
        expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/sm4/portal", "extractorSWFRelToBase");
        expected.put("http://nyumedecs.kk5.org/", "extractorSWFRelToVia");
        expected.put("https://wayback.archive-it.org/", "extractorSWFRelToBase");
        expected.put("http://nyumedecs.kk5.org/loadingBarEdit.swf", "extractorSWFRelToVia");
        expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/loadingBarEdit.swf", "extractorSWFRelToBase");
        expected.put("http://nyumedecs.kk5.org/containermain.swf", "extractorSWFRelToVia");
        expected.put("https://wayback.archive-it.org/3771/20131119163257/http://nyumedecs.kk5.org/_app/28727/en/resources/containermain.swf", "extractorSWFRelToBase");

        for (CrawlURI link: curi.getOutLinks()) {
            System.out.println(link + " " + link.getData());
            assertEquals(1, link.getAnnotations().size());

            String dest = link.toString();
            assertTrue(expected.containsKey(dest));
View Full Code Here


        this.filter.setDestination(this);
    }

    public void testAdding() throws URIException {
        this.filter.add(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addNow(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addForce(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        // Should only have add 'this' once.
        assertTrue("Count is off", this.filter.count() == 1);
    }
View Full Code Here

            assertEquals("count off",count,filter.count());
            UURI u = UURIFactory.getInstance("http://www" +
                    count + ".archive.org/" + count + "/index.html");
            assertFalse("already contained "+u.toString(),filter.bloom.contains(u.toString()));
            logger.fine("adding "+u.toString());
            filter.add(u.toString(), new CrawlURI(u));
            assertTrue("not in bloom",filter.bloom.contains(u.toString()));
            if (count > 0 && ((count % 100) == 0)) {
                list.add(u);
            }
        }
        logger.fine("Added " + count + " in " +
                (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        logger.fine("Readded subset " + list.size() + " in " +
                (System.currentTimeMillis() - start));

        assertTrue("Count is off: " + filter.count(),
View Full Code Here

        // }
  }
   
    public void testAdding() throws URIException {
      this.filter.add(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addNow(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        this.filter.addForce(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(this.getUri())));
        // Should only have add 'this' once.
        assertTrue("Count is off", this.filter.count() == 1);
    }
View Full Code Here

        ArrayList<UURI> list = new ArrayList<UURI>(1000);
        int count = 0;
        for (; count < max; count++) {
            UURI u = UURIFactory.getInstance("http://www" +
                count + ".archive.org/" + count + "/index.html");
            this.filter.add(u.toString(), new CrawlURI(u));
            if (count > 0 && ((count % 100) == 0)) {
                list.add(u);
            }
            if (count > 0 && ((count % 100000) == 0)) {
                this.logger.info("Added " + count + " in " +
                    (System.currentTimeMillis() - start) +
                    " misses " +
                    ((BdbUriUniqFilter)this.filter).getCacheMisses() +
                    " diff of misses " +
                    ((BdbUriUniqFilter)this.filter).getLastCacheMissDiff());
            }
        }
        this.logger.info("Added " + count + " in " +
            (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Added random " + list.size() + " in " +
                (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Deleted random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
        // Looks like delete doesn't work.
        assertTrue("Count is off: " + this.filter.count(),
View Full Code Here

        assertFalse("Receiver was called", this.received);
    }
   
    public void testForgetOnEmpty() throws URIException {
        this.filter.forget(this.getUri(),
            new CrawlURI(UURIFactory.getInstance(getUri())));
        assertEquals("Didn't forget", 0, this.filter.count());
    }
View Full Code Here

            "sch=%2E%2F%3Faction%3Dsearch");
    }
   
    private void doOneSerialization(final String urlStr)
    throws IOException, ClassNotFoundException {
        CrawlURI cauri =
            new CrawlURI(UURIFactory.getInstance(urlStr));
        cauri = serialize(cauri);
        assertEquals(urlStr + " doesn't serialize", urlStr,
            cauri.getUURI().toString())
    }
View Full Code Here

                "hTtP://fOrGeTmE.cOm/bar",
                "http://forgetme.com:80/toot/spuh",
                "http://forgetme.com:90/toot/spuh",
                "https://forgetme.com/baz",
        }) {
            CrawlURI curi = new CrawlURI(UURIFactory.getInstance(uri));
            this.filter.add(curi.getUURI().toCustomString(), curi);
        }

        assertEquals(countBefore + 6, this.filter.count());

        BdbUriUniqFilter bdbFilter = (BdbUriUniqFilter) filter;
View Full Code Here

            // Does not begin with scheme, so try http://
            uri = "http://" + uri;
        }
        try {
            UURI uuri = UURIFactory.getInstance(uri);
            CrawlURI curi = new CrawlURI(uuri);
            curi.setSeed(true);
            curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            if (getSourceTagSeeds()) {
                curi.setSourceTag(curi.toString());
            }
            publishAddedSeed(curi);
        } catch (URIException e) {
            // try as nonseed line as fallback
            nonseedLine(uri);
View Full Code Here

    CrawlURI seed = null;

    protected void setUp() throws Exception {
        super.setUp();
        final String url = "http://www.dh.gov.uk/Home/fs/en";
        this.seed = new CrawlURI(UURIFactory.getInstance(url));
        this.seed.setSchedulingDirective(SchedulingConstants.MEDIUM);
        this.seed.setSeed(true);
        // Force caching of string.
        this.seed.toString();
        // TODO: should this via really be itself?
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.