Package org.archive.net

Examples of org.archive.net.UURI


        long start = System.currentTimeMillis();
        ArrayList<UURI> list = new ArrayList<UURI>(1000);
        int count = 0;
        final int MAX_COUNT = 1000;
        for (; count < MAX_COUNT; count++) {
          UURI u = UURIFactory.getInstance("http://www" +
              count + ".archive.org/" + count + "/index.html");
          this.filter.add(u.toString(), new CrawlURI(u));
          if (count > 0 && ((count % 100) == 0)) {
            list.add(u);
          }
        }
        this.logger.info("Added " + count + " in " +
            (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Added random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
       
        start = System.currentTimeMillis();
        for (Iterator<UURI> i = list.iterator(); i.hasNext();) {
            UURI uuri = i.next();
            this.filter.add(uuri.toString(), new CrawlURI(uuri));
        }
        this.logger.info("Deleted random " + list.size() + " in " +
            (System.currentTimeMillis() - start));
        // Looks like delete doesn't work.
        assertTrue("Count is off: " + this.filter.count(),
View Full Code Here


        this.uuri = uuri;
        this.pathFromSeed = "";
    }

    public static CrawlURI fromHopsViaString(String uriHopsViaContext) throws URIException {
        UURI u;
        String args[] = uriHopsViaContext.split("\\s+");
        u = UURIFactory.getInstance(args[0]);
        String pathFromSeed = (args.length > 1)?
            args[1].toString() : "";
        UURI via = (args.length > 2 && args[2].length()>1) ?
            UURIFactory.getInstance(args[2].toString()):
            null;
        LinkContext viaContext = (args.length > 3 && args[2].length()>1) ?
                HTMLLinkContext.get(args[3].toString()): null;
        CrawlURI caUri = new CrawlURI(u, pathFromSeed, via, viaContext);
View Full Code Here

        return createCrawlURI(destination.toString(), context, hop);
    }

    public CrawlURI createCrawlURI(String destination, LinkContext context, Hop hop)
        throws URIException {
        UURI u = UURIFactory.getInstance(this.getBaseURI(), destination);
        CrawlURI newCaURI = new CrawlURI(
            u,
                extendHopsPath(getPathFromSeed(),
                    hop.getHopChar()),
                this.getUURI(),
View Full Code Here

     * lands in the same queue, with the same overlay values, as the
     * triggering URI.
     * @return UURI to use for policy decisions
     */
    public UURI getPolicyBasisUURI() {
        UURI effectiveuuri = null;
        // always use 'via' of prerequisite URIs, if available, so
        // prerequisites go to same queue as trigger URI
        if (getPathFromSeed().endsWith(Hop.PREREQ.getHopString())) {
            effectiveuuri = getVia();
        }
View Full Code Here

        return testUri;
    }
    private CrawlURI createTestUri(String urlStr, String via) throws URIException {
       
        UURI testViaUuri = UURIFactory.getInstance(via);
        CrawlURI testUri = createTestUri(urlStr);
        testUri.setVia(testViaUuri);

        return testUri;
    }
View Full Code Here

        assertTrue(dr.evaluate(testUri));
    }
   
    private CrawlURI createTestUri(String urlStr) throws URIException{
        UURI testUuri = UURIFactory.getInstance(urlStr);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);

        return testUri;
    }
View Full Code Here

        assertFalse(dr.evaluate(testUri));
    }
   
    private CrawlURI createTestUri(String urlStr) throws URIException{
        UURI testUuri = UURIFactory.getInstance(urlStr);
        CrawlURI testUri = new CrawlURI(testUuri, null, null, LinkContext.NAVLINK_MISC);

        return testUri;
    }
View Full Code Here

        assertEquals("http://example.org/original", curi2.getContentDigestHistory().get(A_ORIGINAL_URL));
    }

    protected CrawlURI makeCrawlURI(String uri) throws URIException,
            IOException {
        UURI uuri = UURIFactory.getInstance(uri);
        CrawlURI curi = new CrawlURI(uuri);
        curi.setSeed(true);
        curi.setRecorder(getRecorder());
        return curi;
    }
View Full Code Here

       
        private static String getKey(String url, boolean prefix)
        throws URIException {

                String key = ArchiveUtils.addImpliedHttpIfNecessary(url);
                UURI uuri = UURIFactory.getInstance(key);
                key = uuri.getScheme() + "://" + uuri.getAuthority() +
                        uuri.getEscapedPathQuery();

                key = SURT.fromURI(key);
               
                int hashPos = key.indexOf('#');
                if(hashPos != -1) {
View Full Code Here

     *
     * @param curi CrawlURI to add discoveries to
     * @param wref Link to examine for internal URIs
     */
    protected void extractLink(CrawlURI curi, CrawlURI wref) {
        UURI source = null;
        try {
            source = UURIFactory.getInstance(wref.getURI());
        } catch (URIException e) {
            LOGGER.log(Level.FINE,"bad URI",e);
        }
        if(source == null) {
            // shouldn't happen
            return;
        }
        List<String> found = extractQueryStringLinks(source);
        for (String uri : found) {
            try {
                UURI dest = UURIFactory.getInstance(uri);
                LinkContext lc = LinkContext.SPECULATIVE_MISC;
                Hop hop = Hop.SPECULATIVE;
                addOutlink(curi, dest, lc, hop);
                numberOfLinksExtracted.incrementAndGet();
            } catch (URIException e) {
View Full Code Here

TOP

Related Classes of org.archive.net.UURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.