Examples of UURI


Examples of org.archive.net.UURI

        }
    }

    public void testCandidateURIWithLoadedAList()
            throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.archive.org");
        CrawlURI curi = new CrawlURI(uuri);
        curi.setSeed(true);
        curi.getData().put("key", "value");
        assertTrue("Didn't find AList item",
                curi.getData().get("key").equals("value"));
View Full Code Here

Examples of org.archive.net.UURI

                            newURL = newURL.substring(0,newURL.length()-1);
                        }

                        // And add the URL to speculative embeds.
                        numberOfLinksExtracted.incrementAndGet();
                        UURI dest = UURIFactory.getInstance(newURL);
                        LinkContext lc = LinkContext.SPECULATIVE_MISC;
                        Hop hop = Hop.SPECULATIVE;
                        addOutlink(curi,  dest, lc, hop);
                    }
                    // Reset lookat for next string.
View Full Code Here

Examples of org.archive.net.UURI

   
    @Override
    protected Collection<TestData> makeData(String content, String destURI)
    throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null,
                LinkContext.NAVLINK_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/html");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
               
        UURI dest = UURIFactory.getInstance(destURI);
        LinkContext context = determineContext(content);
        Hop hop = determineHop(content);
        CrawlURI link = euri.createCrawlURI(dest, context, hop);
        result.add(new TestData(euri, link));
       
View Full Code Here

Examples of org.archive.net.UURI

   
    @Override
    protected Collection<TestData> makeData(String content, String destURI)
    throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/foo/dummy.js");
        CrawlURI euri = new CrawlURI(src, null, src, LinkContext.NAVLINK_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/javascript");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());

        if (destURI != null) {
            UURI dest = UURIFactory.getInstance(destURI);
            CrawlURI link = euri.createCrawlURI(dest, LinkContext.JS_MISC, Hop.SPECULATIVE);
            result.add(new TestData(euri, link));
        } else {
            result.add(new TestData(euri, null));
        }
View Full Code Here

Examples of org.archive.net.UURI

     *
     * @see org.archive.crawler.frontier.CostAssignmentPolicy#costOf(org.archive.modules.CrawlURI)
     */
    public int costOf(CrawlURI curi) {
        int cost = 1;
        UURI uuri = curi.getUURI();
        if (uuri.hasQuery()) {
            // has query string
            cost++;
            int qIndex = uuri.toString().indexOf('?');
            if (curi.flattenVia().startsWith(uuri.toString().substring(0,qIndex))) {
                // non-query-string portion of URI is same as previous
                cost++;
            }
            // TODO: other potential query-related cost penalties:
            //  - more than X query-string attributes
View Full Code Here

Examples of org.archive.net.UURI

   
    public HopCrossesAssignmentLevelDomainDecideRule() {
    }

    protected boolean evaluate(CrawlURI uri) {
        UURI via = uri.getVia();
        if (via == null) {
            return false;
        }
        try {
            // determine if this hop crosses assignment-level-domain borders
View Full Code Here

Examples of org.archive.net.UURI


    @Override
    protected Collection<TestData> makeData(String content, String uri)
    throws Exception {
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null, NAVLINK_MISC);
        Recorder recorder = createRecorder(content);
        euri.setContentType("text/css");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
       
        // TODO: This test was naively modified to account for the abscense of LINK, but no effort was made to confirm that it is actually testing anything useful
        UURI dest = UURIFactory.getInstance(uri);
        CrawlURI link = euri.createCrawlURI(dest, EMBED_MISC, Hop.EMBED);
        TestData td = new TestData(euri, link);
        return Collections.singleton(td);
    }
View Full Code Here

Examples of org.archive.net.UURI

    public AddRedirectFromRootServerToScope() {
    }

    @Override
    protected boolean evaluate(CrawlURI uri) {
        UURI via = uri.getVia();
        if (via == null) {
            return false;
        }
        try {
            String chost = uri.getUURI().getHostBasename();
            if (chost == null) {
                return false;
            }
           
            String viaHost = via.getHostBasename();
            if (viaHost == null) {
                return false;
            }
           
            if (chost.equals(viaHost) && uri.isLocation()
                    && via.getPath().equals(SLASH)) {
                uri.setSeed(true);
                LOGGER.info("Adding " + uri + " to seeds via " + via);
                return true;
            }
        } catch (URIException e) {
View Full Code Here

Examples of org.archive.net.UURI

     * Test a GET FORM ACTION extraction
     *
     * @throws URIException
     */
    public void testFormsLinkGet() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
          "<form name=\"testform\" method=\"GET\" action=\"redirect_me?form=true\"> " +
          "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
          "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
View Full Code Here

Examples of org.archive.net.UURI

     * Test a POST FORM ACTION being properly ignored
     *
     * @throws URIException
     */
    public void testFormsLinkIgnorePost() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
            "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
            "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
            "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.