Package org.archive.modules

Examples of org.archive.modules.CrawlURI


    @Override
    protected Collection<TestData> makeData(String content, String uri)
    throws Exception {
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null, NAVLINK_MISC);
        Recorder recorder = createRecorder(content);
        euri.setContentType("text/css");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
       
        // TODO: This test was naively modified to account for the abscense of LINK, but no effort was made to confirm that it is actually testing anything useful
        UURI dest = UURIFactory.getInstance(uri);
        CrawlURI link = euri.createCrawlURI(dest, EMBED_MISC, Hop.EMBED);
        TestData td = new TestData(euri, link);
        return Collections.singleton(td);
    }
View Full Code Here


    }
   

    @Override
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
       
        // Check if uris should be blocked
        if (getBlockAll()) {
            curi.setFetchStatus(S_BLOCKED_BY_USER);
            return ProcessResult.FINISH;
        }

        // Check if allowed by regular expression
        String regex = getAllowByRegex();
        if (regex != null && !regex.equals("")) {
            if (!TextUtils.matches(regex, curi.toString())) {
                curi.setFetchStatus(S_BLOCKED_BY_USER);
                return ProcessResult.FINISH;
            }
        }

        // Check if blocked by regular expression
        regex = getBlockByRegex();
        if (regex != null && !regex.equals("")) {
            if (TextUtils.matches(regex, curi.toString())) {
                curi.setFetchStatus(S_BLOCKED_BY_USER);
                return ProcessResult.FINISH;
            }
        }

        // Possibly recheck scope
        if (getRecheckScope()) {
            if (!isInScope(curi)) {
                // Scope rejected
                curi.setFetchStatus(S_OUT_OF_SCOPE);
                return ProcessResult.FINISH;
            }
        }
       
        return ProcessResult.PROCEED;
View Full Code Here

                if (getExtractorJS() != null) {
                    linkCount += getExtractorJS().considerStrings(ext, curi, url);
                }
            } else {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                CrawlURI relToBase = addRelativeToBase(curi, max, url,
                        LinkContext.EMBED_MISC, Hop.EMBED);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }
View Full Code Here

        }

        public void considerStringAsUri(String str) throws IOException {
            if (UriUtils.isVeryLikelyUri(str)) {
                int max = ext.getExtractorParameters().getMaxOutlinks();
                CrawlURI relToVia = addRelativeToVia(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                CrawlURI relToBase = addRelativeToBase(curi, max, str,
                        LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                addAnnotations(relToVia, relToBase);
                linkCount++;
            }
        }
View Full Code Here

     *
     * @throws URIException
     */
    public void testFormsLinkGet() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
          "<form name=\"testform\" method=\"GET\" action=\"redirect_me?form=true\"> " +
          "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
          "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
          "  <select name=\"selectBox\">" +
          "    <option value=\"selectedOption\" selected>option1</option>" +
          "    <option value=\"nonselectedOption\">option2</option>" +
          "  </select>" +
          "  <input type=\"submit\" name=\"test\" value=\"Go\">" +
          "</form>";  
        getExtractor().extract(curi, cs);
        curi.getOutLinks();
        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf(
                        "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
            }
        }));
View Full Code Here

     *
     * @throws URIException
     */
    public void testFormsLinkIgnorePost() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
            "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
            "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
            "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
            "  <select name=\"selectBox\">" +
            "    <option value=\"selectedOption\" selected>option1</option>" +
            "    <option value=\"nonselectedOption\">option2</option>" +
            "  </select>" +
            "  <input type=\"submit\" name=\"test\" value=\"Go\">" +
            "</form>";  
        getExtractor().extract(curi, cs);
        curi.getOutLinks();
        assertTrue(!CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf(
                        "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
            }
        }));
View Full Code Here

     *
     * @throws URIException
     */
    public void testFormsLinkFindPost() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs =
            "<form name=\"testform\" method=\"POST\" action=\"redirect_me?form=true\"> " +
            "  <INPUT TYPE=CHECKBOX NAME=\"checked[]\" VALUE=\"1\" CHECKED> "+
            "  <INPUT TYPE=CHECKBOX NAME=\"unchecked[]\" VALUE=\"1\"> " +
            "  <select name=\"selectBox\">" +
            "    <option value=\"selectedOption\" selected>option1</option>" +
            "    <option value=\"nonselectedOption\">option2</option>" +
            "  </select>" +
            "  <input type=\"submit\" name=\"test\" value=\"Go\">" +
            "</form>";
        getExtractor().setExtractOnlyFormGets(false);
        getExtractor().extract(curi, cs);
        curi.getOutLinks();
        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf(
                        "/redirect_me?form=true&checked[]=1&unchecked[]=&selectBox=selectedOption&test=Go")>=0;
            }
        }));
View Full Code Here

        }));
    }
   
    public void testMultipleAttributesPerElement() throws URIException {
        UURI uuri = UURIFactory.getInstance("http://www.example.org");
        CrawlURI curi = new CrawlURI(uuri);
        CharSequence cs = "<a src=\"http://www.example.com/\" href=\"http://www.archive.org/\"> ";
        getExtractor().extract(curi, cs);
        assertTrue("not all links found", curi.getOutLinks().size() == 2);
    }
View Full Code Here

    @Override
    protected Collection<TestData> makeData(String content, String destURI)
    throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null,
            LinkContext.SPECULATIVE_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/xml");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
               
        UURI dest = UURIFactory.getInstance(destURI);
        CrawlURI link = euri.createCrawlURI(dest, LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
        result.add(new TestData(euri, link));
       
        return result;
    }
View Full Code Here

            while ( true ) {
                ArchiveUtils.continueCheck();
               
                setStep(Step.ABOUT_TO_GET_URI, null);

                CrawlURI curi = controller.getFrontier().next();
               
               
                synchronized(this) {
                    ArchiveUtils.continueCheck();
                    setCurrentCuri(curi);
View Full Code Here

TOP

Related Classes of org.archive.modules.CrawlURI

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.