Examples of org.archive.modules.CrawlURI

org.archive.modules.CrawlURI
Represents a candidate URI and the associated state it collects as it is crawled.
Core state is in instance variables but a flexible attribute list is also available. Use this 'bucket' to carry custom processing extracted data and state across CrawlURI processing. See the {@link #putString(String,String)}, {@link #getString(String)}, etc.
Note: getHttpMethod() has been removed starting with Heritrix 3.3.0. HTTP response headers are available using {@link #getHttpResponseHeader(String)}. (HTTP fetchers are responsible for setting the values using {@link #putHttpResponseHeader(String,String)}). @author Gordon Mohr


    /*
     * positive and negative tests for uris in meta tag's content attribute
     */
    public void testMetaContentURI() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com"));
        CharSequence cs = 
                "<meta property=\"og:video\" content=\"http://www.example.com/absolute.mp4\" /> "+
                "<meta property=\"og:video\" content=\"/relative.mp4\" /> "+
                "<meta property=\"og:video:height\" content=\"333\" />"+
                "<meta property=\"og:video:type\" content=\"video/mp4\" />"+
                "<meta property=\"strangeproperty\" content=\"notaurl\" meaninglessurl=\"http://www.example.com/shouldnotbeextracted.html\" />";
        
        getExtractor().extract(puri, cs);
        
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links);         
        String dest1 = "http://www.example.com/absolute.mp4";
        String dest2 = "http://www.example.com/relative.mp4";
        
        assertTrue("incorrect number of links found", puri.getOutLinks().size()==2);
        assertEquals("expected uri in 'content' attribute of meta tag not found",dest1,
                links[0].getURI());        
        assertEquals("expected uri in 'content' attribute of meta tag not found",dest2,
                links[1].getURI());
    }

View Full Code Here

    
    /**
     * Test detection, respect of meta robots nofollow directive
     */
    public void testMetaRobots() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com"));
        CharSequence cs = 
            "Blah Blah "+
            "<meta name='robots' content='index,nofollow'>"+
            "<a href='blahblah'>blah</a> "+
            "blahblah";
        getExtractor().extract(puri, cs);
        assertEquals("meta robots content not extracted","index,nofollow",
                puri.getData().get(ExtractorHTML.A_META_ROBOTS));
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        assertTrue("link extracted despite meta robots",links.length==0);
    }

View Full Code Here

     * See http://webteam.archive.org/jira/browse/HER-1268
     * 
     * @throws URIException
     */
    public void testBadRelativeLinks() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com"));
        CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>"
                + "<a href=\"example.html?parameter=this:value\"/>";
        getExtractor().extract(curi, cs);


        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object)
                        .getURI()
                        .indexOf(
                                "/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0;
            }
        }));


        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf(
                        "/example.html?parameter=this:value") >= 0;
            }
        }));

View Full Code Here

     * same host
     * 
     * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme
     */
    public void testSpeculativeLinkExtraction() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory
                .getInstance("https://www.example.com"));
        CharSequence cs = 
            "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";"
                + "_anotherparameter=\"www.example.com/index.html\""
                + ";</script>";
        getExtractor().extract(curi, cs);


        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                System.err.println("comparing: "
                        + ((CrawlURI) object).getURI()
                        + " and https://www.anotherexample.com/");
                return ((CrawlURI) object).getURI().equals(
                        "http://www.anotherexample.com/");
            }
        }));
        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().equals(
                        "https://www.example.com/index.html");
            }
        }));

View Full Code Here

     *   (eg. 'text/javascript')
     *   
     * @throws URIException
     */
    public void testScriptTagWritingScriptType() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com/en/fiche/dossier/322/"));
        CharSequence cs = 
            "<script type=\"text/javascript\">"
            + "var gaJsHost = ((\"https:\" == document.location.protocol) "
            + "? \"https://ssl.\" : \"http://www.\");"
            + " document.write(unescape(\"%3Cscript src='\" + gaJsHost + "
            + "\"google-analytics.com/ga.js' "
            + "type='text/javascript'%3E%3C/script%3E\"));"
            + "</script>";
        getExtractor().extract(curi, cs);
        assertEquals(Collections.EMPTY_SET, curi.getOutLinks());
    }

View Full Code Here

        getExtractor().extract(curi, cs);
        assertEquals(Collections.EMPTY_SET, curi.getOutLinks());
    }


    public void testOutLinksWithBaseHref() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory
                .getInstance("http://www.example.com/abc/index.html"));
        puri.setBaseURI(puri.getUURI());
        CharSequence cs = 
            "<base href=\"http://www.example.com/\">" + 
            "<a href=\"def/another1.html\">" + 
            "<a href=\"ghi/another2.html\">";
        getExtractor().extract(puri, cs);
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links); 
        String dest1 = "http://www.example.com/def/another1.html";
        String dest2 = "http://www.example.com/ghi/another2.html";
        // ensure outlink from base href
        assertEquals("outlink1 from base href",dest1,

View Full Code Here

    /**
     * HER-1728 
     * @throws URIException 
     */
    public void testFlashvarsParamValue() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
        CharSequence cs = 
            "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
            "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + 
            "    <param name=\"menu\" value=\"false\">\n" + 
            "    <param name=\"bgcolor\" value=\"#000000\">\n" + 
            "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + 
            "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
            "</object> ";
        getExtractor().extract(curi, cs);
        String expected = "http://www.example.com/ParamZoomifySlideshowViewer.xml";
        assertTrue("outlinks should contain: "+expected,
                CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
    }

View Full Code Here

    /**
     * HER-1728 
     * @throws URIException 
     */
    public void testFlashvarsEmbedAttribute() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
        CharSequence cs = 
            "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
            "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + 
            "    <param name=\"menu\" value=\"false\">\n" + 
            "    <param name=\"bgcolor\" value=\"#000000\">\n" + 
            "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + 
            "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + 
            "</object> ";
        getExtractor().extract(curi, cs);
        String expected = "http://www.example.com/EmbedZoomifySlideshowViewer.xml";
        assertTrue("outlinks should contain: "+expected,
                CollectionUtils.exists(curi.getOutLinks(),destinationsIsPredicate(expected)));
    }

View Full Code Here

    /**
     * HER-1998 
     * @throws URIException 
     */
    public  void testConditionalComment1() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
    
        CharSequence cs = 
            "<!--[if IE 6]><img src=\"foo.gif\"><![endif]-->" +
            "<!--[if IE 6]><script src=\"foo.js\"><![endif]-->";
 
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();  
        getExtractor().setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        getExtractor().setMetadata(metadata);
        getExtractor().afterPropertiesSet();
        
        getExtractor().extract(curi, cs);
        
        CrawlURI[] links = curi.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links); 
        
        String dest1 = "http://www.example.com/foo.gif";
        String dest2 = "http://www.example.com/foo.js";

View Full Code Here

    @Override
    protected Collection<TestData> makeData(String content, String destURI)
    throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/foo/dummy.js");
        CrawlURI euri = new CrawlURI(src, null, src, LinkContext.NAVLINK_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/javascript");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());


        if (destURI != null) {
            UURI dest = UURIFactory.getInstance(destURI);
            CrawlURI link = euri.createCrawlURI(dest, LinkContext.JS_MISC, Hop.SPECULATIVE);
            result.add(new TestData(euri, link));
        } else {
            result.add(new TestData(euri, null));
        }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.archive.modules.CrawlURI

org.archive.crawler.datamodel.CandidateURITest

org.archive.crawler.datamodel.CrawlURITest

org.archive.crawler.deciderules.ClassKeyMatchesRegexDecideRule

org.archive.crawler.framework.ToeThread

org.archive.crawler.frontier.AbstractFrontier

org.archive.crawler.frontier.AMQPUrlReceiver$UrlConsumer

org.archive.crawler.frontier.BdbMultipleWorkQueues

org.archive.crawler.frontier.BdbMultipleWorkQueuesTest

org.archive.crawler.frontier.BdbWorkQueue

org.archive.crawler.frontier.FrontierJournal

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.