Package bixo.fetcher

Examples of bixo.fetcher.BaseFetcher


    public static void main(String[] args) {
        System.setProperty("bixo.root.level", "TRACE");
        // Uncomment this to see the wire log for HttpClient
        // System.setProperty("bixo.http.level", "DEBUG");
       
        BaseFetcher fetcher = RobotUtils.createFetcher(ConfigUtils.BIXO_TOOL_AGENT, 1);
       
        boolean interactive = args.length == 0;
        int index = 0;
       
        while (interactive || (index < args.length)) {
View Full Code Here


     *
     * @throws Exception
     */
    @Test
    public void testCircularRedirect() throws Exception {
        BaseFetcher fetcher = RobotUtils.createFetcher(ConfigUtils.BIXO_TEST_AGENT, 1);
        BaseRobotsParser parser = new SimpleRobotRulesParser();
       
        SimulationWebServerForTests webServer = new SimulationWebServerForTests();
        Server server = webServer.startServer(new CircularRedirectResponseHandler(), 8089);
       
View Full Code Here

        }
    }

    @Test
    public void testRedirectToHtml() throws Exception {
        BaseFetcher fetcher = RobotUtils.createFetcher(ConfigUtils.BIXO_TEST_AGENT, 1);
        BaseRobotsParser parser = new SimpleRobotRulesParser();
       
        SimulationWebServerForTests webServer = new SimulationWebServerForTests();
        Server server = webServer.startServer(new RedirectToTopResponseHandler(), 8089);
       
View Full Code Here

        // The "crawler@domain.com" email address shouldn't trigger a match against the
        // "crawler" user agent name in the robots.txt file.
        final String simpleRobotsTxt = "User-agent: crawler" + "\r\n"
        + "Disallow: /";

        BaseFetcher fetcher = Mockito.mock(BaseFetcher.class);
        FetchedDatum datum = Mockito.mock(FetchedDatum.class);
        Mockito.when(datum.getContentBytes()).thenReturn(simpleRobotsTxt.getBytes());
        Mockito.when(fetcher.get(Mockito.any(ScoredUrlDatum.class))).thenReturn(datum);
        UserAgent userAgent = new UserAgent("testAgent", "crawler@domain.com", "http://www.domain.com");
        Mockito.when(fetcher.getUserAgent()).thenReturn(userAgent);
       
        URL robotsUrl = new URL("http://www.domain.com/robots.txt");
        SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
        BaseRobotRules rules = RobotUtils.getRobotRules(fetcher, parser, robotsUrl);
       
View Full Code Here

public class SimpleHttpFetcherIntegrationTest {
   
    @Test
    public final void testNoDomain() {
        BaseFetcher fetcher = new SimpleHttpFetcher(1, ConfigUtils.BIXO_IT_AGENT);
        String url = "http://www.bogusbixodomainxxxxx.com";
       
        try {
            fetcher.get(new ScoredUrlDatum(url));
            Assert.fail("Exception not thrown");
        } catch (Exception e) {
            Assert.assertTrue(e instanceof IOFetchException);
        }
    }
View Full Code Here

            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());
           
            BaseScoreGenerator scorer = new FixedScoreGenerator();
           
            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);
           
            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
        Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(), Fields.RESULTS);
View Full Code Here

            System.setProperty("bixo.root.level", "TRACE");
            // Uncomment this to see the wire log for HttpClient
            // System.setProperty("bixo.http.level", "DEBUG");

            BaseFetcher fetcher = UrlLengthener.makeFetcher(10, ConfigUtils.BIXO_TOOL_AGENT);

            Pipe pipe = new Pipe("urls");
            pipe = new Each(pipe, new UrlLengthener(fetcher));
            pipe = new Each(pipe, new Debug());
View Full Code Here

    protected void testHeadersInStatus(BasePlatform platform) throws Exception {
        Tap in = makeInputData(platform, "testHeadersInStatus", 1, 1);

        Pipe pipe = new Pipe("urlSource");
        BaseFetcher fetcher = new FakeHttpFetcher(false, 1);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseRobotsParser parser = new SimpleRobotRulesParser();
        BaseFetchJobPolicy fetchJobPolicy = new DefaultFetchJobPolicy();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, fetcher, parser, fetchJobPolicy, 1);
       
View Full Code Here

       
        Tap in = makeInputData(platform, "testFetchPipe", "localhost:" + port, numPages, new Payload());

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        BaseFetcher fetcher = new SimpleHttpFetcher(ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testFetchPipe";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
View Full Code Here

        Pipe pipe = new Pipe("urlSource");
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetcherPolicy policy = new FetcherPolicy();
        policy.setRedirectMode(RedirectMode.FOLLOW_TEMP);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);
       
        String output = "build/test/FetchPipeTest/testRedirectException";
        BasePath outputPath = platform.makePath(output);
        BasePath statusPath = platform.makePath(outputPath, "status");
View Full Code Here

TOP

Related Classes of bixo.fetcher.BaseFetcher

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.