Package bixo.config

Examples of bixo.config.FetcherPolicy


        assertTrue( englishContent.equals(contentStr));
    }

    @Test
    public final void testMimeTypeFiltering() throws Exception {
        FetcherPolicy policy = new FetcherPolicy();
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/html");
        policy.setValidMimeTypes(validMimeTypes);

        Server server = startServer(new MimeTypeResponseHandler("text/xml"), 8089);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        String url = "http://localhost:8089/";
       
View Full Code Here


        }
    }

    @Test
    public final void testMimeTypeFilteringNoContentType() throws Exception {
        FetcherPolicy policy = new FetcherPolicy();
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/html");
        validMimeTypes.add(""); // We want unknown (not reported) mime-types too.
        policy.setValidMimeTypes(validMimeTypes);

        Server server = startServer(new MimeTypeResponseHandler(null), 8089);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        String url = "http://localhost:8089/";
       
View Full Code Here

        }
    }

    @Test
    public final void testMimeTypeFilteringWithCharset() throws Exception {
        FetcherPolicy policy = new FetcherPolicy();
        Set<String> validMimeTypes = new HashSet<String>();
        validMimeTypes.add("text/html");
        policy.setValidMimeTypes(validMimeTypes);

        Server server = startServer(new MimeTypeResponseHandler("text/html; charset=UTF-8"), 8089);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        String url = "http://localhost:8089/";
       
View Full Code Here

        }
    }

    @Test
    public final void testHostAddress() throws Exception {
        FetcherPolicy policy = new FetcherPolicy();
        Server server = startServer(new ResourcesResponseHandler(), 8089);
        BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
        String url = "http://localhost:8089/simple-page.html";
        FetchedDatum result = fetcher.get(new ScoredUrlDatum(url));
        server.stop();
View Full Code Here

        assertEquals("127.0.0.1", hostAddress);
    }
   
    @Test
    public final void testAcceptEncoding() throws Exception {
        FetcherPolicy policy = new FetcherPolicy();
        SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT);
       
        System.out.println(fetcher.getAcceptEncoding());
       
        final String acceptEncoding = "bogus";
View Full Code Here

            // Set up the UserAgent for the fetcher.
            UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

            // You also get to customize the FetcherPolicy
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
            defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
            defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);
           
            // It is a good idea to set up a crawl duration when running long crawls as you may
            // end up in situations where the fetch slows down due to a 'long tail' and by
            // specifying a crawl duration you know exactly when the crawl will end.
            int crawlDurationInMinutes = options.getCrawlDuration();
            boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
            long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) :
                FetcherPolicy.NO_CRAWL_END_TIME;

            // By setting up a url filter we only deal with urls that we want to
            // instead of all the urls that we extract.
            BaseUrlFilter urlFilter = null;
            List<String> patterns = null;
            String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
            if (regexUrlFiltersFile != null) {
                patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
            } else {
                patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
                if (domain != null) {
                    String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                    patterns.add(domainPatterStr);
                } else {
                    String protocolPatterStr = "+(?i)^(http|https)://*";
                    patterns.add(protocolPatterStr);
                    LOGGER.warn("Defaulting to basic url regex filtering (just suffix and protocol");
                }
            }
            urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

            // OK, now we're ready to start looping, since we've got our current
            // settings
            for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

                // Adjust target end time, if appropriate.
                if (hasEndTime) {
                    int remainingLoops = (endLoop - curLoop) + 1;
                    long now = System.currentTimeMillis();
                    long perLoopTime = (targetEndTime - now) / remainingLoops;
                    defaultPolicy.setCrawlEndTime(now + perLoopTime);
                }

                BasePath curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, outputPath, curLoop);
                String curLoopDirName = curLoopDirPath.getName();
                setLoopLoggerFile(logsDir+curLoopDirName, curLoop);
View Full Code Here

       
        Pipe pipe = new Pipe("urlSource");

        UserAgent userAgent = new FirefoxUserAgent();
        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setMaxRequestsPerConnection(1);
        fetcherPolicy.setCrawlDelay(5 * 1000L);
        BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
        BaseScoreGenerator scorer = new FixedScoreGenerator();
        FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

        FlowConnector flowConnector = platform.makeFlowConnector();
View Full Code Here

    private boolean _randomFetching;
    private Random _rand;

    public FakeHttpFetcher() {
        this(true, 1, new FetcherPolicy());
    }
View Full Code Here

    public FakeHttpFetcher() {
        this(true, 1, new FetcherPolicy());
    }

    public FakeHttpFetcher(boolean randomFetching, int maxThreads) {
        this(randomFetching, maxThreads, new FetcherPolicy());
    }
View Full Code Here

@SuppressWarnings("serial")
public class NullHttpFetcher extends BaseFetcher {

    public NullHttpFetcher() {
        super(1, new FetcherPolicy(), ConfigUtils.BIXO_TEST_AGENT);
    }
View Full Code Here

TOP

Related Classes of bixo.config.FetcherPolicy

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.