Package bixo.urls

Examples of bixo.urls.BaseUrlFilter


public class SimpleUrlFilterTest {

    @Test
    public void testValidUrls() {
        BaseUrlFilter urlFilter = new SimpleUrlFilter();
        Assert.assertFalse(urlFilter.isRemove(new UrlDatum("http://domain.com")));
    }
View Full Code Here


        Assert.assertFalse(urlFilter.isRemove(new UrlDatum("http://domain.com")));
    }
   
    @Test
    public void testInvalidUrls() {
        BaseUrlFilter urlFilter = new SimpleUrlFilter();
        Assert.assertTrue("No protocol", urlFilter.isRemove(new UrlDatum("www.domain.com")));
        Assert.assertTrue("Unknown protocol", urlFilter.isRemove(new UrlDatum("mdata://www.domain.com")));
        Assert.assertTrue("Invalid port", urlFilter.isRemove(new UrlDatum("http://www.domain.com:a")));
    }
View Full Code Here

            long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) :
                FetcherPolicy.NO_CRAWL_END_TIME;

            // By setting up a url filter we only deal with urls that we want to
            // instead of all the urls that we extract.
            BaseUrlFilter urlFilter = null;
            List<String> patterns = null;
            String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
            if (regexUrlFiltersFile != null) {
                patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
            } else {
View Full Code Here

        curLoopDirPath = CrawlDirUtils.makeLoopDir(platform, baseDirPath, 1);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(1);
        defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
        BaseUrlFilter urlFilter = new BaseUrlFilter() {

            @Override
            public boolean isRemove(UrlDatum datum) {
                return false;
            }
View Full Code Here

TOP

Related Classes of bixo.urls.BaseUrlFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.