Examples of CrawlerTask


Examples of com.googlecode.flaxcrawler.model.CrawlerTask

                if (!initialized) {
                    init();
                    initialized = true;

                    for (URL url : seeds) {
                        CrawlerTask task = new CrawlerTask(url.toString(), 0);
                        taskQueue.enqueue(task);
                    }

                    // Removing seeds
                    seeds = null;
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

     * @param links
     * @return
     */
    private void scheduleTasks(Crawler crawler, CrawlerTask parentTask, List<URL> links) {
        for (URL url : links) {
            CrawlerTask task = new CrawlerTask(url.toString(), parentTask.getLevel() + 1);
            scheduleTask(crawler, task, parentTask);
        }
    }
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

            if (!(task instanceof CrawlerTask)) {
                log.warn("Task is of wrong class, omitting it");
                return;
            }

            CrawlerTask crawlerTask = (CrawlerTask) task;
            log.debug("Processing task " + crawlerTask.getUrl());

            // Getting domain statistics
            DomainStatistics statistics = statisticsService.getDomainStatistics(crawlerTask.getDomain());

            if (!checkMaxHttpErrors(statistics)) {
                log.warn(crawlerTask.getDomain() + " has exceeded http errors limit");
                return;
            }

            if (!checkPolitenessPeriod(statistics)) {
                log.debug("Waiting for politeness period for domain " + statistics.getDomainName());
                deferCrawlerTask(crawlerTask);
                return;
            }

            try {
                Page page = crawler.crawl(crawlerTask);
                processPage(page, crawlerTask);
            } finally {
                log.debug("Stopping processing task " + crawlerTask.getUrl());
            }
        }
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

                    scheduleTasks(crawler, crawlerTask, page.getLinks());
                }
                log.debug(page.getLinks() == null ? 0 : page.getLinks().size() + " tasks were passed to the scheduler");
            } else if (page.getResponseCode() >= 300 && page.getResponseCode() < 400) {
                log.debug("Processing redirect from " + crawlerTask.getUrl() + " to " + page.getRedirectUrl());
                CrawlerTask task = new CrawlerTask(page.getRedirectUrl().toString(), crawlerTask.getLevel());
                // Passing custom data further
                task.setCustomData(crawlerTask.getCustomData());
                scheduleTask(crawler, task, crawlerTask);
            } else {
                log.debug(crawlerTask.getUrl() + " was processed with errors, response code: " + page.getResponseCode());
            }
        }
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

    /**
     * Reads urls from queue and adds them to the TaskQueue (if url was not crawled yet)
     */
    private void doWorkLoop() {
        while (true) {
            CrawlerTask task = null;

            try {
                synchronized (syncRoot) {
                    task = schedulerQueue.poll();
                }
                if (task != null) {
                    if (!statisticsService.isCrawled(task.getUrl())) {
                        taskQueue.enqueue(task);
                        statisticsService.afterScheduling(task);
                        log.debug("Scheduled crawling of the " + task.getUrl());
                    } else {
                        log.debug("Url " + task.getUrl() + " was already crawled");
                    }
                }

                // Yielding context to another thread
                Thread.sleep(1);
            } catch (Exception ex) {
                log.error("Error processing task " + task == null ? "NOTASK" : task.getUrl() + " from the scheduler queue", ex);
            }
        }
    }
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

    public void testDefaultCrawler() throws Exception {
        DefaultCrawler crawler = new DefaultCrawler();
        crawler.setDownloaderController(new DefaultDownloaderController());
        crawler.setParserController(new DefaultParserController());

        CrawlerTask crawlerTask = new CrawlerTask("http://www.wikipedia.org/", 0);
        Page page = crawler.crawl(crawlerTask);

        assertNotNull(page);
        assertTrue(page.getLinks().size() > 0);
    }
View Full Code Here

Examples of com.googlecode.flaxcrawler.model.CrawlerTask

        try {
            statisticsService = new DefaultStatisticsService("stats");


            CrawlerTask task = new CrawlerTask("http://google.com/", 1);
            statisticsService.afterScheduling(task);

            assertNotNull(statisticsService.getDomainStatistics("google.com"));
            assertTrue(statisticsService.isCrawled("http://google.com/"));
            assertTrue(statisticsService.getDomainStatistics("google.com").getScheduled() > 0);
View Full Code Here

Examples of explore.java.concurrency.producer_consumer.CrawlerTask

    }

    private List<CrawlerTask> callables() {
        ArrayList<CrawlerTask> callables = Lists.newArrayList();
        for (int index = 0; index < NUMBER_OF_CALLS; index++) {
            callables.add(new CrawlerTask(String.valueOf(index)));
        }
        callables.add(new StoppingTask(null));
        return callables;
    }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.