Package net.vidageek.crawler.component

Source Code of net.vidageek.crawler.component.PageCrawlerExecutor

package net.vidageek.crawler.component;

import java.util.concurrent.ThreadPoolExecutor;

import net.vidageek.crawler.Page;
import net.vidageek.crawler.PageVisitor;
import net.vidageek.crawler.Status;
import net.vidageek.crawler.Url;

import org.apache.log4j.Logger;

/**
* @author jonasabreu
*
*/
final public class PageCrawlerExecutor implements Runnable {

    private final Downloader downloader;
    private final LinkNormalizer normalizer;
    private final PageVisitor visitor;
    private final ExecutorCounter counter;

    private final Logger log = Logger.getLogger(PageCrawlerExecutor.class);
    private final Url urlToCrawl;
    private final ThreadPoolExecutor executor;

    public PageCrawlerExecutor(final Url urlToCrawl, final ThreadPoolExecutor executor, final ExecutorCounter counter,
            final Downloader downloader, final LinkNormalizer normalizer, final PageVisitor visitor) {
        this.urlToCrawl = urlToCrawl;
        this.executor = executor;
        this.counter = counter;
        this.downloader = downloader;
        this.normalizer = normalizer;
        this.visitor = visitor;

        counter.increase();
    }

    public void run() {
        try {

            log.info("crawling url: " + urlToCrawl.link());

            Page page = downloader.get(urlToCrawl.link());
            if (page.getStatusCode() != Status.OK) {
                visitor.onError(urlToCrawl, page.getStatusCode());
            } else {
                visitor.visit(page);
            }

            for (String l : page.getLinks()) {
                String link = normalizer.normalize(l);
                final Url url = new Url(link, urlToCrawl.depth() + 1);
                if (visitor.followUrl(url)) {
                    executor.execute(new PageCrawlerExecutor(url, executor, counter, downloader, normalizer, visitor));
                }
            }

        } finally {
            counter.decrease();
        }
    }

}
TOP

Related Classes of net.vidageek.crawler.component.PageCrawlerExecutor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.