Source Code of cn.edu.hfut.dmic.webcollector.crawler.CommonCrawler

/*
 * Copyright (C) 2014 hu
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */


package cn.edu.hfut.dmic.webcollector.crawler;




import cn.edu.hfut.dmic.webcollector.fetcher.Fetcher;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.Request;
import cn.edu.hfut.dmic.webcollector.parser.HtmlParser;
import cn.edu.hfut.dmic.webcollector.parser.Parser;
import cn.edu.hfut.dmic.webcollector.util.CommonConnectionConfig;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.ConnectionConfig;
import java.net.Proxy;
import java.net.URL;


/**
 * 一种常用的广度遍历爬虫
 * @author hu
 */
public abstract class CommonCrawler extends Crawler{
    private String cookie = null;
    private String useragent = "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:26.0) Gecko/20100101 Firefox/26.0";


    private boolean isContentStored = true;
    private Proxy proxy = null;
    private ConnectionConfig conconfig = null;
    
    /**
     * 根据url生成Request(http请求)的方法，可以通过Override这个方法来自定义Request
     * @param url
     * @return 实现Request接口的对象
     * @throws Exception
     */
    @Override
    public Request createRequest(String url) throws Exception {
        HttpRequest request = new HttpRequest();
        URL _URL = new URL(url);
        request.setURL(_URL);
        request.setProxy(proxy);
        request.setConnectionConfig(conconfig);
        return request;
    }


    /**
     * 根据网页的url和contentType，来创建Parser(解析器)，可以通过Override这个方法来自定义Parser
     * @param url
     * @param contentType
     * @return 实现Parser接口的对象
     * @throws Exception
     */
    @Override
    public Parser createParser(String url, String contentType) throws Exception {
        if (contentType == null) {
            return null;
        }
        if (contentType.contains("text/html")) {
            return new HtmlParser(Config.topN);
        }
        return null;
    }
    
    
    @Override
    public Fetcher createFetcher() {
        Fetcher fetcher = new Fetcher();
        fetcher.setNeedUpdateDb(true);
        fetcher.setIsContentStored(isContentStored);
        conconfig = new CommonConnectionConfig(useragent, cookie);
        fetcher.setThreads(getThreads());
        return fetcher;
    }
    
    /**
     * 返回User-Agent
     * @return User-Agent
     */
    public String getUseragent() {
        return useragent;
    }


    /**
     * 设置User-Agent
     * @param useragent
     */
    public void setUseragent(String useragent) {
        this.useragent = useragent;
    }
    
    /**
     * 返回http连接配置对象
     *
     * @return http连接配置对象
     */
    public ConnectionConfig getConconfig() {
        return conconfig;
    }


    /**
     * 设置http连接配置对象
     *
     * @param conconfig http连接配置对象
     */
    public void setConconfig(ConnectionConfig conconfig) {
        this.conconfig = conconfig;
    }
    
    /**
     * 返回是否存储网页/文件的内容
     * @return 是否存储网页/文件的内容
     */
    public boolean getIsContentStored() {
        return isContentStored;
    }


    /**
     * 设置是否存储网页／文件的内容
     * @param isContentStored 是否存储网页/文件的内容
     */
    public void setIsContentStored(boolean isContentStored) {
        this.isContentStored = isContentStored;
    }
    
     /**
     * 返回代理
     * @return 代理
     */
    public Proxy getProxy() {
        return proxy;
    }


    /**
     * 设置代理
     * @param proxy 代理
     */
    public void setProxy(Proxy proxy) {
        this.proxy = proxy;
    }
    
    /**
     * 返回Cookie
     * @return Cookie
     */
    public String getCookie() {
        return cookie;
    }


    /**
     * 设置http请求的cookie
     * @param cookie Cookie
     */
    public void setCookie(String cookie) {
        this.cookie = cookie;
    }
}
Source Code of cn.edu.hfut.dmic.webcollector.crawler.CommonCrawler

Related Classes of cn.edu.hfut.dmic.webcollector.crawler.CommonCrawler