Package us.codecraft.webmagic.processor

Examples of us.codecraft.webmagic.processor.SimplePageProcessor


public class SpiderTest {

    @Ignore("long time")
    @Test
    public void testStartAndStop() throws InterruptedException {
        Spider spider = Spider.create(new SimplePageProcessor("http://www.oschina.net/", "http://www.oschina.net/*")).addPipeline(new Pipeline() {
            @Override
            public void process(ResultItems resultItems, Task task) {
                System.out.println(1);
            }
        }).thread(1);
View Full Code Here


    public FileCache(String startUrl, String urlPattern) {
        this(startUrl, urlPattern, "/data/webmagic/temp/");
    }

    public FileCache(String startUrl, String urlPattern, String path) {
        this.pageProcessor = new SimplePageProcessor(startUrl, urlPattern);
        setPath(path);
        downloaderWhenFileMiss = new HttpClientDownloader();
    }
View Full Code Here

    @Test
    public void testGlobalSpider(){
//        PageProcessor pageProcessor = new MeicanProcessor();
//        Spider.me().pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler(pageProcessor.getSite(),"/data/temp/webmagic/cache/")).
//                processor(pageProcessor).run();
        SimplePageProcessor pageProcessor2 = new SimplePageProcessor("http://www.diaoyuweng.com/home.php?mod=space&uid=88304&do=thread&view=me&type=thread&from=space", "http://www.diaoyuweng.com/thread-*-1-1.html");
        System.out.println(pageProcessor2.getSite().getCharset());
        pageProcessor2.getSite().setSleepTime(500);
        Spider.create(pageProcessor2).pipeline(new FilePipeline()).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
                run();


    }
View Full Code Here

TOP

Related Classes of us.codecraft.webmagic.processor.SimplePageProcessor

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.