Package org.apache.nutch.fetcher

Source Code of org.apache.nutch.fetcher.TestFetcher

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.fetcher;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDBTestUtil;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.mortbay.jetty.Server;

/**
* Basic fetcher test
* 1. generate seedlist
* 2. inject
* 3. generate
* 3. fetch
* 4. Verify contents
*
*/
public class TestFetcher {

  final static Path testdir=new Path("build/test/fetch-test");
  Configuration conf;
  FileSystem fs;
  Path crawldbPath;
  Path segmentsPath;
  Path urlPath;
  Server server;

  @Before
  public void setUp() throws Exception{
    conf=CrawlDBTestUtil.createConfiguration();
    fs=FileSystem.get(conf);
    fs.delete(testdir, true);
    urlPath=new Path(testdir,"urls");
    crawldbPath=new Path(testdir,"crawldb");
    segmentsPath=new Path(testdir,"segments");
    server=CrawlDBTestUtil.getServer(conf.getInt("content.server.port",50000), "build/test/data/fetch-test-site");
    server.start();
  }

  @After
  public void tearDown() throws Exception{
    server.stop();
    fs.delete(testdir, true);
  }
 
  @Test
  public void testFetch() throws IOException {
   
    //generate seedlist
    ArrayList<String> urls=new ArrayList<String>();
   
    addUrl(urls,"index.html");
    addUrl(urls,"pagea.html");
    addUrl(urls,"pageb.html");
    addUrl(urls,"dup_of_pagea.html");
    addUrl(urls,"nested_spider_trap.html");
    addUrl(urls,"exception.html");
   
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
   
    //inject
    Injector injector=new Injector(conf);
    injector.inject(crawldbPath, urlPath);

    //generate
    Generator g=new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
        Long.MAX_VALUE, Long.MAX_VALUE, false, false);

    long time=System.currentTimeMillis();
    //fetch
    Fetcher fetcher=new Fetcher(conf);

    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);

    fetcher.fetch(generatedSegment[0], 1);

    time=System.currentTimeMillis()-time;
   
    //verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime=(int) ((urls.size()+1)*1000*conf.getFloat("fetcher.server.delay",5));
    Assert.assertTrue(time > minimumTime);
   
    //verify content
    Path content=new Path(new Path(generatedSegment[0], Content.DIR_NAME),"part-00000/data");
    @SuppressWarnings("resource")
    SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
   
    ArrayList<String> handledurls=new ArrayList<String>();
   
    READ_CONTENT:
      do {
      Text key=new Text();
      Content value=new Content();
      if(!reader.next(key, value)) break READ_CONTENT;
      String contentString=new String(value.getContent());
      if(contentString.indexOf("Nutch fetcher test page")!=-1) {
        handledurls.add(key.toString());
      }
    } while(true);

    reader.close();

    Collections.sort(urls);
    Collections.sort(handledurls);

    //verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());

    //verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
   
    handledurls.clear();

    //verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME),"part-00000/data");
    reader = new SequenceFile.Reader(fs, parseData, conf);
   
    READ_PARSE_DATA:
      do {
      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest"
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);
   
    Collections.sort(handledurls);

    Assert.assertEquals(urls.size(), handledurls.size());

    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
  }

  private void addUrl(ArrayList<String> urls, String page) {
    urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/" + page);
  }
 
  @Test
  public void testAgentNameCheck() {

    boolean failedNoAgentName = false;
    conf.set("http.agent.name", "");

    try {
      conf.setBoolean("fetcher.parse", false);
      Fetcher fetcher = new Fetcher(conf);
      fetcher.fetch(null, 1);
    } catch (IllegalArgumentException iae) {
      String message = iae.getMessage();
      failedNoAgentName = message.equals("Fetcher: No agents listed in "
          + "'http.agent.name' property.");
    } catch (Exception e) {
    }

    Assert.assertTrue(failedNoAgentName);
  }

}
TOP

Related Classes of org.apache.nutch.fetcher.TestFetcher

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.