Source Code of org.apache.nutch.fetcher.TestFetcher

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.fetcher;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDBTestUtil;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.mortbay.jetty.Server;


/**
 * Basic fetcher test
 * 1. generate seedlist
 * 2. inject
 * 3. generate
 * 3. fetch
 * 4. Verify contents
 *
 */
public class TestFetcher {


  final static Path testdir=new Path("build/test/fetch-test");
  Configuration conf;
  FileSystem fs;
  Path crawldbPath;
  Path segmentsPath;
  Path urlPath;
  Server server;


  @Before
  public void setUp() throws Exception{
    conf=CrawlDBTestUtil.createConfiguration();
    fs=FileSystem.get(conf);
    fs.delete(testdir, true);
    urlPath=new Path(testdir,"urls");
    crawldbPath=new Path(testdir,"crawldb");
    segmentsPath=new Path(testdir,"segments");
    server=CrawlDBTestUtil.getServer(conf.getInt("content.server.port",50000), "build/test/data/fetch-test-site");
    server.start();
  }


  @After
  public void tearDown() throws Exception{
    server.stop();
    fs.delete(testdir, true);
  }
  
  @Test
  public void testFetch() throws IOException {
    
    //generate seedlist
    ArrayList<String> urls=new ArrayList<String>();
    
    addUrl(urls,"index.html");
    addUrl(urls,"pagea.html");
    addUrl(urls,"pageb.html");
    addUrl(urls,"dup_of_pagea.html");
    addUrl(urls,"nested_spider_trap.html");
    addUrl(urls,"exception.html");
    
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
    
    //inject
    Injector injector=new Injector(conf);
    injector.inject(crawldbPath, urlPath);


    //generate
    Generator g=new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1,
        Long.MAX_VALUE, Long.MAX_VALUE, false, false);


    long time=System.currentTimeMillis();
    //fetch
    Fetcher fetcher=new Fetcher(conf);


    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);


    fetcher.fetch(generatedSegment[0], 1);


    time=System.currentTimeMillis()-time;
    
    //verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime=(int) ((urls.size()+1)*1000*conf.getFloat("fetcher.server.delay",5));
    Assert.assertTrue(time > minimumTime);
    
    //verify content
    Path content=new Path(new Path(generatedSegment[0], Content.DIR_NAME),"part-00000/data");
    @SuppressWarnings("resource")
    SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
    
    ArrayList<String> handledurls=new ArrayList<String>();
    
    READ_CONTENT:
      do {
      Text key=new Text();
      Content value=new Content();
      if(!reader.next(key, value)) break READ_CONTENT;
      String contentString=new String(value.getContent());
      if(contentString.indexOf("Nutch fetcher test page")!=-1) { 
        handledurls.add(key.toString());
      }
    } while(true);


    reader.close();


    Collections.sort(urls);
    Collections.sort(handledurls);


    //verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());


    //verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
    
    handledurls.clear();


    //verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME),"part-00000/data");
    reader = new SequenceFile.Reader(fs, parseData, conf);
    
    READ_PARSE_DATA:
      do {
      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);
    
    Collections.sort(handledurls);


    Assert.assertEquals(urls.size(), handledurls.size());


    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
  }


  private void addUrl(ArrayList<String> urls, String page) {
    urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/" + page);
  }
  
  @Test
  public void testAgentNameCheck() {


    boolean failedNoAgentName = false;
    conf.set("http.agent.name", "");


    try {
      conf.setBoolean("fetcher.parse", false);
      Fetcher fetcher = new Fetcher(conf);
      fetcher.fetch(null, 1);
    } catch (IllegalArgumentException iae) {
      String message = iae.getMessage();
      failedNoAgentName = message.equals("Fetcher: No agents listed in "
          + "'http.agent.name' property.");
    } catch (Exception e) {
    }


    Assert.assertTrue(failedNoAgentName);
  }


}
Source Code of org.apache.nutch.fetcher.TestFetcher

Related Classes of org.apache.nutch.fetcher.TestFetcher