Source Code of org.apache.nutch.crawl.TestCrawlDbFilter

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
package org.apache.nutch.crawl;


import java.io.IOException;
import java.util.ArrayList;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;


/**
 * CrawlDbFiltering test which tests for correct, error free url 
 * normalization when the CrawlDB includes urls with <code>DB GONE</code> status 
 * and <code>CRAWLDB_PURGE_404</code> is set to true.
 * 
 * @author lufeng
 */
public class TestCrawlDbFilter {
  Configuration conf;
  Path dbDir;
  Path newCrawlDb;
  final static Path testdir = new Path("build/test/crawldbfilter-test");
  FileSystem fs;


  @Before
  public void setUp() throws Exception {
    conf = CrawlDBTestUtil.createConfiguration();
    fs = FileSystem.get(conf);
    fs.delete(testdir, true);
  }


  @After
  public void tearDown() {
    delete(testdir);
  }


  private void delete(Path p) {
    try {
      fs.delete(p, true);
    } catch (IOException e) {
    }
  }


  /**
   * Test url404Purging
   *
   * @throws Exception
   */
  @Test
  public void testUrl404Purging() throws Exception {
    // create a CrawlDatum with DB GONE status
    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
    list.add(new URLCrawlDatum(new Text("http://www.example.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
    list.add(new URLCrawlDatum(new Text("http://www.example1.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
    list.add(new URLCrawlDatum(new Text("http://www.example2.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
    dbDir = new Path(testdir, "crawldb");
    newCrawlDb = new Path(testdir,"newcrawldb");
    // create crawldb
    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
    // set CRAWLDB_PURGE_404 to true
    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
    conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
    conf.setInt("urlnormalizer.loop.count", 2);
    JobConf job = new NutchJob(conf);
    job.setJobName("Test CrawlDbFilter");
    Path current = new Path(dbDir, "current");
    if (FileSystem.get(job).exists(current)) {
      FileInputFormat.addInputPath(job, current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);
    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    JobClient.runJob(job);


    Path fetchlist = new Path(new Path(newCrawlDb,
      "part-00000"), "data");


    ArrayList<URLCrawlDatum> l = readContents(fetchlist);


    // verify we got right amount of records
    Assert.assertEquals(2, l.size());
  }


  /**
   * Read contents of fetchlist.
   * @param fetchlist  path to Generated fetchlist
   * @return Generated {@link URLCrawlDatum} objects
   * @throws IOException
   */
  private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException {
    // verify results
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);


    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();


    READ: do {
      Text key = new Text();
      CrawlDatum value = new CrawlDatum();
      if (!reader.next(key, value)) {
        break READ;
      }
      l.add(new URLCrawlDatum(key, value));
    } while (true);


    reader.close();
    return l;
  }
}
Source Code of org.apache.nutch.crawl.TestCrawlDbFilter

Related Classes of org.apache.nutch.crawl.TestCrawlDbFilter