Package org.apache.nutch.crawl

Source Code of org.apache.nutch.crawl.TestCrawlDbFilter

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.apache.nutch.util.NutchJob;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

/**
* CrawlDbFiltering test which tests for correct, error free url
* normalization when the CrawlDB includes urls with <code>DB GONE</code> status
* and <code>CRAWLDB_PURGE_404</code> is set to true.
*
* @author lufeng
*/
public class TestCrawlDbFilter {
  Configuration conf;
  Path dbDir;
  Path newCrawlDb;
  final static Path testdir = new Path("build/test/crawldbfilter-test");
  FileSystem fs;

  @Before
  public void setUp() throws Exception {
    conf = CrawlDBTestUtil.createConfiguration();
    fs = FileSystem.get(conf);
    fs.delete(testdir, true);
  }

  @After
  public void tearDown() {
    delete(testdir);
  }

  private void delete(Path p) {
    try {
      fs.delete(p, true);
    } catch (IOException e) {
    }
  }

  /**
   * Test url404Purging
   *
   * @throws Exception
   */
  @Test
  public void testUrl404Purging() throws Exception {
    // create a CrawlDatum with DB GONE status
    ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
    list.add(new URLCrawlDatum(new Text("http://www.example.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_GONE, 0, 0.0f)));
    list.add(new URLCrawlDatum(new Text("http://www.example1.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_FETCHED, 0, 0.0f)));
    list.add(new URLCrawlDatum(new Text("http://www.example2.com"), new CrawlDatum(
      CrawlDatum.STATUS_DB_UNFETCHED, 0, 0.0f)));
    dbDir = new Path(testdir, "crawldb");
    newCrawlDb = new Path(testdir,"newcrawldb");
    // create crawldb
    CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
    // set CRAWLDB_PURGE_404 to true
    conf.setBoolean(CrawlDb.CRAWLDB_PURGE_404,true);
    conf.setBoolean(CrawlDbFilter.URL_NORMALIZING,true);
    conf.setBoolean(CrawlDbFilter.URL_FILTERING,false);
    conf.setInt("urlnormalizer.loop.count", 2);
    JobConf job = new NutchJob(conf);
    job.setJobName("Test CrawlDbFilter");
    Path current = new Path(dbDir, "current");
    if (FileSystem.get(job).exists(current)) {
      FileInputFormat.addInputPath(job, current);
    }
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);
    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    JobClient.runJob(job);

    Path fetchlist = new Path(new Path(newCrawlDb,
      "part-00000"), "data");

    ArrayList<URLCrawlDatum> l = readContents(fetchlist);

    // verify we got right amount of records
    Assert.assertEquals(2, l.size());
  }

  /**
   * Read contents of fetchlist.
   * @param fetchlist  path to Generated fetchlist
   * @return Generated {@link URLCrawlDatum} objects
   * @throws IOException
   */
  private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException {
    // verify results
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);

    ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();

    READ: do {
      Text key = new Text();
      CrawlDatum value = new CrawlDatum();
      if (!reader.next(key, value)) {
        break READ;
      }
      l.add(new URLCrawlDatum(key, value));
    } while (true);

    reader.close();
    return l;
  }
}
TOP

Related Classes of org.apache.nutch.crawl.TestCrawlDbFilter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.