Package org.apache.nutch.crawl

Source Code of org.apache.nutch.crawl.TestUrlWithScore

/*******************************************************************************
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.apache.nutch.crawl;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;

import junit.framework.TestCase;

import org.apache.hadoop.io.RawComparator;
import org.apache.nutch.crawl.UrlWithScore;
import org.apache.nutch.crawl.UrlWithScore.UrlOnlyPartitioner;
import org.apache.nutch.crawl.UrlWithScore.UrlScoreComparator;
import org.apache.nutch.crawl.UrlWithScore.UrlScoreComparator.UrlOnlyComparator;

/**
* Tests {@link UrlWithScore} with serialization, partitioning and sorting.
*/
public class TestUrlWithScore extends TestCase {

  public void testSerialization() throws IOException {
    // create a key and test basic functionality
    UrlWithScore keyOut = new UrlWithScore("http://example.org/", 1f);
    assertEquals("http://example.org/", keyOut.getUrl().toString());
    assertEquals(1f, keyOut.getScore().get(), 0.001);
   
    // write to out
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(bos);
    keyOut.write(out);

    // read from in
    UrlWithScore keyIn = new UrlWithScore();
    ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
    DataInputStream in = new DataInputStream(bis);
    keyIn.readFields(in);
    assertEquals(keyOut.getUrl().toString(), keyIn.getUrl().toString());
    assertEquals(keyOut.getScore().get(), keyIn.getScore().get(), 0.001);

    in.close();
    out.close();
  }
 
  public void testPartitioner() throws IOException {
    UrlOnlyPartitioner part = new UrlOnlyPartitioner();
   
    UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f);
    UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f);
    UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f);
    UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f);
    UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f);
   
    int numReduces = 7;
   
    // keys 1 and 2 should be partitioned together
    int partForKey1 = part.getPartition(k1, null, numReduces);
    assertEquals(partForKey1, part.getPartition(k2, null, numReduces));
    assertEquals(partForKey1, part.getPartition(k2, null, numReduces));
   
    // keys 3, 4 and 5 should be partitioned together
    int partForKey3 = part.getPartition(k3, null, numReduces);
    assertEquals(partForKey3, part.getPartition(k4, null, numReduces));
    assertEquals(partForKey3, part.getPartition(k5, null, numReduces));
  }
 
  public void testUrlOnlySorting() throws IOException {
    UrlOnlyComparator comp = new UrlOnlyComparator();
   
    UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f);
    UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f);
    UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f);
    UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f);
    UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f);
   
    // k1 should be equal to k2
    assertEquals(0, compareBothRegularAndRaw(comp, k1, k2));
    // test symmetry
    assertEquals(0, compareBothRegularAndRaw(comp, k2, k1));
   
    // k1 is before k3, k4 and k5
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k3));
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k4));
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k5));
    // test symmetry
    assertEquals(1, compareBothRegularAndRaw(comp, k3, k1));
    assertEquals(1, compareBothRegularAndRaw(comp, k4, k1));
    assertEquals(1, compareBothRegularAndRaw(comp, k5, k1));
  }
 
  public void testUrlScoreSorting() throws IOException {
    UrlScoreComparator comp = new UrlScoreComparator();
   
    UrlWithScore k1 = new UrlWithScore("http://example.org/1", 1f);
    UrlWithScore k2 = new UrlWithScore("http://example.org/1", 2f);
    UrlWithScore k3 = new UrlWithScore("http://example.org/2", 1f);
    UrlWithScore k4 = new UrlWithScore("http://example.org/2", 2f);
    UrlWithScore k5 = new UrlWithScore("http://example.org/2", 3f);
   
    // k1 is after k2, because score is lower
    assertEquals(1, comp.compare(k1, k2));
    // test symmetry
    assertEquals(-1, comp.compare(k2, k1));
   
    // k1 is before k3, k4 and k5, because url is lower
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k3));
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k4));
    assertEquals(-1, compareBothRegularAndRaw(comp, k1, k5));
    // test symmetry
    assertEquals(1, compareBothRegularAndRaw(comp, k3, k1));
    assertEquals(1, compareBothRegularAndRaw(comp, k4, k1));
    assertEquals(1, compareBothRegularAndRaw(comp, k5, k1));
   
    // k3 after k4 and k4 after k5 and therefore k3 after k5 (transitivity)
    assertEquals(1, compareBothRegularAndRaw(comp, k3, k4));
    assertEquals(1, compareBothRegularAndRaw(comp, k4, k5));
    assertEquals(1, compareBothRegularAndRaw(comp, k3, k5));
    // test symmetry
    assertEquals(-1, compareBothRegularAndRaw(comp, k4, k3));
    assertEquals(-1, compareBothRegularAndRaw(comp, k5, k4));
    assertEquals(-1, compareBothRegularAndRaw(comp, k5, k3));
  }

  /**
   * Compares two keys using both regular and raw comparing. Checks if the two
   * values are equal. Returns the compare value, but only when equal.
   *
   * @param comp
   * @param k1
   * @param k2
   * @return The compare result. (When k1 != k2, assert failure kicks in)
   * @throws IOException
   */
  private Object compareBothRegularAndRaw(RawComparator<UrlWithScore> comp,
      UrlWithScore k1, UrlWithScore k2) throws IOException {
    int regular = comp.compare(k1, k2);
   
    byte[] bytes1 = extractBytes(k1);
    byte[] bytes2 = extractBytes(k2);
   
    int raw = comp.compare(bytes1, 0, bytes1.length, bytes2, 0, bytes2.length);
   
    assertEquals("Regular compare should equal raw compare", regular, raw);
   
    return regular;
  }

  /**
   * Return the bytes for a key.
   *
   * @param k
   * @return
   * @throws IOException
   */
  private byte[] extractBytes(UrlWithScore k) throws IOException {
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    DataOutputStream out = new DataOutputStream(bos);
    k.write(out);
    byte[] bytes = bos.toByteArray();
    out.close();
    return bytes;
  }
 
}
TOP

Related Classes of org.apache.nutch.crawl.TestUrlWithScore

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.