Package com.atlantbh.nutch.filter.xpath

Source Code of com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

package com.atlantbh.nutch.filter.xpath;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.anyString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.BlockJUnit4ClassRunner;

import com.atlantbh.nutch.filter.xpath.XPathIndexingFilter;
import com.ibm.icu.util.GregorianCalendar;

@RunWith(BlockJUnit4ClassRunner.class)
public class XPathIndexingFilterTest {
 
  private XPathIndexingFilter xmlHtmlIndexingFilter;
  private Metadata metadata;
 
  private static final String[] testStringArray = {"test1", "test2", "test3"};
  private static final Float[] testFloatArray = {1.2f, 2.3f};
  private static final Date[] testDateArray = {new GregorianCalendar(2001, 3, 15).getTime(), new GregorianCalendar(2003, 8, 21).getTime()};
  private static final SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd.MM.yyyy");
 
  @Before
  public void init() {
    metadata = new Metadata();
 
    metadata.add("testString", testStringArray[0]);
    metadata.add("testString", testStringArray[1]);
    metadata.add("testString", testStringArray[2]);
    metadata.add("testFloat", String.valueOf(testFloatArray[0]));
    metadata.add("testFloat", String.valueOf(testFloatArray[1]));
    metadata.add("testDate", simpleDateFormat.format(testDateArray[0]));
    metadata.add("testDate", simpleDateFormat.format(testDateArray[1]));
   
    xmlHtmlIndexingFilter = new XPathIndexingFilter();
  }
 
  @Test
  public void testFilter() throws IndexingException {
   
    // Prepare data
    NutchDocument nutchDocumentIn = new NutchDocument();
    Parse parse = mock(Parse.class);
    ParseData parseData = new ParseData();
    parseData.setParseMeta(metadata);
    Configuration configuration = mock(Configuration.class);
   
    // Mock data
    when(parse.getData()).thenReturn(parseData);
    when(configuration.get(anyString())).thenReturn("");
    when(configuration.getConfResourceAsReader(anyString())).thenReturn(new InputStreamReader(XPathIndexingFilterTest.class.getResourceAsStream("example-xpathfilter-conf.xml")));
   
    xmlHtmlIndexingFilter.setConf(configuration);
    NutchDocument nutchDocumentOut = xmlHtmlIndexingFilter.filter(nutchDocumentIn, parse, new Text("www.test.com"), null, null);
   
    int stringValueIndexCount = 0;
    int floatValueIndexCount = 0;
    int dateValueIndexCount = 0;
   
    for(String fieldName : nutchDocumentOut.getFieldNames()) {
     
      for(Object value : nutchDocumentOut.getField(fieldName).getValues()) {
       
        if(fieldName.equals("testString")) {
          int index = Arrays.binarySearch(testStringArray, value);
          stringValueIndexCount += index;
         
          assertTrue(index >= 0);
          assertTrue(value instanceof String);
        } else if(fieldName.equals("testFloat"))  {
          int index = Arrays.binarySearch(testFloatArray, value);
          floatValueIndexCount += index;
         
          assertTrue(index >= 0);
          assertTrue(value instanceof Float);
        } else if(fieldName.equals("testDate"))  { 
          int index = Arrays.binarySearch(testDateArray, value)
          dateValueIndexCount += index;
         
          assertTrue(index >= 0);
          assertTrue(value instanceof Date);
        }
      }   
    }
   
    assertEquals("Not all String values parsed!", 3, stringValueIndexCount);
    assertEquals("Not all Flaot values parsed!", 1, floatValueIndexCount);
    assertEquals("Not all Flaot values parsed!", 1, dateValueIndexCount);
  }
}
TOP

Related Classes of com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.