Examples of NutchDocument

org.apache.nutch.indexer.NutchDocument
A {@link NutchDocument} is the unit of indexing.

Examples of org.apache.nutch.indexer.NutchDocument

  public void testEmptyIndexStatic() throws Exception {


    assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }


    assertNotNull(doc);
    assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    conf.set("index.static",
        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
    assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }


    assertNotNull(doc);
    assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    assertEquals("test if doc has field1", "val1",
        ((String[]) doc.getField("field1").getValues().get(0))[0]);
    assertEquals("test if doc has field2", "val2",
        ((String[]) doc.getField("field2").getValues().get(0))[0]);
    assertEquals("test if doc has field4", "val4",
        ((String[]) doc.getField("field4").getValues().get(0))[0]);
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    
    try{
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    }
    catch(Exception e){
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue(doc.getFieldNames().contains("type"));
    assertEquals(1, doc.getField("type").getValues().size());
    assertEquals("text/html", doc.getFieldValue("type"));    
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  
  @Test
  public void testFilter() throws IndexingException {
    
    // Prepare data
    NutchDocument nutchDocumentIn = new NutchDocument();
    Parse parse = mock(Parse.class);
    ParseData parseData = new ParseData();
    parseData.setParseMeta(metadata);
    Configuration configuration = mock(Configuration.class);
    
    // Mock data
    when(parse.getData()).thenReturn(parseData);
    when(configuration.get(anyString())).thenReturn("");
    when(configuration.getConfResourceAsReader(anyString())).thenReturn(new InputStreamReader(XPathIndexingFilterTest.class.getResourceAsStream("example-xpathfilter-conf.xml")));
    
    xmlHtmlIndexingFilter.setConf(configuration);
    NutchDocument nutchDocumentOut = xmlHtmlIndexingFilter.filter(nutchDocumentIn, parse, new Text("www.test.com"), null, null);
    
    int stringValueIndexCount = 0;
    int floatValueIndexCount = 0;
    int dateValueIndexCount = 0;
    
    for(String fieldName : nutchDocumentOut.getFieldNames()) {
      
      for(Object value : nutchDocumentOut.getField(fieldName).getValues()) {
        
        if(fieldName.equals("testString")) {
          int index = Arrays.binarySearch(testStringArray, value);
          stringValueIndexCount += index;

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    when(parse.getData()).thenReturn(parseData);
    when(configuration.get(anyString())).thenReturn("");
    when(configuration.getConfResourceAsReader(anyString())).thenReturn(new InputStreamReader(OmitIndexingFilterTest.class.getResourceAsStream("example-omit-indexfilter-conf.xml")));
    
    omitIndexingFilter.setConf(configuration);
    NutchDocument nutchDocumentOut = omitIndexingFilter.filter(nutchDocumentIn, parse, new Text("http://www.test.ba/"), null, null);
    
    assertNull("Document unsuccessfuly filtered!", nutchDocumentOut);
    
    nutchDocumentOut = omitIndexingFilter.filter(nutchDocumentIn, parse, new Text("http://www.test.com/"), null, null);
    assertSame("Document unsuccessfuly filtered!", nutchDocumentIn, nutchDocumentOut);

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  @Test
  @Ignore("It works but it can be tested only manually :(")
  public void testProcessData() throws IOException, IndexingException {


    // Prepare data
    NutchDocument nutchDocumentIn = new NutchDocument();
    ParseData parseData = new ParseData();
    CrawlDatum crawDatum = new CrawlDatum();
    Parse parse = mock(Parse.class);
    Configuration configuration = mock(Configuration.class);

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument


  @Test
  public void testProcessData() throws IOException {


    // Prepare data
    NutchDocument nutchDocumentIn = new NutchDocument();
    CrawlDatum crawDatum = new CrawlDatum();
    Parse parse = mock(Parse.class);
    Configuration configuration = mock(Configuration.class);
    
    ParseData parseData = new ParseData();

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

      LOG.info("contentType: " + contentType);
    }
    
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    if (doc == null) {
      System.out.println("Document discarded by indexing filter");
      return 0;
    }
    
    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.