Examples of NutchDocument


Examples of org.apache.nutch.indexer.NutchDocument

  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
  page.setTitle(new Utf8("This title exceeds maximum characters"));
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length());
  }
View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

     Configuration conf = NutchConfiguration.create();
     conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
     assertNotNull(filter);
     NutchDocument doc = new NutchDocument();
     try{
       filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
     }
     catch(Exception e){
       e.printStackTrace();
       fail(e.getMessage());
     }
     assertNotNull(doc);
     assertTrue(doc.getFieldNames().contains("type"));
     assertEquals(1, doc.getFieldValues("type").size());
     assertEquals("text/html", doc.getFieldValue("type"));    
  }
View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    WebPage page = new WebPage();
    String url = "http://www.example.com/";
    page.setContent(ByteBuffer.wrap("text".getBytes()));
    page.setTitle(new Utf8("title"));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
    NutchDocument doc = filter.filter(new NutchDocument(), url, page);
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.