Examples of NutchDocument

org.apache.nutch.indexer.NutchDocument
A {@link NutchDocument} is the unit of indexing.

Examples of org.apache.nutch.indexer.NutchDocument

    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
    } catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
    assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    
    try{
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    }
    catch(Exception e){
        e.printStackTrace();
        fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue(doc.getFieldNames().contains("type"));
    assertEquals(1, doc.getField("type").getValues().size());
    assertEquals("text/html", doc.getFieldValue("type"));    
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);


    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());


    assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    this.action = action;
  }


  public void readFields(DataInput in) throws IOException {
    action = in.readByte();
    NutchDocument doc = new NutchDocument();
    doc.readFields(in);
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

     Configuration conf = NutchConfiguration.create();
     conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
     assertNotNull(filter);
     NutchDocument doc = new NutchDocument();
     try{
       filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
     }
     catch(Exception e){
       e.printStackTrace();
       fail(e.getMessage());
     }
     assertNotNull(doc);
     assertTrue(doc.getFieldNames().contains("type"));
     assertEquals(1, doc.getFieldValues("type").size());
     assertEquals("text/html", doc.getFieldValue("type"));     
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    WebPage page = new WebPage();
    String url = "http://www.example.com/";
    page.setContent(ByteBuffer.wrap("text".getBytes()));
    page.setTitle(new Utf8("title"));
    page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
    NutchDocument doc = filter.filter(new NutchDocument(), url, page);
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testDeduplicateAnchor() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    page.putToInlinks(new Utf8("http://example1.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example2.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example3.com/"), new Utf8("fun site"));
    filter.filter(doc, "http://myurldoesnotmatter.com/", page);
    
    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
    
    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testRelTagFields() throws Exception {
    Configuration conf = NutchConfiguration.create();
    RelTagIndexingFilter filter = new RelTagIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(new Utf8(RelTagParser.REL_TAG), bbuf);
    try {
      filter.filter(doc, "http://nutch.apache.org/", page);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testBasicFields() throws Exception {
  Configuration conf = NutchConfiguration.create();
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch"));
  page.setTitle(new Utf8("Welcome to Nutch"));
    page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf);
    page.setFetchTime(System.currentTimeMillis());
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch(Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("check for host field ", doc.getFieldNames().contains("host"));
  assertTrue("check for url field", doc.getFieldNames().contains("url"));
  assertTrue("check for orig field", doc.getFieldNames().contains("orig"));
  assertTrue("check for content field", doc.getFieldNames().contains("content"));
  assertTrue("check for title field", doc.getFieldNames().contains("title"));
  assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
  assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
  }

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.