Examples of NutchDocument

org.apache.nutch.indexer.NutchDocument
A {@link NutchDocument} is the unit of indexing.

Examples of org.apache.nutch.indexer.NutchDocument

  public void testEmptyIndexStatic() throws Exception {


    assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }


    assertNotNull(doc);
    assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

    conf.set("index.static",
        "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
    assertNotNull(filter);
    filter.setConf(conf);


    NutchDocument doc = new NutchDocument();


    try {
      filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }


    assertNotNull(doc);
    assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    assertTrue("test if doc has field1", doc.getField("field1").getValues()
        .contains("val1"));
    assertTrue("test if doc has field2", doc.getField("field2").getValues()
        .contains("val2"));
    assertTrue("test if doc has field4", doc.getField("field4").getValues()
        .contains("val4"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);
    
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testRelTagFields() throws Exception {
    Configuration conf = NutchConfiguration.create();
    RelTagIndexingFilter filter = new RelTagIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(new Utf8(RelTagParser.REL_TAG), bbuf);
    try {
      filter.filter(doc, "http://nutch.apache.org/", page);
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testBasicFields() throws Exception {
  Configuration conf = NutchConfiguration.create();
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://nutch.apache.org/"), new Utf8("Welcome to Nutch"));
  page.setTitle(new Utf8("Welcome to Nutch"));
    page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org"));
    byte[] bytes = new byte[10];
    ByteBuffer bbuf = ByteBuffer.wrap(bytes);
    page.putToMetadata(Nutch.CACHING_FORBIDDEN_KEY_UTF8, bbuf);
    page.setFetchTime(System.currentTimeMillis());
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch(Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("check for host field ", doc.getFieldNames().contains("host"));
  assertTrue("check for site field", doc.getFieldNames().contains("site"));
  assertTrue("check for url field", doc.getFieldNames().contains("url"));
  assertTrue("check for orig field", doc.getFieldNames().contains("orig"));
  assertTrue("check for content field", doc.getFieldNames().contains("content"));
  assertTrue("check for title field", doc.getFieldNames().contains("title"));
  assertTrue("check for cache field", doc.getFieldNames().contains("cache"));
  assertTrue("check for tstamp field", doc.getFieldNames().contains("tstamp"));
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  WebPage page = new WebPage();
  page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/"), new Utf8("exceeding title site"));
  page.setTitle(new Utf8("This title exceeds maximum characters"));
  try {
    filter.filter(doc, "http://www.apache.org/", page);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("assert title field only has 10 characters", 10, doc.getFieldValue("title").length());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

  public void testDeduplicateAnchor() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    WebPage page = new WebPage();
    page.putToInlinks(new Utf8("http://example1.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example2.com/"), new Utf8("cool site"));
    page.putToInlinks(new Utf8("http://example3.com/"), new Utf8("fun site"));
    filter.filter(doc, "http://myurldoesnotmatter.com/", page);
    
    assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
    
    assertEquals("test dedup, we expect 2", 2, doc.getFieldValues("anchor").size());
  }

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument

      LOG.info("contentType: " + contentType);
    }


    ParseResult parseResult = new ParseUtil(conf).parse(content);


    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);


    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
    }


    if (doc == null) {
      System.out.println("Document discarded by indexing filter");
      return 0;
    }
    
    for (String fname : doc.getFieldNames()) {
      List<Object> values = doc.getField(fname).getValues();
      if (values != null) {
        for (Object value : values) {
          String str = value.toString();
          int minText = Math.min(100, str.length());
          System.out.println(fname + " :\t" + str.substring(0, minText));

View Full Code Here

Examples of org.apache.nutch.indexer.NutchDocument


    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);


    NutchDocument doc = new NutchDocument();


    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);


    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);


    Inlinks inlinks = new Inlinks();


    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
    }
    assertNotNull(doc);
    assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
      doc.getField("url").getValues().get(0));
    assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
  }

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.