Examples of Inlinks


Examples of org.apache.nutch.crawl.Inlinks

    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());

    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example","data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md","example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
      new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
      new CrawlDatum(),new Inlinks());
    assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
  }
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

  public void close() {}

  public void reduce(WritableComparable key, Iterator values,
                     OutputCollector output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

  }

  public void reduce(Text key, Iterator<NutchWritable> values,
                     OutputCollector<Text, NutchDocument> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
   
    try{
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    }
    catch(Exception e){
        e.printStackTrace();
        fail(e.getMessage());
    }
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    ParseResult parseResult = new ParseUtil(conf).parse(content);
   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

  }

  public void reduce(Text key, Iterator<NutchWritable> values,
                     OutputCollector<Text, NutchDocument> output, Reporter reporter)
    throws IOException {
    Inlinks inlinks = null;
    CrawlDatum dbDatum = null;
    CrawlDatum fetchDatum = null;
    ParseData parseData = null;
    ParseText parseText = null;
    while (values.hasNext()) {
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    try {
      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
      e.printStackTrace();
View Full Code Here

Examples of org.apache.nutch.crawl.Inlinks

    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
    } catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.