Examples of org.apache.nutch.metadata.Metadata

org.apache.nutch.metadata.Metadata
A multi-valued metadata container.

      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);


      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {

View Full Code Here

        .get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());


    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();


    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));


  }

View Full Code Here

    Parse parse = parseResult.get(content.getUrl());
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set<?> tags = parser.getRelTags();
    Iterator<?> iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext())
      metadata.add(REL_TAG, (String) iter.next());


    return parseResult;
  }

View Full Code Here


  @Test
  public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();


    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);


    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(

View Full Code Here

      Assert.assertEquals(expected[i], parts[i]);
    }
  }
  
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());

View Full Code Here

   * or when it could be determined, <code>false</code>. 
   */
  public static boolean isTruncated(Content content) {
    byte[] contentBytes = content.getContent();
    if (contentBytes == null) return false;
    Metadata metadata = content.getMetadata();
    if (metadata == null) return false;
    
    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
    if (lengthStr != null) lengthStr=lengthStr.trim();
    if (StringUtil.isEmpty(lengthStr)) {
      return false;
    }
    int inHeaderSize;

View Full Code Here

    in.read(buf);
    in.close();
    SWFParser parser = new SWFParser();
    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                          buf, "application/x-shockwave-flash",
                                          new Metadata(),
                                          NutchConfiguration.create()));
    Parse p = parseResult.get("file:" + args[0]);
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");

View Full Code Here


  public ParseResult filter(Content content, ParseResult parseResult,
      HTMLMetaTags metaTags, DocumentFragment doc) {


    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();


    // check in the metadata first : the tika-parser
    // might have stored the values there already
    for (String mdName : metadata.names()) {
      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
    }


    Metadata generalMetaTags = metaTags.getGeneralTags();
    for (String tagName : generalMetaTags.names()) {
      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
    }


    Properties httpequiv = metaTags.getHttpEquivTags();
    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
        .hasMoreElements();) {

View Full Code Here

  // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf.
  public ParseData() {}


  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
                   Metadata contentMeta) {
    this(status, title, outlinks, contentMeta, new Metadata());
  }

View Full Code Here

      Outlink.skip(in);
    }
    
    if (version < 3) {
      int propertyCount = in.readInt();             // read metadata
      contentMeta = new Metadata();
      for (int i = 0; i < propertyCount; i++) {
        contentMeta.add(Text.readString(in), Text.readString(in));
      }
    } else {
      contentMeta = new Metadata();
      contentMeta.readFields(in);
    }
    if (version > 3) {
      parseMeta = new Metadata();
      parseMeta.readFields(in);
    }
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.metadata.Metadata

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow

com.flaptor.hounder.crawler.Nutch9Fetcher$NutchSegment$SegmentIterator

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.fetcher.Fetcher$FetcherThread

org.apache.nutch.fetcher.Fetcher2$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.