Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
View Full Code Here


        .get(content.getUrl());
    String text = parse.getText();
    Assert.assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();

    Assert.assertEquals("test rft document", title);
    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));

  }
View Full Code Here

    Parse parse = parseResult.get(content.getUrl());
    // Trying to find the document's rel-tags
    Parser parser = new Parser(doc);
    Set<?> tags = parser.getRelTags();
    Iterator<?> iter = tags.iterator();
    Metadata metadata = parse.getData().getParseMeta();
    while (iter.hasNext())
      metadata.add(REL_TAG, (String) iter.next());

    return parseResult;
  }
View Full Code Here

  @Test
  public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);

    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
View Full Code Here

      Assert.assertEquals(expected[i], parts[i]);
    }
  }
 
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
View Full Code Here

   * or when it could be determined, <code>false</code>.
   */
  public static boolean isTruncated(Content content) {
    byte[] contentBytes = content.getContent();
    if (contentBytes == null) return false;
    Metadata metadata = content.getMetadata();
    if (metadata == null) return false;
   
    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
    if (lengthStr != null) lengthStr=lengthStr.trim();
    if (StringUtil.isEmpty(lengthStr)) {
      return false;
    }
    int inHeaderSize;
View Full Code Here

    in.read(buf);
    in.close();
    SWFParser parser = new SWFParser();
    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                          buf, "application/x-shockwave-flash",
                                          new Metadata(),
                                          NutchConfiguration.create()));
    Parse p = parseResult.get("file:" + args[0]);
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
View Full Code Here

  public ParseResult filter(Content content, ParseResult parseResult,
      HTMLMetaTags metaTags, DocumentFragment doc) {

    Parse parse = parseResult.get(content.getUrl());
    Metadata metadata = parse.getData().getParseMeta();

    // check in the metadata first : the tika-parser
    // might have stored the values there already
    for (String mdName : metadata.names()) {
      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
    }

    Metadata generalMetaTags = metaTags.getGeneralTags();
    for (String tagName : generalMetaTags.names()) {
      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
    }

    Properties httpequiv = metaTags.getHttpEquivTags();
    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
        .hasMoreElements();) {
View Full Code Here

  // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf.
  public ParseData() {}

  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
                   Metadata contentMeta) {
    this(status, title, outlinks, contentMeta, new Metadata());
  }
View Full Code Here

      Outlink.skip(in);
    }
   
    if (version < 3) {
      int propertyCount = in.readInt();             // read metadata
      contentMeta = new Metadata();
      for (int i = 0; i < propertyCount; i++) {
        contentMeta.add(Text.readString(in), Text.readString(in));
      }
    } else {
      contentMeta = new Metadata();
      contentMeta.readFields(in);
    }
    if (version > 3) {
      parseMeta = new Metadata();
      parseMeta.readFields(in);
    }
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.