Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(
            new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
    System.out.println("data: "+parse.getData());

    System.out.println("text: "+parse.getText());
   
  }
View Full Code Here


    }

    NutchDocument doc = new NutchDocument();
    doc.add("id", key.toString());

    final Metadata metadata = parseData.getContentMeta();

    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
View Full Code Here

  private float weight;

  public NutchDocument() {
    fields = new HashMap<String, NutchField>();
    documentMeta = new Metadata();
    weight = 1.0f;
  }
View Full Code Here

          res.addHeader("X-Handled-By", getClass().getSimpleName());
          return;
        }
        byte[] data = c.getContent();
        LOG.debug("-data len=" + data.length);
        Metadata meta = c.getMetadata();
        String[] names = meta.names();
        LOG.debug("- " + names.length + " meta");
        for (int i = 0; i < names.length; i++) {
          boolean my = true;
          char ch = names[i].charAt(0);
          if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
            // pretty good chance it's a standard header
            my = false;
          }
          String[] values = meta.getValues(names[i]);
          for (int k = 0; k < values.length; k++) {
            if (my) {
              addMyHeader(res, names[i], values[k]);
            } else {
              res.addHeader(names[i], values[k]);
            }
          }
        }
        req.setHandled(true);
        res.addHeader("X-Handled-By", getClass().getSimpleName());
        res.setContentType(meta.get(Metadata.CONTENT_TYPE));
        res.setContentLength(data.length);
        OutputStream os = res.getOutputStream();
        os.write(data, 0, data.length);
        res.flushBuffer();
      } else {
View Full Code Here

  private String sampleFileMultival = "testMultivalueMetatags.html";
  private String description = "This is a test of description";
  private String keywords = "This is a test of keywords";

  public Metadata parseMeta(String fileName, Configuration conf) {
    Metadata metadata = null;
    try {
      String urlString = "file:" + sampleDir + fileSeparator + fileName;
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
View Full Code Here

  /** test defaults: keywords and description */
  public void testIt() {
    Configuration conf = NutchConfiguration.create();

    // check that we get the same values
    Metadata parseMeta = parseMeta(sampleFile, conf);

    Assert.assertEquals(description, parseMeta.get("metatag.description"));
    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
  }
View Full Code Here

  public void testMultiValueMetatags() {
    Configuration conf = NutchConfiguration.create();
    conf.set("metatags.names", "keywords,DC.creator");
    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");

    Metadata parseMeta = parseMeta(sampleFileMultival, conf);

    String failMessage = "One value of metatag with multiple values is missing: ";

    Set<String> valueSet = new TreeSet<String>();
    for (String val : parseMeta.getValues("metatag.dc.creator")) {
      valueSet.add(val);
    }
    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
    for (String val : expectedValues1) {
      Assert.assertTrue(failMessage + val, valueSet.contains(val));
    }

    valueSet.clear();
    for (String val : parseMeta.getValues("metatag.keywords")) {
      valueSet.add(val);
    }
    String[] expectedValues2 = { "robot d'indexation", "web crawler",
        "Webcrawler" };
    for (String val : expectedValues2) {
View Full Code Here

    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());
      System.out.println("text: " + parse.getText() + "\n");
View Full Code Here

  }

  private void addToMap(ParseResult parseResult, SyndFeed feed,
      String feedLink, SyndEntry entry, Content content) {
    String link = entry.getLink(), text = null, title = null;
    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
    Parse parse = null;
    SyndContent description = entry.getDescription();

    try {
      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);

      if (link != null)
        link = filters.filter(link);
    } catch (Exception e) {
      e.printStackTrace();
      return;
    }

    if (link == null)
      return;

    title = stripTags(entry.getTitleEx());

    if (feedLink != null)
      parseMeta.set("feed", feedLink);

    addFields(parseMeta, contentMeta, feed, entry);

    // some item descriptions contain markup text in them,
    // so we temporarily set their content-type to parse them
View Full Code Here

      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
     
      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
       
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null) datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
       
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.