Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


  }

  /** Unit tests for getContentType(String, String, byte[]) method. */
  public void testGetContentType() throws Exception {
    Content c = null;
    Metadata p = new Metadata();

    c = new Content("http://www.foo.com/",
                    "http://www.foo.com/",
                    "".getBytes("UTF8"),
                    "text/html; charset=UTF-8", p, conf);
View Full Code Here


    }
  }
 
 
  private Content getContent(String text) {
    Metadata meta = new Metadata();
    meta.add("Content-Type", "text/html");
    return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
  }
View Full Code Here

      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

      if (content == null) {
        String url = key.toString();
        content = new Content(url, url, new byte[0], "", new Metadata(), this.conf);
      }
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }

      Parse parse = null;
      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
        ParseStatus parseStatus;
        try {
          parse = this.parseUtil.parse(content);
          parseStatus = parse.getData().getStatus();
        } catch (Exception e) {
          parseStatus = new ParseStatus(e);
        }
        if (!parseStatus.isSuccess()) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Error parsing: " + key + ": " + parseStatus);
          }
          parse = parseStatus.getEmptyParse(getConf());
        }
        // Calculate page signature. For non-parsing fetchers this will
        // be done in ParseSegment
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
        metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
        datum.setSignature(signature);
        // Ensure segment name and score are in parseData metadata
        parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
        parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
        try {
View Full Code Here

        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_EXCEPTION,
                               e.toString()).getEmptyParse(conf);
    }

    Metadata metadata = new Metadata();
    metadata.setAll(delegate.getMetaData());
    String title = metadata.get(DublinCore.TITLE);

    if (title != null) {
      metadata.remove(DublinCore.TITLE);
    } else {
      title = "";
    }

    String text = delegate.getText();
View Full Code Here

  }
 
  public Parse getParse(Content content) {
    String text = null;
    String title = null;
    Metadata metadata = new Metadata();
    ArrayList outlinks = new ArrayList();

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete files.").getEmptyParse(conf);
      }
      ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
      ZipEntry ze = null;
      while ((ze = zis.getNextEntry()) != null) {
        if (ze.getName().equals("content.xml")) {
          text = parseContent(ze, zis, outlinks);
        } else if (ze.getName().equals("meta.xml")) {
          parseMeta(ze, zis, metadata);
        }
      }
      zis.close();
    } catch (Exception e) { // run time exception
      e.printStackTrace(LogUtil.getWarnStream(LOG));
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as OO document. " + e).getEmptyParse(conf);
    }

    title = metadata.get(Metadata.TITLE);
    if (text == null)
      text = "";

    if (title == null)
      title = "";
View Full Code Here

    oo.setConf(conf);
    FileInputStream fis = new FileInputStream(args[0]);
    byte[] bytes = new byte[fis.available()];
    fis.read(bytes);
    fis.close();
    Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf);
    Parse p = oo.getParse(c);
    System.out.println(p.getData());
    System.out.println("Text: '" + p.getText() + "'");
    /*
    // create the test output file
View Full Code Here

    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content);

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
    assertEquals(type, metadata.get("Work-Type"));
  }
View Full Code Here

    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
    assertEquals("02", metadata.get("TRCK-Text"));
    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
    assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
    assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));

    assertEquals("postgresql title id3v2 - "
        + "postgresql album id3v2 - "
        + "postgresql artist id3v2", parse.getData().getTitle());
    assertEquals("http://localhost/", parse.getData().getOutlinks()[0].getToUrl());
View Full Code Here

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
    assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));

    assertEquals("postgresql title id3v1 - "
        + "postgresql album id3v1 - "
        + "postgresql artist id3v1", parse.getData().getTitle());
View Full Code Here

    byte[] buf = new byte[in.available()];
    in.read(buf);
    SWFParser parser = new SWFParser();
    Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                          buf, "application/x-shockwave-flash",
                                          new Metadata(),
                                          NutchConfiguration.create()));
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
    System.out.println(p.getData());
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.