Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

      if (content == null) {
        String url = key.toString();
        content = new Content(url, url, new byte[0], "", new Metadata(), this.conf);
      }
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }

      Parse parse = null;
      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
        ParseStatus parseStatus;
        try {
          parse = this.parseUtil.parse(content);
          parseStatus = parse.getData().getStatus();
        } catch (Exception e) {
          parseStatus = new ParseStatus(e);
        }
        if (!parseStatus.isSuccess()) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Error parsing: " + key + ": " + parseStatus);
          }
          parse = parseStatus.getEmptyParse(getConf());
        }
        // Calculate page signature. For non-parsing fetchers this will
        // be done in ParseSegment
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
        metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
        datum.setSignature(signature);
        // Ensure segment name and score are in parseData metadata
        parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
        parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
        try {
View Full Code Here


  private Configuration conf;

  public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
    throws IndexingException {
   
    Metadata metadata = parse.getData().getParseMeta();
    // index the license
    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
    if (licenseUrl != null) {
      if (LOG.isInfoEnabled()) {
        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
      }

      // add the entire license as cc:license=xxx
      addFeature(doc, "license=" + licenseUrl);

      // index license attributes extracted of the license url
      addUrlFeatures(doc, licenseUrl);
    }

    // index the license location as cc:meta=xxx
    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
    if (licenseLocation != null) {
      addFeature(doc, "meta=" + licenseLocation);
    }

    // index the work type cc:type=xxx
    String workType = metadata.get(CreativeCommons.WORK_TYPE);
    if (workType != null) {
      addFeature(doc, workType);
    }

    return doc;
View Full Code Here

        || parseText == null || parseData == null) {
      return;                                     // only have inlinks
    }

    Document doc = new Document();
    Metadata metadata = parseData.getContentMeta();

    // add segment, used to map from merged index back to segment files
    doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
            Field.Store.YES, Field.Index.NO));

    // add digest, used by dedup
    doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
            Field.Store.YES, Field.Index.NO));

//     if (LOG.isInfoEnabled()) {
//       LOG.info("Url: "+key.toString());
//       LOG.info("Title: "+parseData.getTitle());
View Full Code Here

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
View Full Code Here

    }

    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    Metadata metadata = new Metadata();

    // parse the content
    DocumentFragment root;
    try {
      byte[] contentInOctets = content.getContent();
      InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
      String contentType = content.getMetadata().get(Response.CONTENT_TYPE);
      String encoding = StringUtil.parseCharacterEncoding(contentType);
      if ((encoding != null) && !("".equals(encoding))) {
        metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
        if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
          metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
          if (LOG.isTraceEnabled()) {
            LOG.trace(base + ": setting encoding to " + encoding);
          }
        }
      }

      // sniff out 'charset' value from the beginning of a document
      if ((encoding == null) || ("".equals(encoding))) {
        encoding = sniffCharacterEncoding(contentInOctets);
        if (encoding!=null) {
          metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
          if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
            metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
            if (LOG.isTraceEnabled()) {
              LOG.trace(base + ": setting encoding to " + encoding);
            }
          }
        }
      }

      if (encoding == null) {
        // fallback encoding.
        // FIXME : In addition to the global fallback value,
        // we should make it possible to specify fallback encodings for each ccTLD.
        // (e.g. se: windows-1252, kr: x-windows-949, cn: gb18030, tw: big5
        // doesn't work for jp because euc-jp and shift_jis have about the
        // same share)
        encoding = defaultCharEncoding;
        metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, defaultCharEncoding);
        if (LOG.isTraceEnabled()) {
          LOG.trace(base + ": falling back to " + defaultCharEncoding);
        }
      }
      input.setEncoding(encoding);
View Full Code Here

    in.readFully(bytes);
    Configuration conf = NutchConfiguration.create();
    HtmlParser parser = new HtmlParser();
    parser.setConf(conf);
    Parse parse = parser.getParse(
            new Content(url, url, bytes, "text/html", new Metadata(), conf));
    System.out.println("data: "+parse.getData());

    System.out.println("text: "+parse.getText());
   
  }
View Full Code Here

    inflated = true;
  }

  protected final void readFieldsCompressed(DataInput in) throws IOException {
    version = in.readByte();
    metadata = new Metadata();
    switch (version) {
    case 0:
    case 1:
      url = UTF8.readString(in); // read url
      base = UTF8.readString(in); // read base
View Full Code Here

                                logger.debug("       CONTENT OF "+key.toString()+":");
                            }
                            currRec.origurl = value.getUrl();
                            currRec.newurl = value.getBaseUrl();
                            currRec.content = value.getContent();
                            Metadata metadata = value.getMetadata();
                            currRec.header = new HashMap<String,String>();
                            for (String name : metadata.names()) {
                                String data = metadata.get(name);
                                currRec.header.put(name.toLowerCase(),data);
                            }
                            currRec.header.remove("nutch.content.digest");
                            currRec.header.remove("nutch.crawl.score");
                            currRec.header.remove("nutch.segment.name");
View Full Code Here

    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html", content).get(content.getUrl());

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
    assertEquals(type, metadata.get("Work-Type"));
  }
View Full Code Here

  private Configuration conf;

  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
    throws IndexingException {
   
    Metadata metadata = parse.getData().getParseMeta();
    // index the license
    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
    if (licenseUrl != null) {
      if (LOG.isInfoEnabled()) {
        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
      }

      // add the entire license as cc:license=xxx
      addFeature(doc, "license=" + licenseUrl);

      // index license attributes extracted of the license url
      addUrlFeatures(doc, licenseUrl);
    }

    // index the license location as cc:meta=xxx
    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
    if (licenseLocation != null) {
      addFeature(doc, "meta=" + licenseLocation);
    }

    // index the work type cc:type=xxx
    String workType = metadata.get(CreativeCommons.WORK_TYPE);
    if (workType != null) {
      addFeature(doc, workType);
    }

    return doc;
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.