Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


    } else {
      // FIXME: implement charset detector. This code causes problem when
      // character set isn't specified in HTTP header.
      text = new String(content.getContent()); // use default encoding
    }
    Metadata meta=content.getMetadata();
    String title=getTitle(text);
    meta.set(Metadata.TITLE, title);
    content.setMetadata(meta);
       
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
        OutlinkExtractor.getOutlinks(text, getConf()), meta);
    parseData.setConf(this.conf);
View Full Code Here


        try {
            Configuration conf = NutchConfiguration.create();

            byte[] raw = getRawBytes(new File(file));
            Metadata meta = new Metadata();
            Content content = new Content(file, file, raw, "trec/plain", meta, conf);

            //Protocol protocol = new ProtocolFactory(conf).getProtocol(file);                                                                                                                                                             
            //Content content = protocol.getProtocolOutput(new Text(file), new CrawlDatum()).getContent();                                                                                                                                 
            //Parse parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);                                                                                                                                                  
View Full Code Here

                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParse(this.conf);
    }
   
    // collect meta data
    Metadata metadata = new Metadata();
    if (properties != null) {
      title = properties.getProperty(DublinCore.TITLE);
      properties.remove(DublinCore.TITLE);
      metadata.setAll(properties);
    }

    if (text == null) { text = ""; }
    if (title == null) { title = ""; }
View Full Code Here

     
    String file = args[0];
    byte[] raw = getRawBytes(new File(file));
       
    Metadata meta = new Metadata();
    meta.set(Response.CONTENT_LENGTH, "" + raw.length);
    Content content = new Content(file, file, raw, mime, meta,
                                  NutchConfiguration.create());

    System.out.println(parser.getParse(content).getText());
  }
View Full Code Here

    // raw bytes
    byte[] bytes = bean.getContent(details);

    // pass all original headers? only these for now.
    Metadata metadata = bean.getParseData(details).getContentMeta();
    String contentType = metadata.get(Response.CONTENT_TYPE);
    //String lastModified = metadata.get(Metadata.LAST_MODIFIED);
    //String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
    // better use this, since it may have been truncated during fetch
    // or give warning if they don't match?
    int contentLength = bytes.length;
View Full Code Here

    {
      return;
    }

    // Copy http headers to nutch metadata.
    final Metadata metaData = new Metadata();
    final Header[] headers = rec.getHttpHeaders();
    for (int j = 0; j < headers.length; j++)
    {
      final Header header = headers[j];
     
      if (mimetype == null)
      {
        // Special handling. If mimetype is still null, try getting it
        // from the http header. I've seen arc record lines with empty
        // content-type and a MIME unparseable file ending; i.e. .MID.
        if ((header.getName() != null) &&
          header.getName().toLowerCase().equals(ImportArcs.CONTENT_TYPE_KEY))
        {
          mimetype = getMimetype(header.getValue(), null, null);
         
          if (skip(mimetype))
          {
            return;
          }
        }
      }
     
      metaData.set(header.getName(), header.getValue());
    }

    // This call to reporter setStatus pings the tasktracker telling it our
    // status and telling the task tracker we're still alive (so it doesn't
    // time us out).
    final String noSpacesMimetype =
      TextUtils.replaceAll(ImportArcs.WHITESPACE,
      ((mimetype == null || mimetype.length() <= 0)?
      "TODO": mimetype),
      "-");
    final String recordLengthAsStr = Long.toString(recordLength);
   
    reporter.setStatus(getStatus(url, oldUrl, recordLengthAsStr, noSpacesMimetype));

    // This is a nutch 'more' field.
    metaData.set("contentLength", recordLengthAsStr);

    rec.skipHttpHeader();
    reporter.setStatusIfElapse("read headers on " + url);

    // TODO: Skip if unindexable type.
    int total = 0;
   
    // Read in first block. If mimetype still null, look for MAGIC.
    int len = rec.read(this.buffer, 0, this.buffer.length);
   
    if (mimetype == null)
    {
      MimeType mt = this.mimeTypes.getMimeType(this.buffer);
     
      if (mt == null || mt.getName() == null)
      {
        LOG.warn("Failed to get mimetype for: " + url);
       
        return;
      }
     
      mimetype = mt.getName();
    }
   
    metaData.set(ImportArcs.CONTENT_TYPE_KEY, mimetype);

    // How much do we read total? If pdf, we will read more. If equal to -1,
    // read all.
    int readLimit = (ImportArcs.PDF_TYPE.equals(mimetype))?
      this.pdfContentLimit : this.contentLimit;
   
    // Reset our contentBuffer so can reuse.  Over the life of an ARC
    // processing will grow to maximum record size.
    this.contentBuffer.reset();
    while ((len != -1) && ((readLimit == -1) || (total < readLimit)))
    {
      total += len;
      this.contentBuffer.write(this.buffer, 0, len);
      len = rec.read(this.buffer, 0, this.buffer.length);
      reporter.setStatusIfElapse("reading " + url);
    }

    // Close the Record.  We're done with it.  Side-effect is calculation
    // of digest -- if we're digesting.
    rec.close();
    reporter.setStatusIfElapse("closed " + url);

    final byte[] contentBytes = this.contentBuffer.toByteArray();
    final CrawlDatum datum = new CrawlDatum();
    datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);

    // Calculate digest or use precalculated sha1.
    String digest = (this.sha1)? rec.getDigestStr():
    MD5Hash.digest(contentBytes).toString();
    metaData.set(Nutch.SIGNATURE_KEY, digest);
   
    // Set digest back into the arcData so available later when we write
    // CDX line.
    arcData.setDigest(digest);

    metaData.set(Nutch.SEGMENT_NAME_KEY, this.segmentName);
   
    // Score at this stage is 1.0f.
    metaData.set(Nutch.SCORE_KEY, Float.toString(datum.getScore()));

    final long startTime = System.currentTimeMillis();
    final Content content = new Content(url, url, contentBytes, mimetype,
      metaData, getConf());
    datum.setFetchTime(Nutchwax.getDate(arcData.getDate()));
View Full Code Here

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                       
    WritableTestUtils.testWritable(r, null);
  }
View Full Code Here

      outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i);
    }
    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
                                       "Max Outlinks Title",
                                       outlinks,
                                       new Metadata());
    ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
    Assert.assertEquals(outlinks.length, data.getOutlinks().length);
  }
View Full Code Here

  @Test
  public void testGuessing() {
    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

    Metadata metadata = new Metadata();
    EncodingDetector detector;
    Content content;
    String encoding;

    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    // no information is available, so it should return default encoding
    Assert.assertEquals("windows-1252", encoding.toLowerCase());

    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("utf-16", encoding.toLowerCase());

    metadata.clear();
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    Assert.assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("utf-32", "sniffed");
View Full Code Here

    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.