Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


  
   */
  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
                         Inlinks inlinks) throws IndexingException {
    ParseData parseData = parse.getData();
    Metadata parseMeta = parseData.getParseMeta();
   
    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
    String published = parseMeta.get(Feed.FEED_PUBLISHED);
    String updated = parseMeta.get(Feed.FEED_UPDATED);
    String feed = parseMeta.get(Feed.FEED);
   
    if (authors != null) {
      for (String author : authors) {
        doc.add(Feed.FEED_AUTHOR, author);
      }
View Full Code Here


          mostRecent = fetchTime;
        }
      }

      // get parse metadata
      Metadata metadata = parseData.getContentMeta();
      Parse parse = new ParseImpl(parseText, parseData);

      // handle redirect urls
      Text reprUrlText = (Text)fetchDatum.getMetaData().get(
        Nutch.WRITABLE_REPR_URL_KEY);
      String reprUrl = reprUrlText != null ? reprUrlText.toString() : null;
      String url = key.toString();
      String fieldUrl = (reprUrl != null) ? reprUrl : url;
      String host = URLUtil.getHost(fieldUrl);

      // add segment, used to map from merged index back to segment files
      FieldWritable segField = new FieldWritable(Fields.SEGMENT,
        metadata.get(Nutch.SEGMENT_NAME_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(segField);

      // add digest, used by dedup
      FieldWritable digestField = new FieldWritable(Fields.DIGEST,
        metadata.get(Nutch.SIGNATURE_KEY), FieldType.CONTENT, false, true,
        false);
      fieldsList.add(digestField);

      // url is both stored and indexed, so it's both searchable and returned
      fieldsList.add(new FieldWritable(Fields.URL, fieldUrl, FieldType.CONTENT,
View Full Code Here

  private Document createLuceneDoc(NutchDocument doc) {
    final Document out = new Document();

    out.setBoost(doc.getWeight());

    final Metadata documentMeta = doc.getDocumentMeta();
    for (final Entry<String, NutchField> entry : doc) {
      final String fieldName = entry.getKey();

      Field.Store store = fieldStore.get(fieldName);
      Field.Index index = fieldIndex.get(fieldName);
      Field.TermVector vector = fieldVector.get(fieldName);

      // default values
      if (store == null) {
        store = Field.Store.NO;
      }

      if (index == null) {
        index = Field.Index.NO;
      }

      if (vector == null) {
        vector = Field.TermVector.NO;
      }

      // read document-level field information
      final String[] fieldMetas =
        documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
      if (fieldMetas.length != 0) {
        for (final String val : fieldMetas) {
          if (LuceneConstants.STORE_YES.equals(val)) {
            store = Field.Store.YES;
          } else if (LuceneConstants.STORE_NO.equals(val)) {
View Full Code Here

   * */
  @Deprecated
  public static void add(NutchDocument doc, Field f) {
    final String fieldName = f.name();
    final String key = LuceneConstants.FIELD_PREFIX + fieldName;
    final Metadata documentMeta = doc.getDocumentMeta();
    if (f.isStored()) {
      documentMeta.add(key, LuceneConstants.STORE_YES);
    } else {
      documentMeta.add(key, LuceneConstants.STORE_NO);
    }

    if (f.isIndexed()) {
      if (f.isTokenized()) {
        documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
      } else if (f.getOmitNorms()) {
        documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS);
      } else {
        documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
      }
    } else {
      documentMeta.add(key, LuceneConstants.INDEX_NO);
    }

    if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET);
    } else if (f.isStoreOffsetWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_OFFSET);
    } else if (f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS);
    } else if (f.isTermVectorStored()) {
      documentMeta.add(key, LuceneConstants.VECTOR_YES);
    } else {
      documentMeta.add(key, LuceneConstants.VECTOR_NO);
    }
  }
View Full Code Here

    // raw bytes
    byte[] bytes = bean.getContent(details);

    // pass all original headers? only these for now.
    Metadata metadata = bean.getParseData(details).getContentMeta();
    String contentType = metadata.get(Response.CONTENT_TYPE);
    //String lastModified = metadata.get(Metadata.LAST_MODIFIED);
    //String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
    // better use this, since it may have been truncated during fetch
    // or give warning if they don't match?
    int contentLength = bytes.length;
View Full Code Here

      datum.setFetchTime(System.currentTimeMillis());
      if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

      ParseResult parseResult = null;
      if (content != null) {
        Metadata metadata = content.getMetadata();
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
          scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
          if (LOG.isWarnEnabled()) {
View Full Code Here

  }
 
  public ParseResult getParse(Content content) {
    String text = null;
    String title = null;
    Metadata metadata = new Metadata();
    ArrayList outlinks = new ArrayList();

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf);
      }
      ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
      ZipEntry ze = null;
      while ((ze = zis.getNextEntry()) != null) {
        if (ze.getName().equals("content.xml")) {
          text = parseContent(ze, zis, outlinks);
        } else if (ze.getName().equals("meta.xml")) {
          parseMeta(ze, zis, metadata);
        }
      }
      zis.close();
    } catch (Exception e) { // run time exception
      e.printStackTrace(LogUtil.getWarnStream(LOG));
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf);
    }

    title = metadata.get(Metadata.TITLE);
    if (text == null)
      text = "";

    if (title == null)
      title = "";
View Full Code Here

    oo.setConf(conf);
    FileInputStream fis = new FileInputStream(args[0]);
    byte[] bytes = new byte[fis.available()];
    fis.read(bytes);
    fis.close();
    Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf);
    Parse p = oo.getParse(c).get(c.getUrl());
    System.out.println(p.getData());
    System.out.println("Text: '" + p.getText() + "'");
    /*
    // create the test output file
View Full Code Here

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;
    Metadata metadata = new Metadata();

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
View Full Code Here

    }
  }
 
 
  private Content getContent(String text) {
    Metadata meta = new Metadata();
    meta.add("Content-Type", "text/html");
    return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.