Package org.apache.nutch.metadata

Examples of org.apache.nutch.metadata.Metadata


      assertEquals(expected[i], parts[i]);
    }
  }
 
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
View Full Code Here


        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
      return;
    }

    NutchDocument doc = new NutchDocument();
    final Metadata metadata = parseData.getContentMeta();

    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));

    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));

    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it
View Full Code Here

  private Metadata metadata;

  private MimeUtil mimeTypes;

  public Content() {
    metadata = new Metadata();
  }
View Full Code Here

    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
    assertEquals("02", metadata.get("TRCK-Text"));
    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
    assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
    assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));

    assertEquals("postgresql title id3v2 - "
        + "postgresql album id3v2 - "
        + "postgresql artist id3v2", parse.getData().getTitle());
    assertEquals("http://localhost/", parse.getData().getOutlinks()[0].getToUrl());
View Full Code Here

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
    assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));

    assertEquals("postgresql title id3v1 - "
        + "postgresql album id3v1 - "
        + "postgresql artist id3v1", parse.getData().getTitle());
View Full Code Here

  private Document createLuceneDoc(NutchDocument doc) {
    final Document out = new Document();

    out.setBoost(doc.getScore());

    final Metadata documentMeta = doc.getDocumentMeta();
    for (final Entry<String, List<String>> entry : doc) {
      final String fieldName = entry.getKey();

      Field.Store store = fieldStore.get(fieldName);
      Field.Index index = fieldIndex.get(fieldName);
      Field.TermVector vector = fieldVector.get(fieldName);

      // default values
      if (store == null) {
        store = Field.Store.NO;
      }

      if (index == null) {
        index = Field.Index.NO;
      }

      if (vector == null) {
        vector = Field.TermVector.NO;
      }

      // read document-level field information
      final String[] fieldMetas =
        documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
      if (fieldMetas.length != 0) {
        for (final String val : fieldMetas) {
          if (LuceneConstants.STORE_YES.equals(val)) {
            store = Field.Store.YES;
          } else if (LuceneConstants.STORE_NO.equals(val)) {
View Full Code Here

   * */
  @Deprecated
  public static void add(NutchDocument doc, Field f) {
    final String fieldName = f.name();
    final String key = LuceneConstants.FIELD_PREFIX + fieldName;
    final Metadata documentMeta = doc.getDocumentMeta();
    if (f.isStored()) {
      documentMeta.add(key, LuceneConstants.STORE_YES);
    } else if (f.isCompressed()) {
      documentMeta.add(key, LuceneConstants.STORE_COMPRESS);
    } else {
      documentMeta.add(key, LuceneConstants.STORE_NO);
    }

    if (f.isIndexed()) {
      if (f.isTokenized()) {
        documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
      } else if (f.getOmitNorms()) {
        documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS);
      } else {
        documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
      }
    } else {
      documentMeta.add(key, LuceneConstants.INDEX_NO);
    }

    if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET);
    } else if (f.isStoreOffsetWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_OFFSET);
    } else if (f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS);
    } else if (f.isTermVectorStored()) {
      documentMeta.add(key, LuceneConstants.VECTOR_YES);
    } else {
      documentMeta.add(key, LuceneConstants.VECTOR_NO);
    }
  }
View Full Code Here

  }

  /** Unit tests for getContentType(String, String, byte[]) method. */
  public void testGetContentType() throws Exception {
    Content c = null;
    Metadata p = new Metadata();

    c = new Content("http://www.foo.com/",
                    "http://www.foo.com/",
                    "".getBytes("UTF8"),
                    "text/html; charset=UTF-8", p, conf);
View Full Code Here

      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest"
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);
   
    Collections.sort(handledurls);
View Full Code Here

  public void testGuessing() {
    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

    Metadata metadata = new Metadata();
    EncodingDetector detector;
    Content content;
    String encoding;

    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());

    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());

    metadata.clear();
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());

    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("utf-32", "sniffed");
View Full Code Here

TOP

Related Classes of org.apache.nutch.metadata.Metadata

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.