Examples of org.apache.nutch.metadata.Metadata

org.apache.nutch.metadata.Metadata
A multi-valued metadata container.

      assertEquals(expected[i], parts[i]);
    }
  }
  
  private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());

View Full Code Here

        fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
      return;
    }


    NutchDocument doc = new NutchDocument();
    final Metadata metadata = parseData.getContentMeta();


    // add segment, used to map from merged index back to segment files
    doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));


    // add digest, used by dedup
    doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));


    final Parse parse = new ParseImpl(parseText, parseData);
    try {
      // extract information from dbDatum and pass it to
      // fetchDatum so that indexing filters can use it

View Full Code Here

  private Metadata metadata;


  private MimeUtil mimeTypes;


  public Content() {
    metadata = new Metadata();
  }

View Full Code Here

    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
    assertEquals("02", metadata.get("TRCK-Text"));
    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
    assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
    assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));


    assertEquals("postgresql title id3v2 - "
        + "postgresql album id3v2 - "
        + "postgresql artist id3v2", parse.getData().getTitle());
    assertEquals("http://localhost/", parse.getData().getOutlinks()[0].getToUrl());

View Full Code Here

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content).get(urlString);


    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
    assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
    assertEquals("(28)", metadata.get("TCON-Text"));
    assertEquals("2004", metadata.get("TYER-Text"));
    assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
    assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));


    assertEquals("postgresql title id3v1 - "
        + "postgresql album id3v1 - "
        + "postgresql artist id3v1", parse.getData().getTitle());

View Full Code Here

  private Document createLuceneDoc(NutchDocument doc) {
    final Document out = new Document();


    out.setBoost(doc.getScore());


    final Metadata documentMeta = doc.getDocumentMeta();
    for (final Entry<String, List<String>> entry : doc) {
      final String fieldName = entry.getKey();


      Field.Store store = fieldStore.get(fieldName);
      Field.Index index = fieldIndex.get(fieldName);
      Field.TermVector vector = fieldVector.get(fieldName);


      // default values
      if (store == null) {
        store = Field.Store.NO;
      }


      if (index == null) {
        index = Field.Index.NO;
      }


      if (vector == null) {
        vector = Field.TermVector.NO;
      }


      // read document-level field information
      final String[] fieldMetas =
        documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
      if (fieldMetas.length != 0) {
        for (final String val : fieldMetas) {
          if (LuceneConstants.STORE_YES.equals(val)) {
            store = Field.Store.YES;
          } else if (LuceneConstants.STORE_NO.equals(val)) {

View Full Code Here

   * */
  @Deprecated
  public static void add(NutchDocument doc, Field f) {
    final String fieldName = f.name();
    final String key = LuceneConstants.FIELD_PREFIX + fieldName;
    final Metadata documentMeta = doc.getDocumentMeta();
    if (f.isStored()) {
      documentMeta.add(key, LuceneConstants.STORE_YES);
    } else if (f.isCompressed()) {
      documentMeta.add(key, LuceneConstants.STORE_COMPRESS);
    } else {
      documentMeta.add(key, LuceneConstants.STORE_NO);
    }


    if (f.isIndexed()) {
      if (f.isTokenized()) {
        documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
      } else if (f.getOmitNorms()) {
        documentMeta.add(key, LuceneConstants.INDEX_NO_NORMS);
      } else {
        documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
      }
    } else {
      documentMeta.add(key, LuceneConstants.INDEX_NO);
    }


    if (f.isStoreOffsetWithTermVector() && f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS_OFFSET);
    } else if (f.isStoreOffsetWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_OFFSET);
    } else if (f.isStorePositionWithTermVector()) {
      documentMeta.add(key, LuceneConstants.VECTOR_POS);
    } else if (f.isTermVectorStored()) {
      documentMeta.add(key, LuceneConstants.VECTOR_YES);
    } else {
      documentMeta.add(key, LuceneConstants.VECTOR_NO);
    }
  }

View Full Code Here

  }


  /** Unit tests for getContentType(String, String, byte[]) method. */
  public void testGetContentType() throws Exception {
    Content c = null;
    Metadata p = new Metadata();


    c = new Content("http://www.foo.com/",
                    "http://www.foo.com/",
                    "".getBytes("UTF8"),
                    "text/html; charset=UTF-8", p, conf);

View Full Code Here

      Text key = new Text();
      ParseData value = new ParseData();
      if(!reader.next(key, value)) break READ_PARSE_DATA;
      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
      // keys in parse metadata
      Metadata contentMeta = value.getContentMeta();
      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
        handledurls.add(key.toString());
      }
    } while(true);
    
    Collections.sort(handledurls);

View Full Code Here


  public void testGuessing() {
    // first disable auto detection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);


    Metadata metadata = new Metadata();
    EncodingDetector detector;
    Content content;
    String encoding;


    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    // no information is available, so it should return default encoding
    assertEquals("windows-1252", encoding.toLowerCase());


    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    encoding = detector.guessEncoding(content, "windows-1252");
    assertEquals("utf-16", encoding.toLowerCase());


    metadata.clear();
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("windows-1254", "sniffed");
    encoding = detector.guessEncoding(content, "windows-1252");
    assertEquals("windows-1254", encoding.toLowerCase());


    // enable autodetection
    conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
    metadata.clear();
    metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
    content = new Content("http://www.example.com", "http://www.example.com/",
        contentInOctets, "text/plain", metadata, conf);
    detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    detector.addClue("utf-32", "sniffed");

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.metadata.Metadata

com.atlantbh.nutch.filter.index.omit.OmitIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilter

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.atlantbh.nutch.filter.xpath.XPathIndexingFilter

com.atlantbh.nutch.filter.xpath.XPathIndexingFilterTest

com.atlantbh.nutch.index.alternativedataflow.flow.CsvDataFlow

com.flaptor.hounder.crawler.Nutch9Fetcher$NutchSegment$SegmentIterator

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

org.apache.nutch.fetcher.Fetcher$FetcherThread

org.apache.nutch.fetcher.Fetcher2$FetcherThread

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.