Examples of org.apache.nutch.protocol.Content

org.apache.nutch.protocol.Content


                    logger.debug("   Content...");
                    while (advanceReader(ContentData)) {
                        logger.debug("     got reader");
                        Text key = new Text();
                        Content value = new Content();
                        if (readers[ContentData].next(key, value)) { 
                            if (logger.isDebugEnabled()) {
                                logger.debug("       CONTENT OF "+key.toString()+":");
                            }
                            currRec.origurl = value.getUrl();
                            currRec.newurl = value.getBaseUrl();
                            currRec.content = value.getContent();
                            Metadata metadata = value.getMetadata();
                            currRec.header = new HashMap<String,String>();
                            for (String name : metadata.names()) {
                                String data = metadata.get(name);
                                currRec.header.put(name.toLowerCase(),data);
                            }

View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();


    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html", content).get(content.getUrl());


    Metadata metadata = parse.getData().getParseMeta();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
    assertEquals(type, metadata.get("Work-Type"));

View Full Code Here

    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content).get(content.getUrl());
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }

View Full Code Here

      } else {
          contentType = "";
      }
    }
    if (content == null) content = EMPTY_CONTENT;
    return new Content(orig, base, content, contentType, headers);
  }

View Full Code Here

          contentType = type.getName();
      } else {
          contentType = "";
      }
    }
    return new Content(orig, base, content, contentType, headers);
  }

View Full Code Here

  protected void tearDown() {}


  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;


    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];


      protocol = ProtocolFactory.getProtocol(urlString);
      content = protocol.getProtocolOutput(urlString).getContent();


      parser = ParserFactory.getParser(content.getContentType(), urlString);
      parse = parser.getParse(content);


      assertTrue(parse.getText().startsWith(expectedText));
    }
  }

View Full Code Here

    try {


      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {


        Content content = getContent(docs[t]);
        Parser parser = ParserFactory.getParser("text/html", URL);
        Parse parse = parser.getParse(content);


        assertEquals(metalanguages[t], (String) parse.getData().get(
            HTMLLanguageParser.META_LANG_NAME));

View Full Code Here

  
  private Content getContent(String text) {
    Properties p = new Properties();
    p.put("Content-Type", "text/html");


    Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
    return content;
  }

View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();


    Parser parser = ParserFactory.getParser(contentType, url);
    Content content =
      new Content(url, url, bytes, contentType, new Properties());
    Parse parse = parser.getParse(content);


    Properties metadata = parse.getData().getMetadata();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));

View Full Code Here

      file.setMaxContentLength(maxContentLength);


    // set log level
    LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));


    Content content = file.getProtocolOutput(urlString).getContent();


    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.get("Content-Length"));
    System.err.println("Last-Modified: " + content.get("Last-Modified"));
    if (dumpContent) {
      System.out.print(new String(content.getContent()));
    }


    file = null;
  }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.apache.nutch.protocol.Content

com.atlantbh.nutch.filter.xpath.XPathHtmlParserFilterTest

com.flaptor.hounder.crawler.Nutch9Fetcher$NutchSegment$SegmentIterator

nise.ExtractNutch

org.apache.commons.cli.Options

org.apache.hadoop.conf.Configuration

org.apache.hadoop.fs.FileSystem

org.apache.hadoop.fs.Path

org.apache.hadoop.util.GenericOptionsParser

org.apache.nutch.analysis.lang.LanguageIdentifier

org.apache.nutch.analysis.lang.TestHTMLLanguageParser

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.