Package org.apache.nutch.protocol

Examples of org.apache.nutch.protocol.Content


                    logger.debug("   Content...");
                    while (advanceReader(ContentData)) {
                        logger.debug("     got reader");
                        Text key = new Text();
                        Content value = new Content();
                        if (readers[ContentData].next(key, value)) {
                            if (logger.isDebugEnabled()) {
                                logger.debug("       CONTENT OF "+key.toString()+":");
                            }
                            currRec.origurl = value.getUrl();
                            currRec.newurl = value.getBaseUrl();
                            currRec.content = value.getContent();
                            Metadata metadata = value.getMetadata();
                            currRec.header = new HashMap<String,String>();
                            for (String name : metadata.names()) {
                                String data = metadata.get(name);
                                currRec.header.put(name.toLowerCase(),data);
                            }
View Full Code Here


    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);
    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html", content).get(content.getUrl());

    Metadata metadata = parse.getData().getParseMeta();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
    assertEquals(type, metadata.get("Work-Type"));
View Full Code Here

    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content).get(content.getUrl());
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }
View Full Code Here

      } else {
          contentType = "";
      }
    }
    if (content == null) content = EMPTY_CONTENT;
    return new Content(orig, base, content, contentType, headers);
  }
View Full Code Here

          contentType = type.getName();
      } else {
          contentType = "";
      }
    }
    return new Content(orig, base, content, contentType, headers);
  }
View Full Code Here

  protected void tearDown() {}

  public void testIt() throws ProtocolException, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    Parser parser;
    Parse parse;

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = ProtocolFactory.getProtocol(urlString);
      content = protocol.getProtocolOutput(urlString).getContent();

      parser = ParserFactory.getParser(content.getContentType(), urlString);
      parse = parser.getParse(content);

      assertTrue(parse.getText().startsWith(expectedText));
    }
  }
View Full Code Here

    try {

      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {

        Content content = getContent(docs[t]);
        Parser parser = ParserFactory.getParser("text/html", URL);
        Parse parse = parser.getParse(content);

        assertEquals(metalanguages[t], (String) parse.getData().get(
            HTMLLanguageParser.META_LANG_NAME));
View Full Code Here

 
  private Content getContent(String text) {
    Properties p = new Properties();
    p.put("Content-Type", "text/html");

    Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
    return content;
  }
View Full Code Here

    }
    in.close();
    byte[] bytes = out.toByteArray();

    Parser parser = ParserFactory.getParser(contentType, url);
    Content content =
      new Content(url, url, bytes, contentType, new Properties());
    Parse parse = parser.getParse(content);

    Properties metadata = parse.getData().getMetadata();
    assertEquals(license, metadata.get("License-Url"));
    assertEquals(location, metadata.get("License-Location"));
View Full Code Here

      file.setMaxContentLength(maxContentLength);

    // set log level
    LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    Content content = file.getProtocolOutput(urlString).getContent();

    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.get("Content-Length"));
    System.err.println("Last-Modified: " + content.get("Last-Modified"));
    if (dumpContent) {
      System.out.print(new String(content.getContent()));
    }

    file = null;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.protocol.Content

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.