Examples of ProtocolFactory


Examples of org.apache.nutch.protocol.ProtocolFactory

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    //LOG.setLevel(Level.FINE);
    String url = args[0];
    Configuration conf = NutchConfiguration.create();
    RSSParser parser = new RSSParser();
    parser.setConf(conf);
    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
    Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
    Parse parse = parser.getParse(content);
    System.out.println("data: "+ parse.getData());
    System.out.println("text: "+parse.getText());
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    FileOutputStream fos = new FileOutputStream(tempFile);
    fos.write(expectedText.getBytes());
    fos.close();

    // get nutch content
    Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    protocol = null;
  }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    ParseUtil parser = new ParseUtil(conf);
    ProtocolFactory factory = new ProtocolFactory(conf);
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = factory.getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
                                           new CrawlDatum()).getContent();
      parse = parser.parseByExtensionId("parse-msexcel", content);

      assertTrue(parse.getText().equals(expectedText));
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

        Configuration conf = NutchConfiguration.create();
        for (int i = 0; i < sampleFiles.length; i++) {
            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

            protocol = new ProtocolFactory(conf).getProtocol(urlString);
            content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
            parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);

            //check that there are 3 outlinks:
            //http://test.channel.com
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    Content content;
    Parse parse;

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    Configuration conf = NutchConfiguration.create();

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    Configuration conf = NutchConfiguration.create();
    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);

      assertTrue(parse.getText().startsWith(expectedText));
    }
View Full Code Here

Examples of org.apache.nutch.protocol.ProtocolFactory

    super.setUp();

    this.urlString = createUrl(this.testFile.getName());

    System.out.println("Testing file: " + this.urlString + "...");
    this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
    this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
  }
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.