Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseUtil


      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content).get(content.getUrl());

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
    }
  }
View Full Code Here


    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + rtfFile;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
    String text = parse.getText();
    assertEquals("The quick brown fox jumps over the lazy dog", text.trim());

    String title = parse.getData().getTitle();
    Metadata meta = parse.getData().getParseMeta();
View Full Code Here

    for (int i=0; i<sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);

      assertTrue(parse.getText().startsWith(expectedText));
    }
  }
View Full Code Here

   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testContent() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", this.content);

    ParseData data = parse.getData();
    String text = parse.getText();

View Full Code Here

   * @see #SAMPLE_DIR
   * @throws Exception
   */
  public void testMeta() throws Exception {

    Parse parse = new ParseUtil(NutchConfiguration.create())
                        .parseByExtensionId("parse-mspowerpoint", content);
   
    ParseData data = parse.getData();

    final FileExtensionFilter titleFilter = new FileExtensionFilter(
View Full Code Here

      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();

      parse = new ParseUtil(conf).parse(content);

      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
      assertTrue(sampleTexts[i].equals(text));
    }
  }
View Full Code Here

      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      Configuration conf = NutchConfiguration.create();
      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);

      int index = parse.getText().indexOf(expectedText);
      assertTrue(index > 0);
    }
  }
View Full Code Here

    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
      parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
      assertTrue(parse.getText().equals(expectedText));
    }
  }
View Full Code Here

  private static String getUrlContent(String url, Configuration conf) {
    Protocol protocol;
    try {
      protocol = new ProtocolFactory(conf).getProtocol(url);
      Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content);
      System.out.println("text:" + parse.getText());
      return parse.getText();

    } catch (ProtocolNotFound e) {
      e.printStackTrace();
View Full Code Here

    Configuration conf = NutchConfiguration.create();
    urlString = "file:" + sampleDir + fileSeparator + id3v2;
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
                      .getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
    Metadata metadata = parse.getData().getParseMeta();
    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
    assertEquals("02", metadata.get("TRCK-Text"));
    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseUtil

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.