Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseUtil


    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);
   
    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
View Full Code Here


  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  MimeUtil mimeutil = new MimeUtil(conf);
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));
  parse = new ParseUtil(conf).parse(urlString, page);
   
  //begin assertion for tests
  ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag"));
  byte[] byteArray = new byte[bbuf.remaining()];
  bbuf.get(byteArray);
View Full Code Here

  page.setContent(ByteBuffer.wrap(bytes));
  MimeUtil mutil = new MimeUtil(conf);
  String mime = mutil.getMimeType(file);
  page.setContentType(new Utf8(mime));
 
  parse = new ParseUtil(conf).parse(urlString, page);
  return parse.getOutlinks();
  }
View Full Code Here

  page.setBaseUrl(new Utf8(urlString));
  page.setContent(ByteBuffer.wrap(bytes));
  String mtype = mimeutil.getMimeType(file);
  page.setContentType(new Utf8(mtype));

  parse = new ParseUtil(conf).parse(urlString, page);

  String title = parse.getTitle();
  String text = parse.getText();
  assertEquals("test rft document", title);
  //assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
View Full Code Here

    int threadCount = conf.getInt("fetcher.threads.fetch", 10);
    parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
    storingContent=conf.getBoolean("fetcher.store.content", true);
    if (parse) {
      skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
      parseUtil = new ParseUtil(conf);
    }
    LOG.info("Fetcher: threads: " + threadCount);

    int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50);
    feeder = new QueueFeeder(context, fetchQueues, threadCount * maxFeedPerThread);
 
View Full Code Here

   **/
  @Test
  public void testMetaHTMLParsing() {

    try {
      ParseUtil parser = new ParseUtil(NutchConfiguration.create());
      /* loop through the test documents and validate result */
      for (int t = 0; t < docs.length; t++) {
        WebPage page = getPage(docs[t]);
        parser.parse(URL.toString(), page);
        ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
        String lang = Bytes.toString(blang);
        assertEquals(metalanguages[t], lang);
      }
    } catch (Exception e) {
View Full Code Here

    page.setContent(ByteBuffer.wrap(bytes));
    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));

    new ParseUtil(conf).parse(url, page);

    ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
    assertEquals(license, Bytes.toString(bb));
    bb = page.getFromMetadata(new Utf8("License-Location"));
    assertEquals(location, Bytes.toString(bb));
View Full Code Here

    MimeUtil mimeutil = new MimeUtil(conf);
    String mtype = mimeutil.getMimeType(file);
    page.setContentType(new Utf8(mtype));
    // Parse parse = parser.getParse(url, page);

    Parse parse = new ParseUtil(conf).parse(url, page);

    System.out.println("content type: " + mtype);
    System.out.println("title: " + parse.getTitle());
    System.out.println("text: " + parse.getText());
    System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));
View Full Code Here

    try {
      String urlString = "file:" + sampleDir + fileSeparator + fileName;    
      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
      Content content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();
      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
      metadata = parse.getData().getParseMeta();
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.toString());
    }
View Full Code Here

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    Text urlText = new Text(url);

    Inlinks inlinks = null;
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseUtil

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.