Examples of ParseResult


Examples of org.apache.nutch.parse.ParseResult

      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content,
        parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(
            Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

    if (LOG.isInfoEnabled()) {
      LOG.info("parsing: " + url);
      LOG.info("contentType: " + contentType);
    }

    ParseResult parseResult = new ParseUtil(conf).parse(content);

    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);

    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);

    byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
        parse);
    parse.getData().getContentMeta()
        .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

  @Test
  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;

    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
      urlString = urlString.replace('\\', '/');

      protocol = new ProtocolFactory(conf).getProtocol(urlString);
      content = protocol.getProtocolOutput(new Text(urlString),
          new CrawlDatum()).getContent();

      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);

      Assert.assertEquals(3, parseResult.size());

      boolean hasLink1 = false, hasLink2 = false, hasLink3=false;

      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
          .hasNext();) {
        Map.Entry<Text, Parse> entry = j.next();
        if (entry.getKey().toString().equals(
            "http://www-scf.usc.edu/~mattmann/")) {
          hasLink1 = true;
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

   *         were present in the feed file that this {@link Parser} dealt with.
   *
   */
  public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());

    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
      InputSource input = new InputSource(new ByteArrayInputStream(content
          .getContent()));
      input.setEncoding(encoding);
      SyndFeedInput feedInput = new SyndFeedInput();
      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }

    List<?> entries = feed.getEntries();
    for(Object entry: entries) {
      addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

    parser.setConf(conf);
    File file = new File(name);
    byte[] bytes = new byte[(int) file.length()];
    DataInputStream in = new DataInputStream(new FileInputStream(file));
    in.readFully(bytes);
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
        "application/rss+xml", new Metadata(), conf));
    for (Entry<Text, Parse> entry : parseResult) {
      System.out.println("key: " + entry.getKey());
      Parse parse = entry.getValue();
      System.out.println("data: " + parse.getData());
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

  private MetadataCollector metadataCollector;
  private Configuration conf;

  public ParseResult getParse(Content content) {

    ParseResult parse = null;
    byte[] raw = content.getContent();
    File tmp = null;

    try {
      tmp = File.createTempFile("nutch", ".mp3");
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

    metadataCollector.notifyProperty("TIT2-Text", tag.getTitle());
    metadataCollector.notifyProperty("TYER-Text", tag.getYear());
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
        metadataCollector.getTitle(), metadataCollector.getOutlinks(),
        contentMeta, metadataCollector.getData());
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(),
        new ParseImpl(metadataCollector.getText(), parseData));

    return parseResult;
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

      }
    }
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
        metadataCollector.getTitle(), metadataCollector.getOutlinks(),
        contentMeta, metadataCollector.getData());
    ParseResult parseResult = ParseResult.createParseResult(content
        .getUrl(),
        new ParseImpl(metadataCollector.getText(), parseData));

    return parseResult;
  }
View Full Code Here

Examples of org.apache.nutch.parse.ParseResult

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
      datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();
      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      }
      catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          e.printStackTrace(LogUtil.getWarnStream(LOG));
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }

      try {

        // parse the content
        parseResult = this.parseUtil.parse(content);
      }
      catch (Exception e) {
        LOG.warn("Error parsing: " + key + ": "
          + StringUtils.stringifyException(e));
      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
            // Ensure segment name and score are in parseData metadata
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
              segmentName);
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
              StringUtil.toHexString(signature));
            // Pass fetch time to content meta
            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
              Long.toString(datum.getFetchTime()));
            if (url.equals(key))
              datum.setSignature(signature);
            try {
              scfilters.passScoreAfterParsing(url, content, parse);
            }
            catch (Exception e) {
              if (LOG.isWarnEnabled()) {
                e.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
              }
            }
            output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
              parse.getText()), parse.getData(), parse.isCanonical())));
          }
        }
      }
      catch (IOException e) {
        if (LOG.isFatalEnabled()) {
          LOG.fatal("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
        }
      }

      // return parse status if it exits
      if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
          return p.getData().getStatus();
        }
      }
    }
View Full Code Here

Examples of org.eclipse.core.internal.databinding.conversion.StringToNumberParser.ParseResult

  /* (non-Javadoc)
   * @see org.eclipse.core.databinding.conversion.IConverter#convert(java.lang.Object)
   */
  public Object convert(Object fromObject) {
    ParseResult result = StringToNumberParser.parse(fromObject,
        numberFormat, primitive);

    if (result.getPosition() != null) {
      // this shouldn't happen in the pipeline as validation should catch
      // it but anyone can call convert so we should return a properly
      // formatted message in an exception
      throw new IllegalArgumentException(StringToNumberParser
          .createParseErrorMessage((String) fromObject, result
              .getPosition()));
    } else if (result.getNumber() == null) {
      // if an error didn't occur and the number is null then it's a boxed
      // type and null should be returned
      return null;
    }

    if (StringToNumberParser.inByteRange(result.getNumber())) {
      return new Byte(result.getNumber().byteValue());
    }
   
    synchronized (this) {
      if (outOfRangeMessage == null) {
        outOfRangeMessage = StringToNumberParser
View Full Code Here
TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.