Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseStatus


    if (outlinks.size() > 0) {
      Outlink[] old = parse.getData().getOutlinks();
      String title = parse.getData().getTitle();
      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());
View Full Code Here


  }
 
  public Parse getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
              "Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
    if (outlinks == null) outlinks = new Outlink[0];
    // Title? use the first line of the script...
View Full Code Here

      final ByteArrayInputStream bainput = new ByteArrayInputStream(
          contentInBytes);
      final InputStream input = bainput;

      if (contentLen != null && contentInBytes.length != len) {
        return new ParseStatus(ParseStatus.FAILED,
            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
                + contentInBytes.length
                + " bytes. Parser can't handle incomplete pdf file.")
            .getEmptyParse(getConf());
      }

      ZipTextExtractor extractor = new ZipTextExtractor(getConf());

      // extract text
      resultText = extractor.extractText(new ByteArrayInputStream(
          contentInBytes), content.getUrl(), outLinksList);

    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
          "Can't be handled as Zip document. " + e).getEmptyParse(getConf());
    }

    if (resultText == null) {
      resultText = "";
View Full Code Here

    rtfParser.setDelegate(delegate);

    try {
      rtfParser.parse();
    } catch (ParseException e) {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_EXCEPTION,
                               e.toString()).getEmptyParse(conf);
    }

    Metadata metadata = new Metadata();
View Full Code Here

      if (mp3.hasID3v2Tag()) {
        parse = getID3v2Parse(mp3, content.getMetadata());
      } else if (mp3.hasID3v1Tag()) {
        parse = getID3v1Parse(mp3, content.getMetadata());
      } else {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_MISSING_CONTENT,
                               "No textual content available").getEmptyParse(conf);
      }
    } catch (IOException e) {
      return new ParseStatus(ParseStatus.FAILED,
                             ParseStatus.FAILED_EXCEPTION,
                             "Couldn't create temporary file:" + e).getEmptyParse(conf);
    } catch (TagException e) {
      return new ParseStatus(ParseStatus.FAILED,
                             ParseStatus.FAILED_EXCEPTION,
                             "ID3 Tags could not be parsed:" + e).getEmptyParse(conf);
    } finally{
      tmp.delete();
    }
View Full Code Here

    String contentType = content.getContentType();

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      return new ParseStatus(ParseStatus.FAILED,
                      "No external command defined for contentType: " + contentType).getEmptyParse(getConf());

    String command = params[0];
    int timeout = Integer.parseInt(params[1]);

    if (LOG.isTraceEnabled()) {
      LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
    }

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete "
            + contentType + " file.").getEmptyParse(getConf());
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
        return new ParseStatus(ParseStatus.FAILED,
                        "External command " + command
                        + " failed with error: " + es.toString()).getEmptyParse(getConf());

      text = os.toString();

    } catch (Exception e) { // run time exception
      return new ParseStatus(e).getEmptyParse(getConf());
    }

    if (text == null)
      text = "";
View Full Code Here

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
      if ((contentLength != null) &&
          (raw.length != Integer.parseInt(contentLength))) {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_TRUNCATED,
                               "Content truncated at " + raw.length +" bytes. " +
                               "Parser can't handle incomplete file.")
                               .getEmptyParse(this.conf);
      }
      extractor.extract(new ByteArrayInputStream(raw));
      text = extractor.getText();
      properties = extractor.getProperties();
      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
     
    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
                             "Can't be handled as micrsosoft document. " + e)
                             .getEmptyParse(this.conf);
    }
   
    // collect meta data
View Full Code Here

    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  }
View Full Code Here

      final ByteArrayInputStream bainput = new ByteArrayInputStream(
          contentInBytes);
      final InputStream input = bainput;

      if (contentLen != null && contentInBytes.length != len) {
        return new ParseStatus(ParseStatus.FAILED,
            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
                + contentInBytes.length
                + " bytes. Parser can't handle incomplete pdf file.")
            .getEmptyParseResult(content.getUrl(), getConf());
      }

      ZipTextExtractor extractor = new ZipTextExtractor(getConf());

      // extract text
      resultText = extractor.extractText(new ByteArrayInputStream(
          contentInBytes), content.getUrl(), outLinksList);

    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
          "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    }

    if (resultText == null) {
      resultText = "";
View Full Code Here

      feed = feedInput.build(input);
    } catch (Exception e) {
      // return empty parse
      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
          + StringUtils.stringifyException(e));
      return new ParseStatus(e)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    List entries = feed.getEntries();
    String feedLink = feed.getLink();
    try {
      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
      if (feedLink != null)
        feedLink = filters.filter(feedLink);
    } catch (Exception e) {
      feedLink = null;
    }

    for (Iterator i = entries.iterator(); i.hasNext();) {
      SyndEntry entry = (SyndEntry) i.next();
      addToMap(parseResult, feed, feedLink, entry, content);
    }

    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());

    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
        content.getMetadata()));

    return parseResult;
  }
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.