Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseStatus


      Outlink[] old = parse.getData().getOutlinks();
      Properties metadata = parse.getData().getMetadata();
      String title = parse.getData().getTitle();
      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      parse = new ParseImpl(text, new ParseData(status, title, newlinks, metadata));
    }
    return parse;
View Full Code Here


  }
 
  public Parse getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
              "Content not JavaScript: '" + type + "'").getEmptyParse();
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, c.getUrl(), c.getUrl());
    if (outlinks == null) outlinks = new Outlink[0];
    // Title? use the first line of the script...
View Full Code Here

    String contentType = content.getContentType();

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      return new ParseStatus(ParseStatus.FAILED,
                      "No external command defined for contentType: " + contentType).getEmptyParse();

    String command = params[0];
    int timeout = Integer.parseInt(params[1]);

    if (LOG.isLoggable(Level.FINE))
      LOG.fine("Use "+command+ " with timeout="+timeout+"secs");

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength =
        (String)content.getMetadata().get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete "
            + contentType + " file.").getEmptyParse();
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
        return new ParseStatus(ParseStatus.FAILED,
                        "External command " + command
                        + " failed with error: " + es.toString()).getEmptyParse();

      text = os.toString();

    } catch (Exception e) { // run time exception
      return new ParseStatus(e).getEmptyParse();
    }

    if (text == null)
      text = "";
View Full Code Here

        // check that contentType is one we can handle
        String contentType = content.getContentType();
        if (contentType != null
                && (!contentType.startsWith("text/xml") && !contentType
                        .startsWith("application/rss+xml")))
            return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
                    "Content-Type not text/xml or application/rss+xml: "
                            + contentType).getEmptyParse();

        List theRSSChannels = null;

        try {
            byte[] raw = content.getContent();

            // create a new FeedParser...
            FeedParser parser = FeedParserFactory.newFeedParser();

            // create a listener for handling our callbacks
            FeedParserListener listener = new FeedParserListenerImpl();

            // start parsing our feed and have the onItem methods called
            parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
            null);

            theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();

        } catch (Exception e) { // run time exception
            e.printStackTrace();
            LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
            return new ParseStatus(ParseStatus.FAILED,
                    "Can't be handled as rss document. " + e).getEmptyParse();
        }

        StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
        List theOutlinks = new Vector();
View Full Code Here

  public Parse getParse(Content content) {

    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/msword"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/msword: " + contentType).getEmptyParse();

    String text = null;
    String title = null;
    Properties properties = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
      }

      WordExtractor extractor = new WordExtractor();

      // collect text
      text = extractor.extractText(new ByteArrayInputStream(raw));

      // collect meta info
      properties = extractor.extractProperties(new ByteArrayInputStream(raw));

      extractor = null;

    } catch (ParseException e) {
      return new ParseStatus(e).getEmptyParse();
    } catch (FastSavedException e) {
      return new ParseStatus(e).getEmptyParse();
    } catch (PasswordProtectedException e) {
      return new ParseStatus(e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as msword document. " + e).getEmptyParse();
    } finally {
      // nothing so far
    }
View Full Code Here

  public Parse getParse(Content content) {

    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
View Full Code Here

      if (mp3.hasID3v2Tag()) {
        parse = getID3v2Parse(mp3, content.getMetadata(), content);
      } else if (mp3.hasID3v1Tag()) {
        parse = getID3v1Parse(mp3, content.getMetadata(), content);
      } else {
        return new ParseStatus().getEmptyParseResult(content.getUrl(),
            getConf());
      }
    } catch (IOException e) {
      return new ParseStatus().getEmptyParseResult(content.getUrl(),
          getConf());
    } catch (TagException e) {
      return new ParseStatus().getEmptyParseResult(content.getUrl(),
          getConf());
    } finally {
      tmp.delete();
    }
View Full Code Here

    try {
      byte[] raw = content.getContent();
      String contentLength = content.getMetadata().get(Metadata.CONTENT_LENGTH);
      if ((contentLength != null) &&
          (raw.length != Integer.parseInt(contentLength))) {
        return new ParseStatus(ParseStatus.FAILED,
                               ParseStatus.FAILED_TRUNCATED,
                               "Content truncated at " + raw.length +" bytes. " +
                               "Parser can't handle incomplete file.")
                               .getEmptyParseResult(content.getUrl(), this.conf);
      }
      extractor.extract(new ByteArrayInputStream(raw));
      text = extractor.getText();
      properties = extractor.getProperties();
      outlinks = OutlinkExtractor.getOutlinks(text, content.getUrl(), getConf());
     
    } catch (Exception e) {
      return new ParseStatus(ParseStatus.FAILED,
                             "Can't be handled as Microsoft document. " + e)
                             .getEmptyParseResult(content.getUrl(), this.conf);
    }
   
    // collect meta data
View Full Code Here

    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.