Package org.apache.nutch.parse

Examples of org.apache.nutch.parse.ParseStatus


    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
          getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(mimeType);
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type "
          + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName()
        + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
    } catch (Exception e) {
      LOG.error("Error parsing "+content.getUrl(),e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in "
            + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
        continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
View Full Code Here


    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
    assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
  }
View Full Code Here

      }

      // set the content signature
      if (parseResult == null) {
        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
          content, new ParseStatus().getEmptyParse(getConf()));
        datum.setSignature(signature);
      }

      try {
        output.collect(key, new NutchWritable(datum));
        output.collect(key, new NutchWritable(content));

        if (parseResult != null) {
          for (Entry <Text, Parse> entry : parseResult) {
            Text url = entry.getKey();
            Parse parse = entry.getValue();
            ParseStatus parseStatus = parse.getData().getStatus();

            if (!parseStatus.isSuccess()) {
              LOG.warn("Error parsing: " + key + ": " + parseStatus);
              parse = parseStatus.getEmptyParse(getConf());
            }

            // Calculate page signature.
            byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
              content, parse);
View Full Code Here

        Content content = new Content(urlStr, urlStr, bytes.get(), contentType,
          new Metadata(), getConf());
       
        // set the url version into the metadata
        content.getMetadata().set(URL_VERSION, version);
        ParseStatus pstatus = null;
        pstatus = output(output, segmentName, url, datum, content, status,
          CrawlDatum.STATUS_FETCH_SUCCESS);
        reporter.progress();
      }
      catch (Throwable t) { // unexpected exception
View Full Code Here

    if (outlinks.size() > 0) {
      Outlink[] old = parse.getData().getOutlinks();
      String title = parse.getData().getTitle();
      List list = Arrays.asList(old);
      outlinks.addAll(list);
      ParseStatus status = parse.getData().getStatus();
      String text = parse.getText();
      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
      ParseData parseData = new ParseData(status, title, newlinks,
                                          parse.getData().getContentMeta(),
                                          parse.getData().getParseMeta());
View Full Code Here

  }
 
  public ParseResult getParse(Content c) {
    String type = c.getContentType();
    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
      return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
              "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
    String script = new String(c.getContent());
    Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
    if (outlinks == null) outlinks = new Outlink[0];
    // Title? use the first line of the script...
View Full Code Here

    String contentType = content.getContentType();

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      return new ParseStatus(ParseStatus.FAILED,
                      "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());

    String command = params[0];
    int timeout = Integer.parseInt(params[1]);
    String encoding = params[2];

    if (LOG.isTraceEnabled()) {
      LOG.trace("Use "+command+ " with timeout="+timeout+"secs");
    }

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                "Content truncated at " + raw.length
            +" bytes. Parser can't handle incomplete "
            + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
        return new ParseStatus(ParseStatus.FAILED,
                        "External command " + command
                        + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());

      text = os.toString(encoding);

    } catch (Exception e) { // run time exception
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    if (text == null)
      text = "";
View Full Code Here

    } catch (IOException e) {
      // This can never happen because it's an in memory stream
    } catch(PatternSyntaxException e) {
      System.err.println(e.getMessage());
      log.error("Error parsing urlRegex: " + e.getMessage());
      return new ParseStatus(ParseStatus.FAILED, "Error parsing urlRegex: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration);
    } catch (ParserConfigurationException e) {
      System.err.println(e.getMessage());
      log.error("HTML Cleaning error: " + e.getMessage());
      return new ParseStatus(ParseStatus.FAILED, "HTML Cleaning error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration);
    } catch (SAXException e) {
      System.err.println(e.getMessage());
      log.error("XML parsing error: " + e.getMessage());
      return new ParseStatus(ParseStatus.FAILED, "XML parsing error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration);
    } catch (JaxenException e) {
      System.err.println(e.getMessage());
      log.error("XPath error: " + e.getMessage());
      return new ParseStatus(ParseStatus.FAILED, "XPath error: " + e.getMessage()).getEmptyParseResult(content.getUrl(), configuration);
    }
   
    return parseResult;
  }
View Full Code Here

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
          getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(mimeType);
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type "
          + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName()
        + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd,context);
    } catch (Exception e) {
      LOG.error("Error parsing "+content.getUrl(),e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in "
            + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
        continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(new String[] { metaTags.getRefreshHref().toString(),
          Integer.toString(metaTags.getRefreshTime()) });
    }
    ParseData parseData = new ParseData(status, title, outlinks, content
        .getMetadata(), nutchMetadata);
    ParseResult parseResult = ParseResult.createParseResult(content
View Full Code Here

      byte[] raw = content.getContent();

      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));       
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
View Full Code Here

TOP

Related Classes of org.apache.nutch.parse.ParseStatus

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.