Package net.nutch.parse

Examples of net.nutch.parse.ParseException


    rtfParser.setDelegate(delegate);

    try {
      rtfParser.parse();
    } catch (com.etranslate.tm.processing.rtf.ParseException e) {
      throw new ParseException("Exception parsing RTF document", e);
    }

    Properties metadata = new Properties();
    metadata.putAll(content.getMetadata());
    metadata.putAll(delegate.getMetaData());
View Full Code Here


  public Parse getParse(Content content) throws ParseException {

    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/msword"))
      throw new ParseException(
        "Content-Type not application/msword: "+contentType);

    String text = null;
    String title = null;
    Properties properties = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete msword file.");
      }

      WordExtractor extractor = new WordExtractor();

      // collect text
      text = extractor.extractText(new ByteArrayInputStream(raw));

      // collect meta info
      properties = extractor.extractProperties(new ByteArrayInputStream(raw));

      extractor = null;

    } catch (ParseException e) {
      throw e;
    } catch (FastSavedException e) {
      throw new ParseException(e);
    } catch (PasswordProtectedException e) {
      throw new ParseException(e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as msword document. "+e);
    } finally {
      // nothing so far
    }

    // collect meta data
View Full Code Here

  public Parse getParse(Content content) throws ParseException {

    // check that contentType is one we can handle
    String contentType = content.getContentType();
    if (contentType != null && !contentType.startsWith("application/pdf"))
      throw new ParseException(
        "Content-Type not application/pdf: "+contentType);

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.");
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (ParseException e) {
      throw e;
    } catch (CryptographyException e) {
      throw new ParseException("Error decrypting document. "+e);
    } catch (InvalidPasswordException e) {
      throw new ParseException("Can't decrypt document. "+e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as pdf document. "+e);
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
View Full Code Here

    String contentType = content.getContentType();

    String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
    if (params == null)
      throw new ParseException(
        "No external command defined for contentType: " + contentType);

    String command = params[0];
    int timeout = Integer.parseInt(params[1]);

    if (LOG.isLoggable(Level.FINE))
      LOG.fine("Use "+command+ " with timeout="+timeout+"secs");

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength =
        (String)content.getMetadata().get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete "+contentType+" file.");
      }

      ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
      ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE/4);

      CommandRunner cr = new CommandRunner();

      cr.setCommand(command+ " " +contentType);
      cr.setInputStream(new ByteArrayInputStream(raw));
      cr.setStdOutputStream(os);
      cr.setStdErrorStream(es);

      cr.setTimeout(timeout);

      cr.evaluate();

      if (cr.getExitValue() != 0)
        throw new ParseException("External command "+command
          +" failed with error: "+es.toString());

      text = os.toString();

    } catch (ParseException e) {
      throw e;
    } catch (Exception e) { // run time exception
      throw new ParseException("ExtParser failed. "+e);
    }

    if (text == null)
      text = "";
View Full Code Here

TOP

Related Classes of net.nutch.parse.ParseException

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.