Package org.pdfbox.pdmodel

Examples of org.pdfbox.pdmodel.PDDocument


    public Reader convertToPlainText(InputStream source, WVTDocumentInfo d) {

        String plainText = null;
        try {
            PDDocument document = PDDocument.load(source);
            PDFTextStripper stripper = new PDFTextStripper();
            plainText = stripper.getText(document);
            document.close();
        } catch (IOException e) {
            WVToolLogger.getGlobalLogger().logException("Could not read or convert PDF Document", e);
            plainText = new String();
        }
View Full Code Here


    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here

    if (contentType != null && !contentType.startsWith("application/pdf"))
      throw new ParseException(
        "Content-Type not application/pdf: "+contentType);

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.");
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (ParseException e) {
      throw e;
    } catch (CryptographyException e) {
      throw new ParseException("Error decrypting document. "+e);
    } catch (InvalidPasswordException e) {
      throw new ParseException("Can't decrypt document. "+e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as pdf document. "+e);
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here

        try
        {
            PDFParser parser = new PDFParser( content );
            parser.parse();

            PDDocument document = parser.getPDDocument();

            CharArrayWriter writer = new CharArrayWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);

            document.close();
            writer.close();

            return new CharArrayReader(writer.toCharArray());
        }
        catch(Exception e )
View Full Code Here

        try {
            PDFTextStripper stripper = new PDFTextStripper();
            PDFParser parser = new PDFParser(this.content.getInputStream());
            parser.parse();
            PDDocument doc = parser.getPDDocument();
            String text = stripper.getText(doc);
            doc.close();
            char[] chars = text.toCharArray();
            this.contentHandler.characters(chars, 0, chars.length);
        } catch (Exception e) {
            throw new ProcessingException(e);
        }
View Full Code Here

                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);

                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
                }
            }
View Full Code Here

                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);

                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
                }
            }
View Full Code Here

                    try {
                        parser = new PDFParser(new BufferedInputStream(in));
                        parser.parse();

                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();

                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);

                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }
                    } catch (Exception e) {
                        // it may happen that PDFParser throws a runtime
                        // exception when parsing certain pdf documents
View Full Code Here

    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();

    // in memory representation of pdf file
    PDDocument pdf = null;

    String text = null;
    String title = null;

    try {

      byte[] raw = content.getContent();

      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())

    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }
View Full Code Here

}

public Reader getText(InputStream is,TempFiles tempFiles,Charset charset) throws ExtractionException  {
     logger.debug("extracting pdf file");
   File file = null;
     PDDocument document = null;
     try {
       PDFParser parser = new PDFParser(is);
       parser.parse();
       document = parser.getPDDocument();
       if (document.isEncrypted()) {
           DocumentEncryption decryptor = new DocumentEncryption(document);
           if (logger.isDebugEnabled()) {
               logger.debug("pdf document appears to be encrypted (will attempt decryption)");
          
           }
           decryptor.decryptDocument("");
       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
      /*logger.debug("PDF extraction completed");
       BufferedReader reader;
       try {
         reader = new BufferedReader(new FileReader(file));
        String line = null;
        while( (line=reader.readLine()) != null) {
          logger.debug("PDF>"+line);
        }
        reader.close();
       } catch(Exception e) {
         logger.error("failed to open txt file",e);
       }*/
     } catch (Exception e) {
         throw new ExtractionException("failed to extract pdf (probable password protected document)",e,logger,Level.DEBUG);
     } finally {
       try {
         if(document != null)
             document.close();
       } catch (IOException io) {}
     }
     try {
         logger.debug("returning extracted PDF data");
          return new FileReader(file);
View Full Code Here

TOP

Related Classes of org.pdfbox.pdmodel.PDDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.