Examples of org.pdfbox.pdmodel.PDDocument

org.pdfbox.pdmodel.PDDocument
This is the in-memory representation of the PDF document. You need to call close() on this object when you are done using it!! @author Ben Litchfield @version $Revision: 1.43 $


    public Reader convertToPlainText(InputStream source, WVTDocumentInfo d) {


        String plainText = null;
        try {
            PDDocument document = PDDocument.load(source);
            PDFTextStripper stripper = new PDFTextStripper();
            plainText = stripper.getText(document);
            document.close();
        } catch (IOException e) {
            WVToolLogger.getGlobalLogger().logException("Could not read or convert PDF Document", e);
            plainText = new String();
        }

View Full Code Here

    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

    if (contentType != null && !contentType.startsWith("application/pdf"))
      throw new ParseException(
        "Content-Type not application/pdf: "+contentType);


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          throw new ParseException("Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.");
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())


    } catch (ParseException e) {
      throw e;
    } catch (CryptographyException e) {
      throw new ParseException("Error decrypting document. "+e);
    } catch (InvalidPasswordException e) {
      throw new ParseException("Can't decrypt document. "+e);
    } catch (Exception e) { // run time exception
      throw new ParseException("Can't be handled as pdf document. "+e);
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

        try
        {
            PDFParser parser = new PDFParser( content );
            parser.parse();


            PDDocument document = parser.getPDDocument();


            CharArrayWriter writer = new CharArrayWriter();


            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);


            document.close();
            writer.close();


            return new CharArrayReader(writer.toCharArray());
        }
        catch(Exception e )

View Full Code Here


        try {
            PDFTextStripper stripper = new PDFTextStripper();
            PDFParser parser = new PDFParser(this.content.getInputStream());
            parser.parse();
            PDDocument doc = parser.getPDDocument();
            String text = stripper.getText(doc);
            doc.close();
            char[] chars = text.toCharArray();
            this.contentHandler.characters(chars, 0, chars.length);
        } catch (Exception e) {
            throw new ProcessingException(e);
        }

View Full Code Here

                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
                }
            }

View Full Code Here

                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
                }
            }

View Full Code Here


                    try {
                        parser = new PDFParser(new BufferedInputStream(in));
                        parser.parse();


                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();


                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);


                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }
                    } catch (Exception e) {
                        // it may happen that PDFParser throws a runtime
                        // exception when parsing certain pdf documents

View Full Code Here

    if (contentType != null && !contentType.startsWith("application/pdf"))
      return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
        "Content-Type not application/pdf: " + contentType).getEmptyParse();


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;


    try {


      byte[] raw = content.getContent();


      String contentLength = content.get("Content-Length");
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
      // info.getCreator()
      // info.getProducer()
      // info.getTrapped()
      // formatDate(info.getCreationDate())
      // formatDate(info.getModificationDate())


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse();
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse();
    } catch (Exception e) { // run time exception
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse();
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

 }


 public Reader getText(InputStream is,TempFiles tempFiles,Charset charset) throws ExtractionException  {
     logger.debug("extracting pdf file");
   File file = null;
     PDDocument document = null;
     try {
       PDFParser parser = new PDFParser(is);
       parser.parse();
       document = parser.getPDDocument();
       if (document.isEncrypted()) {
           DocumentEncryption decryptor = new DocumentEncryption(document);
           if (logger.isDebugEnabled()) {
               logger.debug("pdf document appears to be encrypted (will attempt decryption)");
           
           }
           decryptor.decryptDocument("");
       }
       file = File.createTempFile("extract", ".tmp");
       tempFiles.markForDeletion(file);
       Writer output = null;
       output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
       PDFTextStripper stripper = new PDFTextStripper();
       stripper.writeText(document, output);
       output.close();
      /*logger.debug("PDF extraction completed");
       BufferedReader reader;
       try {
         reader = new BufferedReader(new FileReader(file));
        String line = null;
        while( (line=reader.readLine()) != null) {
          logger.debug("PDF>"+line);
        }
        reader.close();
       } catch(Exception e) {
         logger.error("failed to open txt file",e);
       }*/
     } catch (Exception e) {
         throw new ExtractionException("failed to extract pdf (probable password protected document)",e,logger,Level.DEBUG);
     } finally {
       try {
         if(document != null)
             document.close();
       } catch (IOException io) {}
     }
     try {
         logger.debug("returning extracted PDF data");
          return new FileReader(file);

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.pdfbox.pdmodel.PDDocument

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.stimulus.archiva.extraction.PDFExtractor

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

net.sf.jabref.util.XMPUtil

org.apache.jackrabbit.core.query.PdfTextFilter

org.apache.jackrabbit.extractor.PdfTextExtractor

org.apache.nutch.parse.pdf.PdfParser

org.exoplatform.services.document.impl.PDFDocumentReader

org.infoglue.cms.controllers.kernel.impl.simple.LuceneController

org.pdfbox.cos.COSArray

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.