Examples of org.pdfbox.pdfparser.PDFParser

org.pdfbox.pdfparser.PDFParser
This class will handle the parsing of the PDF document. @author Ben Litchfield @version $Revision: 1.53 $


    public Reader extract(InputStream content)  throws ExtractorException
    {
        try
        {
            PDFParser parser = new PDFParser( content );
            parser.parse();


            PDDocument document = parser.getPDDocument();


            CharArrayWriter writer = new CharArrayWriter();


            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");

View Full Code Here

        this.contentHandler.startElement(NAMESPACE, "document", PREFIX + ":document",
                new AttributesImpl());


        try {
            PDFTextStripper stripper = new PDFTextStripper();
            PDFParser parser = new PDFParser(this.content.getInputStream());
            parser.parse();
            PDDocument doc = parser.getPDDocument();
            String text = stripper.getText(doc);
            doc.close();
            char[] chars = text.toCharArray();
            this.contentHandler.characters(chars, 0, chars.length);
        } catch (Exception e) {

View Full Code Here

     */
    public Reader extractText(InputStream stream,
                              String type,
                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore

View Full Code Here

     */
    public Reader extractText(InputStream stream,
                              String type,
                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);


                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore

View Full Code Here

        InternalValue[] values = data.getValues();
        if (values.length > 0) {
            final BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
            LazyReader reader = new LazyReader() {
                protected void initializeReader() throws IOException {
                    PDFParser parser = null;
                    InputStream in;
                    try {
                        in = blob.getStream();
                    } catch (RepositoryException e) {
                        throw new IOException(e.getMessage());
                    }


                    try {
                        parser = new PDFParser(new BufferedInputStream(in));
                        parser.parse();


                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();


                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);


                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }
                    } catch (Exception e) {
                        // it may happen that PDFParser throws a runtime
                        // exception when parsing certain pdf documents


                        // JCR-764: Check if document is still open and
                        // close it appropriately. Otherwise some temporary
                        // files may get left behind and document finalization
                        // will log a warning.
                        if (parser != null) {
                            try {
                                parser.getDocument().close();
                            } catch (Exception ioe) {
                                // ignore, this means doc has not been generated
                            }
                        }

View Full Code Here

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }


      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");

View Full Code Here

 public Reader getText(InputStream is,TempFiles tempFiles,Charset charset) throws ExtractionException  {
     logger.debug("extracting pdf file");
   File file = null;
     PDDocument document = null;
     try {
       PDFParser parser = new PDFParser(is);
       parser.parse();
       document = parser.getPDDocument();
       if (document.isEncrypted()) {
           DocumentEncryption decryptor = new DocumentEncryption(document);
           if (logger.isDebugEnabled()) {
               logger.debug("pdf document appears to be encrypted (will attempt decryption)");

View Full Code Here

    {
        COSDocument cos = null;


        try
        {
            PDFParser parser = new PDFParser(metadata);
            parser.parse();
            cos = parser.getDocument();


            // sanity check: PDFBox breaks on encrypted documents, so give up.
            if(cos.getEncryptionDictionary() != null)
                throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");

View Full Code Here

    public static void main(String[] args) {
        try {
            String infile = args[0];
            String outfile = args[1];
            FileWriter out = new FileWriter(outfile);
            PDFParser parser =
                    new PDFParser(new FileInputStream(infile));
            parser.parse();
            PDDocument document = parser.getPDDocument();
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            int i = 0;
            for (Object page: catalog.getAllPages()) {
                i++;
                out.write("/Page " + i + " " + "/Rotate "

View Full Code Here

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.pdfbox.pdfparser.PDFParser

com.stimulus.archiva.extraction.PDFExtractor

com.stimulus.archiva.persistence.textextraction.PDFExtractor

de.spotnik.mail.core.message.content.PDFHandler

ExtractRotate

net.nutch.parse.pdf.PdfParser

org.apache.jackrabbit.core.query.PdfTextFilter

org.apache.jackrabbit.extractor.PdfTextExtractor

org.apache.lenya.modules.resource.PdfToTextGenerator

org.apache.nutch.parse.pdf.PdfParser

org.apache.slide.extractor.PDFExtractor

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.