Package org.pdfbox.pdfparser

Examples of org.pdfbox.pdfparser.PDFParser


    public Reader extract(InputStream contentthrows ExtractorException
    {
        try
        {
            PDFParser parser = new PDFParser( content );
            parser.parse();

            PDDocument document = parser.getPDDocument();

            CharArrayWriter writer = new CharArrayWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
View Full Code Here


        this.contentHandler.startElement(NAMESPACE, "document", PREFIX + ":document",
                new AttributesImpl());

        try {
            PDFTextStripper stripper = new PDFTextStripper();
            PDFParser parser = new PDFParser(this.content.getInputStream());
            parser.parse();
            PDDocument doc = parser.getPDDocument();
            String text = stripper.getText(doc);
            doc.close();
            char[] chars = text.toCharArray();
            this.contentHandler.characters(chars, 0, chars.length);
        } catch (Exception e) {
View Full Code Here

     */
    public Reader extractText(InputStream stream,
                              String type,
                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);

                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
View Full Code Here

     */
    public Reader extractText(InputStream stream,
                              String type,
                              String encoding) throws IOException {
        try {
            PDFParser parser = new PDFParser(new BufferedInputStream(stream));
            try {
                parser.parse();
                PDDocument document = parser.getPDDocument();
                CharArrayWriter writer = new CharArrayWriter();

                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);

                return new CharArrayReader(writer.toCharArray());
            } finally {
                try {
                    PDDocument doc = parser.getPDDocument();
                    if (doc != null) {
                        doc.close();
                    }
                } catch (IOException e) {
                    // ignore
View Full Code Here

        InternalValue[] values = data.getValues();
        if (values.length > 0) {
            final BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
            LazyReader reader = new LazyReader() {
                protected void initializeReader() throws IOException {
                    PDFParser parser = null;
                    InputStream in;
                    try {
                        in = blob.getStream();
                    } catch (RepositoryException e) {
                        throw new IOException(e.getMessage());
                    }

                    try {
                        parser = new PDFParser(new BufferedInputStream(in));
                        parser.parse();

                        PDDocument document = parser.getPDDocument();
                        try {
                            CharArrayWriter writer = new CharArrayWriter();

                            PDFTextStripper stripper = new PDFTextStripper();
                            stripper.setLineSeparator("\n");
                            stripper.writeText(document, writer);

                            delegate = new CharArrayReader(writer.toCharArray());
                        } finally {
                            document.close();
                        }
                    } catch (Exception e) {
                        // it may happen that PDFParser throws a runtime
                        // exception when parsing certain pdf documents

                        // JCR-764: Check if document is still open and
                        // close it appropriately. Otherwise some temporary
                        // files may get left behind and document finalization
                        // will log a warning.
                        if (parser != null) {
                            try {
                                parser.getDocument().close();
                            } catch (Exception ioe) {
                                // ignore, this means doc has not been generated
                            }
                        }
View Full Code Here

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
      }

      PDFParser parser = new PDFParser(
        new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

public Reader getText(InputStream is,TempFiles tempFiles,Charset charset) throws ExtractionException  {
     logger.debug("extracting pdf file");
   File file = null;
     PDDocument document = null;
     try {
       PDFParser parser = new PDFParser(is);
       parser.parse();
       document = parser.getPDDocument();
       if (document.isEncrypted()) {
           DocumentEncryption decryptor = new DocumentEncryption(document);
           if (logger.isDebugEnabled()) {
               logger.debug("pdf document appears to be encrypted (will attempt decryption)");
          
View Full Code Here

    {
        COSDocument cos = null;

        try
        {
            PDFParser parser = new PDFParser(metadata);
            parser.parse();
            cos = parser.getDocument();

            // sanity check: PDFBox breaks on encrypted documents, so give up.
            if(cos.getEncryptionDictionary() != null)
                throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
View Full Code Here

    public static void main(String[] args) {
        try {
            String infile = args[0];
            String outfile = args[1];
            FileWriter out = new FileWriter(outfile);
            PDFParser parser =
                    new PDFParser(new FileInputStream(infile));
            parser.parse();
            PDDocument document = parser.getPDDocument();
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            int i = 0;
            for (Object page: catalog.getAllPages()) {
                i++;
                out.write("/Page " + i + " " + "/Rotate "
View Full Code Here

          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }

      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();

      pdf = parser.getPDDocument();

      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
View Full Code Here

TOP

Related Classes of org.pdfbox.pdfparser.PDFParser

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.