Package org.apache.pdfbox.pdmodel

Examples of org.apache.pdfbox.pdmodel.PDDocument


  @SuppressWarnings("unchecked")
  public void prepare(RawDocument rawDocument) throws RegainException {
    String url = rawDocument.getUrl();

    InputStream stream = null;
    PDDocument pdfDocument = null;

    try {
      // Create a InputStream that reads the content.
      stream = rawDocument.getContentAsStream();

      // Parse the content
      PDFParser parser = new PDFParser(stream);
      parser.parse();
      pdfDocument = parser.getPDDocument();

      // Decrypt the PDF-Dokument
      if (pdfDocument.isEncrypted()) {
        mLog.debug("Document is encrypted: " + url);
        StandardDecryptionMaterial sdm = new StandardDecryptionMaterial("");
        pdfDocument.openProtection(sdm);
        AccessPermission ap = pdfDocument.getCurrentAccessPermission();

        if (!ap.canExtractContent()) {
          throw new RegainException("Document is encrypted and can't be opened: " + url);
        }
      }

      // Extract the text with a utility class
      PDFTextStripper stripper = new PDFTextStripper();
      stripper.setSuppressDuplicateOverlappingText(false);
      stripper.setSortByPosition(true);
      stripper.setStartPage(1);
      stripper.setEndPage(Integer.MAX_VALUE);

      setCleanedContent(stripper.getText(pdfDocument).replaceAll("visiblespace", " "));

      // extract annotations
      StringBuilder annotsResult = new StringBuilder();
      List allPages = pdfDocument.getDocumentCatalog().getAllPages();
      for (int i = 0; i < allPages.size(); i++) {
        int pageNum = i + 1;
        PDPage page = (PDPage) allPages.get(i);
        List<PDAnnotation> annotations = page.getAnnotations();
        if (annotations.size() < 1) {
          continue;
        }
        mLog.debug("Total annotations = " + annotations.size());
        mLog.debug("\nProcess Page " + pageNum + "...");
        for (PDAnnotation annotation : annotations) {
          if (annotation.getContents() != null && annotation.getContents().length() > 0) {
            annotsResult.append(annotation.getContents());
            annotsResult.append(" ");
            mLog.debug("Text from annotation: " + annotation.getContents());
          }
        }
      }
      if (annotsResult.length() > 0) {
        setCleanedContent(getCleanedContent() + " Annotations " + annotsResult.toString());
      }

      // Get the meta data
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      StringBuilder metaData = new StringBuilder();
      metaData.append("p.");
      metaData.append(Integer.toString(pdfDocument.getNumberOfPages()));
      metaData.append(" ");

      // Check if fields are null
      if (info.getAuthor() != null) {
        metaData.append(info.getAuthor());
        metaData.append(" ");
      }
      if (info.getSubject() != null) {
        metaData.append(info.getSubject());
        metaData.append(" ");
      }
      if (info.getKeywords() != null) {
        metaData.append(info.getKeywords());
        metaData.append(" ");
      }

      if (info.getTitle() != null) {
        setTitle(info.getTitle());
      }

      setCleanedMetaData(metaData.toString());
      if (mLog.isDebugEnabled()) {
        mLog.debug("Extracted meta data ::" + getCleanedMetaData()
                + ":: from " + rawDocument.getUrl());
      }

    } catch (CryptographyException exc) {
      throw new RegainException("Error decrypting document: " + url, exc);

    } catch (BadSecurityHandlerException exc) {
      // They didn't supply a password and the default of "" was wrong.
      throw new RegainException("Document is encrypted: " + url, exc);

    } catch (IOException exc) {
      throw new RegainException("Error reading document: " + url, exc);

    } finally {
      if (stream != null) {
        try {
          stream.close();
        } catch (Exception exc) {
        }
      }
      if (pdfDocument != null) {
        try {
          pdfDocument.close();
        } catch (Exception exc) {
        }
      }
    }
  }
View Full Code Here


        // check memory for parser
        if (!MemoryControl.request(200 * 1024 * 1024, true))
            throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

        // create a pdf parser
        PDDocument pdfDoc = null;
        //final PDFParser pdfParser;
        try {
            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
            pdfDoc = PDDocument.load(source);
            //pdfParser = new PDFParser(source);
            //pdfParser.parse();
            //pdfDoc = pdfParser.getPDDocument();
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        } finally {
            Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
        }

        if (pdfDoc.isEncrypted()) {
            try {
                pdfDoc.openProtection(new StandardDecryptionMaterial(""));
            } catch (final BadSecurityHandlerException e) {
                try {pdfDoc.close();} catch (final IOException ee) {}
                throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
            } catch (final IOException e) {
                try {pdfDoc.close();} catch (final IOException ee) {}
                throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
            } catch (final CryptographyException e) {
                try {pdfDoc.close();} catch (final IOException ee) {}
                throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
            }
            final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
            if (perm == null || !perm.canExtractContent())
                throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }

        // extracting some metadata
        final PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
        if (info != null) {
            docTitle = info.getTitle();
            docSubject = info.getSubject();
            docAuthor = info.getAuthor();
            docPublisher = info.getProducer();
            if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
            docKeywordStr = info.getKeywords();
            // unused:
            // info.getTrapped());
            // info.getCreationDate());
            // info.getModificationDate();
        }

        if (docTitle == null || docTitle.length() == 0) {
            docTitle = MultiProtocolURI.unescape(location.getFileName());
        }
        CharBuffer writer = null;
        try {
            // create a writer for output
            PDFTextStripper stripper = null;
            writer = new CharBuffer();
            stripper = new PDFTextStripper();
            stripper.writeText(pdfDoc, writer); // may throw a NPE
            pdfDoc.close();
            writer.close();
        } catch (final IOException e) {
            // close the writer
            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
            try {pdfDoc.close();} catch (final IOException ee) {}
            //throw new Parser.Failure(e.getMessage(), location);
        } catch (final NullPointerException e) {
            // this exception appeared after the insertion of the jempbox-1.5.0.jar library
            Log.logException(e);
            // close the writer
            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
            try {pdfDoc.close();} catch (final IOException ee) {}
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
            try {pdfDoc.close();} catch (final IOException e) {}
        }
        pdfDoc = null;

        String[] docKeywords = null;
        if (docKeywordStr != null) {
View Full Code Here

    @Override
    public void run() {
        try {
            String fileContent = "";
            File filePDF = new File(pathToFile);
            PDDocument pdDoc = PDDocument.load(new FileInputStream(filePDF));
            PDFTextStripper PDFTextStripper = null;

            Integer numberOfPages = pdDoc.getNumberOfPages();

            for (int page = 0; page < numberOfPages; page++) {
                PDFTextStripper = new PDFTextStripper("UTF-8");
                PDFTextStripper.setStartPage(page);
                PDFTextStripper.setEndPage(page);
                String text = PDFTextStripper.getText(pdDoc);
                fileContent += " " + text.replaceAll("\\s+", " ").trim();
                text = null;
            }

            pdDoc.close();
            pdDoc = null;
            filePDF = null;
            PDFTextStripper = null;

            if (fileContent.length() < 1) {
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        PDDocument pdfDocument = PDDocument.load(stream);
        try {
            if (pdfDocument.isEncrypted()
                    && !pdfDocument.getCurrentAccessPermission().canExtractContent()) {
                try {
                    String password = metadata.get(PASSWORD);
                    if (password == null) {
                        password = "";
                    }
                    pdfDocument.decrypt(password);
                } catch (Exception e) {
                    // Ignore
                }
            }
            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            extractMetadata(pdfDocument, metadata);
            PDF2XHTML.process(pdfDocument, handler, metadata);
        } finally {
            pdfDocument.close();
        }
    }
View Full Code Here

            {
               if (is == null)
               {
                  throw new NullPointerException("InputStream is null.");
               }
               PDDocument pdDocument = null;
               StringWriter sw = new StringWriter();
               try
               {
                  if (is.available() == 0)
                     return "";

                  try
                  {
                     pdDocument = PDDocument.load(is);
                  }
                  catch (IOException e)
                  {
                     throw new DocumentReadException("Can not load PDF document.", e);
                  }

                  PDFTextStripper stripper = new PDFTextStripper();
                  stripper.setStartPage(1);
                  stripper.setEndPage(Integer.MAX_VALUE);
                  stripper.writeText(pdDocument, sw);
               }
               finally
               {
                  if (pdDocument != null)
                     try
                     {
                        pdDocument.close();
                     }
                     catch (IOException e)
                     {
                     }
                  if (is != null)
View Full Code Here

    if (obj instanceof byte[]) {
      is = new ByteArrayInputStream((byte[]) obj);
    } else {
      throw new IllegalArgumentException("Parameter must be instance of byte[]");
    }
    PDDocument pdfDocument = null;
    String contents = null;

    try {
      pdfDocument = PDDocument.load(is);

      if (pdfDocument.isEncrypted()) {
        //Just try using the default password and move on
        pdfDocument.decrypt("");
      }

      //create a writer where to append the text content.
      StringWriter writer = new StringWriter();
      if (stripper == null) {
        stripper = new PDFTextStripper();
      } else {
        stripper.resetEngine();
      }
      stripper.writeText(pdfDocument, writer);

      // Note: the buffer to string operation is costless;
      // the char array value of the writer buffer and the content string
      // is shared as long as the buffer content is not modified, which will
      // not occur here.
      contents = writer.getBuffer().toString();

    } catch (IOException e) {
      throw new CRException(e);
    } catch (CryptographyException e) {
      throw new CRException(e);
    } catch (InvalidPasswordException e) {
      throw new CRException(e);
    } catch (Exception e) {
      //Catch all Exceptions happening here to not disturb the indexer
      e.printStackTrace();
    } finally {
      if (pdfDocument != null) {
        try {
          pdfDocument.close();
        } catch (IOException e) {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
      
        PDDocument pdfDocument = null;
        TemporaryResources tmp = new TemporaryResources();
        //config from context, or default if not set via context
        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
        try {
            // PDFBox can process entirely in memory, or can use a temp file
            //  for unpacked / processed resources
            // Decide which to do based on if we're reading from a file or not already
            TikaInputStream tstream = TikaInputStream.cast(stream);
            if (tstream != null && tstream.hasFile()) {
                // File based, take that as a cue to use a temporary file
                RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw");
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile);
                } else {
                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true);
                }
            } else {
                // Go for the normal, stream based in-memory parsing
                if (localConfig.getUseNonSequentialParser() == true){
                    pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer());
                } else {
                    pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true);
                }
            }
           
          
            if (pdfDocument.isEncrypted()) {
                String password = null;
               
                // Did they supply a new style Password Provider?
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                   password = passwordProvider.getPassword(metadata);
                }
               
                // Fall back on the old style metadata if set
                if (password == null && metadata.get(PASSWORD) != null) {
                   password = metadata.get(PASSWORD);
                }
               
                // If no password is given, use an empty string as the default
                if (password == null) {
                   password = "";
                }
              
                try {
                    pdfDocument.decrypt(password);
                } catch (Exception e) {
                    // Ignore
                }
            }
            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            extractMetadata(pdfDocument, metadata);
            PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
           
        } finally {
            if (pdfDocument != null) {
               pdfDocument.close();
            }
            tmp.dispose();
        }
        handler.endDocument();
    }
View Full Code Here

    public void parse(
            InputStream stream, ContentHandler handler,
            Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        PDDocument pdfDocument = PDDocument.load(stream, true);
        try {
            if (pdfDocument.isEncrypted()
                    && !pdfDocument.getCurrentAccessPermission().canExtractContent()) {
                try {
                    String password = metadata.get(PASSWORD);
                    if (password == null) {
                        password = "";
                    }
                    pdfDocument.decrypt(password);
                } catch (Exception e) {
                    // Ignore
                }
            }
            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
            extractMetadata(pdfDocument, metadata);
            PDF2XHTML.process(pdfDocument, handler, metadata);
        } finally {
            pdfDocument.close();
        }
    }
View Full Code Here

     * @throws IOException If there is an error saving the document.
     * @throws COSVisitorException If an error occurs while saving the destination file.
     */
    public void mergeDocuments() throws IOException, COSVisitorException
    {
        PDDocument destination = null;
        File sourceFile;
        PDDocument source;
        if (sources != null && sources.size() > 0)
        {
            try
            {
                Iterator sit = sources.iterator();
                sourceFile = (File) sit.next();
                destination = PDDocument.load(sourceFile);
                while (sit.hasNext())
                {
                    sourceFile = (File) sit.next();
                    source = PDDocument.load(sourceFile);
                    try
                    {
                        appendDocument(destination, source);
                    }
                    finally
                    {
                        if (source != null)
                        {
                            source.close();
                        }
                    }
                }
                destination.save(destinationFileName);
            }
View Full Code Here

     * @throws Exception when there is an exception
     */
    public void doTestFile(File file, boolean bLogResult, String inDir, String outDir)
        throws Exception
    {
        PDDocument document = null;

        log.info("Preparing to convert " + file.getName());
        try
        {
            document =  PDDocument.load(file);
            writer.writeImage(document, "png", "", 1, Integer.MAX_VALUE, outDir + file.getName() + "-");
        }
        catch(Exception e)
        {
            this.bFail=true;
            log.error("Error converting file " + file.getName(), e);
        }
        finally
        {
            document.close();
        }

        //Now check the resulting files ... did we get identical PNG(s)?
        try
        {
View Full Code Here

TOP

Related Classes of org.apache.pdfbox.pdmodel.PDDocument

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.