Examples of org.pdfbox.pdmodel.PDDocument

org.pdfbox.pdmodel.PDDocument
This is the in-memory representation of the PDF document. You need to call close() on this object when you are done using it!! @author Ben Litchfield @version $Revision: 1.43 $

  private Configuration conf;


  public ParseResult getParse(Content content) {


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;
    Metadata metadata = new Metadata();


    try {


      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
      }


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        //Just try using the default password and move on
        pdf.openProtection(new StandardDecryptionMaterial(""));
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (BadSecurityHandlerException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));        
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

   * @return Reader a reader that is fed to an indexer.
   */
  protected Reader getReader(InputStream docStream)
  {
    
    PDFParser parser = null; PDDocument document = null; PDFTextStripper stripper = null;
    CharArrayWriter writer = null;
    try{
      parser = new PDFParser(docStream);
      parser.parse();
      document = parser.getPDDocument();
      writer = new CharArrayWriter();
      stripper = new PDFTextStripper();
      stripper.setLineSeparator("\n");
      stripper.writeText(document, writer);
      document.close();
      writer.close();
      parser.getDocument().close();
      return new CharArrayReader(writer.toCharArray());
    }catch (Exception e){
        //logger.warn("WARNING: Problem converting PDF: ",e);
      try{
        document.close();        
      }catch(Exception e1){
        //logger.warn("WARNING: Problem converting PDF: ",e1);
      }
      try{
        writer.close();

View Full Code Here

  private Configuration conf;


  public Parse getParse(Content content) {


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;
    Metadata metadata = new Metadata();


    try {


      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }


      // TODO MC - store pdf files to analyze
      // FileOutputStream fout = new FileOutputStream("/home/nutchwax/lixo/"+System.currentTimeMillis()+".pdf"); 
      // fout.write(raw);
      // fout.close();
      // TODO MC


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));        
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

   {
      if (is == null)
      {
         throw new NullPointerException("InputStream is null.");
      }
      PDDocument pdDocument = null;
      StringWriter sw = new StringWriter();
      try
      {
         try
         {
            pdDocument = PDDocument.load(is);
         }
         catch (IOException e)
         {
            return "";
         }


         PDFTextStripper stripper = new PDFTextStripper();
         stripper.setStartPage(1);
         stripper.setEndPage(Integer.MAX_VALUE);
         stripper.writeText(pdDocument, sw);
      }
      finally
      {
         if (pdDocument != null)
            try
            {
               pdDocument.close();
            }
            catch (IOException e)
            {
            }
         if (is != null)

View Full Code Here

                
            try {
                PDFParser parser = new PDFParser(blob.getStream());
                parser.parse();
    
                PDDocument document = parser.getPDDocument();
    
                CharArrayWriter writer = new CharArrayWriter();
    
                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");
                stripper.writeText(document, writer);
    
                document.close();
                writer.close();
                
                Map result = new HashMap();
                result.put(FieldNames.FULLTEXT, new CharArrayReader(writer.toCharArray()));
                return result;

View Full Code Here

  private Configuration conf;


  public Parse getParse(Content content) {


    // in memory representation of pdf file
    PDDocument pdf = null;


    String text = null;
    String title = null;
    Metadata metadata = new Metadata();


    try {


      byte[] raw = content.getContent();


      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
      if (contentLength != null
            && raw.length != Integer.parseInt(contentLength)) {
          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
                  "Content truncated at "+raw.length
            +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
      }


      PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
      parser.parse();


      pdf = parser.getPDDocument();


      if (pdf.isEncrypted()) {
        DocumentEncryption decryptor = new DocumentEncryption(pdf);
        //Just try using the default password and move on
        decryptor.decryptDocument("");
      }


      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
      //metadata.put(LAST_MODIFIED, dcDateFormatter.format(info.getModificationDate().getTime()));


    } catch (CryptographyException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Error decrypting document. " + e).getEmptyParse(getConf());
    } catch (InvalidPasswordException e) {
      return new ParseStatus(ParseStatus.FAILED,
              "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
    } catch (Exception e) { // run time exception
        if (LOG.isWarnEnabled()) {
          LOG.warn("General exception in PDF parser: "+e.getMessage());
          e.printStackTrace(LogUtil.getWarnStream(LOG));        
        }
      return new ParseStatus(ParseStatus.FAILED,
              "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
    } finally {
      try {
        if (pdf != null)
          pdf.close();
        } catch (IOException e) {
          // nothing to do
        }
    }

View Full Code Here

      PDFParser parser = new PDFParser(new ByteArrayInputStream(fileData.data));
      parser.parse();
      COSDocument cosDoc = parser.getDocument();


      PDFTextStripper stripper = new PDFTextStripper();
      String docText = stripper.getText(new PDDocument(cosDoc));
      cosDoc.close();


      return new IndexDocument(fileData.path, docText, null);
    } catch (IOException e) {
      String msg = "Failed to write to the index";

View Full Code Here

            PDFParser parser = new PDFParser(is);
            parser.parse();
            COSDocument cosDoc = parser.getDocument();


            PDFTextStripper stripper = new PDFTextStripper();
            String docText = stripper.getText(new PDDocument(cosDoc));
            cosDoc.close();
            Document document = new Document();
            document.add(new Field("id", id, Field.Store.YES, Field.Index.TOKENIZED));
            document.add(
                    new Field("content", docText, Field.Store.NO, Field.Index.TOKENIZED));

View Full Code Here

        if (info.getLevel() >= 0) {
            InputStream in = UtilExtract.getStream(info.getUri());


            ByteArrayOutputStream bout = null;
            Writer writer = null;
            PDDocument document = null;


            
            try {


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.setLineSeparator("\n");




              
              //load the document
                document = PDDocument.load(in);


                String author = "";
                String title = "";
                String summary = "";


                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();


                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }


                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }


                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
                }


                //set the buffer
                bout = new ByteArrayOutputStream();
                writer = new OutputStreamWriter(bout);


                //strip the document to the buffer 
                stripper.writeText(document, writer);
                bout.flush();
                writer.flush();


                //construct the patterns (to not ignore and replace)
                Pattern notIgnorePattern = Pattern.compile(getNotIgnoreChars());
                Pattern replacePattern = Pattern.compile(getReplaceChars());


                NodeStruct node = new NodeStruct();
                ByteArrayInputStream bin = null;


                try {
                    bin = new ByteArrayInputStream(bout.toByteArray());


                    byte[] buffer = new byte[1024];
                    int n = bin.read(buffer);


                    while (n > 0) {
                        String chars = new String(buffer, 0, n);


                        //generate the list of the words for the buffer
                        LinkedList listWords = UtilExtract.getValueList(chars,
                                getMinLengthWord(), notIgnorePattern,
                                replacePattern);


                        for (int j = 0; j < listWords.size(); j++)
                            node.addTuple(TupleStruct.KEYWORD_GENERIC,
                                (String) listWords.get(j));


                        n = bin.read(buffer);
                    }


                    logger.debug("Title is " + title + "Path is :" +
                        info.getUri() + "author" + author + " Summary:" +
                        summary);


                    //set the summary field according to the defualt settings 
                    if (summary.length() > getMaxLengthSummary()) {
                        summary = summary.substring(0, getMaxLengthSummary());
                    }


                    DocumStruct doc = new DocumStruct();
                    doc.setTitle(title);
                    doc.setPath(info.getUri());
                    doc.setDescription(summary);
                    doc.setContent(node);
                    doc.setCategoryName(info.getCategoryName());
                    doc.setCategoryLocation(info.getCategoryLocation());


                    //set the pdf -author
                    doc.setAuthor(author);


                    //store and reindex document
                    PluginManager.storeAndAddDocument(doc);
                } catch (IOException e) {
                    logger.debug("Exception in reading the document text" +
                        e.getMessage(), e);
                    throw new RpException("extractor.pdf.textdatamissing",
                        new Object[] { info.getUri() });
                } finally {
                    try {
                        if (bin != null) {
                            bin.close();
                        }
                    } catch (Exception e) {
                    }
                }
            } catch (IOException e) {
                logger.debug("Exception in reading the document text" +
                    e.getMessage(), e);
                throw new RpException("app.extract.error",
                    new Object[] { info.getUri() });
            } finally {
                try {
                    if (writer != null) {
                        writer.close();
                    }


                    if (bout != null) {
                        bout.close();
                    }


                    if (document != null) {
                        document.close();
                    }
                } catch (Exception e) {
                }
            }
        } else {

View Full Code Here

            log.error("parse() filename is null");
            throw new ConverterException("PDFConverter::parse() filename is null");
        }


        // PD Document
        PDDocument document = null;
        Writer output = null;
        try {
            document = getPDDocument();


            // check document is readable
            AccessPermission ap = document.getCurrentAccessPermission();
            if (! ap.canExtractContent()) {
                log.info("parse() Document (" + filename + ") isn't readable for DocSearcher.");
                throw new ConverterException("parse() can't read PDF file");
            }


            // write the text to temp file
            try {
                log.debug("parse() Attempting to extract text from (" + filename + ")");


                output = new StringWriter();


                PDFTextStripper stripper = new PDFTextStripper();
                stripper.writeText(document, output);


                log.debug("parse() Successfully stripped out text from (" + filename + ")");
            }
            catch (IOException ioe) {
                log.error("parse() failed", ioe);
                throw new ConverterException("PDFConverter::parse() failed", ioe);
            }


            // get the meta data
            PDDocumentInformation info = document.getDocumentInformation();
            documentTitle = info.getTitle();
            documentAuthor = info.getAuthor();
            documentKeywords = info.getKeywords();
            if (document != null) {
                documentText = output.toString();
            }
        }
        catch (IOException ioe) {
            log.error("parse() failed", ioe);
            throw new ConverterException("parse() failed", ioe);
        }
        finally {
            // close stream
            IOUtils.closeQuietly(output);


            // close document
            try {
                if (document != null) {
                    document.close();
                }
            }
            catch (IOException ioe) {
                log.fatal("parse() can't close PDDocument", ioe);
            }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of org.pdfbox.pdmodel.PDDocument

br.com.caelum.stella.boleto.transformer.BoletoTransformerIntegrationTest

com.stimulus.archiva.extraction.PDFExtractor

eu.lsem.bakalarka.filetypeprocess.document.PdfDocumentParser

net.sf.jabref.util.XMPUtil

org.apache.jackrabbit.core.query.PdfTextFilter

org.apache.jackrabbit.extractor.PdfTextExtractor

org.apache.nutch.parse.pdf.PdfParser

org.exoplatform.services.document.impl.PDFDocumentReader

org.infoglue.cms.controllers.kernel.impl.simple.LuceneController

org.pdfbox.cos.COSArray

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.