Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

org.apache.pdfbox.pdmodel.PDDocumentInformation
This is the document metadata. Each getXXX method will return the entry if it exists or null if it does not exist. If you pass in null for the setXXX method then it will clear the value. @author Ben Litchfield @author Gerardo Ortiz @version $Revision: 1.12 $

        }
        if( source.isEncrypted() )
        {
            throw new IOException( "Error: source PDF is encrypted, can't append encrypted PDF documents." );
        }
        PDDocumentInformation destInfo = destination.getDocumentInformation();
        PDDocumentInformation srcInfo = source.getDocumentInformation();
        destInfo.getDictionary().mergeInto( srcInfo.getDictionary() );


        PDDocumentCatalog destCatalog = destination.getDocumentCatalog();
        PDDocumentCatalog srcCatalog = source.getDocumentCatalog();


        if( destCatalog.getOpenAction() == null )

View Full Code Here


            // Add the tag-stripped contents as a Reader-valued Text field so it will
            // get tokenized and indexed.
            addTextField( document, "contents", reader );


            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if( info != null )
            {
                addTextField( document, "Author", info.getAuthor() );
                try
                {
                    addTextField( document, "CreationDate", info.getCreationDate() );
                }
                catch( IOException io )
                {
                    //ignore, bad date but continue with indexing
                }
                addTextField( document, "Creator", info.getCreator() );
                addTextField( document, "Keywords", info.getKeywords() );
                try
                {
                    addTextField( document, "ModificationDate", info.getModificationDate() );
                }
                catch( IOException io )
                {
                    //ignore, bad date but continue with indexing
                }
                addTextField( document, "Producer", info.getProducer() );
                addTextField( document, "Subject", info.getSubject() );
                addTextField( document, "Title", info.getTitle() );
                addTextField( document, "Trapped", info.getTrapped() );
            }
            int summarySize = Math.min( contents.length(), 500 );
            String summary = contents.substring( 0, summarySize );
            // Add the summary as an UnIndexed field, so that it is stored and returned
            // with hit documents for display.

View Full Code Here


                  if (props.isEmpty())
                  {
                     // The pdf doesn't contain any metadata, try to use the document
                     // information instead
                     PDDocumentInformation docInfo = pdDocument.getDocumentInformation();


                     if (docInfo != null)
                     {
                        try
                        {
                           if (docInfo.getAuthor() != null)
                              props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                        }
                        catch (Exception e)
                        {
                           log.warn("getAuthor failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getCreationDate() != null)
                              props.put(DCMetaData.DATE, docInfo.getCreationDate());
                        }
                        catch (Exception e)
                        {
                           log.warn("getCreationDate failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getCreator() != null)
                              props.put(DCMetaData.CREATOR, docInfo.getCreator());
                        }
                        catch (Exception e)
                        {
                           log.warn("getCreator failed: " + e);
                        }
                        try
                        {


                           if (docInfo.getKeywords() != null)
                              props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                        }
                        catch (Exception e)
                        {
                           log.warn("getKeywords failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getModificationDate() != null)
                              props.put(DCMetaData.DATE, docInfo.getModificationDate());
                        }
                        catch (Exception e)
                        {
                           log.warn("getModificationDate failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getProducer() != null)
                              props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                        }
                        catch (Exception e)
                        {
                           log.warn("getProducer failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getSubject() != null)
                              props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                        }
                        catch (Exception e)
                        {
                           log.warn("getSubject failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getTitle() != null)
                              props.put(DCMetaData.TITLE, docInfo.getTitle());
                        }
                        catch (Exception e)
                        {
                           log.warn("getTitle failed: " + e);
                        }

View Full Code Here

                  }
                  else
                  {
                     // The pdf doesn't contain any metadata, try to use the document
                     // information instead
                     PDDocumentInformation docInfo = pdDocument.getDocumentInformation();


                     if (docInfo != null)
                     {
                        try
                        {
                           if (docInfo.getAuthor() != null)
                              props.put(DCMetaData.CONTRIBUTOR, docInfo.getAuthor());
                        }
                        catch (Exception e)
                        {
                           log.warn("getAuthor failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getCreationDate() != null)
                              props.put(DCMetaData.DATE, docInfo.getCreationDate());
                        }
                        catch (Exception e)
                        {
                           log.warn("getCreationDate failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getCreator() != null)
                              props.put(DCMetaData.CREATOR, docInfo.getCreator());
                        }
                        catch (Exception e)
                        {
                           log.warn("getCreator failed: " + e);
                        }
                        try
                        {


                           if (docInfo.getKeywords() != null)
                              props.put(DCMetaData.SUBJECT, docInfo.getKeywords());
                        }
                        catch (Exception e)
                        {
                           log.warn("getKeywords failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getModificationDate() != null)
                              props.put(DCMetaData.DATE, docInfo.getModificationDate());
                        }
                        catch (Exception e)
                        {
                           log.warn("getModificationDate failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getProducer() != null)
                              props.put(DCMetaData.PUBLISHER, docInfo.getProducer());
                        }
                        catch (Exception e)
                        {
                           log.warn("getProducer failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getSubject() != null)
                              props.put(DCMetaData.DESCRIPTION, docInfo.getSubject());
                        }
                        catch (Exception e)
                        {
                           log.warn("getSubject failed: " + e);
                        }
                        try
                        {
                           if (docInfo.getTitle() != null)
                              props.put(DCMetaData.TITLE, docInfo.getTitle());
                        }
                        catch (Exception e)
                        {
                           log.warn("getTitle failed: " + e);
                        }

View Full Code Here

        {
            throw new ValidationException("Document provided is null");
        }
        else
        {
            PDDocumentInformation dico = document.getDocumentInformation();
            if (metadata == null)
            {
                throw new ValidationException("Metadata provided are null");
            }
            else

View Full Code Here

             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
            PDDocument pd = new PDDocument(cos);
            PDDocumentInformation docinfo = pd.getDocumentInformation();
            String title = docinfo.getTitle();


            // sanity check: item must have a title.
            if (title == null)
            {
                throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
            }
            if (log.isDebugEnabled())
            {
                log.debug("PDF Info dict title=\"" + title + "\"");
            }
            item.addDC("title", null, "en", title);
            String value = docinfo.getAuthor();
            if (value != null)
            {
                item.addDC("contributor", "author", null, value);
                if (log.isDebugEnabled())
                {
                    log.debug("PDF Info dict author=\"" + value + "\"");
                }
            }


            value = docinfo.getCreator();
            if (value != null)
            {
                item.addDC("description", "provenance", "en",
                        "Application that created the original document: " + value);
            }


            value = docinfo.getProducer();
            if (value != null)
            {
                item.addDC("description", "provenance", "en",
                        "Original document converted to PDF by: " + value);
            }


            value = docinfo.getSubject();
            if (value != null)
            {
                item.addDC("description", "abstract", null, value);
            }


            value = docinfo.getKeywords();
            if (value != null)
            {
                item.addDC("subject", "other", null, value);
            }


            // Take either CreationDate or ModDate as "date.created",
            // Too bad there's no place to put "last modified" in the DC.
            Calendar calValue = docinfo.getCreationDate();
            if (calValue == null)
            {
                calValue = docinfo.getModificationDate();
            }


            if (calValue != null)
            {
                item.addDC("date", "created", null,

View Full Code Here

                dcSchema = xmp.getDublinCoreSchema();
            }
        } catch (IOException e) {
            //swallow
        }
        PDDocumentInformation info = document.getDocumentInformation();
        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema);
        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema);
        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
        addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);


        // TODO: Move to description in Tika 2.0
        addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            // TODO Remove these in Tika 2.0
            addMetadata(metadata, "created", info.getCreationDate());
            addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate();
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
            addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        
        // All remaining metadata is custom
        // Copy this over as-is
        List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate",
                "Keywords", "Producer", "Subject", "Title", "Trapped");
        for(COSName key : info.getDictionary().keySet()) {
            String name = key.getName();
            if(! handledMetadata.contains(name)) {
          addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
            }
        }
        metadata.set("pdf:encrypted", Boolean.toString(document.isEncrypted()));


        //try to get the various versions

View Full Code Here

     * Extract Metadata in PDF Documents.
     * @param pdDoc
     * @return
     */
    public static AttachmentIndex extractMetadataPDFDocument(final PDDocument pdDoc){
       PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
       author = docInfo.getAuthor();
       title = docInfo.getTitle();
       producer = docInfo.getProducer();
       subject = docInfo.getSubject();
       AttachmentIndex attachmentMetadata =  IndexerFile.addMetadatatoBean(author, title, producer, subject);
       return attachmentMetadata;
    }

View Full Code Here

            if (perm == null || !perm.canExtractContent())
                throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }


        // extracting some metadata
        final PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
        if (info != null) {
            docTitle = info.getTitle();
            docSubject = info.getSubject();
            docAuthor = info.getAuthor();
            docPublisher = info.getProducer();
            if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
            docKeywordStr = info.getKeywords();
            // unused:
            // info.getTrapped());
            // info.getCreationDate());
            // info.getModificationDate();
        }

View Full Code Here

        }
    }


    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
        addMetadata(metadata, Metadata.TITLE, info.getTitle());
        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            addMetadata(metadata, "created", info.getCreationDate());
            addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate(); 
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        
        // All remaining metadata is custom
        // Copy this over as-is
        List<String> handledMetadata = Arrays.asList(new String[] {
             "Author", "Creator", "CreationDate", "ModDate",
             "Keywords", "Producer", "Subject", "Title", "Trapped"
        });
        for(COSName key : info.getDictionary().keySet()) {
            String name = key.getName();
            if(! handledMetadata.contains(name)) {
          addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
            }
        }
    }

View Full Code Here

0 1 2 3 4 5

TOP

Related Classes of org.apache.pdfbox.pdmodel.PDDocumentInformation

de.pdf_scrutinizer.API.app.doc.Info

de.pdf_scrutinizer.emulation.InterpreterEmulation

geopms.GeoPMSImportPDF

net.sf.regain.crawler.preparator.PdfBoxPreparator

net.yacy.document.parser.pdfParser

org.apache.camel.component.fop.FopHelper

org.apache.jackrabbit.core.query.pdf.PDFParser

org.apache.padaf.preflight.xmp.SynchronizedMetaDataValidation

org.apache.pdfbox.cos.COSDictionary

org.apache.pdfbox.examples.lucene.LucenePDFDocument

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.