Examples of PDDocumentInformation

org.apache.pdfbox.pdmodel.PDDocumentInformation
This is the document metadata. Each getXXX method will return the entry if it exists or null if it does not exist. If you pass in null for the setXXX method then it will clear the value. @author Ben Litchfield @author Gerardo Ortiz @version $Revision: 1.12 $
org.pdfbox.pdmodel.PDDocumentInformation
This is the document metadata. Each getXXX method will return the entry if it exists or null if it does not exist. If you pass in null for the setXXX method then it will clear the value. @author Ben Litchfield @version $Revision: 1.12 $

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

      if (annotsResult.length() > 0) {
        setCleanedContent(getCleanedContent() + " Annotations " + annotsResult.toString());
      }


      // Get the meta data
      PDDocumentInformation info = pdfDocument.getDocumentInformation();
      StringBuilder metaData = new StringBuilder();
      metaData.append("p.");
      metaData.append(Integer.toString(pdfDocument.getNumberOfPages()));
      metaData.append(" ");


      // Check if fields are null
      if (info.getAuthor() != null) {
        metaData.append(info.getAuthor());
        metaData.append(" ");
      }
      if (info.getSubject() != null) {
        metaData.append(info.getSubject());
        metaData.append(" ");
      }
      if (info.getKeywords() != null) {
        metaData.append(info.getKeywords());
        metaData.append(" ");
      }


      if (info.getTitle() != null) {
        setTitle(info.getTitle());
      }


      setCleanedMetaData(metaData.toString());
      if (mLog.isDebugEnabled()) {
        mLog.debug("Extracted meta data ::" + getCleanedMetaData()

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

            if (perm == null || !perm.canExtractContent())
                throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
        }


        // extracting some metadata
        final PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
        if (info != null) {
            docTitle = info.getTitle();
            docSubject = info.getSubject();
            docAuthor = info.getAuthor();
            docPublisher = info.getProducer();
            if (docPublisher == null || docPublisher.length() == 0) docPublisher = info.getCreator();
            docKeywordStr = info.getKeywords();
            // unused:
            // info.getTrapped());
            // info.getCreationDate());
            // info.getModificationDate();
        }

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

        parse(stream, handler, metadata, new ParseContext());
    }


    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        addMetadata(metadata, Metadata.TITLE, info.getTitle());
        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            addMetadata(metadata, "created", info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate(); 
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
    }

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation


   


    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
        addMetadata(metadata, TikaCoreProperties.TITLE, info.getTitle());
        addMetadata(metadata, TikaCoreProperties.CREATOR, info.getAuthor());
        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator());
        addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        // TODO: Move to description in Tika 2.0
        addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            // TODO Remove these in Tika 2.0
            addMetadata(metadata, "created", info.getCreationDate());
            addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate(); 
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
            addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        
        // All remaining metadata is custom
        // Copy this over as-is
        List<String> handledMetadata = Arrays.asList(new String[] {
             "Author", "Creator", "CreationDate", "ModDate",
             "Keywords", "Producer", "Subject", "Title", "Trapped"
        });
        for(COSName key : info.getDictionary().keySet()) {
            String name = key.getName();
            if(! handledMetadata.contains(name)) {
          addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
            }
        }
    }

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

        parse(stream, handler, metadata, new ParseContext());
    }


    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
        addMetadata(metadata, Metadata.TITLE, info.getTitle());
        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            addMetadata(metadata, "created", info.getCreationDate());
            addMetadata(metadata, Metadata.CREATION_DATE, info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate(); 
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        
        // All remaining metadata is custom
        // Copy this over as-is
        List<String> handledMetadata = Arrays.asList(new String[] {
             "Author", "Creator", "CreationDate", "ModDate",
             "Keywords", "Producer", "Subject", "Title", "Trapped"
        });
        for(COSName key : info.getDictionary().keySet()) {
            String name = key.getName();
            if(! handledMetadata.contains(name)) {
          addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key));
            }
        }
    }

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

        }
        if( source.isEncrypted() )
        {
            throw new IOException( "Error: source PDF is encrypted, can't append encrypted PDF documents." );
        }
        PDDocumentInformation destInfo = destination.getDocumentInformation();
        PDDocumentInformation srcInfo = source.getDocumentInformation();
        destInfo.getDictionary().mergeInto( srcInfo.getDictionary() );


        PDDocumentCatalog destCatalog = destination.getDocumentCatalog();
        PDDocumentCatalog srcCatalog = source.getDocumentCatalog();


        if( destCatalog.getOpenAction() == null )

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

     *
     * @throws IOException If there is an error getting the page count.
     */
    public void printMetadata( PDDocument document ) throws IOException
    {
        PDDocumentInformation info = document.getDocumentInformation();
        PDDocumentCatalog cat = document.getDocumentCatalog();
        PDMetadata metadata = cat.getMetadata();
        System.out.println( "Page Count=" + document.getNumberOfPages() );
        System.out.println( "Title=" + info.getTitle() );
        System.out.println( "Author=" + info.getAuthor() );
        System.out.println( "Subject=" + info.getSubject() );
        System.out.println( "Keywords=" + info.getKeywords() );
        System.out.println( "Creator=" + info.getCreator() );
        System.out.println( "Producer=" + info.getProducer() );
        System.out.println( "Creation Date=" + formatDate( info.getCreationDate() ) );
        System.out.println( "Modification Date=" + formatDate( info.getModificationDate() ) );
        System.out.println( "Trapped=" + info.getTrapped() );
        if( metadata != null )
        {
            System.out.println( "Metadata=" + metadata.getInputStreamAsString() );
        }
    }

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

        }
        if (source.isEncrypted())
        {
            throw new IOException("Error: source PDF is encrypted, can't append encrypted PDF documents.");
        }
        PDDocumentInformation destInfo = destination.getDocumentInformation();
        PDDocumentInformation srcInfo = source.getDocumentInformation();
        destInfo.getDictionary().mergeInto(srcInfo.getDictionary());


        PDDocumentCatalog destCatalog = destination.getDocumentCatalog();
        PDDocumentCatalog srcCatalog = source.getDocumentCatalog();


        // use the highest version number for the resulting pdf

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation

                {
                    System.err.println( "Error: Cannot add metadata to encrypted document." );
                    System.exit( 1 );
                }
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentInformation info = document.getDocumentInformation();
                
                XMPMetadata metadata = XMPMetadata.createXMPMetadata();


                AdobePDFSchema pdfSchema = metadata.createAndAddAdobePDFSchema();
                pdfSchema.setKeywords( info.getKeywords() );
                pdfSchema.setProducer( info.getProducer() );


                XMPBasicSchema basicSchema = metadata.createAndAddXMPBasicSchema();
                basicSchema.setModifyDate( info.getModificationDate() );
                basicSchema.setCreateDate( info.getCreationDate() );
                basicSchema.setCreatorTool( info.getCreator() );
                basicSchema.setMetadataDate( new GregorianCalendar() );


                DublinCoreSchema dcSchema = metadata.createAndAddDublinCoreSchema();
                dcSchema.setTitle( info.getTitle() );
                dcSchema.addCreator( "PDFBox" );
                dcSchema.setDescription( info.getSubject() );


                PDMetadata metadataStream = new PDMetadata(document);
                catalog.setMetadata( metadataStream );
                
                XmpSerializer serializer = new XmpSerializer();

View Full Code Here

Examples of org.apache.pdfbox.pdmodel.PDDocumentInformation


            // Add the tag-stripped contents as a Reader-valued Text field so it will
            // get tokenized and indexed.
            addTextField(document, "contents", reader);


            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if (info != null)
            {
                addTextField(document, "Author", info.getAuthor());
                try
                {
                    addTextField(document, "CreationDate", info.getCreationDate());
                }
                catch (IOException io)
                {
                    // ignore, bad date but continue with indexing
                }
                addTextField(document, "Creator", info.getCreator());
                addTextField(document, "Keywords", info.getKeywords());
                try
                {
                    addTextField(document, "ModificationDate", info.getModificationDate());
                }
                catch (IOException io)
                {
                    // ignore, bad date but continue with indexing
                }
                addTextField(document, "Producer", info.getProducer());
                addTextField(document, "Subject", info.getSubject());
                addTextField(document, "Title", info.getTitle());
                addTextField(document, "Trapped", info.getTrapped());
            }
            int summarySize = Math.min(contents.length(), 500);
            String summary = contents.substring(0, summarySize);
            // Add the summary as an UnIndexed field, so that it is stored and returned
            // with hit documents for display.

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.