Examples of org.pdfbox.pdmodel.PDDocumentInformation

org.pdfbox.pdmodel.PDDocumentInformation
This is the document metadata. Each getXXX method will return the entry if it exists or null if it does not exist. If you pass in null for the setXXX method then it will clear the value. @author Ben Litchfield @version $Revision: 1.12 $

  PDDocument pdDoc = null;
  try 
  {
   logger.info("Extracting metadata from PDF file " + ifile);
   pdDoc = new PDDocument(cosDoc);
   PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
   String author   = StringTools.filterChars(docInfo.getAuthor());
   String title    = StringTools.filterChars(docInfo.getTitle());
   String keywords = StringTools.filterChars(docInfo.getKeywords());
   String summary  = StringTools.filterChars(docInfo.getSubject());
   if ((author != null) && (!author.equals("")))     { doc.setAuthor(author); }
   if ((title != null) && (!title.equals("")))       { doc.setTitle(title); }
   if ((keywords != null) && (!keywords.equals(""))) { doc.setMetadata(keywords); }
   if ((summary != null) && (!summary.equals("")))   { doc.setSummary(summary); }
  }

View Full Code Here

   *            database is null the strings will not be resolved.
   */
  public static void writeDocumentInformation(PDDocument document,
      BibtexEntry entry, BibtexDatabase database) {


    PDDocumentInformation di = document.getDocumentInformation();


    if (database != null)
      entry = database.resolveForStrings(entry, false);


    // Set all the values including key and entryType
    Set<String> fields = entry.getAllFields();


    for (String field : fields){
      if (field.equals("author")) {
        di.setAuthor(entry.getField("author").toString());
      } else if (field.equals("title")) {
        di.setTitle(entry.getField("title").toString());
      } else if (field.equals("keywords")) {
        di.setKeywords(entry.getField("keywords").toString());
      } else if (field.equals("abstract")) {
        di.setSubject(entry.getField("abstract").toString());
      } else {
        di.setCustomMetadataValue("bibtex/" + field.toString(),
            entry.getField(field.toString()).toString());
      }
    }
    di
        .setCustomMetadataValue("bibtex/entrytype", entry.getType()
            .getName());
  }

View Full Code Here


            // Add the tag-stripped contents as a Reader-valued Text field so it will
            // get tokenized and indexed.
            addTextField( document, "contents", reader );


            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if( info != null ) 
            {
                addTextField( document, "Author", info.getAuthor() );
                addTextField( document, "CreationDate", info.getCreationDate() );
                addTextField( document, "Creator", info.getCreator() );
                addTextField( document, "Keywords", info.getKeywords() );
                addTextField( document, "ModificationDate", info.getModificationDate() );
                addTextField( document, "Producer", info.getProducer() );
                addTextField( document, "Subject", info.getSubject() );
                addTextField( document, "Title", info.getTitle() );
                addTextField( document, "Trapped", info.getTrapped() );
            }
            int summarySize = Math.min( contents.length(), 500 );
            String summary = contents.substring( 0, summarySize );
            // Add the summary as an UnIndexed field, so that it is stored and returned
            // with hit documents for display.

View Full Code Here

        }
        if( source.isEncrypted() )
        {
            throw new IOException( "Error: source PDF is encrypted, can't append encrypted PDF documents." );
        }
        PDDocumentInformation destInfo = destination.getDocumentInformation();
        PDDocumentInformation srcInfo = source.getDocumentInformation();
        destInfo.getDictionary().mergeInto( srcInfo.getDictionary() );
        
        PDDocumentCatalog destCatalog = destination.getDocumentCatalog();
        PDDocumentCatalog srcCatalog = source.getDocumentCatalog();


        if( destCatalog.getOpenAction() == null )

View Full Code Here

     * 
     * @return The thread information.
     */
    public PDDocumentInformation getThreadInfo()
    {
        PDDocumentInformation retval = null;
        COSDictionary info = (COSDictionary)thread.getDictionaryObject( "I" );
        if( info != null )
        {
            retval = new PDDocumentInformation( info );
        }
        
        return retval;
    }

View Full Code Here

                {
                    System.err.println( "Error: Cannot add metadata to encrypted document." );
                    System.exit( 1 );
                }
                PDDocumentCatalog catalog = document.getDocumentCatalog();
                PDDocumentInformation info = document.getDocumentInformation();
                
                //Right now, PDFBox does not have any XMP library, so we will
                //just consruct the XML by hand.
                StringBuffer xmp= new StringBuffer();
                xmp.append(
                "<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'?>\n" + 
                "<?adobe-xap-filters esc=\"CRLF\"?>\n" + 
                "<x:xmpmeta>\n" + 
                "    <rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'>\n" + 
                "        <rdf:Description rdf:about='' xmlns:pdf='http://ns.adobe.com/pdf/1.3/' " +
                                         "pdf:Keywords='" + fixNull( info.getKeywords() ) + "' " + 
                                         "pdf:Producer='" + fixNull( info.getProducer() ) + "'></rdf:Description>\n" + 
                "        <rdf:Description rdf:about='' xmlns:xap='http://ns.adobe.com/xap/1.0/' " + 
                                         "xap:ModifyDate='" + fixNull( info.getModificationDate() ) + "' " +
                                         "xap:CreateDate='" + fixNull( info.getCreationDate() ) + "' " + 
                                         "xap:CreatorTool='" + fixNull( info.getCreator() ) + "' " + 
                                         "xap:MetadataDate='" + fixNull( new GregorianCalendar() )+ "'>\n" + 
                "        </rdf:Description>\n" + 
                "        <rdf:Description rdf:about='' xmlns:dc='http://purl.org/dc/elements/1.1/' " + 
                                         "dc:format='application/pdf'>\n" + 
                "            <dc:title>\n" + 
                "                <rdf:Alt>\n" + 
                "                    <rdf:li xml:lang='x-default'>" + fixNull( info.getTitle() ) +"</rdf:li>\n" + 
                "                </rdf:Alt>\n" + 
                "            </dc:title>\n" + 
                "            <dc:creator>\n" + 
                "                <rdf:Seq>\n" + 
                "                    <rdf:li>PDFBox.org</rdf:li>\n" + 
                "                </rdf:Seq>\n" + 
                "            </dc:creator>\n" + 
                "            <dc:description>\n" + 
                "                <rdf:Alt>\n" + 
                "                    <rdf:li xml:lang='x-default'>" + fixNull( info.getSubject() ) +"</rdf:li>\n" + 
                "                </rdf:Alt>\n" + 
                "            </dc:description>\n" + 
                "        </rdf:Description>\n" + 
                "    </rdf:RDF>\n" + 
                "</x:xmpmeta>\n" );

View Full Code Here

     *
     * @throws IOException If there is an error getting the page count.
     */
    public void printMetadata( PDDocument document ) throws IOException
    {
        PDDocumentInformation info = document.getDocumentInformation();
        PDDocumentCatalog cat = document.getDocumentCatalog();
        PDMetadata metadata = cat.getMetadata();
        System.out.println( "Page Count=" + document.getNumberOfPages() );
        System.out.println( "Title=" + info.getTitle() );
        System.out.println( "Author=" + info.getAuthor() );
        System.out.println( "Subject=" + info.getSubject() );
        System.out.println( "Keywords=" + info.getKeywords() );
        System.out.println( "Creator=" + info.getCreator() );
        System.out.println( "Producer=" + info.getProducer() );
        System.out.println( "Creation Date=" + formatDate( info.getCreationDate() ) );
        System.out.println( "Modification Date=" + formatDate( info.getModificationDate() ) );
        System.out.println( "Trapped=" + info.getTrapped() );
        if( metadata != null )
        {
            System.out.println( "Metadata=" + metadata.getInputStreamAsString() );
        }
    }

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getNumberOfPages()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));

View Full Code Here

0 1 2

TOP

Related Classes of org.pdfbox.pdmodel.PDDocumentInformation

net.fp.rp.search.back.extractor.PdfDataExtractor

net.nutch.parse.pdf.PdfParser

net.sf.jabref.util.XMPUtil

nz.govt.natlib.adapter.pdfbox.PDFBoxAdapter

org.apache.nutch.parse.pdf.PdfParser

org.apache.tika.parser.pdf.PDFParser

org.dspace.content.packager.PDFPackager

org.jab.docsearch.converters.PDFConverter

org.pdfbox.cos.COSDictionary

org.pdfbox.cos.COSName

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.