Examples of org.pdfbox.pdmodel.PDDocumentInformation

org.pdfbox.pdmodel.PDDocumentInformation
This is the document metadata. Each getXXX method will return the entry if it exists or null if it does not exist. If you pass in null for the setXXX method then it will clear the value. @author Ben Litchfield @version $Revision: 1.12 $

                String title = "";
                String summary = "";


                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();


                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }


                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }


                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);

View Full Code Here

                log.error("parse() failed", ioe);
                throw new ConverterException("PDFConverter::parse() failed", ioe);
            }


            // get the meta data
            PDDocumentInformation info = document.getDocumentInformation();
            documentTitle = info.getTitle();
            documentAuthor = info.getAuthor();
            documentKeywords = info.getKeywords();
            if (document != null) {
                documentText = output.toString();
            }
        }
        catch (IOException ioe) {

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()

View Full Code Here

             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
            PDDocument pd = new PDDocument(cos);
            PDDocumentInformation docinfo = pd.getDocumentInformation();
            String title = docinfo.getTitle();


            // sanity check: item must have a title.
            if (title == null)
                throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
            log.debug("PDF Info dict title=\""+title+"\"");
            item.addDC("title", null, "en", title);
            String value;
            Calendar date;
            if ((value = docinfo.getAuthor()) != null)
            {
                item.addDC("contributor", "author", null, value);
                log.debug("PDF Info dict author=\""+value+"\"");
            }
            if ((value = docinfo.getCreator()) != null)
                item.addDC("description", "provenance", "en",
                              "Application that created the original document: "+value);
            if ((value = docinfo.getProducer()) != null)
                item.addDC("description", "provenance", "en",
                              "Original document converted to PDF by: "+value);
            if ((value = docinfo.getSubject()) != null)
                item.addDC("description", "abstract", null, value);
            if ((value = docinfo.getKeywords()) != null)
                item.addDC("subject", "other", null, value);


            // Take either CreationDate or ModDate as "date.created",
            // Too bad there's no place to put "last modified" in the DC.
            Calendar calValue;
            if ((calValue = docinfo.getCreationDate()) == null)
                calValue = docinfo.getModificationDate();
            if (calValue != null)
                item.addDC("date", "created", null,
                             (new DCDate(calValue.getTime())).toString());
            item.update();
        }

View Full Code Here

        }
    }


    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        addMetadata(metadata, Metadata.TITLE, info.getTitle());
        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            addMetadata(metadata, "created", info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate(); 
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
    }

View Full Code Here

            FieldUtil.setRaw(document,contents);
          FieldUtil.setContent(document, contents);
            FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false);
          
          
            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if( info.getAuthor() != null)  {
                FieldUtil.setAuthor(document, info.getAuthor());
            }
            if( info.getCreationDate() != null )
            {
                Date date = info.getCreationDate().getTime();
                if( date.getTime() >= 0 )  {
                    document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) );
                }
            }
            if( info.getCreator() != null ){
                document.add( FieldUtil.Text( "Creator", info.getCreator() ) );
            }
            if( info.getKeywords() != null ){
                FieldUtil.setKeywords(document, info.getKeywords());
            }
            if( info.getModificationDate() != null)  {
                Date date = info.getModificationDate().getTime();
                if( date.getTime() >= 0 ){
                    document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) );
                }
            }
            if( info.getProducer() != null ){
                document.add( FieldUtil.Text( "Producer", info.getProducer() ) );
            }
            if( info.getSubject() != null ){
              document.add( FieldUtil.Text( "Subject", info.getSubject() ) );
            }
            if( info.getTitle() != null ){
              FieldUtil.setTitle(document, info.getTitle());
            }
            if( info.getTrapped() != null ) {
                document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) );
            }
        }
        catch(Throwable t) {}
        finally {
            if( pdfDocument != null ) {

View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);


      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
      
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
      
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));

View Full Code Here

          LogManager.getInstance().logMessage(ex);
          throw new IOException(ex.getMessage());
        }
      }
  
      PDDocumentInformation info = doc.getDocumentInformation();
      PDDocumentCatalog catlog = doc.getDocumentCatalog();
      COSDocument cosDoc = doc.getDocument();    
      
      ctx.fireStartParseEvent("pdf-meta");
  
      COSArray array = cosDoc.getDocumentID();
      if(array != null && array.size() == 2) {
        ctx.fireParseEvent("doc-id", ((COSString) array.get(0)).getHexString());
        ctx.fireParseEvent("iteration-id", ((COSString) array.get(1)).getHexString());
      }
      
      if(array == null || array.size() != 2) {
        ctx.fireParseEvent("original", "unknown");
      } 
      else {
        boolean orig = ((COSString) array.get(0)).getHexString().equals(((COSString) array.get(1)).getHexString());
        ctx.fireParseEvent("original", orig);
      }
      
      fireSpecialNull(ctx, "title", info.getTitle());
      fireSpecialNull(ctx, "language", catlog.getLanguage());
      fireSpecialNull(ctx, "author", info.getAuthor());
      fireSpecialNull(ctx, "creator", info.getCreator());
      fireSpecialNull(ctx, "subject", info.getSubject());
      fireSpecialNull(ctx, "producer", info.getProducer());
      fireSpecialNull(ctx, "keywords", info.getKeywords());    
      
      ctx.fireStartParseEvent("creation-date");
      fireDate(ctx, info.getCreationDate());
      ctx.fireEndParseEvent("creation-date");
      
      ctx.fireStartParseEvent("modified-date");
      fireDate(ctx, info.getModificationDate());
      ctx.fireEndParseEvent("modified-date");
      
      ctx.fireParseEvent("has-forms", catlog.getAcroForm() != null);
      ctx.fireParseEvent("has-metadata-stream", catlog.getMetadata() != null);
      ctx.fireParseEvent("has-outline", catlog.getDocumentOutline() != null);
      ctx.fireParseEvent("has-threads", catlog.getThreads().size() > 0);
      ctx.fireParseEvent("tagged", catlog.getMarkInfo() != null);
      fireSpecialNull(ctx, "page-layout", catlog.getPageLayout());
      fireSpecialNull(ctx, "page-mode", catlog.getPageMode());
      fireSpecialNull(ctx, "trapped", info.getTrapped());
      
      fireSpecialNull(ctx, "version", Float.toString(cosDoc.getVersion()));
      
      ctx.fireStartParseEvent("security");
      ctx.fireParseEvent("encrypted", encrypted);

View Full Code Here

0 1 2

TOP

Related Classes of org.pdfbox.pdmodel.PDDocumentInformation

net.fp.rp.search.back.extractor.PdfDataExtractor

net.nutch.parse.pdf.PdfParser

net.sf.jabref.util.XMPUtil

nz.govt.natlib.adapter.pdfbox.PDFBoxAdapter

org.apache.nutch.parse.pdf.PdfParser

org.apache.tika.parser.pdf.PDFParser

org.dspace.content.packager.PDFPackager

org.jab.docsearch.converters.PDFConverter

org.pdfbox.cos.COSDictionary

org.pdfbox.cos.COSName

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.