Package org.pdfbox.pdmodel

Examples of org.pdfbox.pdmodel.PDDocumentInformation


                String title = "";
                String summary = "";

                //get the additional data
                try {
                    PDDocumentInformation pdfinfo = document.getDocumentInformation();

                    if (!Util.isEmpty(pdfinfo.getAuthor())) {
                        author = pdfinfo.getAuthor();
                    }

                    if (!Util.isEmpty(pdfinfo.getTitle())) {
                        title = pdfinfo.getTitle();
                    }

                    if (!Util.isEmpty(pdfinfo.getSubject())) {
                        summary = pdfinfo.getSubject();
                    }
                } catch (Exception eR) {
                    String message = MessageUtil.getMessage("extractor.pdf.metadatamissing",
                            new Object[] { info.getUri() });
                    logger.info(message);
View Full Code Here


                log.error("parse() failed", ioe);
                throw new ConverterException("PDFConverter::parse() failed", ioe);
            }

            // get the meta data
            PDDocumentInformation info = document.getDocumentInformation();
            documentTitle = info.getTitle();
            documentAuthor = info.getAuthor();
            documentKeywords = info.getKeywords();
            if (document != null) {
                documentText = output.toString();
            }
        }
        catch (IOException ioe) {
View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      // pdf.getPageCount();
      // info.getAuthor()
      // info.getSubject()
      // info.getKeywords()
View Full Code Here

             *   Subject -> description.abstract
             *   Keywords -> subject.other
             *    date is java.util.Calendar
             */
            PDDocument pd = new PDDocument(cos);
            PDDocumentInformation docinfo = pd.getDocumentInformation();
            String title = docinfo.getTitle();

            // sanity check: item must have a title.
            if (title == null)
                throw new MetadataValidationException("This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
            log.debug("PDF Info dict title=\""+title+"\"");
            item.addDC("title", null, "en", title);
            String value;
            Calendar date;
            if ((value = docinfo.getAuthor()) != null)
            {
                item.addDC("contributor", "author", null, value);
                log.debug("PDF Info dict author=\""+value+"\"");
            }
            if ((value = docinfo.getCreator()) != null)
                item.addDC("description", "provenance", "en",
                              "Application that created the original document: "+value);
            if ((value = docinfo.getProducer()) != null)
                item.addDC("description", "provenance", "en",
                              "Original document converted to PDF by: "+value);
            if ((value = docinfo.getSubject()) != null)
                item.addDC("description", "abstract", null, value);
            if ((value = docinfo.getKeywords()) != null)
                item.addDC("subject", "other", null, value);

            // Take either CreationDate or ModDate as "date.created",
            // Too bad there's no place to put "last modified" in the DC.
            Calendar calValue;
            if ((calValue = docinfo.getCreationDate()) == null)
                calValue = docinfo.getModificationDate();
            if (calValue != null)
                item.addDC("date", "created", null,
                             (new DCDate(calValue.getTime())).toString());
            item.update();
        }
View Full Code Here

        }
    }

    private void extractMetadata(PDDocument document, Metadata metadata)
            throws TikaException {
        PDDocumentInformation info = document.getDocumentInformation();
        addMetadata(metadata, Metadata.TITLE, info.getTitle());
        addMetadata(metadata, Metadata.AUTHOR, info.getAuthor());
        addMetadata(metadata, Metadata.CREATOR, info.getCreator());
        addMetadata(metadata, Metadata.KEYWORDS, info.getKeywords());
        addMetadata(metadata, "producer", info.getProducer());
        addMetadata(metadata, Metadata.SUBJECT, info.getSubject());
        addMetadata(metadata, "trapped", info.getTrapped());
        try {
            addMetadata(metadata, "created", info.getCreationDate());
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
        try {
            Calendar modified = info.getModificationDate();
            addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
        } catch (IOException e) {
            // Invalid date format, just ignore
        }
    }
View Full Code Here

            FieldUtil.setRaw(document,contents);
          FieldUtil.setContent(document, contents);
            FieldUtil.setSummary(document, StringUtil.max(contents,SUMMERY_SIZE),false);
         
         
            PDDocumentInformation info = pdfDocument.getDocumentInformation();
            if( info.getAuthor() != null)  {
                FieldUtil.setAuthor(document, info.getAuthor());
            }
            if( info.getCreationDate() != null )
            {
                Date date = info.getCreationDate().getTime();
                if( date.getTime() >= 0 )  {
                    document.add(FieldUtil.Text("CreationDate", DateField.dateToString( date ) ) );
                }
            }
            if( info.getCreator() != null ){
                document.add( FieldUtil.Text( "Creator", info.getCreator() ) );
            }
            if( info.getKeywords() != null ){
                FieldUtil.setKeywords(document, info.getKeywords());
            }
            if( info.getModificationDate() != null)  {
                Date date = info.getModificationDate().getTime();
                if( date.getTime() >= 0 ){
                    document.add(FieldUtil.Text("ModificationDate", DateField.dateToString( date ) ) );
                }
            }
            if( info.getProducer() != null ){
                document.add( FieldUtil.Text( "Producer", info.getProducer() ) );
            }
            if( info.getSubject() != null ){
              document.add( FieldUtil.Text( "Subject", info.getSubject() ) );
            }
            if( info.getTitle() != null ){
              FieldUtil.setTitle(document, info.getTitle());
            }
            if( info.getTrapped() != null ) {
                document.add( FieldUtil.Text( "Trapped", info.getTrapped() ) );
            }
        }
        catch(Throwable t) {}
        finally {
            if( pdfDocument != null ) {
View Full Code Here

      // collect text
      PDFTextStripper stripper = new PDFTextStripper();
      text = stripper.getText(pdf);

      // collect title
      PDDocumentInformation info = pdf.getDocumentInformation();
      title = info.getTitle();
      // more useful info, currently not used. please keep them for future use.
      metadata.add(Metadata.PAGE_COUNT, String.valueOf(pdf.getPageCount()));
      metadata.add(Metadata.AUTHOR, info.getAuthor());
      metadata.add(Metadata.SUBJECT, info.getSubject());
      metadata.add(Metadata.KEYWORDS, info.getKeywords());
      metadata.add(Metadata.CREATOR, info.getCreator());
      metadata.add(Metadata.PUBLISHER, info.getProducer());
     
      //TODO: Figure out why we get a java.io.IOException: Error converting date:1-Jan-3 18:15PM
      //error here
     
      //metadata.put(DATE, dcDateFormatter.format(info.getCreationDate().getTime()));
View Full Code Here

          LogManager.getInstance().logMessage(ex);
          throw new IOException(ex.getMessage());
        }
      }
 
      PDDocumentInformation info = doc.getDocumentInformation();
      PDDocumentCatalog catlog = doc.getDocumentCatalog();
      COSDocument cosDoc = doc.getDocument();   
     
      ctx.fireStartParseEvent("pdf-meta");
 
      COSArray array = cosDoc.getDocumentID();
      if(array != null && array.size() == 2) {
        ctx.fireParseEvent("doc-id", ((COSString) array.get(0)).getHexString());
        ctx.fireParseEvent("iteration-id", ((COSString) array.get(1)).getHexString());
      }
     
      if(array == null || array.size() != 2) {
        ctx.fireParseEvent("original", "unknown");
      }
      else {
        boolean orig = ((COSString) array.get(0)).getHexString().equals(((COSString) array.get(1)).getHexString());
        ctx.fireParseEvent("original", orig);
      }
     
      fireSpecialNull(ctx, "title", info.getTitle());
      fireSpecialNull(ctx, "language", catlog.getLanguage());
      fireSpecialNull(ctx, "author", info.getAuthor());
      fireSpecialNull(ctx, "creator", info.getCreator());
      fireSpecialNull(ctx, "subject", info.getSubject());
      fireSpecialNull(ctx, "producer", info.getProducer());
      fireSpecialNull(ctx, "keywords", info.getKeywords());   
     
      ctx.fireStartParseEvent("creation-date");
      fireDate(ctx, info.getCreationDate());
      ctx.fireEndParseEvent("creation-date");
     
      ctx.fireStartParseEvent("modified-date");
      fireDate(ctx, info.getModificationDate());
      ctx.fireEndParseEvent("modified-date");
     
      ctx.fireParseEvent("has-forms", catlog.getAcroForm() != null);
      ctx.fireParseEvent("has-metadata-stream", catlog.getMetadata() != null);
      ctx.fireParseEvent("has-outline", catlog.getDocumentOutline() != null);
      ctx.fireParseEvent("has-threads", catlog.getThreads().size() > 0);
      ctx.fireParseEvent("tagged", catlog.getMarkInfo() != null);
      fireSpecialNull(ctx, "page-layout", catlog.getPageLayout());
      fireSpecialNull(ctx, "page-mode", catlog.getPageMode());
      fireSpecialNull(ctx, "trapped", info.getTrapped());
     
      fireSpecialNull(ctx, "version", Float.toString(cosDoc.getVersion()));
     
      ctx.fireStartParseEvent("security");
      ctx.fireParseEvent("encrypted", encrypted);
View Full Code Here

TOP

Related Classes of org.pdfbox.pdmodel.PDDocumentInformation

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.