Package nz.govt.natlib.adapter.pdfbox

Source Code of nz.govt.natlib.adapter.pdfbox.PDFBoxAdapter

package nz.govt.natlib.adapter.pdfbox;

import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import nz.govt.natlib.adapter.DataAdapter;
import nz.govt.natlib.fx.FXUtil;
import nz.govt.natlib.fx.ParserContext;
import nz.govt.natlib.meta.log.LogManager;
import nz.govt.natlib.meta.log.LogMessage;

import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.cos.COSString;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentCatalog;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;

public class PDFBoxAdapter extends DataAdapter {
 
  public static final String[] schemeVersionNames = {"Unsupported", "Standard 40bits", "Standard 40+bits", "Unpublished", "Special"};

  public boolean acceptsFile(File file) {
    boolean valid = checkFileHeader(file, toHexFilter("%PDF")) ||
            checkFileHeader(file, toHexFilter("%pdf"));
    //LogManager.getInstance().logMessage(LogMessage.WORTHLESS_CHATTER, file.getName() + (valid ? " is " : " is not ") + "a PDF");
    return valid;
  }

  public void adapt(File file, ParserContext ctx) throws IOException {
    ctx.fireStartParseEvent("pdf");
    writeFileInfo(file, ctx);
   
    PDDocument doc = null;
   
    try {
      doc = PDDocument.load(file);
      boolean encrypted = doc.isEncrypted();
     
      // Attempt to decrypt the file if it is encrypted
      if(doc.isEncrypted()) {
        try {
          doc.decrypt(null);
        }
        catch(InvalidPasswordException ex) {
          LogManager.getInstance().logMessage(LogMessage.WORTHLESS_CHATTER, "Invalid Password - PDF may be encrypted with a non-empty password");
          LogManager.getInstance().logMessage(ex);
          throw new IOException(ex.getMessage());
        }
        catch(CryptographyException ex) {
          LogManager.getInstance().logMessage(LogMessage.WORTHLESS_CHATTER, "Cyptography Exception parsing file.");         
          LogManager.getInstance().logMessage(ex);
          throw new IOException(ex.getMessage());
        }
      }
 
      PDDocumentInformation info = doc.getDocumentInformation();
      PDDocumentCatalog catlog = doc.getDocumentCatalog();
      COSDocument cosDoc = doc.getDocument();   
     
      ctx.fireStartParseEvent("pdf-meta");
 
      COSArray array = cosDoc.getDocumentID();
      if(array != null && array.size() == 2) {
        ctx.fireParseEvent("doc-id", ((COSString) array.get(0)).getHexString());
        ctx.fireParseEvent("iteration-id", ((COSString) array.get(1)).getHexString());
      }
     
      if(array == null || array.size() != 2) {
        ctx.fireParseEvent("original", "unknown");
      }
      else {
        boolean orig = ((COSString) array.get(0)).getHexString().equals(((COSString) array.get(1)).getHexString());
        ctx.fireParseEvent("original", orig);
      }
     
      fireSpecialNull(ctx, "title", info.getTitle());
      fireSpecialNull(ctx, "language", catlog.getLanguage());
      fireSpecialNull(ctx, "author", info.getAuthor());
      fireSpecialNull(ctx, "creator", info.getCreator());
      fireSpecialNull(ctx, "subject", info.getSubject());
      fireSpecialNull(ctx, "producer", info.getProducer());
      fireSpecialNull(ctx, "keywords", info.getKeywords());   
     
      ctx.fireStartParseEvent("creation-date");
      fireDate(ctx, info.getCreationDate());
      ctx.fireEndParseEvent("creation-date");
     
      ctx.fireStartParseEvent("modified-date");
      fireDate(ctx, info.getModificationDate());
      ctx.fireEndParseEvent("modified-date");
     
      ctx.fireParseEvent("has-forms", catlog.getAcroForm() != null);
      ctx.fireParseEvent("has-metadata-stream", catlog.getMetadata() != null);
      ctx.fireParseEvent("has-outline", catlog.getDocumentOutline() != null);
      ctx.fireParseEvent("has-threads", catlog.getThreads().size() > 0);
      ctx.fireParseEvent("tagged", catlog.getMarkInfo() != null);
      fireSpecialNull(ctx, "page-layout", catlog.getPageLayout());
      fireSpecialNull(ctx, "page-mode", catlog.getPageMode());
      fireSpecialNull(ctx, "trapped", info.getTrapped());
     
      fireSpecialNull(ctx, "version", Float.toString(cosDoc.getVersion()));
     
      ctx.fireStartParseEvent("security");
      ctx.fireParseEvent("encrypted", encrypted);
      if (encrypted) {
        PDEncryptionDictionary encDict = doc.getEncryptionDictionary();
 
        PDFPermissions perms = new PDFPermissions(
            encDict.getCOSDictionary().getInt("P"),
            encDict.getCOSDictionary().getInt("V"));
       
        fireSpecialNull(ctx, "scheme", encDict.getFilter());
        fireSpecialNull(ctx, "scheme-type", schemeVersionNames[encDict.getVersion()]);
        ctx.fireParseEvent("key-length", encDict.getLength());
        ctx.fireParseEvent("readonly", !perms.allowModify());
        ctx.fireParseEvent("allow-print", perms.allowPrint());
        ctx.fireParseEvent("allow-copy", perms.allowCopy());
        ctx.fireParseEvent("allow-notes", perms.allowTextNotes());
        ctx.fireParseEvent("user-password", encDict.getUserKey() != null);
        ctx.fireParseEvent("owner-password", encDict.getOwnerKey() != null);
      }
      ctx.fireEndParseEvent("security");   
     
      ctx.fireEndParseEvent("pdf-meta");
    }
    catch(OutOfMemoryError er) {
      LogManager.getInstance().logMessage(LogMessage.CRITICAL, "Out of memory processing PDF.");
      throw new IOException("Out of memory");
      //throw er;
    }
    catch(RuntimeException ex) {
      LogManager.getInstance().logMessage(LogMessage.CRITICAL, ex.getMessage());
      throw ex;
    }
    catch(Error er) {
      LogManager.getInstance().logMessage(LogMessage.CRITICAL, "Out of memory processing PDF.");
      throw er;
    }
    finally {
      if(doc != null) {
        doc.close();
      }
      ctx.fireEndParseEvent("pdf");     
    }
  }
 
  private static void fireDate(ParserContext ctx, Calendar cal) {
    SimpleDateFormat dateFormatter = new SimpleDateFormat();
    if (cal != null) {
      dateFormatter.applyPattern(FXUtil.dateFormat);
      ctx.fireParseEvent("DATE", dateFormatter.format(cal.getTime()));
      ctx.fireParseEvent("DATEPATTERN", FXUtil.dateFormat);
      dateFormatter.applyPattern(FXUtil.timeFormat);
      ctx.fireParseEvent("TIME", dateFormatter.format(cal.getTime()));
      ctx.fireParseEvent("TIMEPATTERN", FXUtil.timeFormat);
    } else {
      ctx.fireParseEvent("unavailable");
    }   
  }
 
 
  private static void fireSpecialNull(ParserContext ctx, String name, String value) {
    if (value == null) {
      ctx.fireParseEvent(name, "");
    } else {
      ctx.fireParseEvent(name, value);
    }
  } 


  public String getVersion() {
    return "1.0";
  }
 
  public String getOutputType() {
    return "pdf.dtd";
  }

  public String getInputType() {
    return "application/pdf";
  }

  public String getName() {
    return "PDF Text Adapter";
  }

  public String getDescription() {
    return "Adapts all PDF Formats from 1.1 to 1.5.  Handles encrypted PDFs with no user password set";
 
}
TOP

Related Classes of nz.govt.natlib.adapter.pdfbox.PDFBoxAdapter

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.