package com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.functors.AndPredicate;
import org.apache.commons.collections.functors.TruePredicate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSBoolean;
import org.pdfbox.cos.COSDictionary;
import org.pdfbox.cos.COSFloat;
import org.pdfbox.cos.COSInteger;
import org.pdfbox.cos.COSName;
import org.pdfbox.cos.COSNull;
import org.pdfbox.cos.COSString;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.common.PDRectangle;
import org.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.pdfbox.pdmodel.encryption.SecurityHandlersManager;
import org.pdfbox.pdmodel.encryption.StandardSecurityHandler;
import org.pdfbox.pdmodel.font.PDFont;
import org.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.pdfbox.pdmodel.interactive.action.type.PDActionGoTo;
import org.pdfbox.pdmodel.interactive.action.type.PDActionURI;
import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.pdfbox.pdmodel.interactive.form.PDField;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.util.PDFTextStripperByArea;
import org.pdfbox.util.TextPosition;
import com.canoo.webtest.plugins.pdftest.htmlunit.PDFEncryptionPermission;
import com.canoo.webtest.plugins.pdftest.htmlunit.PDFField;
import com.canoo.webtest.plugins.pdftest.htmlunit.PDFInvalidPasswordException;
import com.canoo.webtest.plugins.pdftest.htmlunit.PDFPage;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.WebWindow;
/**
* Implementation of {@link PDFPage} based on <a href="http://www.pdfbox.org/">PDFBox</a>.
* @author Etienne Studer
* @author Paul King
* @author Marc Guillemot
*/
public class PdfBoxPDFPage implements PDFPage {
private PDDocument pdfDocument_;
private List bookmarks_;
private final WebWindow webWindow_;
private final WebResponse webResponse_;
private static final COSName INFO_PROPERTY_TITLE = COSName
.getPDFName("Title"); // title of document
private boolean cleanUpCalled;
private static int counter = 0;
private static int allocated = 0;
public void cleanUp() throws IOException {
cleanUpCalled = true;
allocated--;
if (pdfDocument_ != null)
pdfDocument_.close();
}
public PdfBoxPDFPage(final WebResponse webResponse, final WebWindow webWindow) {
webWindow_ = webWindow;
webResponse_ = webResponse;
pdfDocument_ = loadPDFDocument();
counter++;
allocated++;
}
protected PDDocument loadPDFDocument()
{
try {
return PDDocument.load(getWebResponse().getContentAsStream());
}
catch (final IOException e) {
getLog().warn("Failed parsing PDF document " + getWebResponse().getRequestUrl() + ": " + e.getMessage(), e);
}
return null;
}
/**
* Return the log object for this web client
* @return The log object
*/
protected final Log getLog() {
return LogFactory.getLog(getClass());
}
private COSDictionary getInfoDictionary() {
final COSDictionary encryptProperties = getPDFDocument()
.getDocumentInformation().getDictionary();
return encryptProperties != null ? encryptProperties
: new COSDictionary();
}
private static void assertKeyExists(COSName key, COSDictionary properties) {
if (properties.keyList().contains(key)) {
return;
}
throw new IllegalArgumentException("Specified property key '"
+ key.getName() + "' does not exist.");
}
public String getDocumentTitle() {
assertKeyExists(INFO_PROPERTY_TITLE, getInfoDictionary());
COSString title = (COSString) getInfoDictionary().getItem(
INFO_PROPERTY_TITLE);
return title.getString();
}
public WebWindow getEnclosingWindow() {
return webWindow_;
}
public WebResponse getWebResponse() {
return webResponse_;
}
public void initialize() throws IOException {
// TODO Auto-generated method stub
}
public int getNumberOfPages() {
return getPDFDocument().getNumberOfPages();
}
/**
* Gets the PDF document
* @return the document
* @throws RuntimeException if the PDF document couldn't be parsed
*/
protected PDDocument getPDFDocument() {
if (cleanUpCalled)
{
pdfDocument_ = loadPDFDocument();
cleanUpCalled = false;
}
if (pdfDocument_ == null)
throw new RuntimeException("Can't work on pdf document as it couldn't get parsed");
return pdfDocument_;
}
public List getFields() {
return getFields(TruePredicate.INSTANCE);
}
public void decrypt(String password)
{
try {
getPDFDocument().decrypt(password);
}
catch (final InvalidPasswordException e)
{
throw new PDFInvalidPasswordException(e);
}
catch (final CryptographyException e)
{
throw new PDFInvalidPasswordException(e);
}
catch (final Exception e) {
throw new RuntimeException("Problem decrypting the document", e);
}
}
public boolean isEncrypted() {
return getPDFDocument().isEncrypted();
}
public String getText(int startPage, int endPage) {
return getTextInternal(startPage, endPage);
}
protected String getTextInternal(int startPage, int endPage) {
try {
final PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setStartPage(startPage);
textStripper.setEndPage(endPage);
return textStripper.getText(getPDFDocument());
} catch (final IOException e) {
throw new RuntimeException("Problem extracting text", e);
}
}
protected List getFields(final Predicate filter)
{
final PDAcroForm acroForm = getPDFDocument().getDocumentCatalog().getAcroForm();
final List response = new ArrayList();
try
{
if (acroForm != null) {
final List fields = acroForm.getFields();
for (final Iterator iter = fields.iterator(); iter.hasNext();) {
final PDField field = (PDField) iter.next();
final List kids = field.getKids();
if (kids != null && !kids.isEmpty())
{
for (final Iterator iterKids = kids.iterator(); iterKids.hasNext();) {
final PDField childField = (PDField) iterKids.next();
if (filter.evaluate(childField))
{
response.add(PdfBoxPDFField.wrap(childField));
}
}
}
else if (filter.evaluate(field))
{
response.add(PdfBoxPDFField.wrap(field));
}
}
}
}
catch (final IOException e)
{
throw new RuntimeException("Failed reading fields", e);
}
return response;
}
public List getFields(final String name, final PDFField.Type type)
{
return getFields(PdfBoxPDFField.FieldPredicate.buildNamePredicate(name));
}
public List getFields(final String name, final int pageNumber, final PDFField.Type type)
{
final Predicate predicateName = PdfBoxPDFField.FieldPredicate.buildNamePredicate(name);
final Predicate predicatePage = PdfBoxPDFField.FieldPredicate.buildPageNumberPredicate(pageNumber);
final Predicate predicate = new AndPredicate(predicateName, predicatePage);
return getFields(predicate);
}
public boolean hasPermission(final PDFEncryptionPermission permission) {
// with release 0.7.3, following doesn't work
/*
AccessPermission info = getPDFDocument().getCurrentAccessPermission();
...
else if (PDFEncryptionPermission.PRINTING.equals(permission))
return info.canPrint();
...
*/
// values taken from deprecated class PDStandardEncryption
final int PRINT_BIT = 3;
final int MODIFICATION_BIT = 4;
final int MODIFY_ANNOTATIONS_BIT = 6;
final int ASSEMBLE_DOCUMENT_BIT = 11;
final int DEGRADED_PRINT_BIT = 12;
final int EXTRACT_BIT = 5;
final int FILL_IN_FORM_BIT = 9;
final int EXTRACT_FOR_ACCESSIBILITY_BIT = 10;
final int bitValue;
if (PDFEncryptionPermission.ASSEMBLY.equals(permission))
bitValue = ASSEMBLE_DOCUMENT_BIT;
else if (PDFEncryptionPermission.COPY.equals(permission))
bitValue = EXTRACT_BIT;
else if (PDFEncryptionPermission.DEGRADED_PRINTING.equals(permission))
bitValue = DEGRADED_PRINT_BIT;
else if (PDFEncryptionPermission.FILL_IN.equals(permission))
bitValue = FILL_IN_FORM_BIT;
else if (PDFEncryptionPermission.MODIFY_ANNOTATIONS.equals(permission))
bitValue = MODIFY_ANNOTATIONS_BIT;
else if (PDFEncryptionPermission.MODIFY_CONTENTS.equals(permission))
bitValue = MODIFICATION_BIT;
else if (PDFEncryptionPermission.PRINTING.equals(permission))
bitValue = PRINT_BIT;
else if (PDFEncryptionPermission.SCREEN_READERS.equals(permission))
bitValue = EXTRACT_FOR_ACCESSIBILITY_BIT;
else
throw new IllegalArgumentException("Unknown pdf permission: " + permission);
final PDEncryptionDictionary info;
try {
info = (PDEncryptionDictionary) getPDFDocument().getEncryptionDictionary();
}
catch (final IOException e)
{
throw new RuntimeException("Can't read permissions", e);
}
return (info.getPermissions() & (1 << (bitValue-1))) != 0;
}
public String getEncryptProperty(final String key)
{
final COSDictionary encryptProperties = getPDFDocument().getDocument().getEncryptionDictionary();
return stringValue(encryptProperties.getDictionaryObject(key));
}
static String stringValue(final COSBase element)
{
if (element == null) {
return null;
}
else if (element instanceof COSString) {
return ((COSString) element).getString();
}
else if (element instanceof COSName) {
return ((COSName) element).getName();
}
else if (element instanceof COSBoolean) {
return String.valueOf(((COSBoolean) element).getValue());
}
else if (element instanceof COSInteger) {
return String.valueOf(((COSInteger) element).intValue());
}
else if (element instanceof COSFloat) {
return String.valueOf(((COSFloat) element).floatValue());
}
else if (element instanceof COSNull) {
return "null";
}
else
return String.valueOf(element);
}
public int getEncryptionStrength()
{
try {
return getPDFDocument().getEncryptionDictionary().getLength();
}
catch (final IOException e)
{
throw new RuntimeException("Failed reading encryption strength", e);
}
}
public String getInfoProperty(final String key) {
final COSDictionary properties = getPDFDocument().getDocumentInformation().getDictionary();
if (properties == null)
return null;
final COSName pdfName = COSName.getPDFName(key);
return stringValue(properties.getDictionaryObject(pdfName));
}
public boolean isUserPassword(final String password)
{
try {
return isPassword(password, true);
}
catch (final Exception e)
{
throw new RuntimeException("Failed verifying user password", e);
}
}
private boolean isPassword(String password, boolean userPassword) throws IOException, BadSecurityHandlerException, CryptographyException {
final StandardSecurityHandler secHandler = getSecurityHandler();
PDEncryptionDictionary dictionary = getPDFDocument().getEncryptionDictionary();
int dicPermissions = dictionary.getPermissions();
int dicRevision = dictionary.getRevision();
int dicLength = dictionary.getLength()/8;
COSString id = (COSString) getPDFDocument().getDocument().getDocumentID().getObject( 0 );
byte[] u = dictionary.getUserKey();
byte[] o = dictionary.getOwnerKey();
if (userPassword)
{
return secHandler.isUserPassword(password.getBytes(), u,
o, dicPermissions, id.getBytes(), dicRevision, dicLength );
}
else
{
return secHandler.isOwnerPassword(password.getBytes(), u,
o, dicPermissions, id.getBytes(), dicRevision, dicLength );
}
}
private StandardSecurityHandler getSecurityHandler() throws IOException,
BadSecurityHandlerException {
PDEncryptionDictionary dict = getPDFDocument().getEncryptionDictionary();
StandardSecurityHandler secHandler = (StandardSecurityHandler) SecurityHandlersManager.getInstance().getSecurityHandler(dict.getFilter());
return secHandler;
}
public boolean isOwnerPassword(final String password) {
try {
return isPassword(password, false);
}
catch (final Exception e)
{
throw new RuntimeException("Failed verifying owner password", e);
}
}
public List getBookmarks()
{
if (bookmarks_ == null)
bookmarks_ = extractBookmarks();
return bookmarks_;
}
private List extractBookmarks()
{
final PDDocumentOutline outline = getPDFDocument().getDocumentCatalog().getDocumentOutline();
final List result = new ArrayList();
if (outline != null)
{
PDOutlineItem child = outline.getFirstChild();
while (child != null)
{
final PdfBoxPDFBookmark topBookmark = new PdfBoxPDFBookmark(child, null);
result.add(topBookmark);
result.addAll(topBookmark.getAllChildren());
child = child.getNextSibling();
}
}
return result;
}
public List getFonts() {
final List fonts = new ArrayList();
final List pages = getPDFDocument().getDocumentCatalog().getAllPages();
for (final ListIterator iter = pages.listIterator(); iter.hasNext();)
{
final PDPage page = (PDPage) iter.next();
try {
for (final Iterator fontIterator = page.findResources().getFonts().values().iterator();
fontIterator.hasNext();) {
final PDFont font = (PDFont) fontIterator.next();
fonts.add(new PDFBoxPDFFont(font, iter.nextIndex())); // nextIndex() because page number start with 1 not 0
}
}
catch (final IOException e)
{
throw new RuntimeException("Failed retrieving the fonts on page " + iter.nextIndex(), e);
}
}
return fonts;
}
public List getFields(int pageNumber) {
final Predicate predicatePage = PdfBoxPDFField.FieldPredicate.buildPageNumberPredicate(pageNumber);
return getFields(predicatePage);
}
public List getFields(final String name, final int pageNumber) {
final Predicate predicateName = PdfBoxPDFField.FieldPredicate.buildNamePredicate(name);
final Predicate predicatePage = PdfBoxPDFField.FieldPredicate.buildPageNumberPredicate(pageNumber);
final Predicate predicate = new AndPredicate(predicateName, predicatePage);
return getFields(predicate);
}
public List getFields(final String name) {
return getFields(PdfBoxPDFField.FieldPredicate.buildNamePredicate(name));
}
/**
* Gets the links from the document
* @return the links
*/
public List getLinks() {
final List result = new ArrayList();
final List allPages = getPDFDocument().getDocumentCatalog().getAllPages();
for (final ListIterator iter = allPages.listIterator(); iter.hasNext();) {
final PDPage page = (PDPage) iter.next();
processPage(result, page, iter.nextIndex());
}
return result;
}
private static void processPage(final List result, final PDPage page, final int pageNum) {
try {
final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
final List linkAnnotations = new ArrayList();
final List linkRegions = new ArrayList();
extractAnnotations(page, stripper, linkAnnotations, linkRegions);
stripper.extractRegions(page);
final Map uriMap = new HashMap();
final Map textMap = new HashMap();
collateLinks(linkAnnotations, linkRegions, uriMap, textMap, stripper);
final Iterator it = uriMap.keySet().iterator();
while (it.hasNext()) {
final Object key = it.next();
result.add(new PDFBoxPDFLink((String)textMap.get(key), (String)uriMap.get(key), pageNum));
}
}
catch (final IOException e) {
// ignore
}
}
private static void collateLinks(final List linkAnnotations, final List linkRegions, final Map uriMap, final Map textMap, final PDFTextStripperByArea stripper) throws IOException {
for (int j = 0; j < linkAnnotations.size(); j++) {
final PDAnnotationLink link = (PDAnnotationLink) linkAnnotations.get(j);
final PDAction action = link.getAction();
final String urlText = stripper.getTextForRegion(Integer.toString(j));
if (action instanceof PDActionURI) {
final PDActionURI uri = (PDActionURI) action;
// internal links have no text
if (urlText.length() > 0) {
textMap.put(linkRegions.get(j), urlText);
}
uriMap.put(linkRegions.get(j), uri.getURI());
}
else if (action instanceof PDActionGoTo) {
// internal link text associated with goto
if (urlText.length() > 0) {
textMap.put(linkRegions.get(j), urlText);
}
}
}
}
private static List extractAnnotations(final PDPage page, final PDFTextStripperByArea stripper, final List linkAnnotations, final List linkRegions) throws IOException {
final List annotations = page.getAnnotations();
for (int j = 0; j < annotations.size(); j++) {
final PDAnnotation annot = (PDAnnotation) annotations.get(j);
if (annot instanceof PDAnnotationLink) {
final PDRectangle rect = annot.getRectangle();
//need to reposition link rectangle to match text space plus add
//a little to account for descenders and the like
final float x = rect.getLowerLeftX() - 1;
float y = rect.getUpperRightY() - 1;
final float width = rect.getWidth() + 2;
final float height = rect.getHeight() + rect.getHeight() / 4;
final int rotation = page.findRotation();
if (rotation == 0) {
final PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
final Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
stripper.addRegion(Integer.toString(j), awtRect);
linkAnnotations.add(annot);
linkRegions.add(awtRect);
}
}
return annotations;
}
public String getText(final String fragmentSeparator, final String lineSeparator, final String pageSeparator, final String mode) {
return getText(0, getNumberOfPages(), fragmentSeparator, lineSeparator, pageSeparator, mode);
}
private String getText(final int startPage, final int endPage, final String fragmentSeparator,
final String lineSeparator, final String pageSeparator, final String mode)
{
final StringBuffer buf = new StringBuffer();
if (MODE_NORMAL.equals(mode)) {
buf.append(getTextInternal(startPage, endPage, lineSeparator, pageSeparator));
}
else
{
for (int page = startPage; page <= endPage; page++) {
final List fragments = getFragments(page, fragmentSeparator, lineSeparator);
final String tmp = collateFragments(fragments, fragmentSeparator, lineSeparator);
if (tmp.length() > 0)
{
buf.append(tmp);
buf.append(pageSeparator);
}
}
}
return buf.toString();
}
private String getTextInternal(final int startPage, final int endPage,
final String lineSeparator, final String pageSeparator)
{
final StringWriter output = new StringWriter();
try
{
final PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setPageSeparator(pageSeparator);
textStripper.setLineSeparator(lineSeparator);
textStripper.setStartPage(startPage);
textStripper.setEndPage(endPage);
textStripper.writeText(getPDFDocument(), output);
return output.toString();
}
catch (final Exception e)
{
throw new RuntimeException("Error while extracting text from document.", e);
}
finally
{
IOUtils.closeQuietly(output);
}
}
public List getFragments(int page, final String fragmentSeparator, final String lineSeparator)
{
final List fragments = new ArrayList();
final StringWriter output = new StringWriter();
try
{
final PDFTextStripper textStripper = new PDFTextStripper()
{
protected void showCharacter(TextPosition textPosition) {
fragments.add(textPosition);
}
};
textStripper.setLineSeparator(lineSeparator);
textStripper.setStartPage(page);
textStripper.setEndPage(page);
textStripper.writeText(getPDFDocument(), output);
return fragments;
}
catch (final Exception e)
{
throw new RuntimeException("Error while extracting text from document.", e);
}
finally
{
IOUtils.closeQuietly(output);
}
}
private String collateFragments(List fragments, String fragmentSeparator, String lineSeparator) {
final Map linesOfText = new TreeMap();
regroup(fragments, linesOfText);
final Map linesOfString = new TreeMap();
coalesce(linesOfText, linesOfString);
return fragmentsToString(linesOfString, fragmentSeparator, lineSeparator);
}
private void coalesce(Map linesOfText, Map linesOfString) {
Iterator kit = linesOfText.keySet().iterator();
while (kit.hasNext()) {
Integer key = (Integer) kit.next();
linesOfString.put(key, coalesceLine((Map) linesOfText.get(key)));
}
}
private Map coalesceLine(Map input) {
final Map output = new TreeMap();
final Iterator kit = input.keySet().iterator();
TextPosition lastFragment = null;
String lastString = null;
Integer lastKey = null;
while (kit.hasNext())
{
final Integer key = (Integer) kit.next();
final TextPosition thisFragment = (TextPosition) input.get(key);
if (lastFragment != null && adjacent(lastFragment, thisFragment))
{
lastFragment = thisFragment;
lastString += thisFragment.getCharacter();
}
else
{
if (lastFragment != null)
{
output.put(lastKey, lastString);
}
lastFragment = thisFragment;
lastString = thisFragment.getCharacter();
lastKey = key;
}
if (lastFragment != null)
{
output.put(lastKey, lastString);
}
}
return output;
}
private boolean adjacent(final TextPosition lastFragment, final TextPosition thisFragment)
{
final int TOLERANCE = 2;
return thisFragment.getX() - (lastFragment.getX() + lastFragment.getWidth() * lastFragment.getXScale()) < TOLERANCE;
}
private void regroup(final List fragments, final Map lines)
{
for (int i = 0; i < fragments.size(); i++) {
final TextPosition textPosition = (TextPosition) fragments.get(i);
final Integer y = new Integer((int)textPosition.getY());
final Integer x = new Integer((int)textPosition.getX());
final Map pieces;
if (lines.containsKey(y)) {
pieces = (TreeMap) lines.get(y);
} else {
pieces = new TreeMap();
}
pieces.put(x, textPosition);
lines.put(y, pieces);
}
}
private String fragmentsToString(Map linesOfString, String fragmentSeparator, String lineSeparator) {
StringBuffer buf = new StringBuffer();
Iterator lit = linesOfString.values().iterator();
while (lit.hasNext()) {
Map pieces = (Map) lit.next();
Iterator pit = pieces.values().iterator();
while (pit.hasNext()) {
String piece = (String) pit.next();
buf.append(piece);
if (pit.hasNext()) {
buf.append(fragmentSeparator);
}
}
buf.append(lineSeparator);
}
return buf.toString();
}
}