Source Code of de.oio.jpdfunit.document.pdflibimpl.PdfBoxAnalyser

/*
 * JPdfUnit- make your pdf green
 * Copyright (C) 2005 Orientation in Objects GmbH
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the Apache License Version 2.0.
 * There is a copy of this license along with this library.
 * Otherwise see terms of license at apache.org
 *
 * Feel free to contact us:
 *
 * jpdfunit-users@lists.sourceforge.net
 *
 * $Id: PdfBoxAnalyser.java,v 1.1 2009/12/14 12:07:10 sschaefe Exp $
 */


package de.oio.jpdfunit.document.pdflibimpl;


import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.LinkedList;
import java.util.List;


import org.pdfbox.cos.COSDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDPage;
import org.pdfbox.pdmodel.PDResources;
import org.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
import org.pdfbox.pdmodel.font.PDFont;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.pdfbox.util.PDFTextStripper;


import de.oio.jpdfunit.document.util.PdfImplUtilResourceBundle;
import de.oio.jpdfunit.document.util.TextSearchType;
import de.oio.jpdfunit.document.util.TextSearcher;


/**
 * The PdfBoxAnalyser class is the adaptor class to the pdf library PDFBox. It
 * extends the PDFTextStripper class of the PDFBox and provides via the
 * PDDocument of PDFBox and the PDFTextStripper the user the functionalities to
 * get the different informations and content of the pdf document.
 * 
 * @author bbratkus
 * 
 */
class PdfBoxAnalyser extends PDFTextStripper {
  private static final String PARAMETER = PdfImplUtilResourceBundle
      .getString("PdfBoxAnalyser.inital");


  private static final String CANTDECRYPT = PdfImplUtilResourceBundle
      .getString("PdfBoxAnalyser.decrypt");


  private static final String NOCONTENT = PdfImplUtilResourceBundle
      .getString("PdfBoxAnalyser.content");


  private static final String NODOCINIT = PdfImplUtilResourceBundle
      .getString("PdfBoxAnalyser.docInital");


  private final transient PDDocument pdDocument;


  private transient StringBuffer textbuffer = null;


  private transient LinkedList fonts;


  private transient ArrayList bookMarkList;


  /**
   * This constructor uses a String parameter to instanciate the PDDocument.
   * 
   * @param file
   *            The path and the file as String. I.e.
   *            "/home/bbratkus/test.pdf".
   * @throws IOException
   */
  public PdfBoxAnalyser(String file) throws IOException {
    if ((file.equals("") || (file == null))) { //$NON-NLS-1$
      throw new IllegalArgumentException(PARAMETER);
    }
    try {
      pdDocument = PDDocument.load(file);
    } catch (IOException ioe) {
      throw new IllegalArgumentException(NODOCINIT);
    }
    setContentAsStringBuffer();
    getDocumentFonts();
  }


  /**
   * This constructor uses a InputStream as parameter to instanciate the
   * PDDocument.
   * 
   * @param pdfFileStream
   *            The Stream which the pdf file is within.
   * @throws IOException
   */
  public PdfBoxAnalyser(InputStream pdfStream) throws IOException {
    if (pdfStream == null) {
      throw new IllegalArgumentException(PARAMETER);
    }
    try {
      pdDocument = PDDocument.load(pdfStream);
    } catch (IOException ioe) {
      throw new IllegalArgumentException(NODOCINIT);
    }
    setContentAsStringBuffer();
    getDocumentFonts();
  }


  /**
   * This method sets the Content of the the PDDocument to a StringBuffer.
   * 
   * 
   * @throws IllegalArgumentException
   *             The method throws an IllegalArgumentException if if is not
   *             possible to get the content as StringBuffer of the
   *             PDDocument.
   */
  private void setContentAsStringBuffer() {
    if (!pdDocument.isEncrypted()) {
      try {
        this.textbuffer = new StringBuffer(getText(pdDocument));
      } catch (IOException e) {
        throw new IllegalArgumentException(NOCONTENT);
      }
    }
  }


  /**
   * 
   * Close the document.
   * 
   * @throws IllegalStateException
   *             The method throws an IllegalStateException if the framework
   *             is not able to close the document.
   */
  public void closeDocument() {
    try {
      pdDocument.close();
    } catch (IOException ioe) {
      throw new IllegalStateException(PdfImplUtilResourceBundle
          .getString("PdfBoxAnalyser.close"));
    }
  }


  /**
   * This method gets the number of pages of the PDDocument.
   * 
   * @return The page number of the document.
   * @throws IllegalStateException
   *             The method throws an IllegalStateException if it is not
   *             possible to get the page count.
   */
  public int countPages() {


    return pdDocument.getNumberOfPages();
  }


  /**
   * This method tries to decrypt the document via the user or the owner
   * password.
   * 
   * @param passwd
   *            The supposed owner or user password of the document.
   * @throws IllegalStateException
   *             The method throws an IllegalStateException if it is not
   *             possible to decrypt the document.
   * @throws IllegalArgumentException
   *             if the supposed pasword do not match the password of the pdf
   *             document.
   */
  public void decryptDocument(final String passwd) {
    try {
      if (passwd == null) {
        throw new IllegalArgumentException(PARAMETER);
      }
      pdDocument.decrypt(passwd);
      if (pdDocument.isEncrypted()) {
        throw new IllegalStateException(PdfImplUtilResourceBundle
            .getString("PdfBoxAnalyser.encrypted"));
      }
    } catch (CryptographyException e) {
      throw new IllegalStateException(CANTDECRYPT);
    } catch (IOException e) {
      throw new IllegalStateException(CANTDECRYPT);
    } catch (InvalidPasswordException e) {
      throw new IllegalArgumentException(PdfImplUtilResourceBundle
          .getString("PdfBoxAnalyser.passwd"));


    }
  }


  /**
   * The method gets the authors name of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The authors name which is set in the document information.
   */
  public String getAuthor() {
    return pdDocument.getDocumentInformation().getAuthor();
  }


  /**
   * This methods files a String with the StringBuffer which holds the
   * content.
   * 
   * @return The hole content of a pdf document.
   */
  public String getContent() {
    setContentAsStringBuffer();
    return textbuffer.toString();
  }


  /**
   * This method get the content for a certain page.
   * 
   * @param page
   *            The page number of the page which content should be returned.
   *            The page count is 1-based.
   * @return The content of the selected page.
   * @throws IllegalArgumentException
   *             The method throws an IllegalArgumentException if the page
   *             number is smaller or equals 0.
   */
  public String getContentOnPage(final int page) {


    if ((page == 0) || (page < 0)) {
      throw new IllegalArgumentException(PARAMETER);
    }
    StringBuffer tmpBuffer = null;
    super.setStartPage(page);
    super.setEndPage(page);
    try {
      tmpBuffer = new StringBuffer(super.getText(pdDocument));
    } catch (IOException ioe) {
      throw new IllegalArgumentException(NOCONTENT);
    }
    return tmpBuffer.toString();
  }


  /**
   * The method gets the creator of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The creator of the document.
   */
  public String getCreator() {
    return pdDocument.getDocumentInformation().getCreator();
  }


  /**
   * The method gets the creation date of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The creation date which is set in the document information.
   */
  public Calendar getCreationDate() {
    try {
      return pdDocument.getDocumentInformation().getCreationDate();
    } catch (IOException e) {
      throw new IllegalStateException();
    }
  }


  /**
   * The method gets the encryption length of the PDEncryptionDictionary of
   * the PDDocument.
   * 
   * @return The actual length of the choosen encryption.
   */
  public int getEncryptionLength() {
    int length = 0;
    try {
      if (pdDocument.isEncrypted()) {
        final PDEncryptionDictionary dicen = pdDocument
            .getEncryptionDictionary();
        length = dicen.getLength();
      }
    } catch (IOException ioe) {
      length = 0;
    }
    return length;
  }


  /**
   * The method gets the first page where the text appears.
   * 
   * @param text
   *            The string or regular expression (constructs a regular
   *            expression matcher from a String by compiling it using a new
   *            instance of RECompiler) to search for in the document. Be
   *            carefull regex are greedy.
   * @param type
   *            The kind of search.
   * @return The first page the text was found. Returns -1 if the text can not
   *         be found
   */
  public int getFirstPageForContent(final String text,
      final TextSearchType type) {
    boolean isContent = false;
    if ((text == null) || (type == null)) {
      throw new IllegalArgumentException(PARAMETER);
    }


    String docContent = null;
    int value = -1;
    final TextSearcher textsearcher = type.getSearcher();
    final int pageCount = pdDocument.getNumberOfPages();
    for (int i = 1; i <= pageCount; i++) {
      docContent = this.getContentOnPage(i);
      isContent = textsearcher.isTextContent(text, docContent);
      if (isContent) {
        value = i;
      }
    }


    return value;
  }


  /**
   * The method gets the keywords of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The keywords which are set in the document information.
   */
  public String getKeywords() {
    return pdDocument.getDocumentInformation().getKeywords();
  }


  /**
   * The method gets all the pages where the text appears.
   * 
   * @param text
   *            The string or regular expression (Constructs a regular
   *            expression matcher from a String by compiling it using a new
   *            instance of RECompiler.) to search for in the document. Be
   *            carefull regex are greedy.
   * @param type
   *            The kind of search. The type is related to the pages of the
   *            document.
   * @return The hole page numbers where the text was found in the document.
   *         If the text was found one time returns -1.
   * @see     de.oio.jpdfunit.document.Content
     * @deprecated As of version 0.93,
     * replaced by <code>Content.getListOfPagesForContent(String text, TextSearchType type)</code>.
     */
  public int[] getPagesForContent(final String text, final TextSearchType type) {
    int[] result;
    boolean isContent = false;
    if ((text == null) || (type == null)) {
      throw new IllegalArgumentException(PARAMETER);
    }
    String docContent = null;
    final TextSearcher textsearcher = type.getSearcher();
    final int pageCount = pdDocument.getNumberOfPages();
    result = new int[pageCount];
    boolean positive = false;
    for (int i = 1; i <= pageCount; i++) {
      docContent = this.getContentOnPage(i);
      isContent = textsearcher.isTextContent(text, docContent);
      if (isContent) {
        result[i] = i;
        positive = true;
      }
    }
    if(!positive) {
      result = new int[0];
    }
    return result;
  }
  /**
   * The method gets all the pages where the text appears.
   * 
   * @param text
   *            The string or regular expression (Constructs a regular
   *            expression matcher from a String by compiling it using a new
   *            instance of RECompiler.) to search for in the document. Be
   *            carefull regex are greedy.
   * @param type
   *            The kind of search. The type is related to the pages of the
   *            document.
   * @return The hole page numbers where the text was found in the document.
   *         If the text was found one time returns -1.
   */
  public List getListOfPagesForContent(final String text, final TextSearchType type) {
    boolean isContent = false;
    if ((text == null) || (type == null)) {
      throw new IllegalArgumentException(PARAMETER);
    }
    String docContent = null;
    final TextSearcher textsearcher = type.getSearcher();
    final int pageCount = pdDocument.getNumberOfPages();
    List pages = new ArrayList();
    for (int i = 1; i <= pageCount; i++) {
      docContent = this.getContentOnPage(i);
      isContent = textsearcher.isTextContent(text, docContent);
      if (isContent) {
        pages.add(new Integer(i));
      }
    }
    return pages;
  }
  
  /**
   * The method gets the producer of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The producer which is set in the in information of the document.
   */
  public String getProducer() {
    return pdDocument.getDocumentInformation().getProducer();
  }


  /**
   * The method gets the subject of the PDDocumentInformation of the
   * PDDocument.
   * 
   * @return The subject of the document which is set in the document
   *         information.
   */
  public String getSubject() {
    return pdDocument.getDocumentInformation().getSubject();


  }


  /**
   * The method gets the title of the PDDocumentInformation of the PDDocument.
   * 
   * @return The title of the document which is set in the document
   *         information.
   */
  public String getTitle() {
    return pdDocument.getDocumentInformation().getTitle();
  }


  /**
   * The method gets the version of the COSDocument of the PDDocument.
   * 
   * @return The version of the tested pdf file.
   */
  public float getVersion() {
    final COSDocument cosDoc = pdDocument.getDocument();
    return cosDoc.getVersion();
  }


  /**
   * The method ckecks if the text is content.
   * 
   * @param text
   *            The expected text which should be content of the document,
   *            even here you can use a regular expression .
   * @param type
   *            The kind of search. The type is related to the pages of the
   *            document.
   * @return The method returns true if the text is found the first time.
   */
  public boolean isTextContent(final String text, final TextSearchType type) {
    boolean isContent = false;
    if ((text == null) || (type == null)) {
      throw new IllegalArgumentException(PARAMETER);
    }


    String docContent = null;


    final TextSearcher textsearcher = type.getSearcher();
    final int pageCount = pdDocument.getNumberOfPages();
    boolean returnBool = false;
    for (int i = 1; i <= pageCount; i++) {
      docContent = this.getContentOnPage(i);
      isContent = textsearcher.isTextContent(text, docContent);
      if (isContent) {
        returnBool = true;
      }
    }


    return returnBool;
  }


  /**
   * The method ckecks if the text is content on a certain page.
   * 
   * @param text
   *            The expected text or regular expression which should be
   *            content of the document.
   * @param type
   *            The kind of search. The type is related to the pages of the
   *            document.
   * @param page
   *            The page of the document which should be searched for the
   *            text. The page count is 1-based.
   * @return The method returns true if the text is found on the suggested
   *         page.
   */
  public boolean isTextContentOnPage(final String text,
      final TextSearchType type, final int page) {
    boolean isContent = false;
    if ((text == null) || (type == null) || (page == 0) || (page < 0)) {
      throw new IllegalArgumentException(PARAMETER);
    }


    String docContent = null;
    final TextSearcher textsearcher = type.getSearcher();
    docContent = this.getContentOnPage(page);
    isContent = textsearcher.isTextContent(text, docContent);
    return isContent;
  }


  /**
   * The method checks if the PDDocument is encrypted.
   * 
   * @return the method returns true if the document is encrypted.
   */
  public boolean isDocumentEncrypted() {
    return pdDocument.isEncrypted();
  }


  /**
   * The method checks if the supposed password is a password of the
   * PDDocument.
   * 
   * @param expected
   *            The expected owner password. The owner password is required
   *            for changing the attributes of the document.
   * 
   * @return The method returns true, if the expected password matches to the
   *         owner password of the document.
   */
  public boolean isOwnerPasswd(final String expected) {
    if (expected == null) {
      throw new IllegalArgumentException(PARAMETER);
    }
    boolean isPasswd = false;
    try {
      if (pdDocument.isEncrypted()) {
        isPasswd = pdDocument.isOwnerPassword(expected);
      }
    } catch (IOException e) {
      throw new IllegalStateException(CANTDECRYPT);
    } catch (CryptographyException e) {
      throw new IllegalStateException(CANTDECRYPT);
    }
    return isPasswd;
  }


  /**
   * The method checks if the supposed password is a password of the
   * PDDocument.
   * 
   * @param expected
   *            The expected user password. The user password is required for
   *            i.e. reading the document.
   * @return The method returns true, if the expected password matches to the
   *         user password of the document.
   */
  public boolean isUserPasswd(final String expected) {
    if (expected == null) {
      throw new IllegalArgumentException(PARAMETER);
    }
    boolean isPasswd = false;
    try {
      if (pdDocument.isEncrypted()) {
        isPasswd = pdDocument.isUserPassword(expected);
      }
    } catch (IOException e) {
      throw new IllegalStateException(CANTDECRYPT);
    } catch (CryptographyException e) {
      throw new IllegalStateException(CANTDECRYPT);
    }
    return isPasswd;
  }


  /**
   * 
   * @return A LinkedList with containing all Fonts of the Document, means the
   *         "names" and the types of a font relative to the page.
   * 
   * 
   */
  public LinkedList getAllFontsInDocument() {
    getDocumentFonts();
    return fonts;
  }


  private void getDocumentFonts() {
    fonts = new LinkedList();
    PDResources ress;
    LinkedList myFontList = null;
    if (!pdDocument.isEncrypted()) {
      try {
        for (int page = 0; page < pdDocument.getNumberOfPages(); page++) {
          ress = ((PDPage) (pdDocument.getDocumentCatalog()
              .getAllPages().get(page))).findResources();
          myFontList = new LinkedList(ress.getFonts().values());
          for (int i = 0; i < myFontList.size(); i++) {
            fonts.add(new PdfBoxFontAdapter(((PDFont) (myFontList
                .get(i))).getBaseFont(), ((PDFont) (myFontList
                .get(i))).getSubType(), page));
          }


        }
      } catch (IOException e) {
        throw new IllegalArgumentException(NOCONTENT);
      }
    }
  }


  /**
   * This method returns all bookmarks of a pdf document.
   * 
   * @return The llst containing the bookmarks.
   */
  public List getAllBookmarks() {
    getBookmarks();
    return bookMarkList;
  }


  private void getBookmarks() {
    bookMarkList = new ArrayList();
    PDDocumentOutline root = pdDocument.getDocumentCatalog()
        .getDocumentOutline();
    if (root != null) {
      PDOutlineItem item = root.getFirstChild();
      rekursionBookmarks(item);
    }
  }
  private void rekursionBookmarks(PDOutlineItem bla) {
    while (bla != null) {
      bookMarkList.add(bla.getTitle());
      PDOutlineItem child = bla.getFirstChild();
      rekursionBookmarks(child);
      bla = bla.getNextSibling();
    }
  }
}
Source Code of de.oio.jpdfunit.document.pdflibimpl.PdfBoxAnalyser

Related Classes of de.oio.jpdfunit.document.pdflibimpl.PdfBoxAnalyser