package org.sf.mustru.filters;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import org.apache.log4j.Logger;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.encryption.DocumentEncryption;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.util.PDFTextStripper;
import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;
/**
* Extract text and metadata from a PDF file using the PDFBox class
*/
public class PdfHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(PdfHandler.class.getName());
/**
* empty constructor
*/
public PdfHandler() { super(); }
/**
* Convert a PDF file into text and save PDF fields in a IndexableDoc object
*/
public void getDocument(String ifile, IndexableDoc doc)
{
doc.setFileType("text"); doc.setFileName(ifile);
COSDocument cosDoc = null;
logger.info("Extracting text from PDF file " + ifile);
try
{ cosDoc = parseDocument(new FileInputStream(new File(ifile)) ); }
catch (OutOfMemoryError exc)
{ logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
return; }
catch (IOException e)
{ logger.error("Cannot read PDF document " + ifile + " " + e.getMessage());
return; }
catch (Exception e)
{ logger.error("Could not parse PDF document" + ifile + " " + e.getMessage());
return; }
// decrypt the PDF document, if it is encrypted -- use a blank password
try
{
String password = "";
if ( (cosDoc != null) && (cosDoc.isEncrypted()) )
{ DocumentEncryption decryptor = new DocumentEncryption(cosDoc); decryptor.decryptDocument(password); }
}
catch (CryptographyException e)
{ logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc);
return; }
catch (InvalidPasswordException e)
{ logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc);
return; }
catch (IOException e)
{ logger.error("Could not decrypt PDF doc: " + ifile + " " + e.getMessage()); closeCOSDocument(cosDoc);
return; }
//*-- extract PDF document's textual content
String docText = null;
try
{ PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(new PDDocument(cosDoc));
}
catch (OutOfMemoryError exc)
{ closeCOSDocument(cosDoc);
logger.error("Ran out of memory for " + ifile + " or could be corrupt file " + exc.getMessage());
}
catch (Exception e)
{ closeCOSDocument(cosDoc);
logger.error("Cannot get text from PDF document " + ifile + " " + e.getMessage());
return;
}
//*-- Extract the entire text and save in the contents
if (docText != null)
{ docText = StringTools.filterChars(docText); doc.setContents(new StringBuffer(docText) ); }
//*-- Extract PDF document's meta-data
PDDocument pdDoc = null;
try
{
logger.info("Extracting metadata from PDF file " + ifile);
pdDoc = new PDDocument(cosDoc);
PDDocumentInformation docInfo = pdDoc.getDocumentInformation();
String author = StringTools.filterChars(docInfo.getAuthor());
String title = StringTools.filterChars(docInfo.getTitle());
String keywords = StringTools.filterChars(docInfo.getKeywords());
String summary = StringTools.filterChars(docInfo.getSubject());
if ((author != null) && (!author.equals(""))) { doc.setAuthor(author); }
if ((title != null) && (!title.equals(""))) { doc.setTitle(title); }
if ((keywords != null) && (!keywords.equals(""))) { doc.setMetadata(keywords); }
if ((summary != null) && (!summary.equals(""))) { doc.setSummary(summary); }
}
catch (OutOfMemoryError e)
{ logger.info("Ran out of memory for PDF file metadata: " + ifile + " " + e.getMessage()); }
catch (Exception e)
{ logger.info("Did not get PDF document metadata: " + ifile + " " + e.getMessage()); }
finally
{ closeCOSDocument(cosDoc); closePDDocument(pdDoc); }
return;
} //*-- end of getDocument
private static COSDocument parseDocument(InputStream is) throws IOException
{ PDFParser parser = new PDFParser(is);
parser.parse();
return parser.getDocument();
}
private void closeCOSDocument(COSDocument cosDoc)
{ if (cosDoc != null)
{ try { cosDoc.close(); }
catch (IOException e) { }
}
}
private void closePDDocument(PDDocument pdDoc)
{ if (pdDoc != null)
{ try { pdDoc.close(); }
catch (IOException e) { }
}
}
}