/*
************************************************************************************
* Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009
* encuestame Development Team.
* Licensed under the Apache Software License version 2.0
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
************************************************************************************
*/
package org.encuestame.business.search;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRichTextString;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
/**
* Indexer File description.
* @author Morales, Diana Paola paolaATencuestame.org
* @since Apr 5, 2011
*/
public class IndexerFile {
/** Attachment text content. **/
protected static final String CONTENT = "content";
/** Attachment full path. **/
protected static final String FULLPATH = "fullpath";
/** Attachment file name. **/
protected static final String FILENAME = "filename";
/** Attachment Id. **/
protected static final String DOCUMENTID = "documentId";
/** Attachment upload date. **/
protected static final String UPLOAD_DATE = "uploadDate";
/** Attachment type. **/
protected static final String DOCUMENT_TYPE = "documentType";
/** Attachment title. **/
protected static final String ATTACHMENT_TITLE = "title";
/** Log. **/
private static final Log log = LogFactory.getLog(IndexerFile.class);
/** Auto commit option. **/
private boolean autoCommit = true;
/**
* Create standard lucene document
* @param attachFile
* @return {@link Document} doc
*/
public static Document createStandardLuceneDocument(AttachmentIndex attachFile) {
Document doc = new Document();
doc.add(new Field(CONTENT, attachFile.getContent(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(FULLPATH, attachFile.getFilepath(), Field.Store.YES, Field.Index.NO));
doc.add(new Field(FILENAME, attachFile.getFilename(), Field.Store.YES, Field.Index.NO));
doc.add(new Field(DOCUMENTID, attachFile.getDocumentId().toString(), Field.Store.YES, Field.Index.NO));
doc.add(new Field(UPLOAD_DATE, attachFile.getUploadDate().toString(), Field.Store.YES, Field.Index.NO));
doc.add(new Field(DOCUMENT_TYPE, attachFile.getDocumentType(), Field.Store.YES, Field.Index.NO));
doc.add(new Field(ATTACHMENT_TITLE, attachFile.getTitle(), Field.Store.YES, Field.Index.NO));
return doc;
}
/**
* Add files to index
* @param attachment
*/
public static void addToIndex(final AttachmentIndex attachment, final IndexWriterManager indexWriter) {
try {
long start = System.currentTimeMillis();
indexWriter.openIndexWriter();
IndexerFile.addDocumentToIndex(attachment, indexWriter);
log.debug("Add to search index for topic " + attachment.getFilename() + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s.");
} catch (Exception e) {
log.error("Exception while adding topic " + attachment.getFilename(), e);
}
}
/**
* Add document to index.
* @param documentAttachment
* @throws IOException
*/
private static void addDocumentToIndex(final AttachmentIndex documentAttachment, final IndexWriterManager indexWriter)
throws IOException {
Document standardLuceneDocument = createStandardLuceneDocument(documentAttachment);
indexWriter.getIndexWriter().addDocument(standardLuceneDocument);
}
/**
* Delete attachment from index.
* @param topic
*/
public void deleteAttachmentFromIndex(AttachmentIndex attachmentIndex, final IndexWriterManager indexWriter ) {
try {
long start = System.currentTimeMillis();
// delete the current document
indexWriter.getIndexWriter();
this.deleteFromIndex(attachmentIndex, indexWriter);
log.debug("Delete from search index for topic " + attachmentIndex.getFilename() + " in " + ((System.currentTimeMillis() - start) / 1000.000) + " s.");
} catch (Exception e) {
log.error("Exception while adding topic " + attachmentIndex.getFilename(), e);
}
}
/**
* Delete Document from index
* @param topic
* @throws IOException
*/
private void deleteFromIndex(AttachmentIndex attachmentIndex, final IndexWriterManager indexWriter) throws IOException {
indexWriter.getIndexWriter().deleteDocuments(new Term(FILENAME, attachmentIndex.getFilename()));
}
/**
* Commit into lucene index.
* @param commitNow
* @throws IOException
*/
private void commit(final boolean commitNow, final IndexWriterManager indexWriter) throws IOException {
if (commitNow) {
indexWriter.getIndexWriter().commit();
}
}
/**
* Create Attachment Document.
* @param file
* @return
* @throws IOException
*/
public static AttachmentIndex createAttachmentDocument(final File file, final Long attachmentId) throws IOException{
final String path = file.getCanonicalPath();
final String fileExtension = SearchUtils.getExtension(path);
final String filename = file.getName();
String contentText = "";
AttachmentIndex attachmentIndexBean = new AttachmentIndex();
log.debug("Creating attachment document type --> "+fileExtension);
if ("docx".equals(fileExtension)) {
XWPFWordExtractor parserDoc;
try {
//1- Parsear word Document
parserDoc = IndexerFile.parseWordDocument(file);
//2- Extract word document content
contentText = IndexerFile.extractContentWordDocument(parserDoc);
//3- Set values to Attachment Index
} catch (POIXMLException e) {
log.error("Fail createAttachmentDocument POIXMLException --> " + e);
} catch (Exception e) {
log.error("Fail createAttachmentDocument Exception --> " + e);
}
} else if ("pdf".equals(fileExtension)) {
PDDocument parsePdf;
parsePdf = IndexerFile.parsePdfDocument(file);
try {
contentText = IndexerFile.extractContentPdfDocument(parsePdf);
} catch (Exception e) {
log.error("Fail createAttachmentDocument PDF Exception --> " + e);
}
}
else if ("xls".equals(fileExtension) ) {
HSSFWorkbook parseSpreadsheets;
try {
parseSpreadsheets = IndexerFile.parseSpreadsheetsDocument(file);
contentText = extractContentSpreadsheetsDocument(parseSpreadsheets);
} catch (Exception e) {
log.error("Fail createAttachmentDocument spreadsheets Exception --> " + e);
}
}
else if ("txt".equals(fileExtension) ) {
contentText = "Document text file";
}
attachmentIndexBean.setContent(contentText);
attachmentIndexBean.setFilepath(path);
attachmentIndexBean.setFilename(filename);
attachmentIndexBean.setDocumentId(attachmentId);
attachmentIndexBean.setUploadDate(new Date());
attachmentIndexBean.setDocumentType(fileExtension);
attachmentIndexBean.setTitle("ENCUESTAME - TITLE");
return attachmentIndexBean;
}
/**
* Parse Word Document.
* @param file
* @return
* @throws POIXMLException
* @throws Exception
*/
public static XWPFWordExtractor parseWordDocument(final File file) throws POIXMLException, Exception {
InputStream is = new FileInputStream(file);
XWPFWordExtractor wde = null;
try {
XWPFDocument wd = new XWPFDocument(is);
wde = new XWPFWordExtractor(wd);
log.debug("Parse Word Document --------------------------> ");
} catch (Exception e) {
log.error("ERROR parse Word Document-------->"+ e);
}
return wde;
}
/**
* Extract word document content.
* @param wde
* @return
*/
public static String extractContentWordDocument(final XWPFWordExtractor wde){
String bodyText = null;
try {
bodyText = wde.getText();
} catch (Exception e) {
log.error("ERROR extracting content Word Document-------->"+ e);
}
return bodyText;
}
/**
* Parse pdf Document.
* @param file
* @return
* @throws IOException
*/
public static PDDocument parsePdfDocument(final File file) throws IOException {
InputStream is = new FileInputStream(file);
COSDocument cosDoc = null;
PDDocument pdDoc = null;
try {
cosDoc = SearchUtils.parseDocument(is);
pdDoc = new PDDocument(cosDoc);
} catch (IOException e) {
// TODO Auto-generated catch block
log.error(e);
}
finally {
if( pdDoc == null ) {
log.error("PdDocument is null");
} else {
pdDoc.close();
}
}
return pdDoc;
}
/**
* Extract content in PDF Document.
* @param pdfDoc
* @return
* @throws Exception
*/
public static String extractContentPdfDocument(final PDDocument pdfDoc) throws Exception {
String docText = null;
try {
PDFTextStripper stripper = new PDFTextStripper();
docText = stripper.getText(pdfDoc);
log.debug("Extract content pdf document leng ----> "+ docText.length());
}
finally {
if( docText == null ) {
log.error("**************** PDF content is null *********************");
}
}
return docText;
}
/**
*
* @param author
* @param title
* @param producer
* @param subject
* @return
*/
public static AttachmentIndex addMetadatatoBean(final String author, final String title, final String producer,
final String subject){
AttachmentIndex attachmentPdfMetadata = new AttachmentIndex();
if (StringUtils.isNotEmpty(author)) {
attachmentPdfMetadata.setAuthor(author);
}
if (StringUtils.isNotEmpty(title)) {
attachmentPdfMetadata.setTitle(title);
}
if (StringUtils.isNotEmpty(producer)) {
attachmentPdfMetadata.setProducer(producer);
}
if (StringUtils.isNotEmpty(subject)) {
attachmentPdfMetadata.setSubject(subject);
}
return attachmentPdfMetadata;
}
/**
* Parse spreadsheets documents.
* @param file
* @return
* @throws Exception
*/
public static HSSFWorkbook parseSpreadsheetsDocument(final File file) throws Exception {
InputStream is = new FileInputStream(file);
POIFSFileSystem fileSystem = new POIFSFileSystem(is);
HSSFWorkbook workBook = new HSSFWorkbook(fileSystem);
return workBook;
}
/**
* Extract spreadsheets content.
* @param workBook
* @return
* @throws Exception
*/
public static String extractContentSpreadsheetsDocument(final HSSFWorkbook workBook) throws Exception {
StringBuilder contents = new StringBuilder();
for (int i = 0; i < workBook.getNumberOfSheets(); i++) {
HSSFSheet sheet = workBook.getSheetAt(i);
Iterator<Row> rows = sheet.rowIterator();
while (rows.hasNext()) {
HSSFRow row = (HSSFRow) rows.next();
// Display the row number
log.debug(row.getRowNum());
Iterator<Cell> cells = row.cellIterator();
while (cells.hasNext()) {
HSSFCell cell = (HSSFCell) cells.next();
// Display the cell number of the current Row
switch (cell.getCellType()) {
case HSSFCell.CELL_TYPE_NUMERIC: {
log.debug(String.valueOf(cell
.getNumericCellValue()));
contents.append(
String.valueOf(cell.getNumericCellValue()))
.append(" ");
break;
}
case HSSFCell.CELL_TYPE_STRING: {
HSSFRichTextString richTextString = cell
.getRichStringCellValue();
log.debug(richTextString.toString());
contents.append(richTextString.toString()).append(" ");
break;
}
case HSSFCell.CELL_TYPE_BOOLEAN: {
contents.append(
String.valueOf(cell.getBooleanCellValue()))
.append(" ");
break;
}
}
}
}
}
return contents.toString();
}
}