package org.sf.mustru.filters;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.PrintWriter;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import org.apache.log4j.Logger;
import org.sf.mustru.docs.IndexableDoc;
//import org.sf.mustru.docs.TextDoc;
import org.sf.mustru.utils.*;
/**
* A class to extract text from OpenOffice files using the Xerces XML parser.
*/
public class SxwHandler implements HandlerInterface
{
static Logger logger = Logger.getLogger(SxwHandler.class.getName());
StringBuffer content = new StringBuffer();
/**
* empty constructor
*/
public SxwHandler() { super(); }
/**
* Extract text from an OpenOffice file -
* a. Unzip the file.
* b. Create the contents.xml file in a directory with OpenOffice dtds.
* c. Use an XML parser to remove the text.
*/
public void getDocument (String ifile, IndexableDoc doc)
{
//*-- fetch an unique thread code to generate an unique temporary file name
int threadCode = Thread.currentThread().hashCode();
//*-- a. unzip the OpenOffice file
logger.info("Extracting from OpenOffice file " + ifile);
ZipFile zFile;
try
{ zFile = new ZipFile(new File(ifile)); }
catch (IOException e)
{ logger.error("Could not open OpenOffice file " + ifile + " " + e.getMessage() );
return; }
//*-- b. Extract the content.xml file and write to a file
ZipEntry zEntry = zFile.getEntry("content.xml");
InputStream xmlStream = null; PrintWriter outp = null;
BufferedReader iReader = null;
String outfile = "";
try
{
//*-- create an input stream for the XML file
xmlStream = zFile.getInputStream(zEntry);
iReader = new BufferedReader( new InputStreamReader(xmlStream, "UTF-8") );
//*-- generate the output file name and dump the XML contents
String iline; outfile = Constants.OFFICEDIR + File.separator + "TEMP_content_" + threadCode + ".xml";
outp = new PrintWriter(new FileWriter(outfile));
while ( (iline = iReader.readLine()) != null ) { outp.println(iline); }
outp.flush();
}
catch (IOException e)
{ logger.error("Could not read text from OpenOffice file: " + ifile + " " + e.getMessage()); }
finally
{
if (outp != null) outp.close();
try
{ if (iReader != null) iReader.close();
if (xmlStream != null) xmlStream.close();
if (zFile != null) zFile.close();
}
catch (IOException exc) { logger.error("Ignore error"); }
}
//*-- parse the content.xml file with the SAXParser
try
{
XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
parser.setContentHandler(new OOHandler());
parser.parse(outfile);
}
catch (OutOfMemoryError oe) { logger.error("Ran out of memory for " + outfile + " or could be corrupt file " + oe.getMessage()); }
catch (SAXException se) { logger.error("Could not parse XML file" + outfile + " " + se.getMessage()); }
catch (IOException ie) { logger.error("Could not read XML file" + outfile + " " + ie.getMessage()); }
//*-- remove the temporary content.xml file
File tfile = new File(outfile); tfile.delete();
//*-- return the contents
String bodyText = content.toString();
if (bodyText != null) { bodyText = StringTools.filterChars(bodyText); }
doc.setContents ( new StringBuffer(bodyText) );
doc.setFileType("text"); doc.setFileName(ifile);
return;
}
/**
* Inner class to append the text of the OpenOffice file
*/
class OOHandler extends DefaultHandler
{
public void characters(char[] ch, int start, int length)
{ content.append(" "); content.append( new String(ch, start, length)); }
}
}