*/
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
POIFSFileSystem filesystem = new POIFSFileSystem(stream);
// Parse summary entries first, to make metadata available early
new SummaryExtractor(metadata).parseSummaries(filesystem);
// Parse remaining document entries
boolean outlookExtracted = false;
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
if (entry instanceof DirectoryEntry) {
if ("Quill".equals(name)) {
setType(metadata, "application/x-mspublisher");
PublisherTextExtractor extractor =
new PublisherTextExtractor(filesystem);
xhtml.element("p", extractor.getText());
}
} else if (entry instanceof DocumentEntry) {
if ("WordDocument".equals(name)) {
setType(metadata, "application/msword");
WordExtractor extractor = new WordExtractor(filesystem);
addTextIfAny(xhtml, "header", extractor.getHeaderText());
for (String paragraph : extractor.getParagraphText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getFootnoteText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getCommentsText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getEndnoteText()) {
xhtml.element("p", paragraph);
}
addTextIfAny(xhtml, "footer", extractor.getFooterText());
} else if ("PowerPoint Document".equals(name)) {
setType(metadata, "application/vnd.ms-powerpoint");
PowerPointExtractor extractor =
new PowerPointExtractor(filesystem);
xhtml.element("p", extractor.getText(true, true));
} else if ("Workbook".equals(name)) {
setType(metadata, "application/vnd.ms-excel");
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor().parse(filesystem, xhtml, locale);
} else if ("VisioDocument".equals(name)) {
setType(metadata, "application/vnd.visio");
VisioTextExtractor extractor =
new VisioTextExtractor(filesystem);
for (String text : extractor.getAllText()) {
xhtml.element("p", text);
}
} else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
// TODO: Cleaner mechanism for detecting Outlook
outlookExtracted = true;
setType(metadata, "application/vnd.ms-outlook");
new OutlookExtractor(filesystem).parse(xhtml, metadata);
}
}
}
xhtml.endDocument();
}