String name = entry.getName();
if (!(entry instanceof DocumentEntry)) {
// Skip directory entries
} else if ("WordDocument".equals(name)) {
setType(metadata, "application/msword");
WordExtractor extractor = new WordExtractor(filesystem);
addTextIfAny(xhtml, "header", extractor.getHeaderText());
for (String paragraph : extractor.getParagraphText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getFootnoteText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getCommentsText()) {
xhtml.element("p", paragraph);
}
for (String paragraph : extractor.getEndnoteText()) {
xhtml.element("p", paragraph);
}
addTextIfAny(xhtml, "footer", extractor.getFooterText());
} else if ("PowerPoint Document".equals(name)) {
setType(metadata, "application/vnd.ms-powerpoint");
PowerPointExtractor extractor =
new PowerPointExtractor(filesystem);
xhtml.element("p", extractor.getText(true, true));
} else if ("Workbook".equals(name)) {
setType(metadata, "application/vnd.ms-excel");
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor().parse(filesystem, xhtml, locale);
} else if ("VisioDocument".equals(name)) {
setType(metadata, "application/vnd.visio");
VisioTextExtractor extractor =
new VisioTextExtractor(filesystem);
for (String text : extractor.getAllText()) {
xhtml.element("p", text);
}
} else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
// TODO: Cleaner mechanism for detecting Outlook
outlookExtracted = true;