// Is it an embedded OLE2 document, or an embedded OOXML document?
try {
Entry ooxml = dir.getEntry("Package");
// It's OOXML
TikaInputStream ooxmlStream = TikaInputStream.get(
new DocumentInputStream((DocumentEntry)ooxml)
);
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = detector.detect(ooxmlStream, new Metadata());
handleEmbeddedResource(ooxmlStream, null, type.toString(), xhtml, true);
return;
} catch(FileNotFoundException e) {
// It's regular OLE2
}
// Need to dump the directory out to a new temp file, so
// it's stand along
POIFSFileSystem newFS = new POIFSFileSystem();
copy(dir, newFS.getRoot());
File tmpFile = File.createTempFile("tika", ".ole2");
try {
FileOutputStream out = new FileOutputStream(tmpFile);
newFS.writeFilesystem(out);
out.close();
// What kind of document is it?
Metadata metadata = new Metadata();
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
// Trigger for the document itself
TikaInputStream embedded = TikaInputStream.get(tmpFile);
try {
if (extractor.shouldParseEmbedded(metadata)) {
extractor.parseEmbedded(embedded, xhtml, metadata, true);
}
} finally {
embedded.close();
}
} finally {
tmpFile.delete();
}
}