// Is it an embedded OLE2 document, or an embedded OOXML document?
try {
Entry ooxml = dir.getEntry("Package");
// It's OOXML
TikaInputStream stream = TikaInputStream.get(
new DocumentInputStream((DocumentEntry) ooxml));
try {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = detector.detect(stream, new Metadata());
handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
return;
} finally {
stream.close();
}
} catch(FileNotFoundException e) {
// It's regular OLE2
}
// Need to dump the directory out to a new temp file, so
// it's stand along
POIFSFileSystem newFS = new POIFSFileSystem();
copy(dir, newFS.getRoot());
File tmpFile = File.createTempFile("tika", ".ole2");
try {
FileOutputStream out = new FileOutputStream(tmpFile);
newFS.writeFilesystem(out);
out.close();
// What kind of document is it?
Metadata metadata = new Metadata();
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded;
if (type==POIFSDocumentType.OLE10_NATIVE) {
Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
byte[] data = bos.toByteArray();
try {
Ole10Native ole = new Ole10Native(data, 0);
byte[] dataBuffer = ole.getDataBuffer();
metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
embedded = TikaInputStream.get(dataBuffer);
} catch (Ole10NativeException ex) {
embedded = TikaInputStream.get(data);
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
embedded = TikaInputStream.get(tmpFile);
}
try {
if (extractor.shouldParseEmbedded(metadata)) {
extractor.parseEmbedded(embedded, xhtml, metadata, true);
}
} finally {
embedded.close();
}
} finally {
tmpFile.delete();
}
}