if (dir.hasEntry("Package")) {
// It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
TikaInputStream stream = TikaInputStream.get(
new DocumentInputStream((DocumentEntry) ooxml));
try {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = detector.detect(stream, new Metadata());
handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
return;
} finally {
stream.close();
}
}
// It's regular OLE2:
// What kind of document is it?
Metadata metadata = new Metadata();
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode)dir);
metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
// Grab the contents and process
DocumentEntry contentsEntry = (DocumentEntry)dir.getEntry("CONTENTS");
DocumentInputStream inp = new DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
inp.readFully(contents);
embedded = TikaInputStream.get(contents);
// Try to work out what it is
MediaType mediaType = getDetector().detect(embedded, new Metadata());
String extension = type.getExtension();
try {
MimeType mimeType = getMimeTypes().forName(mediaType.toString());
extension = mimeType.getExtension();
} catch(MimeTypeException mte) {
// No details on this type are known
}
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
} catch(Exception e) {
throw new TikaException("Invalid embedded resource", e);
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
}
// Should we parse it?
if (extractor.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
// embedded document, and is otherwise
// empty (byte[0]):
embedded = TikaInputStream.get(new byte[0]);
embedded.setOpenContainer(dir);
}
extractor.parseEmbedded(embedded, xhtml, metadata, true);
}
} finally {
if (embedded != null) {
embedded.close();
}
}
}