public void extractText(POIFSFileSystem fsys, Appendable appendable)
throws IOException, TikaException {
// load our POIFS document streams.
DocumentEntry headerProps =
(DocumentEntry) fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
byte[] header = new byte[headerProps.getSize()];
din.read(header);
din.close();
int info = LittleEndian.getShort(header, 0xa);
if ((info & 0x4) != 0) {
throw new TikaException("Fast-saved files are unsupported");
}
if ((info & 0x100) != 0) {
throw new TikaException("This document is password protected");
}
// determine the version of Word this document came from.
int nFib = LittleEndian.getShort(header, 0x2);
switch (nFib) {
case 101:
case 102:
case 103:
case 104:
// this is a Word 6.0 doc send it to the extractor for that version.
Word6Extractor oldExtractor = new Word6Extractor(appendable);
oldExtractor.extractText(header);
}
//get the location of the piece table
int complexOffset = LittleEndian.getInt(header, 0x1a2);
// determine which table stream we must use.
//Get the information we need from the header
String tableName = null;
boolean useTable1 = (info & 0x200) != 0;
if (useTable1) {
tableName = "1Table";
} else {
tableName = "0Table";
}
DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[] tableStream = new byte[table.getSize()];
din = fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
int chpOffset = LittleEndian.getInt(header, 0xfa);
int chpSize = LittleEndian.getInt(header, 0xfe);
int fcMin = LittleEndian.getInt(header, 0x18);
CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);