tworker = new TaskWorker(yoshikoder){
protected void doWork() throws Exception {
// TODO remove redundant code here!
// FIRST DOC
YKDocument doc1 = (YKDocument)docs.get(0);
// tokenize the document
TokenizationCache tcache = yoshikoder.getTokenizationCache();
TokenList tl1 = tcache.getTokenList(doc1);
if (tl1 == null)
tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
// for _all_ categories
EntryFrequencyMap efm1 = new EntryFrequencyMap(dict.getDictionaryRoot(), tl1);
List lkeys = efm1.getSortedCategoryEntries();
Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
int[] counts = new int[keys.length+1];
for (int ii=0; ii<keys.length; ii++) {
Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
counts[ii] = cnt.intValue();
}
// add N
counts[keys.length] = efm1.getTokenTotal();
HSSFWorkbook wb = new HSSFWorkbook();
HSSFRow row;
HSSFCell cell;
HSSFSheet sheet = wb.createSheet("Category frequencies");
// header
row = sheet.createRow((short)0);
for (int c=0; c<keys.length; c++){
cell = row.createCell((short)(c+1));
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
String nodepath = efm1.getEntryPath(keys[c]);
cell.setCellValue(nodepath);
}
cell = row.createCell((short)(keys.length+1));
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue("Total");
int rownum = 1;
for (Iterator iter = docs.iterator(); iter.hasNext();) {
YKDocument d = (YKDocument) iter.next();
TokenList tl2 = tcache.getTokenList(d);
if (tl2 == null)
tl2 = TokenizationService.getTokenizationService().tokenize(d);
Concordance conc = dict.getConcordance(tl2, catnode, wsize);
// note _all_categories counted (implicitly around catnode matches)
counts = getDocumentStats(d.getTitle(), conc, keys, dict.getDictionaryRoot());
row = sheet.createRow((short)rownum);
cell = row.createCell((short)0);
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue(d.getTitle());
for (int ii = 0; ii < keys.length; ii++) {
cell = row.createCell((short)(ii+1));
cell.setCellValue((double)counts[ii]);
}