* @param rules The head rules for the parses.
* @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary.
* @return A dictionary object.
*/
private static MutableDictionary buildDictionary(DataStream data, HeadRules rules, int cutoff) {
MutableDictionary mdict = new MutableDictionary(cutoff);
while(data.hasNext()) {
String parseStr = (String) data.nextToken();
Parse p = Parse.parseParse(parseStr);
p.updateHeads(rules);
Parse[] pwords = p.getTagNodes();
String[] words = new String[pwords.length];
//add all uni-grams
for (int wi=0;wi<words.length;wi++) {
words[wi] = pwords[wi].toString();
}
mdict.add(words,1,true);
//add tri-grams and bi-grams for inital sequence
Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p),rules.getPunctuationTags());
String[] cwords = new String[chunks.length];
for (int wi=0;wi<cwords.length;wi++) {
cwords[wi] = chunks[wi].getHead().toString();
}
mdict.add(cwords,3,false);
//emulate reductions to produce additional n-grams
int ci = 0;
while (ci < chunks.length) {
//System.err.println("chunks["+ci+"]="+chunks[ci].getHead().toString()+" chunks.length="+chunks.length);
if (lastChild(chunks[ci], chunks[ci].getParent(),rules.getPunctuationTags())) {
//perform reduce
int reduceStart = ci;
while (reduceStart >=0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) {
reduceStart--;
}
reduceStart++;
chunks = ParserEventStream.reduceChunks(chunks,ci,chunks[ci].getParent());
ci = reduceStart;
if (chunks.length != 0) {
String[] window = new String[5];
int wi = 0;
if (ci-2 >= 0) window[wi++] = chunks[ci-2].getHead().toString();
if (ci-1 >= 0) window[wi++] = chunks[ci-1].getHead().toString();
window[wi++] = chunks[ci].getHead().toString();
if (ci+1 < chunks.length) window[wi++] = chunks[ci+1].getHead().toString();
if (ci+2 < chunks.length) window[wi++] = chunks[ci+2].getHead().toString();
if (wi < 5) {
String[] subWindow = new String[wi];
for (int swi=0;swi<wi;swi++) {
subWindow[swi]=window[swi];
}
window = subWindow;
}
if (window.length >=3) {
mdict.add(window,3,false);
}
else if (window.length == 2) {
mdict.add(window,2,false);
}
}
ci=reduceStart-1; //ci will be incremented at end of loop
}
ci++;