float sumFLengs = 0, sumELengs = 0;
try {
BufferedReader dis1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(eReadFile)), "UTF-8"));
BufferedReader dis2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File(fReadFile)), "UTF-8"));
HMapSIW fDoc = new HMapSIW();
HMapSIW eDoc = new HMapSIW();
String eLine = null, fLine = null;
int cntEDocs = 0, cntFDocs = 0, lastDocLenE = 0, lastDocLenF = 0, numSents = 0;
while ((eLine = dis1.readLine()) != null) {
fLine = dis2.readLine().trim();
eLine = eLine.trim();
String[] tokens = fTokenizer.processContent(fLine);
lastDocLenF += tokens.length;
for (String token : tokens) {
if (!fDoc.containsKey(token)) { // if this is first time we saw token in this sentence
dfD.increment(token);
}
fDoc.increment(token);
}
tokens = eTokenizer.processContent(eLine);
lastDocLenE += tokens.length;
for (String token : tokens) {
if (!eDoc.containsKey(token)) {
dfE.increment(token);
}
eDoc.increment(token);
}
numSents++;
if (numSents == sentsPerDoc) {
sumFLengs += lastDocLenF;
sumELengs += lastDocLenE;
enSentLengths.add(lastDocLenE);
deSentLengths.add(lastDocLenF);
eDocTfs.add(eDoc);
fDocTfs.add(fDoc);
cntEDocs++;
cntFDocs++;
// reset variables
fDoc = new HMapSIW();
eDoc = new HMapSIW();
numSents = 0;
lastDocLenE = 0;
lastDocLenF = 0;
}
eSents.add(eLine);