public static HMapSFW createTermDocVector(int docLen, HMapSIW tfTable, ScoringModel scoringModel, FrequencySortedDictionary dict, DfTableArray dfTable, boolean isNormalize, Logger sLogger) {
if(sLogger == null){
sLogger = logger;
}
HMapSFW v = new HMapSFW();
float normalization=0;
for(edu.umd.cloud9.util.map.MapKI.Entry<String> entry : tfTable.entrySet()){
// retrieve term string, tf and df
String eTerm = entry.getKey();
float tf = entry.getValue();
int eId = dict.getId(eTerm);
if(eId < 1){ //OOV
continue;
}
int df = dfTable.getDf(eId);
// compute score via scoring model
float score = ((Bm25) scoringModel).computeDocumentWeight(tf, df, docLen);
if(df<1){
sLogger.warn("Suspicious DF WARNING = "+eTerm+" "+tf+" "+df+" "+score);
}
sLogger.debug(eTerm+" "+tf+" "+df+" "+score);
if(score>0){
v.put(eTerm, score);
if(isNormalize){
normalization+=Math.pow(score, 2);
}
}
}
// length-normalize doc vector
if(isNormalize){
normalization = (float) Math.sqrt(normalization);
for(Entry<String> e : v.entrySet()){
v.put(e.getKey(), e.getValue()/normalization);
}
}
return v;
}