long sTime, eTime;
sTime = System.currentTimeMillis();
int maxid = reader.maxDoc();
TermFreqVector tv;
String[] terms;
String term = "";
Term t;
int tfreq = 0;
float idf;
float tf;
float tfidf;
double inlinkBoost;
double sum;
int wikiID;
int hashInt;
int numDocs = reader.numDocs();
TermEnum tnum = reader.terms();
HashMap<String, Float> idfMap = new HashMap<String, Float>(500000);
HashMap<String, Float> tfidfMap = new HashMap<String, Float>(5000);
HashMap<String, Integer> termHash = new HashMap<String, Integer>(500000);
FileOutputStream fos = new FileOutputStream("vector.txt");
OutputStreamWriter osw = new OutputStreamWriter(fos,"UTF-8");
tnum = reader.terms();
hashInt = 0;
while(tnum.next()){
t = tnum.term();
term = t.text();
tfreq = tnum.docFreq(); // get DF for the term
// skip rare terms
if(tfreq < 3){
continue;
}
// idf = (float)(Math.log(numDocs/(double)(tfreq+1)) + 1.0);
idf = (float)(Math.log(numDocs/(double)(tfreq)));
// idf = (float)(Math.log(numDocs/(double)(tfreq)) / Math.log(2));
idfMap.put(term, idf);
termHash.put(term, hashInt++);
}
for(int i=0;i<maxid;i++){
if(!reader.isDeleted(i)){
//System.out.println(i);
wikiID = Integer.valueOf(reader.document(i).getField("id").stringValue());
inlinkBoost = inlinkMap.get(wikiID);
tv = reader.getTermFreqVector(i, "contents");
try {
terms = tv.getTerms();
int[] fq = tv.getTermFrequencies();
sum = 0.0;
tfidfMap.clear();