JobConf jc = HadoopPlugin.getJobFactory("testSplits").newJob();
HadoopUtility.toHConfiguration(index, jc);
BitPostingIndexInputFormat.setStructures(jc, "direct", "document");
InputSplit[] splits = informat.getSplits(jc, 2);
TIntHashSet termIds = new TIntHashSet();
long tokens = 0;
long pointers = 0;
int docid = 0;
for(InputSplit split : splits)
{
RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> rr = informat.getRecordReader(split, jc, null);
IntWritable key = rr.createKey();
IntObjectWrapper<IterablePosting> value = rr.createValue();
while(rr.next(key, value))
{
docid = key.get();
int doclen = 0; int docpointers = 0;
IterablePosting ip = value.getObject();
assertEquals("Number of pointers for docid " + docid + " is incorrect", documentPointers[docid], value.getInt());
while(ip.next() != IterablePosting.EOL)
{
//System.err.println("termid" +ip.getId() + " f=" + ip.getFrequency());
termIds.add(ip.getId());
tokens += ip.getFrequency();
doclen += ip.getFrequency();
pointers++; docpointers++;
if (numberOfTerms > 0)
assertTrue("Got too big a termid ("+ip.getId()+") from direct index input stream, numTerms=" + numberOfTerms, ip.getId() < maxTermId);
}
if (documentPointers.length > 0)
assertEquals("Number of pointers for docid " + docid + " is incorrect", documentPointers[docid], docpointers);
assertEquals("Document length for docid "+docid+" is incorrect", documentLengths[docid], doclen);
}
}
CollectionStatistics cs = index.getCollectionStatistics();
assertEquals("Number of documents is incorrect", cs.getNumberOfDocuments(), docid + 1);
assertEquals("Number of pointers is incorrect", cs.getNumberOfPointers(), pointers);
assertEquals("Number of tokens is incorrect", cs.getNumberOfTokens(), tokens);
if (numberOfTerms > 0)
{
assertEquals("Not all termIds found in direct index", termIds.size(), numberOfTerms);
}
}