FixedSizeWriteableFactory<LexiconEntry> lvf =
(FixedSizeWriteableFactory<LexiconEntry>)srcIndex1.getIndexStructure("lexicon-valuefactory");
//setting the output stream
LexiconOutputStream<String> lexOutStream =
new FSOMapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());
int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms();
PostingIndex inverted1 = srcIndex1.getInvertedIndex();
PostingIndex inverted2 = srcIndex2.getInvertedIndex();
DirectInvertedOutputStream invOS =null;
try{
invOS = (fieldCount > 0 ? fieldInvertedFileOutputStreamClass : invertedFileOutputStreamClass)
.getConstructor(String.class)
.newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +
destIndex.getPrefix() + ".inverted"+ BitIn.USUAL_EXTENSION);
} catch (Exception e) {
logger.error("Couldn't create specified DirectInvertedOutputStream", e);
return;
}
boolean hasMore1 = false;
boolean hasMore2 = false;
String term1;
String term2;
Map.Entry<String,LexiconEntry> lee1 = null;
Map.Entry<String,LexiconEntry> lee2 = null;
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
while (hasMore1 && hasMore2) {
term1 = lee1.getKey();
term2 = lee2.getKey();
int lexicographicalCompare = term1.compareTo(term2);
if (lexicographicalCompare < 0) {
//write to inverted file postings for the term that only occurs in 1st index
BitIndexPointer newPointer = invOS.writePostings(inverted1.getPostings(lee1.getValue()));
lee1.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
lexOutStream.writeNextEntry(term1, lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
} else if (lexicographicalCompare > 0) {
//write to inverted file postings for the term that only occurs in 2nd index
//docids are transformed as we go.
BitIndexPointer newPointer =
invOS.writePostings(inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
lee2.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
int newCode = newCodes++;
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
lee2.getValue().setTermId(newCode);
lexOutStream.writeNextEntry(term2, lee2.getValue());
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
} else {
//write to postings for a term that occurs in both indices
//1. postings from the first index are unchanged
IterablePosting ip1 = inverted1.getPostings(lee1.getValue());
BitIndexPointer newPointer1 = invOS.writePostings(ip1);
//2. postings from the 2nd index have their docids transformed
IterablePosting ip2 = inverted2.getPostings(lee2.getValue());
BitIndexPointer newPointer2 = invOS.writePostings(ip2, ip1.getId() - numberOfDocs1);
numberOfPointers+= newPointer1.getNumberOfEntries() + newPointer2.getNumberOfEntries();
//don't set numberOfEntries, as LexiconEntry.add() will take care of this.
lee1.getValue().setPointer(newPointer1);
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId());
lee1.getValue().add(lee2.getValue());
lexOutStream.writeNextEntry(term1, lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
}
}
if (hasMore1) {
lee2 = null;
while (hasMore1) {
//write to inverted file as well.
BitIndexPointer newPointer = invOS.writePostings(
inverted1.getPostings(lee1.getValue()));
lee1.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
hasMore1 = lexInStream1.hasNext();
if (hasMore1)
lee1 = lexInStream1.next();
}
} else if (hasMore2) {
lee1 = null;
while (hasMore2) {
//write to inverted file as well.
BitIndexPointer newPointer = invOS.writePostings(
inverted2.getPostings(lee2.getValue()), -(numberOfDocs1+1));
lee2.getValue().setPointer(newPointer);
numberOfPointers+=newPointer.getNumberOfEntries();
int newCode = newCodes++;
if (keepTermCodeMap)
termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
lee2.getValue().setTermId(newCode);
lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
hasMore2 = lexInStream2.hasNext();
if (hasMore2)
lee2 = lexInStream2.next();
}
}
IndexUtil.close(lexInStream1);
IndexUtil.close(lexInStream2);
inverted1.close();
inverted2.close();
invOS.close();
destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
destIndex.addIndexStructure(
"inverted",
invertedFileInputClass,
"org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class",
"index,structureName,document,"+
(fieldCount > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
destIndex.addIndexStructureInputStream(
"inverted",
invertedFileInputStreamClass,
"org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
"index,structureName,lexicon-entry-inputstream,"+
(fieldCount > 0
? fieldInvertedIndexPostingIteratorClass
: basicInvertedIndexPostingIteratorClass ));
destIndex.setIndexProperty("index.inverted.fields.count", ""+fieldCount);
lexOutStream.close();
if (fieldCount > 0)
{
destIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
}
destIndex.flush();