HadoopUtility.loadTerrierJob(job);
final String lookupStructureName = job.get(BITPOSTING_LOOKUP_STRUCTURE_KEY);
final String bitPostingStructureName = job.get(BITPOSTING_STRUCTURE_KEY);
Index.setIndexLoadingProfileAsRetrieval(false);
final Index index = HadoopUtility.fromHConfiguration(job);
final byte fileCount = Byte.parseByte(index.getIndexProperty("index." + bitPostingStructureName + ".data-files", "1"));
final Path bitPostingStructureFiles[] = new Path[fileCount];
final FileStatus[] fss = new FileStatus[fileCount];
final long[] bitPostingStructureFSBlockSizes = new long[fileCount];
//logger.info("Calculating splits of structure " + bitPostingStructureName);
FileSystem fs = FileSystem.get(job);
for(byte i=0;i<fileCount;i++)
{
bitPostingStructureFiles[i] = new Path(BitPostingIndexInputStream.getFilename(index, bitPostingStructureName, fileCount, i));
fss[i] = fs.getFileStatus(bitPostingStructureFiles[i]);
bitPostingStructureFSBlockSizes[i] = getBlockSize(bitPostingStructureFiles[i], fss[i]);
//logger.info("File " + i + " approx splits=" + ((double)fss[i].getLen() /(double)bitPostingStructureFSBlockSizes[i]));
}
//this smells of a hack, because we dont have a strategy for naming various index structures streams
final Iterator<? extends BitIndexPointer> offsetIterator =
index.hasIndexStructureInputStream(lookupStructureName+ "-entry")
? (Iterator<? extends BitIndexPointer>)index.getIndexStructureInputStream(lookupStructureName+ "-entry")
: (Iterator<? extends BitIndexPointer>)index.getIndexStructureInputStream(lookupStructureName);
if (offsetIterator == null)
throw new IOException("No such stream structure called " + lookupStructureName+ "-entry or "+lookupStructureName+" found in index");
final List<InputSplit> splitList = new ArrayList<InputSplit>();
int currentId = 0;
//size of the current split of each file
final long[] blockSize = new long[fileCount];
//location of the last split for each file
final long[] bitPostingStructureSplitEndOffsets = new long[fileCount];
//how many entries will be in this split, for each file
final int[] entriesInBlock = new int[fileCount];
//what is the starting id of the next entry split, for each file
final int[] firstEntryOfNextSplit = new int[fileCount];
//number of splits per file, for logging only
final int[] splitsPerFile = new int[fileCount];
Arrays.fill(firstEntryOfNextSplit, Integer.MAX_VALUE);
BitIndexPointer currentPointer = null;
//iterate through the lookup iterator
//split the target bit posting index structure into chunks of size bitPostingStructureFSBlockSize
while(offsetIterator.hasNext())
{
//ok, where is the next pointer to
currentPointer = offsetIterator.next();
final byte fileId = currentPointer.getFileNumber();
//what is the first entry of the next split of this file?
firstEntryOfNextSplit[fileId] = Math.min(currentId, firstEntryOfNextSplit[fileId]);
//this split will have one more entry
entriesInBlock[fileId]++;
//what is our current offset?
long offset = currentPointer.getOffset();
//System.err.println("Offset" + offset);
//if we made the split here, how big would it be?
blockSize[fileId] = offset - bitPostingStructureSplitEndOffsets[fileId];
//is this block is large enough
if (blockSize[fileId] > bitPostingStructureFSBlockSizes[fileId])
{
//yes, its big enough
//block will be from bitPostingStructureSplitEndOffsets[fileId] to offset, which is blockSize[fileId]
BlockLocation[] blkLocations = fs.getFileBlockLocations(
fss[fileId],
bitPostingStructureSplitEndOffsets[fileId],
blockSize[fileId]);
splitList.add(
new BitPostingIndexInputSplit(
bitPostingStructureFiles[fileId], //path
bitPostingStructureSplitEndOffsets[fileId], //start
blockSize[fileId], //length
blkLocations[0].getHosts(), //hosts
firstEntryOfNextSplit[fileId], //first entry in this split
entriesInBlock[fileId]) //number of entries in this split
);
//logger.info("File "+ fileId + " split " +(splitList.size()-1)
// + " "+ splitList.get(splitList.size() -1).toString());
//record another split for this file (for logging only)
splitsPerFile[fileId]++;
//update recording of last offset for this file
bitPostingStructureSplitEndOffsets[fileId] = offset;
//reset size of split for this file
blockSize[fileId] = 0;
//reset counter of entries in split of this file
entriesInBlock[fileId] = 0;
//reset the first offset of this split
firstEntryOfNextSplit[fileId] = Integer.MAX_VALUE;
}
//ids always increment
currentId++;
}
//find any files which have trailing blocks
for(byte fileId=0;fileId<fileCount;fileId++)
{
if (entriesInBlock[fileId] == 0)
continue;
assert(firstEntryOfNextSplit[fileId] != Integer.MAX_VALUE);
//block will be from bitPostingStructureSplitEndOffsets[fileId], with length blockSize[fileId]
BlockLocation[] blkLocations = fs.getFileBlockLocations(fss[fileId], bitPostingStructureSplitEndOffsets[fileId], blockSize[fileId]);
splitList.add(
new BitPostingIndexInputSplit(
bitPostingStructureFiles[fileId], //path of file for split
bitPostingStructureSplitEndOffsets[fileId], //start offset of this split
blockSize[fileId], //size of this split
blkLocations[0].getHosts(), //hosts for this split
firstEntryOfNextSplit[fileId], //first entry id for this split
entriesInBlock[fileId]) //number of entries in this split
);
//logger.info("File "+ fileId + " trailing split "+ (splitList.size() -1)
// + " " + splitList.get(splitList.size() -1).toString());
//record another split for this file (for logging only)
splitsPerFile[fileId]++;
}
//logger.info("Split "+ bitPostingStructureName+ " (of "+currentId+" entries) into " + splitList.size() + " splits");
if (fileCount > 1)
{
//logger.info("Multiple files of " + bitPostingStructureName + " were split as follows: " + ArrayUtils.join(splitsPerFile, ","));
}
assert(splitList.size() > 0);
index.close();
return splitList.toArray(new InputSplit[splitList.size()]);
}