final int [] tokenImagesOrder = IndirectSort.mergesort(tokenImages, 0,
tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);
// Create holders for new arrays
final List<char []> normalizedWordImages = Lists.newArrayList();
final IntArrayList normalizedWordTf = new IntArrayList();
final List<int []> wordTfByDocumentList = Lists.newArrayList();
final ByteArrayList fieldIndexList = new ByteArrayList();
final ShortArrayList types = new ShortArrayList();
final int [] wordIndexes = new int [tokenCount];
Arrays.fill(wordIndexes, -1);
// Initial values for counters
int tf = 1;
int maxTf = 1;
int maxTfVariantIndex = tokenImagesOrder[0];
int totalTf = 1;
int variantStartIndex = 0;
// A byte set for word fields tracking
final BitSet fieldIndices = new BitSet(context.allFields.name.length);
// A stack for pushing information about the term's documents.
final IntStack wordDocuments = new IntStack();
if (documentIndexesArray[tokenImagesOrder[0]] >= 0)
{
wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
}
// Go through the ordered token images
for (int i = 0; i < tokenImagesOrder.length - 1; i++)
{
final char [] image = tokenImages[tokenImagesOrder[i]];
final char [] nextImage = tokenImages[tokenImagesOrder[i + 1]];
final int tokenType = tokenTypesArray[tokenImagesOrder[i]];
final int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];
// Reached the end of non-null tokens?
if (image == null)
{
break;
}
// Check if we want to index this token at all
if (isNotIndexed(tokenType))
{
variantStartIndex = i + 1;
maxTfVariantIndex = tokenImagesOrder[i + 1];
resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
fieldIndices, wordDocuments, i);
continue;
}
fieldIndices.set(tokensFieldIndex[tokenImagesOrder[i]]);
// Now check if image case is changing
final boolean sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR
.compare(image, nextImage) == 0;
if (sameCase)
{
// Case has not changed, just increase counters
tf++;
totalTf++;
wordDocuments.push(documentIndex);
continue;
}
// Case (or even token image) has changed. Update most frequent case
// variant
if (maxTf < tf)
{
maxTf = tf;
maxTfVariantIndex = tokenImagesOrder[i];
tf = 1;
}
final boolean sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR
.compare(image, nextImage) == 0;
// Check if token image has changed
if (sameImage)
{
totalTf++;
wordDocuments.push(documentIndex);
}
else
{
// The image has changed completely.
// Before we start processing the new image, we need to
// see if we want to store the previous image, and if so
// we need add some data about it to the arrays
// wordDocuments.size() may contain duplicate entries from the same document,
// but this check is faster than deduping, so we do it first.
if (wordDocuments.size() >= dfThreshold)
{
// Flatten the list of documents this term occurred in.
final int [] sparseEncoding = SparseArray.toSparseEncoding(wordDocuments);
final int df = (sparseEncoding.length >> 1);
if (df >= dfThreshold)
{
wordTfByDocumentList.add(sparseEncoding);
// Add the word to the word list
normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
types.add(tokenTypesArray[maxTfVariantIndex]);
normalizedWordTf.add(totalTf);
fieldIndexList.add((byte) fieldIndices.bits[0]);
// Add this word's index in AllWords to all its instances
// in the AllTokens multiarray
for (int j = variantStartIndex; j < i + 1; j++)
{
wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
}
}
}
// Reinitialize counters
totalTf = 1;
tf = 1;
maxTf = 1;
maxTfVariantIndex = tokenImagesOrder[i + 1];
variantStartIndex = i + 1;
// Re-initialize int set used for document frequency calculation
resetForNewTokenImage(documentIndexesArray, tokenImagesOrder,
fieldIndices, wordDocuments, i);
}
}
// Mapping from allTokens
context.allTokens.wordIndex = wordIndexes;
context.allWords.image = normalizedWordImages
.toArray(new char [normalizedWordImages.size()] []);
context.allWords.tf = normalizedWordTf.toArray();
context.allWords.tfByDocument =
wordTfByDocumentList.toArray(new int [wordTfByDocumentList.size()] []);
context.allWords.fieldIndices = fieldIndexList.toArray();
context.allWords.type = types.toArray();
}