* Perform clustering for a given language.
*/
protected void cluster(LanguageCode language)
{
// Preprocessing of documents
final PreprocessingContext preprocessingContext =
preprocessingPipeline.preprocess(documents, null, language);
// Add trivial AllLabels so that we can reuse the common TD matrix builder
final int [] stemsMfow = preprocessingContext.allStems.mostFrequentOriginalWordIndex;
final short [] wordsType = preprocessingContext.allWords.type;
final IntArrayList featureIndices = new IntArrayList(stemsMfow.length);
for (int i = 0; i < stemsMfow.length; i++)
{
final short flag = wordsType[stemsMfow[i]];
if ((flag & (ITokenizer.TF_COMMON_WORD | ITokenizer.TF_QUERY_WORD | ITokenizer.TT_NUMERIC)) == 0)
{
featureIndices.add(stemsMfow[i]);
}
}
preprocessingContext.allLabels.featureIndex = featureIndices.toArray();
preprocessingContext.allLabels.firstPhraseIndex = -1;
// Further processing only if there are words to process
clusters = Lists.newArrayList();
if (preprocessingContext.hasLabels())
{
// Term-document matrix building and reduction
final VectorSpaceModelContext vsmContext = new VectorSpaceModelContext(
preprocessingContext);
final ReducedVectorSpaceModelContext reducedVsmContext = new ReducedVectorSpaceModelContext(