/*
* Carrot2 project.
*
* Copyright (C) 2002-2014, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;
import org.carrot2.core.Document;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.text.util.TabularOutput;
import com.carrotsearch.hppc.*;
/**
* Document preprocessing context provides low-level (usually integer-coded) data
* structures useful for further processing.
*
* <p><img src="doc-files/preprocessing-arrays.png"
* alt="Internals of PreprocessingContext"></p>
*/
public final class PreprocessingContext
{
/** Uninitialized structure constant. */
private static final String UNINITIALIZED = "[uninitialized]\n";
/** Query used to perform processing, may be <code>null</code> */
public final String query;
/** A list of documents to process. */
public final List<Document> documents;
/** Language model to be used */
public final LanguageModel language;
/**
* Token interning cache. Token images are interned to save memory and allow reference
* comparisons.
*/
private ObjectOpenHashSet<MutableCharArray> tokenCache = ObjectOpenHashSet.newInstance();
/**
* Creates a preprocessing context for the provided <code>documents</code> and with
* the provided <code>languageModel</code>.
*/
public PreprocessingContext(LanguageModel languageModel, List<Document> documents,
String query)
{
this.query = query;
this.documents = documents;
this.language = languageModel;
}
/**
* Information about all tokens of the input {@link PreprocessingContext#documents}.
* Each element of each of the arrays corresponds to one individual token from the
* input or a synthetic separator inserted between documents, fields and sentences.
* Last element of this array is a special terminator entry.
* <p>
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllTokens
{
/**
* Token image as it appears in the input. On positions where {@link #type} is
* equal to one of {@link ITokenizer#TF_TERMINATOR},
* {@link ITokenizer#TF_SEPARATOR_DOCUMENT} or
* {@link ITokenizer#TF_SEPARATOR_FIELD} , image is <code>null</code>.
* <p>
* This array is produced by {@link Tokenizer}.
*/
public char [][] image;
/**
* Token's {@link ITokenizer} bit flags.
* <p>
* This array is produced by {@link Tokenizer}.
*/
public short [] type;
/**
* Document field the token came from. The index points to arrays in
* {@link AllFields}, equal to <code>-1</code> for document and field separators.
* <p>
* This array is produced by {@link Tokenizer}.
*/
public byte [] fieldIndex;
/**
* Index of the document this token came from, points to elements of
* {@link PreprocessingContext#documents}. Equal to <code>-1</code> for document
* separators.
* <p>
* This array is produced by {@link Tokenizer}.
* </p>
* <p>
* This array is accessed in in {@link CaseNormalizer} and {@link PhraseExtractor}
* to compute by-document statistics, e.g. tf-by document, which are then needed
* to build a VSM or assign documents to labels. An alternative to this representation
* would be creating an <code>AllDocuments</code> holder and keep there an array
* of start token indexes for each document and then refactor the model building code
* to do a binary search to determine the document index given token index. This is
* likely to be a significant performance hit because model building code accesses
* the documentIndex array pretty much randomly (in the suffix order), so we'd be
* doing twice-the-number-of-tokens binary searches. Unless there's some other
* data structure that can help us here.
* </p>
*/
public int [] documentIndex;
/**
* A pointer to {@link AllWords} arrays for this token. Equal to <code>-1</code>
* for document, field and {@link ITokenizer#TT_PUNCTUATION} tokens (including
* sentence separators).
* <p>
* This array is produced by {@link CaseNormalizer}.
*/
public int [] wordIndex;
/**
* The suffix order of tokens. Suffixes starting with a separator come at the end
* of the array.
* <p>
* This array is produced by {@link PhraseExtractor}.
*/
public int [] suffixOrder;
/**
* The Longest Common Prefix for the adjacent suffix-sorted token sequences.
* <p>
* This array is produced by {@link PhraseExtractor}.
*/
public int [] lcp;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("token").alignLeft();
t.addColumn("type");
t.addColumn("fieldIndex");
t.addColumn("=>field").alignLeft();
t.addColumn("docIdx");
t.addColumn("wordIdx");
t.addColumn("=>word").alignLeft();
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "<null>" : new String(image[i]),
type[i],
fieldIndex[i],
fieldIndex[i] >= 0 ? allFields.name[fieldIndex[i]] : null,
documentIndex[i],
wordIndex[i],
wordIndex[i] >= 0 ? new String(allWords.image[wordIndex[i]]) : null);
}
if (suffixOrder != null)
{
t = new TabularOutput(sw);
t.addColumn("#");
t.addColumn("sa");
t.addColumn("lcp");
t.addColumn("=>words").alignLeft();
sw.append("\n");
final StringBuilder suffixImage = new StringBuilder();
for (int i = 0; i < suffixOrder.length; i++, t.nextRow())
{
t.rowData(
i,
suffixOrder[i],
lcp[i]);
int windowLength = 5;
for (int j = suffixOrder[i], max = Math.min(suffixOrder[i] + windowLength, wordIndex.length); j < max;)
{
suffixImage.append(
wordIndex[j] >= 0 ? new String(allWords.image[wordIndex[j]]) : "|").append(" ");
if (++j == max && j != wordIndex.length)
suffixImage.append(" [...]");
}
t.rowData(suffixImage.toString());
suffixImage.setLength(0);
}
sw.append("\n");
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all tokens of the input {@link PreprocessingContext#documents}.
*/
public final AllTokens allTokens = new AllTokens();
/**
* Information about all fields processed for the input
* {@link PreprocessingContext#documents}.
*/
public static class AllFields
{
/**
* Name of the document field. Entries of {@link AllTokens#fieldIndex} point to
* this array.
* <p>
* This array is produced by {@link Tokenizer}.
*/
public String [] name;
/** For debugging purposes. */
@Override
public String toString()
{
if (name == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("name").format("%-10s").alignLeft();
int i = 0;
for (String n : name)
{
t.rowData(i++, n).nextRow();
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all fields processed for the input
* {@link PreprocessingContext#documents}.
*/
public final AllFields allFields = new AllFields();
/**
* Information about all unique words found in the input
* {@link PreprocessingContext#documents}. An entry in each parallel array corresponds to one
* conflated form of a word. For example, <em>data</em> and <em>DATA</em> will most likely become
* a single entry in the words table. However, different grammatical forms of a single lemma
* (like <em>computer</em> and <em>computers</em>) will have different entries in the
* words table. See {@link AllStems} for inflection-conflated versions.
* <p>
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllWords
{
/**
* The most frequently appearing variant of the word with respect to case. E.g. if
* a token <em>MacOS</em> appeared 12 times in the input and <em>macos</em>
* appeared 3 times, the image will be equal to <em>MacOS</em>.
* <p>
* This array is produced by {@link CaseNormalizer}.
*/
public char [][] image;
/**
* Token type of this word copied from {@link AllTokens#type}. Additional
* flags are set for each word by
* {@link CaseNormalizer} and {@link LanguageModelStemmer}.
*
* <p>
* This array is produced by {@link CaseNormalizer}.
* This array is modified by {@link LanguageModelStemmer}.
*
* @see ITokenizer
*/
public short [] type;
/**
* Term Frequency of the word, aggregated across all variants with respect to
* case. Frequencies for each variant separately are not available.
* <p>
* This array is produced by {@link CaseNormalizer}.
*/
public int [] tf;
/**
* Term Frequency of the word for each document. The length of this array is equal
* to the number of documents this word appeared in (Document Frequency)
* multiplied by 2. Elements at even indices contain document indices pointing to
* {@link PreprocessingContext#documents}, elements at odd indices contain the
* frequency of the word in the document. For example, an array with 4 values:
* <code>[2, 15, 138, 7]</code> means that the word appeared 15 times in document
* at index 2 and 7 times in document at index 138.
* <p>
* This array is produced by {@link CaseNormalizer}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/**
* A pointer to the {@link AllStems} arrays for this word.
* <p>
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] stemIndex;
/**
* A bit-packed indices of all fields in which this word appears at least once.
* Indexes (positions) of selected bits are pointers to the
* {@link AllFields} arrays. Fast conversion between the bit-packed representation
* and <code>byte[]</code> with index values is done by {@link #toFieldIndexes(byte)}
* <p>
* This array is produced by {@link CaseNormalizer}.
*/
public byte [] fieldIndices;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("image").alignLeft();
t.addColumn("type");
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
t.addColumn("fieldIndices");
if (stemIndex != null)
{
t.addColumn("stemIndex");
t.addColumn("=>stem").alignLeft();
}
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "<null>" : new String(image[i]),
type[i],
tf[i],
SparseArray.sparseToString(tfByDocument[i]));
t.rowData(Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
if (stemIndex != null)
{
t.rowData(stemIndex[i]);
t.rowData(new String(allStems.image[stemIndex[i]]));
}
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all unique words found in the input
* {@link PreprocessingContext#documents}.
*/
public final AllWords allWords = new AllWords();
/**
* Information about all unique stems found in the input
* {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
* base form different words can be transformed to by the {@link IStemmer} used while
* processing. E.g. the English <em>mining</em> and <em>mine</em> will be aggregated
* to one entry in the arrays, while they will have separate entries in
* {@link AllWords}.
* <p>
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllStems
{
/**
* Stem image as produced by the {@link IStemmer}, may not correspond to any
* correct word.
* <p>
* This array is produced by {@link LanguageModelStemmer}.
*/
public char [][] image;
/**
* Pointer to the {@link AllWords} arrays, to the most frequent original form of
* the stem. Pointers to the less frequent variants are not available.
* <p>
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] mostFrequentOriginalWordIndex;
/**
* Term frequency of the stem, i.e. the sum of all {@link AllWords#tf} values
* for which the {@link AllWords#stemIndex} points to this stem.
* <p>
* This array is produced by {@link LanguageModelStemmer}.
*/
public int [] tf;
/**
* Term frequency of the stem for each document. For the encoding of this array,
* see {@link AllWords#tfByDocument}.
* <p>
* This array is produced by {@link LanguageModelStemmer}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/**
* A bit-packed indices of all fields in which this word appears at least once.
* Indexes (positions) of selected bits are pointers to the
* {@link AllFields} arrays. Fast conversion between the bit-packed representation
* and <code>byte[]</code> with index values is done by {@link #toFieldIndexes(byte)}
* <p>
* This array is produced by {@link LanguageModelStemmer}
*/
public byte [] fieldIndices;
/** For debugging purposes. */
@Override
public String toString()
{
if (image == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("stem");
t.addColumn("mostFrqWord");
t.addColumn("=>mostFrqWord").alignLeft();
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
t.addColumn("fieldIndices");
for (int i = 0; i < image.length; i++, t.nextRow())
{
t.rowData(
i,
image[i] == null ? "<null>" : new String(image[i]),
mostFrequentOriginalWordIndex[i],
new String(allWords.image[mostFrequentOriginalWordIndex[i]]),
tf[i],
SparseArray.sparseToString(tfByDocument[i]),
Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
}
t.flush();
sw.append("\n");
return sw.toString();
}
}
/**
* Information about all unique stems found in the input
* {@link PreprocessingContext#documents}.
*/
public final AllStems allStems = new AllStems();
/**
* Information about all frequently appearing sequences of words found in the input
* {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
* sequence.
* <p>
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllPhrases
{
/**
* Pointers to {@link AllWords} for each word in the phrase sequence.
* <p>
* This array is produced by {@link PhraseExtractor}.
*/
public int [][] wordIndices;
/**
* Term frequency of the phrase.
* <p>
* This array is produced by {@link PhraseExtractor}.
*/
public int [] tf;
/**
* Term frequency of the phrase for each document. The encoding of this
* array is similar to {@link AllWords#tfByDocument}: consecutive pairs of:
* document index, frequency.
* <p>
* This array is produced by {@link PhraseExtractor}. The order of documents in this
* array is not defined.
*/
public int [][] tfByDocument;
/** For debugging purposes. */
@Override
public String toString()
{
if (wordIndices == null)
{
return UNINITIALIZED;
}
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("wordIndices");
t.addColumn("=>words").alignLeft();
t.addColumn("tf");
t.addColumn("tfByDocument").alignLeft();
for (int i = 0; i < wordIndices.length; i++, t.nextRow())
{
t.rowData(
i,
Arrays.toString(wordIndices[i]).replace(" ", ""),
getPhrase(i),
tf[i],
SparseArray.sparseToString(tfByDocument[i]));
}
t.flush();
sw.append("\n");
return sw.toString();
}
/** Returns space-separated words that constitute this phrase. */
public CharSequence getPhrase(int index)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < wordIndices[index].length; i++)
{
if (i > 0) sb.append(" ");
sb.append(new String(allWords.image[wordIndices[index][i]]));
}
return sb;
}
/**
* Returns length of all arrays in this {@link AllPhrases}.
*/
public int size()
{
return wordIndices.length;
}
}
/**
* Information about all frequently appearing sequences of words found in the input
* {@link PreprocessingContext#documents}.
*/
public AllPhrases allPhrases = new AllPhrases();
/**
* Information about words and phrases that might be good cluster label candidates.
* Each entry in each array corresponds to one label candidate.
* <p>
* All arrays in this class have the same length and values across different arrays
* correspond to each other for the same index.
*/
public class AllLabels
{
/**
* Feature index of the label candidate. Features whose values are less than the
* size of {@link AllWords} arrays are single word features and point to entries
* in {@link AllWords}. Features whose values are larger or equal to the size of
* {@link AllWords}, after subtracting the size of {@link AllWords}, point to
* {@link AllPhrases}.
* <p>
* This array is produced by {@link LabelFilterProcessor}.
*/
public int [] featureIndex;
/**
* Indices of documents assigned to the label candidate.
* <p>
* This array is produced by {@link DocumentAssigner}.
*/
public BitSet [] documentIndices;
/**
* The first index in {@link #featureIndex} which
* points to {@link AllPhrases}, or -1 if there are no phrases
* in {@link #featureIndex}.
* <p>
* This value is set by {@link LabelFilterProcessor}.
*
* @see #featureIndex
*/
public int firstPhraseIndex;
/** For debugging purposes. */
@Override
public String toString()
{
if (featureIndex == null)
return UNINITIALIZED;
StringWriter sw = new StringWriter();
TabularOutput t = new TabularOutput(sw);
t.flushEvery(Integer.MAX_VALUE);
t.addColumn("#");
t.addColumn("featureIdx");
t.addColumn("=>feature").alignLeft();
t.addColumn("documentIdx").alignLeft();
for (int i = 0; i < featureIndex.length; i++, t.nextRow())
{
t.rowData(
i,
featureIndex[i],
getLabel(i),
documentIndices != null ? documentIndices[i].toString().replace(" ", "") : "");
}
t.flush();
sw.append("\n");
return t.toString();
}
private CharSequence getLabel(int index)
{
final int wordsSize = allWords.image.length;
if (featureIndex[index] < wordsSize)
return new String(allWords.image[featureIndex[index]]);
else
return allPhrases.getPhrase(featureIndex[index] - wordsSize);
}
}
/**
* Information about words and phrases that might be good cluster label candidates.
*/
public final AllLabels allLabels = new AllLabels();
/**
* Returns <code>true</code> if this context contains any words.
*/
public boolean hasWords()
{
return allWords.image.length > 0;
}
/**
* Returns <code>true</code> if this context contains any label candidates.
*/
public boolean hasLabels()
{
return allLabels.featureIndex != null && allLabels.featureIndex.length > 0;
}
@Override
public String toString()
{
return "PreprocessingContext 0x" + Integer.toHexString(this.hashCode()) + "\n"
+ "== Fields:\n" + this.allFields.toString()
+ "== Tokens:\n" + this.allTokens.toString()
+ "== Words:\n" + this.allWords.toString()
+ "== Stems:\n" + this.allStems.toString()
+ "== Phrases:\n" + this.allPhrases.toString()
+ "== Labels:\n" + this.allLabels.toString();
}
/**
* Static conversion between selected bits and an array of indexes of these bits.
*/
private final static int [][] bitsCache;
static
{
bitsCache = new int [0x100][];
for (int i = 0; i < 0x100; i++)
{
bitsCache[i] = new int [Integer.bitCount(i & 0xFF)];
for (int v = 0, bit = 0, j = i & 0xff; j != 0; j >>>= 1, bit++)
{
if ((j & 0x1) != 0)
bitsCache[i][v++] = bit;
}
}
}
/**
* Convert the selected bits in a byte to an array of indexes.
*/
public static int [] toFieldIndexes(byte b)
{
return bitsCache[b & 0xff];
}
/*
* These should really be package-private, shouldn't they? We'd need to move classes under pipeline.
* here for accessibility.
*/
/**
* This method should be invoked after all preprocessing contributors have been executed
* to release temporary data structures.
*/
public void preprocessingFinished()
{
this.tokenCache = null;
}
/**
* Return a unique char buffer representing a given character sequence.
*/
public char [] intern(MutableCharArray chs)
{
if (tokenCache.contains(chs))
{
return tokenCache.lkey().getBuffer();
}
else
{
final char [] tokenImage = new char [chs.length()];
System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
tokenCache.add(new MutableCharArray(tokenImage));
return tokenImage;
}
}
}