WordIdfValuesGenerator.computeIdfScores(dataset);
// ** FEATURES **
// String features
configs.add(new FeatureConfig(
createExternalResourceDescription(
GreedyStringTilingMeasureResource.class,
GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
Document.class.getName(),
false,
"string",
"GreedyStringTiling_3"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubsequenceComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubsequenceNormComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubstringComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
CosineSimilarityResource.class),
Lemma.class.getName() + "/value",
false,
"string",
"CosineSimilarity"
));
// n-gram models
ngrams_n = new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
CharacterNGramResource.class,
CharacterNGramResource.PARAM_N, new Integer(n).toString(),
CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + n + "/" + dataset.toString() + ".txt"),
Document.class.getName(),
false,
"n-grams",
"CharacterNGramMeasure_" + n
));
}
ngrams_n = new int[] { 1, 2, 3, 4, 5 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
WordNGramContainmentResource.class,
WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
Token.class.getName(),
false,
"n-grams",
"WordNGramContainmentMeasure_" + n
));
}
ngrams_n = new int[] { 1, 2, 3, 4, 5 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
WordNGramContainmentResource.class,
WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
Token.class.getName(),
true,
"n-grams",
"WordNGramContainmentMeasure_" + n + "_stopword-filtered"
));
}
ngrams_n = new int[] { 1, 2, 3, 4, 5 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
WordNGramJaccardResource.class,
WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
Token.class.getName(),
false,
"n-grams",
"WordNGramJaccardMeasure_" + n
));
}
ngrams_n = new int[] { 1, 2, 3, 4, 5 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
WordNGramJaccardResource.class,
WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
Token.class.getName(),
true,
"n-grams",
"WordNGramJaccardMeasure_" + n + "_stopword-filtered"
));
}
/* TODO: If you plan to use the following measures, make sure that you have the
* necessary resources installed.
* Details on obtaining and installing them can be found here:
* http://code.google.com/p/dkpro-similarity-asl/wiki/SettingUpTheResources
*/
// Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
configs.add(new FeatureConfig(
createExternalResourceDescription(
MCS06AggregateResource.class,
MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
ResnikRelatednessResource.class,
ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
),
MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + dataset.toString() + ".txt"),
Lemma.class.getName() + "/value",
false,
"word-sim",
"MCS06_Resnik_WordNet"
));
// Lexical Substitution System wrapper for
// Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
configs.add(new FeatureConfig(
createExternalResourceDescription(
TWSISubstituteWrapperResource.class,
TWSISubstituteWrapperResource.PARAM_TEXT_SIMILARITY_RESOURCE, createExternalResourceDescription(
MCS06AggregateResource.class,
MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
ResnikRelatednessResource.class,
ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
),
MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + dataset.toString() + ".txt")),
"word-sim",
"TWSI_MCS06_Resnik_WordNet"
));
// Explicit Semantic Analysis
configs.add(new FeatureConfig(
createExternalResourceDescription(
VectorIndexSourceRelatednessResource.class,
VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
Lemma.class.getName() + "/value",
false,
"esa",
"ESA_WordNet"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
VectorIndexSourceRelatednessResource.class,
VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wiktionary_en"),
Lemma.class.getName() + "/value",
false,
"esa",
"ESA_Wiktionary"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
VectorIndexSourceRelatednessResource.class,
VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wp_eng_lem_nc_c"),
Lemma.class.getName() + "/value",
false,
"esa",
"ESA_Wikipedia"
));
// LSA
configs.add(new FeatureConfig(
createExternalResourceDescription(
LatentSemanticAnalysisResource.class,
LatentSemanticAnalysisResource.PARAM_INPUT_DIR, UTILS_DIR + "/plaintexts/" + dataset.toString()),
Token.class.getName(),
false,
"lsa",
"LSA"
));
// ** Structure **
configs.add(new FeatureConfig(
createExternalResourceDescription(
TokenPairDistanceResource.class),
Token.class.getName(),
false,
"structure",
"TokenPairDistanceMeasure"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
TokenPairOrderingResource.class),
Token.class.getName(),
false,
"structure",
"TokenPairOrderingMeasure"
));
for (int n = 2; n <= 7; n++) {
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords_english_punctuation.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_" + n + "_english-punctuation"
));
}
for (int n = 2; n <= 7; n++) {
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_" + n + "_mosteller-wallace"
));
}
for (int n = 2; n <= 7; n++) {
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_" + n + "_stamatatos"
));
}
for (int n = 1; n <= 7; n++)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
PosNGramJaccardResource.class,
PosNGramJaccardResource.PARAM_N, new Integer(n).toString()),
POS.class.getName(),
false,
"structure",
"PosNGramJaccardMeasure_" + n
));
}
for (int n = 1; n <= 7; n++)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
PosNGramContainmentResource.class,
PosNGramContainmentResource.PARAM_N, new Integer(n).toString()),
POS.class.getName(),
false,
"structure",
"PosNGramContainmentMeasure_" + n
));
}
// ** Style **
configs.add(new FeatureConfig(
createExternalResourceDescription(
MTLDResource.class),
null,
false,
"style",
"MTLDComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
TypeTokenRatioResource.class),
null,
false,
"style",
"TypeTokenRatioComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
AvgCharactersPerTokenResource.class),
null,
false,
"style",
"AvgCharactersPerTokenComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
AvgTokensPerSentenceResource.class),
null,
false,
"style",
"AvgTokensPerSentenceComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SentenceRatioResource.class),
null,
false,
"style",
"SentenceRatioComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
TokenRatioResource.class),
null,
false,
"style",
"TokenRatioComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
FunctionWordFrequenciesMeasureResource.class,
FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/stopwords_english_punctuation.txt"),
Token.class.getName(),
false,
"style",
"FunctionWordFrequenciesMeasure_english-punctuation"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
FunctionWordFrequenciesMeasureResource.class,
FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
Token.class.getName(),
false,
"style",
"FunctionWordFrequenciesMeasure_mosteller-wallace"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
FunctionWordFrequenciesMeasureResource.class,
FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
Token.class.getName(),
false,