WordIdfValuesGenerator.computeIdfScores(dataset);
if (dataset.equals(Dataset.WikipediaRewriteCorpus))
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubsequenceComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, "10",
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_10"
));
ngrams_n = new int[] { 5 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
CharacterNGramResource.class,
CharacterNGramResource.PARAM_N, new Integer(n).toString(),
CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + n + "/" + dataset.toString() + ".txt"),
Document.class.getName(),
false,
"n-grams",
"CharacterNGramMeasure_" + n
));
}
}
else if (dataset.equals(Dataset.MeterCorpus))
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
GreedyStringTilingMeasureResource.class,
GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
Document.class.getName(),
false,
"string",
"GreedyStringTiling_3"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, "12",
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_12"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
MTLDResource.class),
Document.class.getName(),
false,
"style",
"SequentialTTR"
));
}
else if (dataset.equals(Dataset.WebisCrowdParaphraseCorpus))
{
// Content
configs.add(new FeatureConfig(
createExternalResourceDescription(
GreedyStringTilingMeasureResource.class,
GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
Document.class.getName(),
false,
"string",
"GreedyStringTiling_3"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubsequenceComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubsequenceNormComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
Document.class.getName(),
false,
"string",
"LongestCommonSubstringComparator"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
SimpleTextSimilarityResource.class,
SimpleTextSimilarityResource.PARAM_MODE, "text",
SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroSecondStringComparator.class.getName()),
Document.class.getName(),
false,
"string",
"Jaro"
));
ngrams_n = new int[] { 6, 14, 15 };
for (int n : ngrams_n)
{
configs.add(new FeatureConfig(
createExternalResourceDescription(
WordNGramJaccardResource.class,
WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
Token.class.getName(),
false,
"n-grams",
"WordNGramJaccardMeasure_" + n
));
}
// Structure
configs.add(new FeatureConfig(
createExternalResourceDescription(
TokenPairOrderingResource.class),
Lemma.class.getName() + "/value",
false,
"structure",
"TokenPairOrdering"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
StopwordNGramContainmentMeasureResource.class,
StopwordNGramContainmentMeasureResource.PARAM_N, "6",
StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
Token.class.getName(),
false,
"structure",
"StopwordNGramContainmentMeasure_6"
));
// Style
configs.add(new FeatureConfig(
createExternalResourceDescription(
FunctionWordFrequenciesMeasureResource.class,
FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
Document.class.getName(),
false,
"style",
"FunctionWordFrequencies"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
MTLDResource.class),
Document.class.getName(),
false,
"style",
"SequentialTTR"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
TokenRatioResource.class),
Document.class.getName(),
false,
"style",
"TokenRatio"
));
// Content, again
/* TODO: If you plan to use the following measures, make sure that you have the
* necessary resources installed.
* Details on obtaining and installing them can be found here:
* http://code.google.com/p/dkpro-similarity-asl/wiki/SettingUpTheResources
*/
configs.add(new FeatureConfig(
createExternalResourceDescription(
VectorIndexSourceRelatednessResource.class,
VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DKProContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
Lemma.class.getName() + "/value",
false,
"esa",
"ESA_WordNet"
));
configs.add(new FeatureConfig(
createExternalResourceDescription(
VectorIndexSourceRelatednessResource.class,
VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DKProContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
Lemma.class.getName() + "/value",
true,
"esa",
"ESA_WordNet_stopword-filtered"
));
// Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
configs.add(new FeatureConfig(
createExternalResourceDescription(
MCS06AggregateResource.class,
MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
ResnikRelatednessResource.class,
ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",