Package dkpro.similarity.ml

Examples of dkpro.similarity.ml.FeatureConfig


  {
    // Define the features
    List<FeatureConfig> configs = new ArrayList<FeatureConfig>();
   
    // String features
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "Jaro"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroWinklerSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "JaroWinkler"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, MongeElkanSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "MongeElkan"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LevenshteinComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "Levenshtein"
        ));
   
//    configs.add(new FeatureConfig(
//        createExternalResourceDescription(
//              GreedyStringTilingMeasureResource.class,
//              GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
//        Document.class.getName(),
//        false,
//        "content/string",
//        "GreedyStringTiling_3"
//        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "LongestCommonSubsequenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "LongestCommonSubsequenceNormComparator"
        ));

    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
        Document.class.getName(),     
        false,
        "content/string",
        "LongestCommonSubstringComparator"
        ));

    // N-Grams
    for (int i = 1; i <= 5; i++)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(i).toString()),
          Token.class.getName(),
          false,
          "content/n-grams",
          "WordNGramContainmentMeasure_" + i
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(i).toString()),
          Token.class.getName(),
          false,
          "content/n-grams",
          "WordNGramJaccardMeasure_" + i
          ));
    }
   
    // ESA
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wp_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WP"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wiktionary_en"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WK"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WN"
        ));
   
    // Structure
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              StopwordNGramContainmentMeasureResource.class,
              StopwordNGramContainmentMeasureResource.PARAM_N, "3",
              StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "/home/danielb/Projekte/Similarity/workspace/de.tudarmstadt.ukp.similarity-asl/de.tudarmstadt.ukp.similarity.dkpro.data-asl/src/main/resources/stopwords/stopwords-bnc-stamatatos.txt"),
        Lemma.class.getName() + "/value",
        false,
        "structure",
        "StopwordNGramContainmentMeasure_3_stamatatos"
        ));
   
    // Style
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              FunctionWordFrequenciesMeasureResource.class,
              FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "/home/danielb/Projekte/Similarity/workspace/de.tudarmstadt.ukp.similarity-asl/de.tudarmstadt.ukp.similarity.dkpro.data-asl/src/main/resources/stopwords/function-words-mosteller-wallace.txt"),
        Lemma.class.getName() + "/value",
        false,
View Full Code Here


//        "custom",
//        "MyTextSimilarityMeasure_3"
//        ));
   
    // String features
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              GreedyStringTilingMeasureResource.class,
              GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
        Document.class.getName(),
        false,
        "string",
        "GreedyStringTiling_3"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceNormComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
        Document.class.getName(),     
        false,
        "string",
        "LongestCommonSubstringComparator"
        ));
   
    ngrams_n = new int[] { 2, 3, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
              CharacterNGramResource.class,
              CharacterNGramResource.PARAM_N, new Integer(n).toString(),
              CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + mode.toString().toLowerCase() + "/" + n + "/" + dataset.toString() + ".txt"),
          Document.class.getName(),
          false,
          "n-grams",
          "CharacterNGramMeasure_" + n
          ));
    }
   
    ngrams_n = new int[] { 1, 2 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
          "n-grams",
          "WordNGramContainmentMeasure_" + n + "_stopword-filtered"
          ));
    }
   
    ngrams_n = new int[] { 1, 3, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          false,
          "n-grams",
          "WordNGramJaccardMeasure_" + n
          ));     
    }
   
    ngrams_n = new int[] { 2, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
View Full Code Here

  {
    // Define the features
    List<FeatureConfig> configs = new ArrayList<FeatureConfig>();
   
    // String features
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "Jaro"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroWinklerSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "JaroWinkler"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, MongeElkanSecondStringComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "MongeElkan"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LevenshteinComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "Levenshtein"
        ));
   
//    configs.add(new FeatureConfig(
//        createExternalResourceDescription(
//              GreedyStringTilingMeasureResource.class,
//              GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
//        Document.class.getName(),
//        false,
//        "content/string",
//        "GreedyStringTiling_3"
//        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "LongestCommonSubsequenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
        Document.class.getName(),
        false,
        "content/string",
        "LongestCommonSubsequenceNormComparator"
        ));

    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
        Document.class.getName(),     
        false,
        "content/string",
        "LongestCommonSubstringComparator"
        ));

    // N-Grams
    for (int i = 1; i <= 5; i++)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(i).toString()),
          Token.class.getName(),
          false,
          "content/n-grams",
          "WordNGramContainmentMeasure_" + i
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(i).toString()),
          Token.class.getName(),
          false,
          "content/n-grams",
          "WordNGramJaccardMeasure_" + i
          ));
    }
   
    // ESA
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wp_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WP"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wiktionary_en"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WK"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, "/home/danielb/Projekte/DKPro/Resources/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "content/esa",
        "ESA_WN"
        ));
   
    // Structure
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              StopwordNGramContainmentMeasureResource.class,
              StopwordNGramContainmentMeasureResource.PARAM_N, "3",
              StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "/home/danielb/Projekte/Similarity/workspace/de.tudarmstadt.ukp.similarity-asl/de.tudarmstadt.ukp.similarity.dkpro.data-asl/src/main/resources/stopwords/stopwords-bnc-stamatatos.txt"),
        Lemma.class.getName() + "/value",
        false,
        "structure",
        "StopwordNGramContainmentMeasure_3_stamatatos"
        ));
   
    // Style
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              FunctionWordFrequenciesMeasureResource.class,
              FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "/home/danielb/Projekte/Similarity/workspace/de.tudarmstadt.ukp.similarity-asl/de.tudarmstadt.ukp.similarity.dkpro.data-asl/src/main/resources/stopwords/function-words-mosteller-wallace.txt"),
        Lemma.class.getName() + "/value",
        false,
View Full Code Here

        }
   
    WordIdfValuesGenerator.computeIdfScores(mode, dataset);
   
    // String features
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              GreedyStringTilingMeasureResource.class,
              GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
        Document.class.getName(),
        false,
        "string",
        "GreedyStringTiling_3"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceNormComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
        Document.class.getName(),     
        false,
        "string",
        "LongestCommonSubstringComparator"
        ));
   
    ngrams_n = new int[] { 2, 3, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
              CharacterNGramResource.class,
              CharacterNGramResource.PARAM_N, new Integer(n).toString(),
              CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + mode.toString().toLowerCase() + "/" + n + "/" + dataset.toString() + ".txt"),
          Document.class.getName(),
          false,
          "n-grams",
          "CharacterNGramMeasure_" + n
          ));
    }
   
    ngrams_n = new int[] { 1, 2 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
          "n-grams",
          "WordNGramContainmentMeasure_" + n + "_stopword-filtered"
          ));
    }
   
    ngrams_n = new int[] { 1, 3, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          false,
          "n-grams",
          "WordNGramJaccardMeasure_" + n
          ));     
    }
   
    ngrams_n = new int[] { 2, 4 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
          "n-grams",
          "WordNGramJaccardMeasure_" + n + "_stopword-filtered"
          ));     
    }
   
    /* TODO: If you plan to use the following measures, make sure that you have the
     * necessary resources installed.
     * Details on obtaining and installing them can be found here:
     * http://code.google.com/p/dkpro-similarity-asl/wiki/SettingUpTheResources
     */
   
    // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              MCS06AggregateResource.class,
              MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
                  ResnikRelatednessResource.class,
                  ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
                  ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
                  ),
              MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + mode.toString().toLowerCase() + "/" + dataset.toString() + ".txt"),
        Lemma.class.getName() + "/value",
        false,
        "word-sim",
        "MCS06_Resnik_WordNet"
        ));
   
    // Lexical Substitution System wrapper for
    // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TWSISubstituteWrapperResource.class,
            TWSISubstituteWrapperResource.PARAM_TEXT_SIMILARITY_RESOURCE, createExternalResourceDescription(
                  MCS06AggregateResource.class,
                  MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
                      ResnikRelatednessResource.class,
                      ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
                      ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
                      ),
                  MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + mode.toString().toLowerCase() + "/" + dataset.toString() + ".txt")),
        "word-sim",
        "TWSI_MCS06_Resnik_WordNet"
        ));
   
    // Bing SMT wrapper for
    // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
    // In our original system, we used MOSES for machine translation.
//    configs.add(new FeatureConfig(
//        createExternalResourceDescription(
//            BingSMTWrapperResource.class,
//            BingSMTWrapperResource.PARAM_TEXT_SIMILARITY_RESOURCE, createExternalResourceDescription(
//                  MCS06AggregateResource.class,
//                  MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
//                      ResnikRelatednessResource.class,
//                      ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
//                      ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
//                      ),
//                  MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + mode.toString().toLowerCase() + "/" + dataset.toString() + ".txt"
//                  ),
//            BingSMTWrapperResource.PARAM_ORIGINAL_LANGUAGE, Language.EN.toString(),
//            BingSMTWrapperResource.PARAM_BRIDGE_LANGUAGE, Language.ES.toString()),
//        "word-sim",
//        "BingSMT_MCS06_Resnik_WordNet"
//        ));
       
    // Explicit Semantic Analysis
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "esa",
        "ESA_WordNet"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wiktionary_en"),
        Lemma.class.getName() + "/value",
        false,
View Full Code Here

    WordIdfValuesGenerator.computeIdfScores(dataset);
   
    // ** FEATURES **
   
    // String features
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              GreedyStringTilingMeasureResource.class,
              GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
        Document.class.getName(),
        false,
        "string",
        "GreedyStringTiling_3"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
        Document.class.getName(),
        false,
        "string",
        "LongestCommonSubsequenceNormComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              SimpleTextSimilarityResource.class,
              SimpleTextSimilarityResource.PARAM_MODE, "text",
              SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
        Document.class.getName(),     
        false,
        "string",
        "LongestCommonSubstringComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              CosineSimilarityResource.class),
        Lemma.class.getName() + "/value",
        false,
        "string",
        "CosineSimilarity"
        ));
   
    // n-gram models
    ngrams_n = new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
              CharacterNGramResource.class,
              CharacterNGramResource.PARAM_N, new Integer(n).toString(),
              CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + n + "/" + dataset.toString() + ".txt"),
          Document.class.getName(),
          false,
          "n-grams",
          "CharacterNGramMeasure_" + n
          ));
    }
   
    ngrams_n = new int[] { 1, 2, 3, 4, 5 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          false,
          "n-grams",
          "WordNGramContainmentMeasure_" + n
          ));
    }
   
    ngrams_n = new int[] { 1, 2, 3, 4, 5 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramContainmentResource.class,
                WordNGramContainmentResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
          "n-grams",
          "WordNGramContainmentMeasure_" + n + "_stopword-filtered"
          ));
    }
   
    ngrams_n = new int[] { 1, 2, 3, 4, 5 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          false,
          "n-grams",
          "WordNGramJaccardMeasure_" + n
          ));     
    }
   
    ngrams_n = new int[] { 1, 2, 3, 4, 5 };
    for (int n : ngrams_n)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                WordNGramJaccardResource.class,
                WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          Token.class.getName(),
          true,
          "n-grams",
          "WordNGramJaccardMeasure_" + n + "_stopword-filtered"
          ));     
    }
   
    /* TODO: If you plan to use the following measures, make sure that you have the
     * necessary resources installed.
     * Details on obtaining and installing them can be found here:
     * http://code.google.com/p/dkpro-similarity-asl/wiki/SettingUpTheResources
     */
   
    // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              MCS06AggregateResource.class,
              MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
                  ResnikRelatednessResource.class,
                  ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
                  ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
                  ),
              MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + dataset.toString() + ".txt"),
        Lemma.class.getName() + "/value",
        false,
        "word-sim",
        "MCS06_Resnik_WordNet"
        ));
   
    // Lexical Substitution System wrapper for
    // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TWSISubstituteWrapperResource.class,
            TWSISubstituteWrapperResource.PARAM_TEXT_SIMILARITY_RESOURCE, createExternalResourceDescription(
                  MCS06AggregateResource.class,
                  MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
                      ResnikRelatednessResource.class,
                      ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
                      ResnikRelatednessResource.PARAM_RESOURCE_LANGUAGE, "en"
                      ),
                  MCS06AggregateResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/word-idf/" + dataset.toString() + ".txt")),
        "word-sim",
        "TWSI_MCS06_Resnik_WordNet"
        ));
       
    // Explicit Semantic Analysis
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "esa",
        "ESA_WordNet"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wiktionary_en"),
        Lemma.class.getName() + "/value",
        false,
        "esa",
        "ESA_Wiktionary"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              VectorIndexSourceRelatednessResource.class,
              VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DkproContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wp_eng_lem_nc_c"),
        Lemma.class.getName() + "/value",
        false,
        "esa",
        "ESA_Wikipedia"
        ));
   
    // LSA
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
              LatentSemanticAnalysisResource.class,
              LatentSemanticAnalysisResource.PARAM_INPUT_DIR, UTILS_DIR + "/plaintexts/" + dataset.toString()),
        Token.class.getName(),
        false,
        "lsa",
        "LSA"
        ));
   
    // ** Structure **
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TokenPairDistanceResource.class),
        Token.class.getName(),
        false,
        "structure",
        "TokenPairDistanceMeasure"
        ));
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TokenPairOrderingResource.class),
        Token.class.getName(),
        false,
        "structure",
        "TokenPairOrderingMeasure"
        ));
   
    for (int n = 2; n <= 7; n++) {
            configs.add(new FeatureConfig(
          createExternalResourceDescription(
              StopwordNGramContainmentMeasureResource.class,
              StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
              StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords_english_punctuation.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_" + n + "_english-punctuation"
          ));
        }
   
    for (int n = 2; n <= 7; n++) {
            configs.add(new FeatureConfig(
          createExternalResourceDescription(
              StopwordNGramContainmentMeasureResource.class,
              StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
              StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_" + n + "_mosteller-wallace"
          ));
        }
   
    for (int n = 2; n <= 7; n++) {
            configs.add(new FeatureConfig(
          createExternalResourceDescription(
              StopwordNGramContainmentMeasureResource.class,
              StopwordNGramContainmentMeasureResource.PARAM_N, new Integer(n).toString(),
              StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_" + n + "_stamatatos"
          ));
        }
   
    for (int n = 1; n <= 7; n++)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
              PosNGramJaccardResource.class,
              PosNGramJaccardResource.PARAM_N, new Integer(n).toString()),
          POS.class.getName(),
          false,
          "structure",
          "PosNGramJaccardMeasure_" + n
          ));
    }
     
    for (int n = 1; n <= 7; n++)
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
              PosNGramContainmentResource.class,
              PosNGramContainmentResource.PARAM_N, new Integer(n).toString()),
          POS.class.getName(),
          false,
          "structure",
          "PosNGramContainmentMeasure_" + n
          ));
    }
   
    // ** Style **

    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            MTLDResource.class),
        null,
        false,
        "style",
        "MTLDComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TypeTokenRatioResource.class),
        null,
        false,
        "style",
        "TypeTokenRatioComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            AvgCharactersPerTokenResource.class),
        null,
        false,
        "style",
        "AvgCharactersPerTokenComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            AvgTokensPerSentenceResource.class),
        null,
        false,
        "style",
        "AvgTokensPerSentenceComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            SentenceRatioResource.class),
        null,
        false,
        "style",
        "SentenceRatioComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            TokenRatioResource.class),
        null,
        false,
        "style",
        "TokenRatioComparator"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            FunctionWordFrequenciesMeasureResource.class,
            FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/stopwords_english_punctuation.txt"),
        Token.class.getName(),
        false,
        "style",
        "FunctionWordFrequenciesMeasure_english-punctuation"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            FunctionWordFrequenciesMeasureResource.class,
            FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
        Token.class.getName(),
        false,
        "style",
        "FunctionWordFrequenciesMeasure_mosteller-wallace"
        ));
   
    configs.add(new FeatureConfig(
        createExternalResourceDescription(
            FunctionWordFrequenciesMeasureResource.class,
            FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
        Token.class.getName(),
        false,
View Full Code Here

TOP

Related Classes of dkpro.similarity.ml.FeatureConfig

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.