Package de.tudarmstadt.ukp.similarity.ml

Examples of de.tudarmstadt.ukp.similarity.ml.FeatureConfig


   
    WordIdfValuesGenerator.computeIdfScores(dataset);
   
    if (dataset.equals(Dataset.WikipediaRewriteCorpus))
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                SimpleTextSimilarityResource.class,
                SimpleTextSimilarityResource.PARAM_MODE, "text",
                SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
          Document.class.getName(),
          false,
          "string",
          "LongestCommonSubsequenceComparator"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                StopwordNGramContainmentMeasureResource.class,
                StopwordNGramContainmentMeasureResource.PARAM_N, "10",
                StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_10"
          ));
     
      ngrams_n = new int[] { 5 };
      for (int n : ngrams_n)
      {
        configs.add(new FeatureConfig(
            createExternalResourceDescription(
                CharacterNGramResource.class,
                CharacterNGramResource.PARAM_N, new Integer(n).toString(),
                CharacterNGramResource.PARAM_IDF_VALUES_FILE, UTILS_DIR + "/character-ngrams-idf/" + n + "/" + dataset.toString() + ".txt"),
            Document.class.getName(),
            false,
            "n-grams",
            "CharacterNGramMeasure_" + n
            ));
      }
    }
    else if (dataset.equals(Dataset.MeterCorpus))
    {
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                GreedyStringTilingMeasureResource.class,
                GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
          Document.class.getName(),
          false,
          "string",
          "GreedyStringTiling_3"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                StopwordNGramContainmentMeasureResource.class,
                StopwordNGramContainmentMeasureResource.PARAM_N, "12",
                StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_12"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                MTLDResource.class),
          Document.class.getName(),
          false,
          "style",
          "SequentialTTR"
          ));
    }
    else if (dataset.equals(Dataset.WebisCrowdParaphraseCorpus))
    {
      // Content
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                GreedyStringTilingMeasureResource.class,
                GreedyStringTilingMeasureResource.PARAM_MIN_MATCH_LENGTH, "3"),
          Document.class.getName(),
          false,
          "string",
          "GreedyStringTiling_3"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                SimpleTextSimilarityResource.class,
                SimpleTextSimilarityResource.PARAM_MODE, "text",
                SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceComparator.class.getName()),
          Document.class.getName(),
          false,
          "string",
          "LongestCommonSubsequenceComparator"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                SimpleTextSimilarityResource.class,
                SimpleTextSimilarityResource.PARAM_MODE, "text",
                SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubsequenceNormComparator.class.getName()),
          Document.class.getName(),
          false,
          "string",
          "LongestCommonSubsequenceNormComparator"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                SimpleTextSimilarityResource.class,
                SimpleTextSimilarityResource.PARAM_MODE, "text",
                SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, LongestCommonSubstringComparator.class.getName()),
          Document.class.getName(),     
          false,
          "string",
          "LongestCommonSubstringComparator"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                SimpleTextSimilarityResource.class,
                SimpleTextSimilarityResource.PARAM_MODE, "text",
                SimpleTextSimilarityResource.PARAM_TEXT_SIMILARITY_MEASURE, JaroSecondStringComparator.class.getName()),
          Document.class.getName(),     
          false,
          "string",
          "Jaro"
          ));
     
      ngrams_n = new int[] { 6, 14, 15 };
      for (int n : ngrams_n)
      {
        configs.add(new FeatureConfig(
            createExternalResourceDescription(
                  WordNGramJaccardResource.class,
                  WordNGramJaccardResource.PARAM_N, new Integer(n).toString()),
            Token.class.getName(),
            false,
            "n-grams",
            "WordNGramJaccardMeasure_" + n
            ))
      }
     
      // Structure
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                TokenPairOrderingResource.class),
          Lemma.class.getName() + "/value",
          false,
          "structure",
          "TokenPairOrdering"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                StopwordNGramContainmentMeasureResource.class,
                StopwordNGramContainmentMeasureResource.PARAM_N, "6",
                StopwordNGramContainmentMeasureResource.PARAM_STOPWORD_LIST_LOCATION, "classpath:/stopwords/stopwords-bnc-stamatatos.txt"),
          Token.class.getName(),
          false,
          "structure",
          "StopwordNGramContainmentMeasure_6"
          ));
     
      // Style
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                FunctionWordFrequenciesMeasureResource.class,
                FunctionWordFrequenciesMeasureResource.PARAM_FUNCTION_WORD_LIST_LOCATION, "classpath:/stopwords/function-words-mosteller-wallace.txt"),
          Document.class.getName(),
          false,
          "style",
          "FunctionWordFrequencies"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                MTLDResource.class),
          Document.class.getName(),
          false,
          "style",
          "SequentialTTR"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                TokenRatioResource.class),
          Document.class.getName(),
          false,
          "style",
          "TokenRatio"
          ));
     
      // Content, again
       
      /* TODO: If you plan to use the following measures, make sure that you have the
       * necessary resources installed.
       * Details on obtaining and installing them can be found here:
       * http://code.google.com/p/dkpro-similarity-asl/wiki/SettingUpTheResources
       */
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                VectorIndexSourceRelatednessResource.class,
                VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DKProContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
          Lemma.class.getName() + "/value",
          false,
          "esa",
          "ESA_WordNet"
          ));
     
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                VectorIndexSourceRelatednessResource.class,
                VectorIndexSourceRelatednessResource.PARAM_MODEL_LOCATION, DKProContext.getContext().getWorkspace().getAbsolutePath() + "/ESA/VectorIndexes/wordnet_eng_lem_nc_c"),
          Lemma.class.getName() + "/value",
          true,
          "esa",
          "ESA_WordNet_stopword-filtered"
          ));
     
      // Resnik word similarity measure, aggregated according to Mihalcea et al. (2006)
      configs.add(new FeatureConfig(
          createExternalResourceDescription(
                MCS06AggregateResource.class,
                MCS06AggregateResource.PARAM_TERM_SIMILARITY_RESOURCE, createExternalResourceDescription(
                    ResnikRelatednessResource.class,
                    ResnikRelatednessResource.PARAM_RESOURCE_NAME, "wordnet",
View Full Code Here

TOP

Related Classes of de.tudarmstadt.ukp.similarity.ml.FeatureConfig

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.