Package org.apache.ctakes.ytex.kernel.metric

Source Code of org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityServiceImpl

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.ytex.kernel.metric;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Element;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.ctakes.ytex.kernel.ImputedFeatureEvaluator;
import org.apache.ctakes.ytex.kernel.InfoContentEvaluator;
import org.apache.ctakes.ytex.kernel.IntrinsicInfoContentEvaluator;
import org.apache.ctakes.ytex.kernel.OrderedPair;
import org.apache.ctakes.ytex.kernel.SimSvcContextHolder;
import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
import org.apache.ctakes.ytex.kernel.model.ConcRel;
import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
import org.apache.ctakes.ytex.kernel.model.FeatureRank;
import org.apache.ctakes.ytex.kernel.pagerank.PageRankService;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.support.TransactionCallback;
import org.springframework.transaction.support.TransactionTemplate;

import com.google.common.collect.ImmutableMap;

/**
* compute concept similarity
*
* @author vijay
*
*/
public class ConceptSimilarityServiceImpl implements ConceptSimilarityService {
  private static final Log log = LogFactory
      .getLog(ConceptSimilarityServiceImpl.class);

  private static String formatPaths(List<LCSPath> lcsPaths) {
    StringBuilder b = new StringBuilder();
    Iterator<LCSPath> lcsPathIter = lcsPaths.iterator();
    while (lcsPathIter.hasNext()) {
      LCSPath lcsPath = lcsPathIter.next();
      String lcs = lcsPath.getLcs();
      b.append(lcs);
      b.append("=");
      b.append(lcsPath.toString());
      if (lcsPathIter.hasNext())
        b.append("|");
    }
    return b.toString();
  }

  @SuppressWarnings("static-access")
  public static void main(String args[]) throws IOException {
    Options options = new Options();
    options.addOption(OptionBuilder
        .withArgName("concepts")
        .hasArg()
        .withDescription(
            "concept pairs or a file containing concept pairs.  To specify pairs on command line, separate concepts by comma, concept pairs by semicolon.  For file, separate concepts by comma or tab, each concept pair on a new line.")
        .isRequired(true).create("concepts"));
    options.addOption(OptionBuilder
        .withArgName("metrics")
        .hasArg()
        .withDescription(
            "comma-separated list of metrics.  Valid metrics: "
                + Arrays.asList(SimilarityMetricEnum.values()))
        .isRequired(true).create("metrics"));
    options.addOption(OptionBuilder
        .withArgName("out")
        .hasArg()
        .withDescription(
            "file to write oputput to.  if not specified, output sent to stdout.")
        .create("out"));
    options.addOption(OptionBuilder.withArgName("lcs")
        .withDescription("output lcs and path for each concept pair")
        .create("lcs"));
    try {
      CommandLineParser parser = new GnuParser();
      CommandLine line = parser.parse(options, args);
      String concepts = line.getOptionValue("concepts");
      String metrics = line.getOptionValue("metrics");
      String out = line.getOptionValue("out");
      boolean lcs = line.hasOption("lcs");
      PrintStream os = null;
      try {
        if (out != null) {
          os = new PrintStream(new BufferedOutputStream(
              new FileOutputStream(out)));
        } else {
          os = System.out;
        }
        List<ConceptPair> conceptPairs = parseConcepts(concepts);
        List<SimilarityMetricEnum> metricList = parseMetrics(metrics);
        ConceptSimilarityService simSvc = SimSvcContextHolder
            .getApplicationContext().getBean(
                ConceptSimilarityService.class);
        List<SimilarityInfo> simInfos = lcs ? new ArrayList<SimilarityInfo>(
            conceptPairs.size()) : null;
        List<ConceptPairSimilarity> conceptSimMap = simSvc.similarity(
            conceptPairs, metricList, null, lcs);
        printSimilarities(conceptPairs, conceptSimMap, metricList,
            simInfos, lcs, os);
        // try {
        // Thread.sleep(60*1000);
        // } catch (InterruptedException e) {
        // e.printStackTrace();
        // }
      } finally {
        if (out != null) {
          try {
            os.close();
          } catch (Exception e) {
          }
        }
      }
    } catch (ParseException pe) {
      HelpFormatter formatter = new HelpFormatter();
      formatter.printHelp(
          "java " + ConceptSimilarityServiceImpl.class.getName()
              + " get concept similiarity", options);
    }
  }

  private static List<ConceptPair> parseConcepts(String concepts)
      throws IOException {
    BufferedReader r = null;
    try {
      List<ConceptPair> conceptPairs = new ArrayList<ConceptPair>();
      File f = new File(concepts);
      if (f.exists()) {
        r = new BufferedReader(new FileReader(f));
      } else {
        r = new BufferedReader(new StringReader(concepts));
      }
      String line = null;
      while ((line = r.readLine()) != null) {
        // for command line, split pairs by semicolon
        String lines[] = line.split(";");
        for (String subline : lines) {
          String pair[] = subline.split(",|\\t");
          if (pair.length != 2) {
            System.err.println("cannot parse concept pair: "
                + subline);
          } else {
            conceptPairs.add(new ConceptPair(pair[0], pair[1]));
          }
        }
      }
      return conceptPairs;
    } finally {
      if (r != null)
        r.close();
    }
  }

  private static List<SimilarityMetricEnum> parseMetrics(String metrics) {
    String ms[] = metrics.split(",");
    List<SimilarityMetricEnum> metricSet = new ArrayList<SimilarityMetricEnum>();
    for (String metric : ms) {
      SimilarityMetricEnum m = SimilarityMetricEnum.valueOf(metric);
      if (m == null)
        System.err.println("invalid metric: " + ms);
      else
        metricSet.add(m);
    }
    return metricSet;
  }

  private static void printSimilarities(List<ConceptPair> conceptPairs,
      List<ConceptPairSimilarity> conceptSimList,
      List<SimilarityMetricEnum> metricList,
      List<SimilarityInfo> simInfos, boolean lcs, PrintStream os) {
    // print header
    os.print("Concept 1\tConcept 2");
    for (SimilarityMetricEnum metric : metricList) {
      os.print("\t");
      os.print(metric);
    }
    if (lcs) {
      os.print("\tlcs(s)\tcorpus lcs\tintrinsic lcs\tpaths");
    }
    os.println();
    // print content
    for (ConceptPairSimilarity csim : conceptSimList) {
      ConceptPair p = csim.getConceptPair();
      os.print(p.getConcept1());
      os.print("\t");
      os.print(p.getConcept2());
      for (Double sim : csim.getSimilarities()) {
        os.print("\t");
        if (sim != null)
          os.print(String.format("%6f", sim));
        else
          os.print(0d);
      }
      if (lcs) {
        SimilarityInfo simInfo = csim.getSimilarityInfo();
        os.print("\t");
        Iterator<String> lcsIter = simInfo.getLcses().iterator();
        while (lcsIter.hasNext()) {
          os.print(lcsIter.next());
          if (lcsIter.hasNext())
            os.print('|');
        }
        os.print("\t");
        os.print(simInfo.getCorpusLcs() == null ? "" : simInfo
            .getCorpusLcs());
        os.print("\t");
        os.print(simInfo.getIntrinsicLcs() == null ? "" : simInfo
            .getIntrinsicLcs());
        os.print("\t");
        os.print(formatPaths(simInfo.getLcsPaths()));
      }
      os.println();
    }
  }

  private CacheManager cacheManager;

  private ConceptGraph cg = null;

  private ClassifierEvaluationDao classifierEvaluationDao;

  private ConceptDao conceptDao;
  private String conceptGraphName;

  private String conceptSetName;

  // /**
  // * information concept cache
  // */
  // private Map<String, Double> corpusICMap = null;

  private String corpusName;

  private Map<String, BitSet> cuiTuiMap;

  // private Map<String, ConceptInfo> conceptInfoMap = null;
  // private ConceptInfo[] conceptInfoCache;

  /**
   * cache to hold lcs's
   */
  private Cache lcsCache;
  private String lcsImputedType = ImputedFeatureEvaluator.MeasureType.INFOGAIN
      .getName();

  private PageRankService pageRankService;

  private boolean preload = true;
  private Map<String, Double> corpusICMap;

  private Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap = null;
  private PlatformTransactionManager transactionManager;

  private List<String> tuiList;

  private void addCuiTuiToMap(Map<String, Set<String>> cuiTuiMap,
      Map<String, String> tuiMap, String cui, String tui) {
    // get 'the' tui string
    if (tuiMap.containsKey(tui))
      tui = tuiMap.get(tui);
    else
      tuiMap.put(tui, tui);
    Set<String> tuis = cuiTuiMap.get(cui);
    if (tuis == null) {
      tuis = new HashSet<String>();
      cuiTuiMap.put(cui, tuis);
    }
    tuis.add(tui);
  }

  @Override
  public Object[] getBestLCS(Set<String> lcses, boolean intrinsicIC,
      Map<String, Double> conceptFilter) {
    Map<String, Double> lcsICMap = new HashMap<String, Double>(lcses.size());
    // if (isPreload()) {
    // look in conceptInfoMap for info content
    for (String lcs : lcses) {
      lcsICMap.put(lcs, getIC(lcs, intrinsicIC));
      // }
      // } else {
      // // load info content on demand
      // Map<String, FeatureRank> frMap = getICOnDemand(lcses,
      // intrinsicIC);
      // for (Map.Entry<String, FeatureRank> frMapEntry :
      // frMap.entrySet()) {
      // lcsICMap.put(frMapEntry.getKey(), frMapEntry.getValue()
      // .getEvaluation());
      // }
    }
    if (conceptFilter != null) {
      double currentBest = -1;
      Set<String> bestLcses = new HashSet<String>();
      for (String lcs : lcses) {
        if (conceptFilter.containsKey(lcs)) {
          double lcsEval = conceptFilter.get(lcs);
          if (currentBest == -1 || lcsEval > currentBest) {
            bestLcses.clear();
            bestLcses.add(lcs);
            currentBest = lcsEval;
          } else if (currentBest == lcsEval) {
            bestLcses.add(lcs);
          }
        }
      }
      if (currentBest < 0)
        currentBest = 0d;
      if (bestLcses.size() > 0) {
        return this.getBestLCS(bestLcses, lcsICMap);
      } else {
        // no lcses made the cut
        return null;
      }
    } else {
      // unfiltered - get the lowest ic
      return this.getBestLCS(lcses, lcsICMap);
    }
  }

  public Object[] getBestLCS(Set<String> lcses, Map<String, Double> icMap) {
    double ic = -1;
    String bestLCS = null;
    for (String lcs : lcses) {
      Double ictmp = icMap.get(lcs);
      if (ictmp != null && ic < ictmp.doubleValue()) {
        ic = ictmp;
        bestLCS = lcs;
      }
    }
    if (ic < 0)
      ic = 0d;
    return new Object[] { bestLCS, ic };
  }

  // /**
  // * return lin measure. optionally filter lin measure so that only concepts
  // * that have an lcs that is relevant to the classification task have a
  // * non-zero lin measure.
  // *
  // * relevant concepts are those whose evaluation wrt the label exceeds a
  // * threshold.
  // *
  // * @param concept1
  // * @param concept2
  // * @param label
  // * if not null, then filter lcses.
  // * @param lcsMinEvaluation
  // * if gt; 0, then filter lcses. this is the threshold.
  // * @return 0 - no lcs, or no lcs that meets the threshold.
  // */
  // @Override
  // public double filteredLin(String concept1, String concept2,
  // Map<String, Double> conceptFilter) {
  // double ic1 = getIC(concept1);
  // double ic2 = getIC(concept2);
  // // lin not defined if one of the concepts doesn't exist in the corpus
  // if (ic1 == 0 || ic2 == 0)
  // return 0;
  // double denom = getIC(concept1) + getIC(concept2);
  // if (denom != 0) {
  // ConcRel cr1 = cg.getConceptMap().get(concept1);
  // ConcRel cr2 = cg.getConceptMap().get(concept2);
  // if (cr1 != null && cr2 != null) {
  // Set<String> lcses = new HashSet<String>();
  // int dist = getLCSFromCache(cr1, cr2, lcses);
  // if (dist > 0) {
  // double ic = getBestIC(lcses, conceptFilter);
  // return 2 * ic / denom;
  // }
  // }
  // }
  // return 0;
  // }

  // /**
  // * get the information content for the concept with the highest evaluation
  // * greater than a specified threshold.
  // *
  // * If threshold 0, get the lowest IC of all the lcs's.
  // *
  // * @param lcses
  // * the least common subsumers of a pair of concepts
  // * @param label
  // * label against which feature was evaluated
  // * @param lcsMinEvaluation
  // * threshold that the feature has to exceed. 0 for no filtering.
  // * @return 0 if no lcs that makes the cut. else find the lcs(es) with the
  // * maximal evaluation, and return getIC on these lcses.
  // *
  // * @see #getIC(Iterable)
  // */
  // private double getBestIC(Set<String> lcses,
  // Map<String, Double> conceptFilter) {
  // if (conceptFilter != null) {
  // double currentBest = -1;
  // Set<String> bestLcses = new HashSet<String>();
  // for (String lcs : lcses) {
  // if (conceptFilter.containsKey(lcs)) {
  // double lcsEval = conceptFilter.get(lcs);
  // if (currentBest == -1 || lcsEval > currentBest) {
  // bestLcses.clear();
  // bestLcses.add(lcs);
  // currentBest = lcsEval;
  // } else if (currentBest == lcsEval) {
  // bestLcses.add(lcs);
  // }
  // }
  // }
  // if (bestLcses.size() > 0) {
  // return this.getIC(bestLcses);
  // }
  // } else {
  // // unfiltered - get the lowest ic
  // return this.getIC(lcses);
  // }
  // return 0;
  // }

  // private ConceptInfo getPreloadedConceptInfo(String conceptId) {
  // ConcRel cr = cg.getConceptMap().get(conceptId);
  // if (cr != null) {
  // return this.conceptInfoCache[cr.getNodeIndex()];
  // }
  // return null;
  // }

  public CacheManager getCacheManager() {
    return cacheManager;
  }

  public ClassifierEvaluationDao getClassifierEvaluationDao() {
    return classifierEvaluationDao;
  }

  public ConceptDao getConceptDao() {
    return conceptDao;
  }

  // private String createKey(String c1, String c2) {
  // if (c1.compareTo(c2) < 0) {
  // return new StringBuilder(c1).append("-").append(c2).toString();
  // } else {
  // return new StringBuilder(c2).append("-").append(c1).toString();
  // }
  // }

  @Override
  public ConceptGraph getConceptGraph() {
    return cg;
  }

  public String getConceptGraphName() {
    return conceptGraphName;
  }

  public String getConceptSetName() {
    return conceptSetName;
  }

  public String getCorpusName() {
    return corpusName;
  }

  @Override
  public Map<String, BitSet> getCuiTuiMap() {
    return cuiTuiMap;
  }

  @Override
  public int getDepth(String concept) {
    // if (isPreload()) {
    // // preloaded all concept info - depth should be there
    // ConceptInfo ci = this.getPreloadedConceptInfo(concept);
    // if (ci != null)
    // return (int) ci.getDepth();
    // } else {
    // // get the feature ranks for the intrinsic infocontent -
    // // rank = depth
    // Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>(
    // Arrays.asList(concept)), true);
    // if (frMap.containsKey(concept))
    // return frMap.get(concept).getRank();
    // }
    ConcRel cr = this.cg.getConceptMap().get(concept);
    if (cr != null)
      return cr.getDepth();
    return 0;
  }

  @Override
  public double getIC(String concept, boolean intrinsicICMap) {
    double ic = 0d;
    if (intrinsicICMap) {
      ConcRel cr = this.cg.getConceptMap().get(concept);
      if (cr != null)
        ic = cr.getIntrinsicInfoContent();
    } else {
      Double icC = null;
      if (isPreload()) {
        // we preloaded all ic - just look in the cache
        icC = this.corpusICMap.get(concept);
      } else {
        // we need to load the ic from the database on demand
        Map<String, FeatureRank> frMap = getICOnDemand(
            new HashSet<String>(Arrays.asList(concept)), false);
        if (frMap.containsKey(concept))
          return frMap.get(concept).getEvaluation();
      }
      if (icC != null)
        ic = icC;
    }
    return ic;
    // if (isPreload()) {
    // ConceptInfo ci = this.getPreloadedConceptInfo(concept);
    // if (ci != null)
    // return intrinsicICMap ? ci.getIntrinsicIC() : ci.getCorpusIC();
    // } else {
    // Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>(
    // Arrays.asList(concept)), intrinsicICMap);
    // if (frMap.containsKey(concept))
    // return frMap.get(concept).getEvaluation();
    // }
    // return 0d;
  }

  private Map<String, FeatureRank> getICOnDemand(Set<String> lcses,
      boolean intrinsicIC) {
    if (lcses == null || lcses.isEmpty())
      return new HashMap<String, FeatureRank>(0);
    Map<String, FeatureRank> lcsICMap;
    lcsICMap = this.classifierEvaluationDao
        .getFeatureRanks(
            lcses,
            intrinsicIC ? null : this.corpusName,
            intrinsicIC ? null : this.conceptSetName,
            null,
            intrinsicIC ? IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT
                : InfoContentEvaluator.INFOCONTENT, null, 0d,
            this.getConceptGraphName());
    return lcsICMap;
  }

  // /**
  // * get the concept with the lowest Information Content of all the LCSs.
  // * Functionality copied from umls interface.
  // *
  // * @todo make this configurable/add a parameter - avg/min/max/median?
  // * @param lcses
  // * @return
  // */
  // public double getIC(Iterable<String> lcses) {
  // double ic = 0;
  // for (String lcs : lcses) {
  // double ictmp = getIC(lcs);
  // if (ic < ictmp)
  // ic = ictmp;
  // }
  // return ic;
  // }
  //
  // public double getIC(String concept1) {
  // Double dRetVal = corpusICMap.get(concept1);
  // if (dRetVal != null)
  // return (double) dRetVal;
  // else
  // return 0;
  // }

  public int getLCS(String concept1, String concept2, Set<String> lcses,
      List<LCSPath> lcsPaths) {
    int lcsDist = 0;
    ConcRel cr1 = getConceptGraph().getConceptMap().get(concept1);
    ConcRel cr2 = getConceptGraph().getConceptMap().get(concept2);
    if (cr1 != null && cr2 != null) {
      lcses.clear();
      if (lcsPaths == null) {
        // no need to get paths which we don't cache - look in the cache
        lcsDist = getLCSFromCache(cr1, cr2, lcses);
      } else {
        lcsPaths.clear();
        // need to get paths - compute the lcses and their paths
        lcsDist = lcs(concept1, concept2, lcsPaths);
        for (LCSPath lcsPath : lcsPaths) {
          lcses.add(lcsPath.getLcs());
        }
      }
    } else {
      if (log.isDebugEnabled()) {
        if (cr1 == null)
          log.debug("could not find concept:" + concept1);
        if (cr2 == null)
          log.debug("could not find concept:" + concept2);
      }
    }
    return lcsDist;
  }

  public Cache getLcsCache() {
    return lcsCache;
  }

  @SuppressWarnings("unchecked")
  private int getLCSFromCache(ConcRel cr1, ConcRel cr2, Set<String> lcses) {
    OrderedPair<String> cacheKey = new OrderedPair<String>(
        cr1.getConceptID(), cr2.getConceptID());
    Element e = this.lcsCache.get(cacheKey);
    if (e != null) {
      // hit the cache - unpack the lcs
      if (e.getObjectValue() != null) {
        Object[] val = (Object[]) e.getObjectValue();
        lcses.addAll((Set<String>) val[1]);
        return (Integer) val[0];
      } else {
        return -1;
      }
    } else {
      // missed the cache - save the lcs
      Object[] val = null;
      Set<ConcRel> lcsCRSet = new HashSet<ConcRel>(2);
      int dist = ConcRel.getLeastCommonConcept(cr1, cr2, lcsCRSet, null);
      if (dist >= 0) {
        val = new Object[2];
        val[0] = dist;
        for (ConcRel cr : lcsCRSet) {
          lcses.add(cr.getConceptID());
        }
        val[1] = lcses;
      }
      e = new Element(cacheKey, val);
      this.lcsCache.put(e);
      return dist;
    }
  }

  public String getLcsImputedType() {
    return lcsImputedType;
  }

  public PageRankService getPageRankService() {
    return pageRankService;
  }

  public Map<SimilarityMetricEnum, SimilarityMetric> getSimilarityMetricMap() {
    return similarityMetricMap;
  }

  public PlatformTransactionManager getTransactionManager() {
    return transactionManager;
  }

  @Override
  public List<String> getTuiList() {
    return this.tuiList;
  }

  public void init() {
    log.info("begin initialization for concept graph: " + conceptGraphName);
    cg = conceptDao.getConceptGraph(conceptGraphName);
    if (cg == null) {
      log.warn("concept graph null, name: " + conceptGraphName);
    } else {
      initSimilarityMetricMap();
      if (isPreload()) {
        try {
          TransactionTemplate t = new TransactionTemplate(
              this.transactionManager);
          t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW);
          t.execute(new TransactionCallback<Object>() {
            @Override
            public Object doInTransaction(TransactionStatus arg0) {
              initInfoContent();
              initCuiTuiMapFromCorpus();
              return null;
            }
          });
        } catch (Exception e) {
          log.info("could not initialize cui-tui map: "
              + e.getMessage()
              + ".  This is expected if you do not have umls installed in your db.");
        }
      }
    }
    log.info("end initialization for concept graph: " + conceptGraphName);
  }

  /**
   * load cui-tui for the specified corpus from the MRSTY table
   */
  public void initCuiTuiMapFromCorpus() {
    // don't duplicate tui strings to save memory
    SortedMap<String, String> tuiMap = new TreeMap<String, String>();
    Map<String, Set<String>> tmpTuiCuiMap = new HashMap<String, Set<String>>();
    List<Object[]> listCuiTui = this.classifierEvaluationDao
        .getCorpusCuiTuis(this.getCorpusName(),
            this.getConceptGraphName(), this.getConceptSetName());
    for (Object[] cuiTui : listCuiTui) {
      String cui = (String) cuiTui[0];
      String tui = (String) cuiTui[1];
      addCuiTuiToMap(tmpTuiCuiMap, tuiMap, cui, tui);
    }
    // map of tui - bitset index
    SortedMap<String, Integer> mapTuiIndex = new TreeMap<String, Integer>();
    // list of tuis corresponding to bitset indices
    List<String> tmpTuiList = new ArrayList<String>(tuiMap.size());
    int index = 0;
    for (String tui : tuiMap.keySet()) {
      mapTuiIndex.put(tui, index++);
      tmpTuiList.add(tui);
    }
    this.tuiList = Collections.unmodifiableList(tmpTuiList);
    // convert list of cuis into bitsets
    // Map<String, BitSet> tmpCuiTuiBitsetMap = new HashMap<String,
    // BitSet>();
    ImmutableMap.Builder<String, BitSet> cuiTuiBitsetMapBuilder = new ImmutableMap.Builder<String, BitSet>();
    for (Map.Entry<String, Set<String>> cuiTuiMapEntry : tmpTuiCuiMap
        .entrySet()) {
      // tmpCuiTuiBitsetMap.put(cuiTuiMapEntry.getKey(),
      // tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex));
      cuiTuiBitsetMapBuilder.put(cuiTuiMapEntry.getKey(),
          tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex));
    }
    // this.cuiTuiMap = Collections.unmodifiableMap(tmpCuiTuiBitsetMap);
    this.cuiTuiMap = cuiTuiBitsetMapBuilder.build();
  }

  /**
   * initialize information content caches TODO replace strings with concept
   * ids from conceptGraph to save memory
   */
  private void initInfoContent() {
    // log.info("loading intrinsic infocontent for concept graph: "
    // + conceptGraphName);
    // List<ConceptInfo> listConceptInfo = classifierEvaluationDao
    // .getIntrinsicInfoContent(conceptGraphName);
    // if (listConceptInfo.isEmpty()) {
    // log.warn("intrinsic info content not available! most similarity measures will not work");
    // }
    // this.conceptInfoCache = new ConceptInfo[cg.getConceptMap().size()];
    // for (ConceptInfo ci : listConceptInfo) {
    // ConcRel cr = cg.getConceptMap().get(ci.getConceptId());
    // if (cr != null) {
    // // save a little memory by reusing the string
    // ci.setConceptId(cr.getConceptID());
    // conceptInfoCache[cr.getNodeIndex()] = ci;
    // }
    // }
    // fill intrinsicIC
    // Map<String, FeatureRank> intrinsicICMap = classifierEvaluationDao
    // .getIntrinsicInfoContent(conceptGraphName);
    // for (Map.Entry<String, FeatureRank> icMapEntry : intrinsicICMap
    // .entrySet()) {
    // FeatureRank r = icMapEntry.getValue();
    // ConcRel cr = cg.getConceptMap().get(r.getFeatureName());
    // if (cr != null) {
    // ConceptInfo ci = new ConceptInfo();
    // ci.setConceptId(cr.getConceptID());
    // ci.setDepth(r.getRank());
    // ci.setIntrinsicIC(r.getEvaluation());
    // conceptInfoMap.put(ci.getConceptId(), ci);
    // }
    // }
    // fill corpusIC
    log.info("loading corpus infocontent for corpusName=" + corpusName
        + ", conceptGraphName=" + conceptGraphName
        + ", conceptSetName=" + conceptSetName);
    Map<String, Double> corpusICMap = classifierEvaluationDao
        .getInfoContent(corpusName, conceptGraphName,
            this.conceptSetName);
    if (corpusICMap == null || corpusICMap.isEmpty()) {
      log.warn("IC not found");
    }
    ImmutableMap.Builder<String, Double> mb = new ImmutableMap.Builder<String, Double>();
    for (Map.Entry<String, Double> corpusICEntry : corpusICMap.entrySet()) {
      ConcRel cr = cg.getConceptMap().get(corpusICEntry.getKey());
      if (cr != null) {
        mb.put(cr.getConceptID(), corpusICEntry.getValue());
      }
    }
    this.corpusICMap = mb.build();
    // ConceptInfo ci = this.conceptInfoCache[cr.getNodeIndex()];
    // if (ci == null) {
    // // this shouldn't happen! there should be intrinsic ic for
    // // this concept
    // ci = new ConceptInfo();
    // ci.setConceptId(cr.getConceptID());
    // this.conceptInfoCache[cr.getNodeIndex()] = ci;
    // }
    // ci.setCorpusIC(corpusICEntry.getValue());
    // }
    // }
  }

  /**
   * initialize the metrics
   */
  private void initSimilarityMetricMap() {
    log.info("initializing similarity measures");
    // Double maxIC = this.classifierEvaluationDao.getMaxFeatureEvaluation(
    // null, null, null,
    // IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT, 0, 0,
    // conceptGraphName);
    // Integer maxDepth = this.classifierEvaluationDao
    // .getMaxDepth(conceptGraphName);
    double maxIC = this.cg.getIntrinsicICMax();
    int maxDepth = this.cg.getDepthMax();
    this.similarityMetricMap = new HashMap<SimilarityMetricEnum, SimilarityMetric>(
        SimilarityMetricEnum.values().length);
    if (maxDepth > 0) {
      this.similarityMetricMap.put(SimilarityMetricEnum.LCH,
          new LCHMetric(this, maxDepth));
      this.similarityMetricMap.put(SimilarityMetricEnum.LIN,
          new LinMetric(this, false));
      this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LIN,
          new LinMetric(this, true));
      this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LCH,
          new IntrinsicLCHMetric(this, maxIC));
      this.similarityMetricMap.put(SimilarityMetricEnum.PATH,
          new PathMetric(this));
      this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_PATH,
          new IntrinsicPathMetric(this, maxIC));
      this.similarityMetricMap.put(SimilarityMetricEnum.RADA,
          new RadaMetric(this, maxDepth));
      this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_RADA,
          new IntrinsicRadaMetric(this, maxIC));
      this.similarityMetricMap.put(SimilarityMetricEnum.SOKAL,
          new SokalSneathMetric(this));
      this.similarityMetricMap.put(SimilarityMetricEnum.JACCARD,
          new JaccardMetric(this));
      this.similarityMetricMap.put(SimilarityMetricEnum.WUPALMER,
          new WuPalmerMetric(this));
    } else {
      this.similarityMetricMap.put(SimilarityMetricEnum.PAGERANK,
          new PageRankMetric(this, this.getPageRankService()));
    }
  }

  public boolean isPreload() {
    return preload;
  }

  public int lcs(String concept1, String concept2, List<LCSPath> lcsPaths) {
    ConcRel cr1 = cg.getConceptMap().get(concept1);
    ConcRel cr2 = cg.getConceptMap().get(concept2);
    int dist = -1;
    if (cr1 != null && cr2 != null) {
      Set<ConcRel> crlcses = new HashSet<ConcRel>();
      Map<ConcRel, LCSPath> crpaths = new HashMap<ConcRel, LCSPath>();
      dist = ConcRel.getLeastCommonConcept(cr1, cr2, crlcses, crpaths);
      lcsPaths.addAll(crpaths.values());
    }
    return dist;
  }

  // /*
  // * (non-Javadoc)
  // *
  // * @see
  // org.apache.ctakes.ytex.kernel.ConceptSimilarity#lch(java.lang.String,
  // * java.lang.String)
  // */
  // public double lch(String concept1, String concept2) {
  // double dm = 2 * cg.getDepthMax() + 1.0;
  // ConcRel cr1 = cg.getConceptMap().get(concept1);
  // ConcRel cr2 = cg.getConceptMap().get(concept2);
  // if (cr1 != null && cr2 != null) {
  // Set<String> lcses = new HashSet<String>();
  // int lcsDist = getLCSFromCache(cr1, cr2, lcses);
  // // leacock is defined as -log([path length]/(2*[depth])
  // double lch = -Math.log(((double) lcsDist + 1.0) / dm);
  // // scale to depth
  // return lch / Math.log(dm);
  // } else {
  // if (log.isDebugEnabled()) {
  // if (cr1 == null)
  // log.debug("could not find concept:" + concept1);
  // if (cr2 == null)
  // log.debug("could not find concept:" + concept2);
  // }
  // return 0;
  // }
  // }

  /**
   * For the given label and cutoff, get the corresponding concepts whose
   * propagated ig meets the threshold. Used by lin kernel to find concepts
   * that actually have a non-trivial similarity
   *
   * @param label
   *            label
   * @param rankCutoff
   *            cutoff
   * @param conceptFilter
   *            set to fill with concepts
   * @return double minimum evaluation
   */
  @Override
  public double loadConceptFilter(String label, int rankCutoff,
      Map<String, Double> conceptFilter) {
    List<FeatureRank> imputedConcepts = this.classifierEvaluationDao
        .getImputedFeaturesByPropagatedCutoff(corpusName,
            conceptSetName, label, lcsImputedType
                + ImputedFeatureEvaluator.SUFFIX_IMPUTED,
            conceptGraphName, lcsImputedType
                + ImputedFeatureEvaluator.SUFFIX_PROP,
            rankCutoff);
    double minEval = 1d;
    for (FeatureRank r : imputedConcepts) {
      conceptFilter.put(r.getFeatureName(), r.getEvaluation());
      if (minEval >= r.getEvaluation())
        minEval = r.getEvaluation();
    }
    return minEval;
  }

  // public double lin(String concept1, String concept2) {
  // return filteredLin(concept1, concept2, null);
  // }

  public void setCacheManager(CacheManager cacheManager) {
    this.cacheManager = cacheManager;
  }

  public void setClassifierEvaluationDao(
      ClassifierEvaluationDao classifierEvaluationDao) {
    this.classifierEvaluationDao = classifierEvaluationDao;
  }

  public void setConceptDao(ConceptDao conceptDao) {
    this.conceptDao = conceptDao;
  }

  public void setConceptGraphName(String conceptGraphName) {
    this.conceptGraphName = conceptGraphName;
  }

  public void setConceptSetName(String conceptSetName) {
    this.conceptSetName = conceptSetName;
  }

  public void setCorpusName(String corpusName) {
    this.corpusName = corpusName;
  }

  public void setLcsCache(Cache lcsCache) {
    this.lcsCache = lcsCache;
  }

  public void setLcsImputedType(String lcsImputedType) {
    this.lcsImputedType = lcsImputedType;
  }

  // double minEval = 1d;
  // List<FeatureRank> listPropagatedConcepts = classifierEvaluationDao
  // .getTopFeatures(corpusName, conceptSetName, label,
  // ImputedFeatureEvaluator.MeasureType.INFOGAIN.toString()
  // + ImputedFeatureEvaluator.SUFFIX_PROP, 0, 0,
  // conceptGraphName, rankCutoff);
  // for (FeatureRank r : listPropagatedConcepts) {
  // ConcRel cr = cg.getConceptMap().get(r.getFeatureName());
  // if (cr != null) {
  // addSubtree(conceptFilterSet, cr);
  // }
  // if (r.getEvaluation() < minEval)
  // minEval = r.getEvaluation();
  // }
  // return minEval;
  // }
  //
  // /**
  // * add all children of parent to conceptSet. Limit only to children that
  // * actually appear in the corpus
  // *
  // * @param conceptSet
  // * set of concepts to add ids to
  // * @param parent
  // * parent which will be added to the conceptSet
  // * @param corpusICSet
  // * set of concepts and hypernyms contained in corpus
  // */
  // private void addSubtree(Map<String, Double> conceptSet, ConcRel parent) {
  // if (!conceptSet.containsKey(parent.getConceptID())
  // && conceptFreq.containsKey(parent.getConceptID())) {
  // conceptSet.put(parent.getConceptID(), 0d);
  // for (ConcRel child : parent.getChildren()) {
  // addSubtree(conceptSet, child);
  // }
  // }
  // }

  public void setPageRankService(PageRankService pageRankService) {
    this.pageRankService = pageRankService;
  }

  public void setPreload(boolean preload) {
    this.preload = preload;
  }

  public void setSimilarityMetricMap(
      Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap) {
    this.similarityMetricMap = similarityMetricMap;
  }

  public void setTransactionManager(
      PlatformTransactionManager transactionManager) {
    this.transactionManager = transactionManager;
  }

  @Override
  public List<ConceptPairSimilarity> similarity(
      List<ConceptPair> conceptPairs, List<SimilarityMetricEnum> metrics,
      Map<String, Double> conceptFilter, boolean lcs) {
    List<ConceptPairSimilarity> conceptSimMap = new ArrayList<ConceptPairSimilarity>(
        conceptPairs.size());
    for (ConceptPair conceptPair : conceptPairs) {
      conceptSimMap.add(similarity(metrics, conceptPair.getConcept1(),
          conceptPair.getConcept2(), conceptFilter, lcs));
    }
    return conceptSimMap;
  }

  /**
   *
   */
  @Override
  public ConceptPairSimilarity similarity(List<SimilarityMetricEnum> metrics,
      String concept1, String concept2,
      Map<String, Double> conceptFilter, boolean lcs) {
    // allocate simInfo if this isn't provided
    SimilarityInfo simInfo = new SimilarityInfo();
    if (lcs)
      simInfo.setLcsPaths(new ArrayList<LCSPath>(1));
    // allocate result map
    List<Double> similarities = new ArrayList<Double>(metrics.size());
    if (cg != null) {
      // iterate over metrics, compute, stuff in map
      for (SimilarityMetricEnum metric : metrics) {
        double sim = this.similarityMetricMap.get(metric).similarity(
            concept1, concept2, conceptFilter, simInfo);
        similarities.add(sim);
      }
    }
    ConceptPairSimilarity csim = new ConceptPairSimilarity();
    csim.setConceptPair(new ConceptPair(concept1, concept2));
    csim.setSimilarities(similarities);
    csim.setSimilarityInfo(simInfo);
    return csim;
  }

  /**
   * convert the list of tuis into a bitset
   *
   * @param tuis
   * @param mapTuiIndex
   * @return
   */
  private BitSet tuiListToBitset(Set<String> tuis,
      SortedMap<String, Integer> mapTuiIndex) {
    BitSet bs = new BitSet(mapTuiIndex.size());
    for (String tui : tuis) {
      bs.set(mapTuiIndex.get(tui));
    }
    return bs;
  }
}
TOP

Related Classes of org.apache.ctakes.ytex.kernel.metric.ConceptSimilarityServiceImpl

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.