/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.ytex.kernel.metric;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Element;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.ctakes.ytex.kernel.ImputedFeatureEvaluator;
import org.apache.ctakes.ytex.kernel.InfoContentEvaluator;
import org.apache.ctakes.ytex.kernel.IntrinsicInfoContentEvaluator;
import org.apache.ctakes.ytex.kernel.OrderedPair;
import org.apache.ctakes.ytex.kernel.SimSvcContextHolder;
import org.apache.ctakes.ytex.kernel.dao.ClassifierEvaluationDao;
import org.apache.ctakes.ytex.kernel.dao.ConceptDao;
import org.apache.ctakes.ytex.kernel.model.ConcRel;
import org.apache.ctakes.ytex.kernel.model.ConceptGraph;
import org.apache.ctakes.ytex.kernel.model.FeatureRank;
import org.apache.ctakes.ytex.kernel.pagerank.PageRankService;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.support.TransactionCallback;
import org.springframework.transaction.support.TransactionTemplate;
import com.google.common.collect.ImmutableMap;
/**
* compute concept similarity
*
* @author vijay
*
*/
public class ConceptSimilarityServiceImpl implements ConceptSimilarityService {
private static final Log log = LogFactory
.getLog(ConceptSimilarityServiceImpl.class);
private static String formatPaths(List<LCSPath> lcsPaths) {
StringBuilder b = new StringBuilder();
Iterator<LCSPath> lcsPathIter = lcsPaths.iterator();
while (lcsPathIter.hasNext()) {
LCSPath lcsPath = lcsPathIter.next();
String lcs = lcsPath.getLcs();
b.append(lcs);
b.append("=");
b.append(lcsPath.toString());
if (lcsPathIter.hasNext())
b.append("|");
}
return b.toString();
}
@SuppressWarnings("static-access")
public static void main(String args[]) throws IOException {
Options options = new Options();
options.addOption(OptionBuilder
.withArgName("concepts")
.hasArg()
.withDescription(
"concept pairs or a file containing concept pairs. To specify pairs on command line, separate concepts by comma, concept pairs by semicolon. For file, separate concepts by comma or tab, each concept pair on a new line.")
.isRequired(true).create("concepts"));
options.addOption(OptionBuilder
.withArgName("metrics")
.hasArg()
.withDescription(
"comma-separated list of metrics. Valid metrics: "
+ Arrays.asList(SimilarityMetricEnum.values()))
.isRequired(true).create("metrics"));
options.addOption(OptionBuilder
.withArgName("out")
.hasArg()
.withDescription(
"file to write oputput to. if not specified, output sent to stdout.")
.create("out"));
options.addOption(OptionBuilder.withArgName("lcs")
.withDescription("output lcs and path for each concept pair")
.create("lcs"));
try {
CommandLineParser parser = new GnuParser();
CommandLine line = parser.parse(options, args);
String concepts = line.getOptionValue("concepts");
String metrics = line.getOptionValue("metrics");
String out = line.getOptionValue("out");
boolean lcs = line.hasOption("lcs");
PrintStream os = null;
try {
if (out != null) {
os = new PrintStream(new BufferedOutputStream(
new FileOutputStream(out)));
} else {
os = System.out;
}
List<ConceptPair> conceptPairs = parseConcepts(concepts);
List<SimilarityMetricEnum> metricList = parseMetrics(metrics);
ConceptSimilarityService simSvc = SimSvcContextHolder
.getApplicationContext().getBean(
ConceptSimilarityService.class);
List<SimilarityInfo> simInfos = lcs ? new ArrayList<SimilarityInfo>(
conceptPairs.size()) : null;
List<ConceptPairSimilarity> conceptSimMap = simSvc.similarity(
conceptPairs, metricList, null, lcs);
printSimilarities(conceptPairs, conceptSimMap, metricList,
simInfos, lcs, os);
// try {
// Thread.sleep(60*1000);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
} finally {
if (out != null) {
try {
os.close();
} catch (Exception e) {
}
}
}
} catch (ParseException pe) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(
"java " + ConceptSimilarityServiceImpl.class.getName()
+ " get concept similiarity", options);
}
}
private static List<ConceptPair> parseConcepts(String concepts)
throws IOException {
BufferedReader r = null;
try {
List<ConceptPair> conceptPairs = new ArrayList<ConceptPair>();
File f = new File(concepts);
if (f.exists()) {
r = new BufferedReader(new FileReader(f));
} else {
r = new BufferedReader(new StringReader(concepts));
}
String line = null;
while ((line = r.readLine()) != null) {
// for command line, split pairs by semicolon
String lines[] = line.split(";");
for (String subline : lines) {
String pair[] = subline.split(",|\\t");
if (pair.length != 2) {
System.err.println("cannot parse concept pair: "
+ subline);
} else {
conceptPairs.add(new ConceptPair(pair[0], pair[1]));
}
}
}
return conceptPairs;
} finally {
if (r != null)
r.close();
}
}
private static List<SimilarityMetricEnum> parseMetrics(String metrics) {
String ms[] = metrics.split(",");
List<SimilarityMetricEnum> metricSet = new ArrayList<SimilarityMetricEnum>();
for (String metric : ms) {
SimilarityMetricEnum m = SimilarityMetricEnum.valueOf(metric);
if (m == null)
System.err.println("invalid metric: " + ms);
else
metricSet.add(m);
}
return metricSet;
}
private static void printSimilarities(List<ConceptPair> conceptPairs,
List<ConceptPairSimilarity> conceptSimList,
List<SimilarityMetricEnum> metricList,
List<SimilarityInfo> simInfos, boolean lcs, PrintStream os) {
// print header
os.print("Concept 1\tConcept 2");
for (SimilarityMetricEnum metric : metricList) {
os.print("\t");
os.print(metric);
}
if (lcs) {
os.print("\tlcs(s)\tcorpus lcs\tintrinsic lcs\tpaths");
}
os.println();
// print content
for (ConceptPairSimilarity csim : conceptSimList) {
ConceptPair p = csim.getConceptPair();
os.print(p.getConcept1());
os.print("\t");
os.print(p.getConcept2());
for (Double sim : csim.getSimilarities()) {
os.print("\t");
if (sim != null)
os.print(String.format("%6f", sim));
else
os.print(0d);
}
if (lcs) {
SimilarityInfo simInfo = csim.getSimilarityInfo();
os.print("\t");
Iterator<String> lcsIter = simInfo.getLcses().iterator();
while (lcsIter.hasNext()) {
os.print(lcsIter.next());
if (lcsIter.hasNext())
os.print('|');
}
os.print("\t");
os.print(simInfo.getCorpusLcs() == null ? "" : simInfo
.getCorpusLcs());
os.print("\t");
os.print(simInfo.getIntrinsicLcs() == null ? "" : simInfo
.getIntrinsicLcs());
os.print("\t");
os.print(formatPaths(simInfo.getLcsPaths()));
}
os.println();
}
}
private CacheManager cacheManager;
private ConceptGraph cg = null;
private ClassifierEvaluationDao classifierEvaluationDao;
private ConceptDao conceptDao;
private String conceptGraphName;
private String conceptSetName;
// /**
// * information concept cache
// */
// private Map<String, Double> corpusICMap = null;
private String corpusName;
private Map<String, BitSet> cuiTuiMap;
// private Map<String, ConceptInfo> conceptInfoMap = null;
// private ConceptInfo[] conceptInfoCache;
/**
* cache to hold lcs's
*/
private Cache lcsCache;
private String lcsImputedType = ImputedFeatureEvaluator.MeasureType.INFOGAIN
.getName();
private PageRankService pageRankService;
private boolean preload = true;
private Map<String, Double> corpusICMap;
private Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap = null;
private PlatformTransactionManager transactionManager;
private List<String> tuiList;
private void addCuiTuiToMap(Map<String, Set<String>> cuiTuiMap,
Map<String, String> tuiMap, String cui, String tui) {
// get 'the' tui string
if (tuiMap.containsKey(tui))
tui = tuiMap.get(tui);
else
tuiMap.put(tui, tui);
Set<String> tuis = cuiTuiMap.get(cui);
if (tuis == null) {
tuis = new HashSet<String>();
cuiTuiMap.put(cui, tuis);
}
tuis.add(tui);
}
@Override
public Object[] getBestLCS(Set<String> lcses, boolean intrinsicIC,
Map<String, Double> conceptFilter) {
Map<String, Double> lcsICMap = new HashMap<String, Double>(lcses.size());
// if (isPreload()) {
// look in conceptInfoMap for info content
for (String lcs : lcses) {
lcsICMap.put(lcs, getIC(lcs, intrinsicIC));
// }
// } else {
// // load info content on demand
// Map<String, FeatureRank> frMap = getICOnDemand(lcses,
// intrinsicIC);
// for (Map.Entry<String, FeatureRank> frMapEntry :
// frMap.entrySet()) {
// lcsICMap.put(frMapEntry.getKey(), frMapEntry.getValue()
// .getEvaluation());
// }
}
if (conceptFilter != null) {
double currentBest = -1;
Set<String> bestLcses = new HashSet<String>();
for (String lcs : lcses) {
if (conceptFilter.containsKey(lcs)) {
double lcsEval = conceptFilter.get(lcs);
if (currentBest == -1 || lcsEval > currentBest) {
bestLcses.clear();
bestLcses.add(lcs);
currentBest = lcsEval;
} else if (currentBest == lcsEval) {
bestLcses.add(lcs);
}
}
}
if (currentBest < 0)
currentBest = 0d;
if (bestLcses.size() > 0) {
return this.getBestLCS(bestLcses, lcsICMap);
} else {
// no lcses made the cut
return null;
}
} else {
// unfiltered - get the lowest ic
return this.getBestLCS(lcses, lcsICMap);
}
}
public Object[] getBestLCS(Set<String> lcses, Map<String, Double> icMap) {
double ic = -1;
String bestLCS = null;
for (String lcs : lcses) {
Double ictmp = icMap.get(lcs);
if (ictmp != null && ic < ictmp.doubleValue()) {
ic = ictmp;
bestLCS = lcs;
}
}
if (ic < 0)
ic = 0d;
return new Object[] { bestLCS, ic };
}
// /**
// * return lin measure. optionally filter lin measure so that only concepts
// * that have an lcs that is relevant to the classification task have a
// * non-zero lin measure.
// *
// * relevant concepts are those whose evaluation wrt the label exceeds a
// * threshold.
// *
// * @param concept1
// * @param concept2
// * @param label
// * if not null, then filter lcses.
// * @param lcsMinEvaluation
// * if gt; 0, then filter lcses. this is the threshold.
// * @return 0 - no lcs, or no lcs that meets the threshold.
// */
// @Override
// public double filteredLin(String concept1, String concept2,
// Map<String, Double> conceptFilter) {
// double ic1 = getIC(concept1);
// double ic2 = getIC(concept2);
// // lin not defined if one of the concepts doesn't exist in the corpus
// if (ic1 == 0 || ic2 == 0)
// return 0;
// double denom = getIC(concept1) + getIC(concept2);
// if (denom != 0) {
// ConcRel cr1 = cg.getConceptMap().get(concept1);
// ConcRel cr2 = cg.getConceptMap().get(concept2);
// if (cr1 != null && cr2 != null) {
// Set<String> lcses = new HashSet<String>();
// int dist = getLCSFromCache(cr1, cr2, lcses);
// if (dist > 0) {
// double ic = getBestIC(lcses, conceptFilter);
// return 2 * ic / denom;
// }
// }
// }
// return 0;
// }
// /**
// * get the information content for the concept with the highest evaluation
// * greater than a specified threshold.
// *
// * If threshold 0, get the lowest IC of all the lcs's.
// *
// * @param lcses
// * the least common subsumers of a pair of concepts
// * @param label
// * label against which feature was evaluated
// * @param lcsMinEvaluation
// * threshold that the feature has to exceed. 0 for no filtering.
// * @return 0 if no lcs that makes the cut. else find the lcs(es) with the
// * maximal evaluation, and return getIC on these lcses.
// *
// * @see #getIC(Iterable)
// */
// private double getBestIC(Set<String> lcses,
// Map<String, Double> conceptFilter) {
// if (conceptFilter != null) {
// double currentBest = -1;
// Set<String> bestLcses = new HashSet<String>();
// for (String lcs : lcses) {
// if (conceptFilter.containsKey(lcs)) {
// double lcsEval = conceptFilter.get(lcs);
// if (currentBest == -1 || lcsEval > currentBest) {
// bestLcses.clear();
// bestLcses.add(lcs);
// currentBest = lcsEval;
// } else if (currentBest == lcsEval) {
// bestLcses.add(lcs);
// }
// }
// }
// if (bestLcses.size() > 0) {
// return this.getIC(bestLcses);
// }
// } else {
// // unfiltered - get the lowest ic
// return this.getIC(lcses);
// }
// return 0;
// }
// private ConceptInfo getPreloadedConceptInfo(String conceptId) {
// ConcRel cr = cg.getConceptMap().get(conceptId);
// if (cr != null) {
// return this.conceptInfoCache[cr.getNodeIndex()];
// }
// return null;
// }
public CacheManager getCacheManager() {
return cacheManager;
}
public ClassifierEvaluationDao getClassifierEvaluationDao() {
return classifierEvaluationDao;
}
public ConceptDao getConceptDao() {
return conceptDao;
}
// private String createKey(String c1, String c2) {
// if (c1.compareTo(c2) < 0) {
// return new StringBuilder(c1).append("-").append(c2).toString();
// } else {
// return new StringBuilder(c2).append("-").append(c1).toString();
// }
// }
@Override
public ConceptGraph getConceptGraph() {
return cg;
}
public String getConceptGraphName() {
return conceptGraphName;
}
public String getConceptSetName() {
return conceptSetName;
}
public String getCorpusName() {
return corpusName;
}
@Override
public Map<String, BitSet> getCuiTuiMap() {
return cuiTuiMap;
}
@Override
public int getDepth(String concept) {
// if (isPreload()) {
// // preloaded all concept info - depth should be there
// ConceptInfo ci = this.getPreloadedConceptInfo(concept);
// if (ci != null)
// return (int) ci.getDepth();
// } else {
// // get the feature ranks for the intrinsic infocontent -
// // rank = depth
// Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>(
// Arrays.asList(concept)), true);
// if (frMap.containsKey(concept))
// return frMap.get(concept).getRank();
// }
ConcRel cr = this.cg.getConceptMap().get(concept);
if (cr != null)
return cr.getDepth();
return 0;
}
@Override
public double getIC(String concept, boolean intrinsicICMap) {
double ic = 0d;
if (intrinsicICMap) {
ConcRel cr = this.cg.getConceptMap().get(concept);
if (cr != null)
ic = cr.getIntrinsicInfoContent();
} else {
Double icC = null;
if (isPreload()) {
// we preloaded all ic - just look in the cache
icC = this.corpusICMap.get(concept);
} else {
// we need to load the ic from the database on demand
Map<String, FeatureRank> frMap = getICOnDemand(
new HashSet<String>(Arrays.asList(concept)), false);
if (frMap.containsKey(concept))
return frMap.get(concept).getEvaluation();
}
if (icC != null)
ic = icC;
}
return ic;
// if (isPreload()) {
// ConceptInfo ci = this.getPreloadedConceptInfo(concept);
// if (ci != null)
// return intrinsicICMap ? ci.getIntrinsicIC() : ci.getCorpusIC();
// } else {
// Map<String, FeatureRank> frMap = getICOnDemand(new HashSet<String>(
// Arrays.asList(concept)), intrinsicICMap);
// if (frMap.containsKey(concept))
// return frMap.get(concept).getEvaluation();
// }
// return 0d;
}
private Map<String, FeatureRank> getICOnDemand(Set<String> lcses,
boolean intrinsicIC) {
if (lcses == null || lcses.isEmpty())
return new HashMap<String, FeatureRank>(0);
Map<String, FeatureRank> lcsICMap;
lcsICMap = this.classifierEvaluationDao
.getFeatureRanks(
lcses,
intrinsicIC ? null : this.corpusName,
intrinsicIC ? null : this.conceptSetName,
null,
intrinsicIC ? IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT
: InfoContentEvaluator.INFOCONTENT, null, 0d,
this.getConceptGraphName());
return lcsICMap;
}
// /**
// * get the concept with the lowest Information Content of all the LCSs.
// * Functionality copied from umls interface.
// *
// * @todo make this configurable/add a parameter - avg/min/max/median?
// * @param lcses
// * @return
// */
// public double getIC(Iterable<String> lcses) {
// double ic = 0;
// for (String lcs : lcses) {
// double ictmp = getIC(lcs);
// if (ic < ictmp)
// ic = ictmp;
// }
// return ic;
// }
//
// public double getIC(String concept1) {
// Double dRetVal = corpusICMap.get(concept1);
// if (dRetVal != null)
// return (double) dRetVal;
// else
// return 0;
// }
public int getLCS(String concept1, String concept2, Set<String> lcses,
List<LCSPath> lcsPaths) {
int lcsDist = 0;
ConcRel cr1 = getConceptGraph().getConceptMap().get(concept1);
ConcRel cr2 = getConceptGraph().getConceptMap().get(concept2);
if (cr1 != null && cr2 != null) {
lcses.clear();
if (lcsPaths == null) {
// no need to get paths which we don't cache - look in the cache
lcsDist = getLCSFromCache(cr1, cr2, lcses);
} else {
lcsPaths.clear();
// need to get paths - compute the lcses and their paths
lcsDist = lcs(concept1, concept2, lcsPaths);
for (LCSPath lcsPath : lcsPaths) {
lcses.add(lcsPath.getLcs());
}
}
} else {
if (log.isDebugEnabled()) {
if (cr1 == null)
log.debug("could not find concept:" + concept1);
if (cr2 == null)
log.debug("could not find concept:" + concept2);
}
}
return lcsDist;
}
public Cache getLcsCache() {
return lcsCache;
}
@SuppressWarnings("unchecked")
private int getLCSFromCache(ConcRel cr1, ConcRel cr2, Set<String> lcses) {
OrderedPair<String> cacheKey = new OrderedPair<String>(
cr1.getConceptID(), cr2.getConceptID());
Element e = this.lcsCache.get(cacheKey);
if (e != null) {
// hit the cache - unpack the lcs
if (e.getObjectValue() != null) {
Object[] val = (Object[]) e.getObjectValue();
lcses.addAll((Set<String>) val[1]);
return (Integer) val[0];
} else {
return -1;
}
} else {
// missed the cache - save the lcs
Object[] val = null;
Set<ConcRel> lcsCRSet = new HashSet<ConcRel>(2);
int dist = ConcRel.getLeastCommonConcept(cr1, cr2, lcsCRSet, null);
if (dist >= 0) {
val = new Object[2];
val[0] = dist;
for (ConcRel cr : lcsCRSet) {
lcses.add(cr.getConceptID());
}
val[1] = lcses;
}
e = new Element(cacheKey, val);
this.lcsCache.put(e);
return dist;
}
}
public String getLcsImputedType() {
return lcsImputedType;
}
public PageRankService getPageRankService() {
return pageRankService;
}
public Map<SimilarityMetricEnum, SimilarityMetric> getSimilarityMetricMap() {
return similarityMetricMap;
}
public PlatformTransactionManager getTransactionManager() {
return transactionManager;
}
@Override
public List<String> getTuiList() {
return this.tuiList;
}
public void init() {
log.info("begin initialization for concept graph: " + conceptGraphName);
cg = conceptDao.getConceptGraph(conceptGraphName);
if (cg == null) {
log.warn("concept graph null, name: " + conceptGraphName);
} else {
initSimilarityMetricMap();
if (isPreload()) {
try {
TransactionTemplate t = new TransactionTemplate(
this.transactionManager);
t.setPropagationBehavior(TransactionTemplate.PROPAGATION_REQUIRES_NEW);
t.execute(new TransactionCallback<Object>() {
@Override
public Object doInTransaction(TransactionStatus arg0) {
initInfoContent();
initCuiTuiMapFromCorpus();
return null;
}
});
} catch (Exception e) {
log.info("could not initialize cui-tui map: "
+ e.getMessage()
+ ". This is expected if you do not have umls installed in your db.");
}
}
}
log.info("end initialization for concept graph: " + conceptGraphName);
}
/**
* load cui-tui for the specified corpus from the MRSTY table
*/
public void initCuiTuiMapFromCorpus() {
// don't duplicate tui strings to save memory
SortedMap<String, String> tuiMap = new TreeMap<String, String>();
Map<String, Set<String>> tmpTuiCuiMap = new HashMap<String, Set<String>>();
List<Object[]> listCuiTui = this.classifierEvaluationDao
.getCorpusCuiTuis(this.getCorpusName(),
this.getConceptGraphName(), this.getConceptSetName());
for (Object[] cuiTui : listCuiTui) {
String cui = (String) cuiTui[0];
String tui = (String) cuiTui[1];
addCuiTuiToMap(tmpTuiCuiMap, tuiMap, cui, tui);
}
// map of tui - bitset index
SortedMap<String, Integer> mapTuiIndex = new TreeMap<String, Integer>();
// list of tuis corresponding to bitset indices
List<String> tmpTuiList = new ArrayList<String>(tuiMap.size());
int index = 0;
for (String tui : tuiMap.keySet()) {
mapTuiIndex.put(tui, index++);
tmpTuiList.add(tui);
}
this.tuiList = Collections.unmodifiableList(tmpTuiList);
// convert list of cuis into bitsets
// Map<String, BitSet> tmpCuiTuiBitsetMap = new HashMap<String,
// BitSet>();
ImmutableMap.Builder<String, BitSet> cuiTuiBitsetMapBuilder = new ImmutableMap.Builder<String, BitSet>();
for (Map.Entry<String, Set<String>> cuiTuiMapEntry : tmpTuiCuiMap
.entrySet()) {
// tmpCuiTuiBitsetMap.put(cuiTuiMapEntry.getKey(),
// tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex));
cuiTuiBitsetMapBuilder.put(cuiTuiMapEntry.getKey(),
tuiListToBitset(cuiTuiMapEntry.getValue(), mapTuiIndex));
}
// this.cuiTuiMap = Collections.unmodifiableMap(tmpCuiTuiBitsetMap);
this.cuiTuiMap = cuiTuiBitsetMapBuilder.build();
}
/**
* initialize information content caches TODO replace strings with concept
* ids from conceptGraph to save memory
*/
private void initInfoContent() {
// log.info("loading intrinsic infocontent for concept graph: "
// + conceptGraphName);
// List<ConceptInfo> listConceptInfo = classifierEvaluationDao
// .getIntrinsicInfoContent(conceptGraphName);
// if (listConceptInfo.isEmpty()) {
// log.warn("intrinsic info content not available! most similarity measures will not work");
// }
// this.conceptInfoCache = new ConceptInfo[cg.getConceptMap().size()];
// for (ConceptInfo ci : listConceptInfo) {
// ConcRel cr = cg.getConceptMap().get(ci.getConceptId());
// if (cr != null) {
// // save a little memory by reusing the string
// ci.setConceptId(cr.getConceptID());
// conceptInfoCache[cr.getNodeIndex()] = ci;
// }
// }
// fill intrinsicIC
// Map<String, FeatureRank> intrinsicICMap = classifierEvaluationDao
// .getIntrinsicInfoContent(conceptGraphName);
// for (Map.Entry<String, FeatureRank> icMapEntry : intrinsicICMap
// .entrySet()) {
// FeatureRank r = icMapEntry.getValue();
// ConcRel cr = cg.getConceptMap().get(r.getFeatureName());
// if (cr != null) {
// ConceptInfo ci = new ConceptInfo();
// ci.setConceptId(cr.getConceptID());
// ci.setDepth(r.getRank());
// ci.setIntrinsicIC(r.getEvaluation());
// conceptInfoMap.put(ci.getConceptId(), ci);
// }
// }
// fill corpusIC
log.info("loading corpus infocontent for corpusName=" + corpusName
+ ", conceptGraphName=" + conceptGraphName
+ ", conceptSetName=" + conceptSetName);
Map<String, Double> corpusICMap = classifierEvaluationDao
.getInfoContent(corpusName, conceptGraphName,
this.conceptSetName);
if (corpusICMap == null || corpusICMap.isEmpty()) {
log.warn("IC not found");
}
ImmutableMap.Builder<String, Double> mb = new ImmutableMap.Builder<String, Double>();
for (Map.Entry<String, Double> corpusICEntry : corpusICMap.entrySet()) {
ConcRel cr = cg.getConceptMap().get(corpusICEntry.getKey());
if (cr != null) {
mb.put(cr.getConceptID(), corpusICEntry.getValue());
}
}
this.corpusICMap = mb.build();
// ConceptInfo ci = this.conceptInfoCache[cr.getNodeIndex()];
// if (ci == null) {
// // this shouldn't happen! there should be intrinsic ic for
// // this concept
// ci = new ConceptInfo();
// ci.setConceptId(cr.getConceptID());
// this.conceptInfoCache[cr.getNodeIndex()] = ci;
// }
// ci.setCorpusIC(corpusICEntry.getValue());
// }
// }
}
/**
* initialize the metrics
*/
private void initSimilarityMetricMap() {
log.info("initializing similarity measures");
// Double maxIC = this.classifierEvaluationDao.getMaxFeatureEvaluation(
// null, null, null,
// IntrinsicInfoContentEvaluator.INTRINSIC_INFOCONTENT, 0, 0,
// conceptGraphName);
// Integer maxDepth = this.classifierEvaluationDao
// .getMaxDepth(conceptGraphName);
double maxIC = this.cg.getIntrinsicICMax();
int maxDepth = this.cg.getDepthMax();
this.similarityMetricMap = new HashMap<SimilarityMetricEnum, SimilarityMetric>(
SimilarityMetricEnum.values().length);
if (maxDepth > 0) {
this.similarityMetricMap.put(SimilarityMetricEnum.LCH,
new LCHMetric(this, maxDepth));
this.similarityMetricMap.put(SimilarityMetricEnum.LIN,
new LinMetric(this, false));
this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LIN,
new LinMetric(this, true));
this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_LCH,
new IntrinsicLCHMetric(this, maxIC));
this.similarityMetricMap.put(SimilarityMetricEnum.PATH,
new PathMetric(this));
this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_PATH,
new IntrinsicPathMetric(this, maxIC));
this.similarityMetricMap.put(SimilarityMetricEnum.RADA,
new RadaMetric(this, maxDepth));
this.similarityMetricMap.put(SimilarityMetricEnum.INTRINSIC_RADA,
new IntrinsicRadaMetric(this, maxIC));
this.similarityMetricMap.put(SimilarityMetricEnum.SOKAL,
new SokalSneathMetric(this));
this.similarityMetricMap.put(SimilarityMetricEnum.JACCARD,
new JaccardMetric(this));
this.similarityMetricMap.put(SimilarityMetricEnum.WUPALMER,
new WuPalmerMetric(this));
} else {
this.similarityMetricMap.put(SimilarityMetricEnum.PAGERANK,
new PageRankMetric(this, this.getPageRankService()));
}
}
public boolean isPreload() {
return preload;
}
public int lcs(String concept1, String concept2, List<LCSPath> lcsPaths) {
ConcRel cr1 = cg.getConceptMap().get(concept1);
ConcRel cr2 = cg.getConceptMap().get(concept2);
int dist = -1;
if (cr1 != null && cr2 != null) {
Set<ConcRel> crlcses = new HashSet<ConcRel>();
Map<ConcRel, LCSPath> crpaths = new HashMap<ConcRel, LCSPath>();
dist = ConcRel.getLeastCommonConcept(cr1, cr2, crlcses, crpaths);
lcsPaths.addAll(crpaths.values());
}
return dist;
}
// /*
// * (non-Javadoc)
// *
// * @see
// org.apache.ctakes.ytex.kernel.ConceptSimilarity#lch(java.lang.String,
// * java.lang.String)
// */
// public double lch(String concept1, String concept2) {
// double dm = 2 * cg.getDepthMax() + 1.0;
// ConcRel cr1 = cg.getConceptMap().get(concept1);
// ConcRel cr2 = cg.getConceptMap().get(concept2);
// if (cr1 != null && cr2 != null) {
// Set<String> lcses = new HashSet<String>();
// int lcsDist = getLCSFromCache(cr1, cr2, lcses);
// // leacock is defined as -log([path length]/(2*[depth])
// double lch = -Math.log(((double) lcsDist + 1.0) / dm);
// // scale to depth
// return lch / Math.log(dm);
// } else {
// if (log.isDebugEnabled()) {
// if (cr1 == null)
// log.debug("could not find concept:" + concept1);
// if (cr2 == null)
// log.debug("could not find concept:" + concept2);
// }
// return 0;
// }
// }
/**
* For the given label and cutoff, get the corresponding concepts whose
* propagated ig meets the threshold. Used by lin kernel to find concepts
* that actually have a non-trivial similarity
*
* @param label
* label
* @param rankCutoff
* cutoff
* @param conceptFilter
* set to fill with concepts
* @return double minimum evaluation
*/
@Override
public double loadConceptFilter(String label, int rankCutoff,
Map<String, Double> conceptFilter) {
List<FeatureRank> imputedConcepts = this.classifierEvaluationDao
.getImputedFeaturesByPropagatedCutoff(corpusName,
conceptSetName, label, lcsImputedType
+ ImputedFeatureEvaluator.SUFFIX_IMPUTED,
conceptGraphName, lcsImputedType
+ ImputedFeatureEvaluator.SUFFIX_PROP,
rankCutoff);
double minEval = 1d;
for (FeatureRank r : imputedConcepts) {
conceptFilter.put(r.getFeatureName(), r.getEvaluation());
if (minEval >= r.getEvaluation())
minEval = r.getEvaluation();
}
return minEval;
}
// public double lin(String concept1, String concept2) {
// return filteredLin(concept1, concept2, null);
// }
public void setCacheManager(CacheManager cacheManager) {
this.cacheManager = cacheManager;
}
public void setClassifierEvaluationDao(
ClassifierEvaluationDao classifierEvaluationDao) {
this.classifierEvaluationDao = classifierEvaluationDao;
}
public void setConceptDao(ConceptDao conceptDao) {
this.conceptDao = conceptDao;
}
public void setConceptGraphName(String conceptGraphName) {
this.conceptGraphName = conceptGraphName;
}
public void setConceptSetName(String conceptSetName) {
this.conceptSetName = conceptSetName;
}
public void setCorpusName(String corpusName) {
this.corpusName = corpusName;
}
public void setLcsCache(Cache lcsCache) {
this.lcsCache = lcsCache;
}
public void setLcsImputedType(String lcsImputedType) {
this.lcsImputedType = lcsImputedType;
}
// double minEval = 1d;
// List<FeatureRank> listPropagatedConcepts = classifierEvaluationDao
// .getTopFeatures(corpusName, conceptSetName, label,
// ImputedFeatureEvaluator.MeasureType.INFOGAIN.toString()
// + ImputedFeatureEvaluator.SUFFIX_PROP, 0, 0,
// conceptGraphName, rankCutoff);
// for (FeatureRank r : listPropagatedConcepts) {
// ConcRel cr = cg.getConceptMap().get(r.getFeatureName());
// if (cr != null) {
// addSubtree(conceptFilterSet, cr);
// }
// if (r.getEvaluation() < minEval)
// minEval = r.getEvaluation();
// }
// return minEval;
// }
//
// /**
// * add all children of parent to conceptSet. Limit only to children that
// * actually appear in the corpus
// *
// * @param conceptSet
// * set of concepts to add ids to
// * @param parent
// * parent which will be added to the conceptSet
// * @param corpusICSet
// * set of concepts and hypernyms contained in corpus
// */
// private void addSubtree(Map<String, Double> conceptSet, ConcRel parent) {
// if (!conceptSet.containsKey(parent.getConceptID())
// && conceptFreq.containsKey(parent.getConceptID())) {
// conceptSet.put(parent.getConceptID(), 0d);
// for (ConcRel child : parent.getChildren()) {
// addSubtree(conceptSet, child);
// }
// }
// }
public void setPageRankService(PageRankService pageRankService) {
this.pageRankService = pageRankService;
}
public void setPreload(boolean preload) {
this.preload = preload;
}
public void setSimilarityMetricMap(
Map<SimilarityMetricEnum, SimilarityMetric> similarityMetricMap) {
this.similarityMetricMap = similarityMetricMap;
}
public void setTransactionManager(
PlatformTransactionManager transactionManager) {
this.transactionManager = transactionManager;
}
@Override
public List<ConceptPairSimilarity> similarity(
List<ConceptPair> conceptPairs, List<SimilarityMetricEnum> metrics,
Map<String, Double> conceptFilter, boolean lcs) {
List<ConceptPairSimilarity> conceptSimMap = new ArrayList<ConceptPairSimilarity>(
conceptPairs.size());
for (ConceptPair conceptPair : conceptPairs) {
conceptSimMap.add(similarity(metrics, conceptPair.getConcept1(),
conceptPair.getConcept2(), conceptFilter, lcs));
}
return conceptSimMap;
}
/**
*
*/
@Override
public ConceptPairSimilarity similarity(List<SimilarityMetricEnum> metrics,
String concept1, String concept2,
Map<String, Double> conceptFilter, boolean lcs) {
// allocate simInfo if this isn't provided
SimilarityInfo simInfo = new SimilarityInfo();
if (lcs)
simInfo.setLcsPaths(new ArrayList<LCSPath>(1));
// allocate result map
List<Double> similarities = new ArrayList<Double>(metrics.size());
if (cg != null) {
// iterate over metrics, compute, stuff in map
for (SimilarityMetricEnum metric : metrics) {
double sim = this.similarityMetricMap.get(metric).similarity(
concept1, concept2, conceptFilter, simInfo);
similarities.add(sim);
}
}
ConceptPairSimilarity csim = new ConceptPairSimilarity();
csim.setConceptPair(new ConceptPair(concept1, concept2));
csim.setSimilarities(similarities);
csim.setSimilarityInfo(simInfo);
return csim;
}
/**
* convert the list of tuis into a bitset
*
* @param tuis
* @param mapTuiIndex
* @return
*/
private BitSet tuiListToBitset(Set<String> tuis,
SortedMap<String, Integer> mapTuiIndex) {
BitSet bs = new BitSet(mapTuiIndex.size());
for (String tui : tuis) {
bs.set(mapTuiIndex.get(tui));
}
return bs;
}
}