*/
private double[] computeFeatureValues(Candidate candidate,
boolean training, HashMap<String, Counter> hashKeyphrases,
HashMap<String, Candidate> candidates) {
Article candidateArticle = candidate.getArticle();
// Compute feature values
double[] newInst = new double[numFeatures + 1];
String name = candidate.getName();
String original = candidate.getBestFullForm();
String title = candidate.getTitle();
// Compute TFxIDF
Counter counterGlobal = (Counter) globalDictionary.get(name);
double globalVal = 0;
if (counterGlobal != null) {
globalVal = counterGlobal.value();
if (training) {
globalVal = globalVal - 1;
}
}
double tf = candidate.getTermFrequency();
double idf = -Math.log((globalVal + 1) / ((double) numDocs + 1));
// System.out.println(candidate + " count: " + candidate.getFrequency() + "
// tf: " + tf + " glob val: " + globalVal + " numDocs: " + numDocs + " idf:
// " + idf);
if (useBasicFeatures) {
newInst[tfidfIndex] = tf * idf;
newInst[firstOccurIndex] = candidate.getFirstOccurrence();
}
if (useFrequencyFeatures) {
newInst[tfIndex] = tf;
newInst[idfIndex] = idf;
}
if (usePositionsFeatures) {
newInst[lastOccurIndex] = candidate.getLastOccurrence();
newInst[spreadOccurIndex] = candidate.getSpread();
}
if (useKeyphrasenessFeature) {
if (vocabularyName.equals("wikipedia")) {
name = title;
}
Counter domainKeyphr = keyphraseDictionary.get(name);
if ((training) && (hashKeyphrases != null)
&& (hashKeyphrases.containsKey(name))) {
newInst[domainKeyphIndex] = domainKeyphr.value() - 1;
} else {
if (domainKeyphr != null) {
newInst[domainKeyphIndex] = domainKeyphr.value();
} else {
newInst[domainKeyphIndex] = 0;
}
}
}
if (useLengthFeature) {
if (original == null) {
System.err.println("Warning! Problem with candidate " + name);
newInst[lengthIndex] = 1.0;
} else {
String[] words = original.split(" ");
newInst[lengthIndex] = (double) words.length;
}
}
if (useNodeDegreeFeature) {
int nodeDegree = 0;
if (vocabularyName.equals("wikipedia")) {
try {
for (int relatedID : candidateArticle.getLinksInIds()) {
if (candidates.containsKey(relatedID + "")) {
nodeDegree++;
}
}
for (int relatedID : candidateArticle.getLinksOutIds()) {
if (candidates.containsKey(relatedID + "")) {
nodeDegree++;
}
}
} catch (SQLException e) {
System.err.println("Error retrieving ids for candidate "+ candidate);
}
} else if (vocabulary != null) {
Vector<String> relatedTerms = vocabulary.getRelated(name);
if (relatedTerms != null) {
for (String relatedTerm : relatedTerms) {
if (candidates.get(relatedTerm) != null)
nodeDegree++;
}
}
}
// if (nodeDegree != 0) {
// System.out.println(candidate + " has node degree " + nodeDegree);
// }
newInst[nodeDegreeIndex] = (double) nodeDegree;
}
if (useBasicWikipediaFeatures && wikipedia != null) {
double wikipKeyphraseness = 0;
if (vocabularyName.equals("wikipedia")) {
wikipKeyphraseness = candidate.getWikipKeyphraseness();
} else {
Anchor anchor = null;
try {
anchor = new Anchor(wikipedia.getDatabase()
.addEscapes(original), null, wikipedia.getDatabase());
if (anchor != null) {
if (anchor.getLinkProbability() != 0) {
wikipKeyphraseness = anchor.getLinkProbability();
}
}
} catch (SQLException e) {
System.err.println("Error retrieving the anchor for " + candidate);
// e.printStackTrace();
}
}
newInst[wikipKeyphrIndex] = wikipKeyphraseness;
newInst[totalWikipKeyphrIndex] = candidate.getTotalWikipKeyphraseness();
// System.out.println(candidate + "\t wikip Keyphr " + newInst[wikipKeyphrIndex] + "\t total wikip Keyphr " + newInst[totalWikipKeyphrIndex]);
}
if (useAllWikipediaFeatures) {
if (candidateArticle == null) {
try {
candidateArticle = wikipedia.getMostLikelyArticle(original,
new CaseFolder());
} catch (SQLException e) {
e.printStackTrace();
}
}
double wikipFrequency = 0;
double generality = 0;
double semRelatedness = 0;
if (candidateArticle != null) {
try {
double pageCount = candidateArticle.getLinksInCount();
wikipFrequency = -Math.log(pageCount / 2000000);
generality = candidateArticle.getGenerality();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (vocabularyName.equals("wikipedia") && candidateArticle != null) {
for (Candidate c : candidates.values()) {
if (!c.equals(candidate)) {
double relatedness = 0;
Article article = c.getArticle();
try {
relatedness = candidateArticle.getRelatednessTo(article);
} catch (SQLException e) {
e.printStackTrace();