* Converts an instance.
*/
private FastVector convertInstance(Instance instance, boolean training)
throws Exception {
FastVector vector = new FastVector();
String fileName = instance.stringValue(fileNameAtt);
if (debugMode) {
System.err.println("-- Converting instance for document "
+ fileName);
}
// Get the key phrases for the document
HashMap<String, Counter> hashKeyphrases = null;
if (!instance.isMissing(keyphrasesAtt)) {
String keyphrases = instance.stringValue(keyphrasesAtt);
hashKeyphrases = getGivenKeyphrases(keyphrases);
}
// Get the document text
String documentText = instance.stringValue(documentAtt);
// Compute the candidate topics
HashMap<String, Candidate> candidateList;
if (allCandidates != null && allCandidates.containsKey(instance)) {
candidateList = allCandidates.get(instance);
} else {
candidateList = getCandidates(documentText);
}
System.err.println(candidateList.size() + " candidates ");
// Set indices for key attributes
int tfidfAttIndex = documentAtt + 2;
int distAttIndex = documentAtt + 3;
int probsAttIndex = documentAtt + numFeatures;
int countPos = 0;
int countNeg = 0;
// Go through the phrases and convert them into instances
for (Candidate candidate : candidateList.values()) {
if (candidate.getFrequency() < minOccurFrequency) {
continue;
}
String name = candidate.getName();
String orig = candidate.getBestFullForm();
if (!vocabularyName.equals("none")) {
orig = candidate.getTitle();
}
double[] vals = computeFeatureValues(candidate, training,
hashKeyphrases, candidateList);
Instance inst = new Instance(instance.weight(), vals);
inst.setDataset(classifierData);
// Get probability of a phrase being key phrase
double[] probs = classifier.distributionForInstance(inst);
double prob = probs[0];
if (nominalClassValue) {
prob = probs[1];
}
// Compute attribute values for final instance
double[] newInst = new double[instance.numAttributes()
+ numFeatures + 2];
int pos = 0;
for (int i = 1; i < instance.numAttributes(); i++) {
if (i == documentAtt) {
// output of values for a given phrase:
// Add phrase
int index = outputFormatPeek().attribute(pos)
.addStringValue(name);
newInst[pos++] = index;
// Add original version
if (orig != null) {
index = outputFormatPeek().attribute(pos)
.addStringValue(orig);
} else {
index = outputFormatPeek().attribute(pos)
.addStringValue(name);
}
newInst[pos++] = index;
// Add features
newInst[pos++] = inst.value(tfIndex);
newInst[pos++] = inst.value(idfIndex);
newInst[pos++] = inst.value(tfidfIndex);
newInst[pos++] = inst.value(firstOccurIndex);
newInst[pos++] = inst.value(lastOccurIndex);
newInst[pos++] = inst.value(spreadOccurIndex);
newInst[pos++] = inst.value(domainKeyphIndex);
newInst[pos++] = inst.value(lengthIndex);
newInst[pos++] = inst.value(generalityIndex);
newInst[pos++] = inst.value(nodeDegreeIndex);
newInst[pos++] = inst.value(semRelIndex);
newInst[pos++] = inst.value(wikipKeyphrIndex);
newInst[pos++] = inst.value(invWikipFreqIndex);
newInst[pos++] = inst.value(totalWikipKeyphrIndex);
// Add probability
probsAttIndex = pos;
newInst[pos++] = prob;
// Set rank to missing (computed below)
newInst[pos++] = Instance.missingValue();
} else if (i == keyphrasesAtt) {
newInst[pos++] = inst.classValue();
} else {
newInst[pos++] = instance.value(i);
}
}
Instance ins = new Instance(instance.weight(), newInst);
ins.setDataset(outputFormatPeek());
vector.addElement(ins);
if (inst.classValue() == 0) {
countNeg++;
} else {
countPos++;
}
}
System.err.println(countPos + " positive; " + countNeg
+ " negative instances");
// Sort phrases according to their distance (stable sort)
double[] vals = new double[vector.size()];
for (int i = 0; i < vals.length; i++) {
vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
}
FastVector newVector = new FastVector(vector.size());
int[] sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Sort phrases according to their tfxidf value (stable sort)
for (int i = 0; i < vals.length; i++) {
vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
}
newVector = new FastVector(vector.size());
sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Sort phrases according to their probability (stable sort)
for (int i = 0; i < vals.length; i++) {
vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
}
newVector = new FastVector(vector.size());
sortedIndices = Utils.stableSort(vals);
for (int i = 0; i < vals.length; i++) {
newVector.addElement(vector.elementAt(sortedIndices[i]));
}
vector = newVector;
// Compute rank of phrases. Check for subphrases that are ranked
// lower than superphrases and assign probability -1 and set the