results = executeInitialStage();
cascadeStage++;
} else {
String featureID = null;
ScoringFunction scoringFunction = null;
int mSize = -1;
String[][] concepts_this_stage = new String[totalCnt][];
float[] clique_wgts = new float[concepts_this_stage.length];
int cntConcepts = 0;
for (CascadeClique c : cascadeStages.get(cascadeStage)) {
cnt++;
pruningFunction = c.getPruningFunction();
pruningParameter = c.getPruningParameter();
featureID = c.getParamID().trim(); // termWt, orderedWt, unorderedWt
scoringFunction = c.getScoringFunction();
mSize = c.getWindowSize(); // window width
if (mSize == -1 && !(featureID.equals("termWt"))) {
throw new RetrievalException("Only term features don't support getWindowSize()! " + featureID);
}
concepts_this_stage[cntConcepts] = c.getSingleTerms();
clique_wgts[cntConcepts] = c.getWeight();
cntConcepts++;
subTotal_cascadeCost += c.cost;
}
// for use in pruning
// score-based
float max_score = results[0].score;
float min_score = results[results.length - 1].score;
float score_threshold = (max_score - min_score) * pruningParameter + min_score;
float mean_max_score_threshold = pruningParameter * max_score + (1.0f - pruningParameter) * meanScore;
// rank-based
int retainSize = (int) ((1.0 - pruningParameter) * ((double) (results.length)));
int size = 0;
// Clear priority queue.
mSortedAccumulators.clear();
float[] termCollectionFreqs = new float[cntConcepts];
float[] termDFs = new float[cntConcepts];
int[][] termIndexes = new int[cntConcepts][];
float sumScore = 0;
for (int j = 0; j < cntConcepts; j++) {
String[] singleTerms = concepts_this_stage[j];
int termIndex1 = termToCliqueNumber.get(singleTerms[0]);
if (featureID.indexOf("termWt") != -1) {
float termCollectionFreq = cf.get(singleTerms[0]);
termCollectionFreqs[j] = termCollectionFreq;
float termDF = df.get(singleTerms[0]);
termDFs[j] = termDF;
termIndexes[j] = new int[1];
termIndexes[j][0] = termIndex1;
if (singleTerms.length != 1) {
System.out.println("Should have length 1 " + singleTerms.length);
System.exit(-1);
}
} else {
int termIndex2 = termToCliqueNumber.get(singleTerms[1]);
termIndexes[j] = new int[2];
termIndexes[j][0] = termIndex1;
termIndexes[j][1] = termIndex2;
if (singleTerms.length != 2) {
System.out.println("Should have length 2 " + singleTerms.length);
System.exit(-1);
}
}
}
// iterate over results documents, which are sorted in scores
for (int i = 0; i < results.length; i++) {
// pruning, if okay, scoring, update pruning stats for next cascade stage
boolean passedPruning = false;
if (pruningFunction.equals("rank")) {
if (i < retainSize) {
passedPruning = true;
} else {
if (size < mK && mK != defaultNumDocs) {
passedPruning = true;
} else {
break;
}
}
} else if (pruningFunction.equals("score")) {
if (results[i].score > score_threshold) {
passedPruning = true;
} else {
if (size < mK && mK != defaultNumDocs) {
passedPruning = true;
} else {
break;
}
}
} else if (pruningFunction.equals("mean-max")) {
if (results[i].score > mean_max_score_threshold) {
passedPruning = true;
} else {
if (size < mK && mK != defaultNumDocs) {
passedPruning = true;
} else {
break;
}
}
} else {
throw new RetrievalException("Not supported pruner! "+pruningFunction);
}
if (passedPruning) {
size++;
int docIndex = results[i].index_into_keptDocs;
int docLen = keptDocLengths[docIndex];
float docScore_cascade = 0;
for (int j = 0; j < cntConcepts; j++) {
if (featureID.equals("termWt")) {
int termIndex1 = termIndexes[j][0];
int[] positions1 = keptDocs[docIndex][termIndex1];
int tf = 0;
if (positions1 != null) {
tf = positions1.length;
}
docScore_cascade += clique_wgts[j] * scoringFunction.getScore(tf, docLen);
} else { // term proximity
// merge into a single stream and compute matches. Assume there are only two
// terms!!!
int termIndex1 = termIndexes[j][0];
int termIndex2 = termIndexes[j][1];
int[] positions1 = keptDocs[docIndex][termIndex1];
int[] positions2 = keptDocs[docIndex][termIndex2];
int matches = 0;
if (positions1 != null && positions2 != null) { // both query terms are in the doc
termMatches++;
int[] ids = new int[positions1.length];
Arrays.fill(ids, 0);
int length = positions1.length;
int length2 = positions2.length;
int[] newPositions = new int[length + length2];
int[] newIds = new int[length + length2];
int posA = 0;
int posB = 0;
int ii = 0;
while (ii < length + length2) {
if (posB == length2 || posA < length && positions1[posA] <= positions2[posB]) {
newPositions[ii] = positions1[posA];
newIds[ii] = ids[posA];
posA++;
} else {
newPositions[ii] = positions2[posB];
newIds[ii] = 1;
posB++;
}
ii++;
}
int[] positions = newPositions;
ids = newIds;
BitSet mMatchedIds = new BitSet(2); // Assume there are only two terms!!!
if (featureID.equals("orderedWt")) {
for (ii = 0; ii < positions.length; ii++) {
mMatchedIds.clear();
int maxGap = 0;
boolean ordered = true;
mMatchedIds.set(ids[ii]);
int matchedIDCounts = 1;
int lastMatchedID = ids[ii];
int lastMatchedPos = positions[ii];
for (int jj = ii + 1; jj < positions.length; jj++) {
int curID = ids[jj];
int curPos = positions[jj];
if (!mMatchedIds.get(curID)) {
mMatchedIds.set(curID);
matchedIDCounts++;
if (curID < lastMatchedID) {
ordered = false;
}
if (curPos - lastMatchedPos > maxGap) {
maxGap = curPos - lastMatchedPos;
}
}
// stop looking if the maximum gap is too large
// or the terms appear out of order
if (maxGap > mSize || !ordered) {
break;
}
// did we match all the terms, and in order?
if (matchedIDCounts == 2 && ordered) {
matches++;
break;
}
}
}
} else if (featureID.equals("unorderedWt")) {
for (ii = 0; ii < positions.length; ii++) {
mMatchedIds.clear();
mMatchedIds.set(ids[ii]);
int matchedIDCounts = 1;
int startPos = positions[ii];
for (int jj = ii + 1; jj < positions.length; jj++) {
int curID = ids[jj];
int curPos = positions[jj];
int windowSize = curPos - startPos + 1;
if (!mMatchedIds.get(curID)) {
mMatchedIds.set(curID);
matchedIDCounts++;
}
// stop looking if we've exceeded the maximum window size
if (windowSize > mSize) {
break;
}
// did we match all the terms?
if (matchedIDCounts == 2) {
matches++;
break;
}
}
}
} else {
System.out.println("Invalid featureID " + featureID);
System.exit(-1);
}
} // end if this is a match, i.e., both query terms are in the doc
// float s = getScore(matches, docLen, RetrievalEnvironment.defaultCf,
// (float) RetrievalEnvironment.defaultDf, scoringFunctionName);
// docScore_cascade += clique_wgts[j] * s;
GlobalTermEvidence termEvidence = scoringFunction.getGlobalTermEvidence();
termEvidence.cf = RetrievalEnvironment.defaultCf;
termEvidence.df = RetrievalEnvironment.defaultDf;
scoringFunction.initialize(termEvidence, scoringFunction.getGlobalEvidence());
docScore_cascade += clique_wgts[j] * scoringFunction.getScore(matches, docLen);
} // end else it's proximity feature
} // end for (each concept)
// accumulate doc score in results[i] across cascade stages