// TODO: check that phrases in the query are matched in the fragment
Token[] tokens = getTokens(text); // parse text to token array
if (tokens.length == 0)
return new Summary();
String[] terms = query.getTerms();
HashSet highlight = new HashSet(); // put query terms in table
for (int i = 0; i < terms.length; i++)
highlight.add(terms[i]);
// A list to store document's excerpts.
// (An excerpt is a Vector full of Fragments and Highlights)
List excerpts = new ArrayList();
//
// Iterate through all terms in the document
//
int lastExcerptPos = 0;
for (int i = 0; i < tokens.length; i++) {
//
// If we find a term that's in the query...
//
if (highlight.contains(tokens[i].term())) {
//
// Start searching at a point SUM_CONTEXT terms back,
// and move SUM_CONTEXT terms into the future.
//
int startToken = (i > sumContext) ? i - sumContext : 0;
int endToken = Math.min(i + sumContext, tokens.length);
int offset = tokens[startToken].startOffset();
int j = startToken;
//
// Iterate from the start point to the finish, adding
// terms all the way. The end of the passage is always
// SUM_CONTEXT beyond the last query-term.
//
Excerpt excerpt = new Excerpt(i);
if (i != 0) {
excerpt.add(new Summary.Ellipsis());
}
//
// Iterate through as long as we're before the end of
// the document and we haven't hit the max-number-of-items
// -in-a-summary.
//
while ((j < endToken) && (j - startToken < sumLength)) {
//
// Now grab the hit-element, if present
//
Token t = tokens[j];
if (highlight.contains(t.term())) {
excerpt.addToken(t.term());
excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
offset = t.endOffset();
endToken = Math.min(j + sumContext, tokens.length);
}
j++;
}
lastExcerptPos = endToken;
//
// We found the series of search-term hits and added
// them (with intervening text) to the excerpt. Now
// we need to add the trailing edge of text.
//
// So if (j < tokens.length) then there is still trailing
// text to add. (We haven't hit the end of the source doc.)
// Add the words since the last hit-term insert.
//
if (j < tokens.length) {
excerpt.add(new Fragment(text.substring(offset,tokens[j].endOffset())));
}
//
// Remember how many terms are in this excerpt
//
excerpt.setNumTerms(j - startToken);
//
// Store the excerpt for later sorting
//
excerpts.add(excerpt);
//
// Start SUM_CONTEXT places away. The next
// search for relevant excerpts begins at i-SUM_CONTEXT
//
i = j + sumContext;
}
}
// Sort the excerpts based on their score
Collections.sort(excerpts, SCORE_COMPARATOR);
//
// If the target text doesn't appear, then we just
// excerpt the first SUM_LENGTH words from the document.
//
if (excerpts.size() == 0) {
Excerpt excerpt = new Excerpt(0);
int excerptLen = Math.min(sumLength, tokens.length);
lastExcerptPos = excerptLen;
excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
excerpt.setNumTerms(excerptLen);
excerpts.add(excerpt);
}
//
// Now choose the best items from the excerpt set.
// Stop when we have enought excerpts to build our Summary.
//
double tokenCount = 0;
int numExcerpt = excerpts.size()-1;
List bestExcerpts = new ArrayList();
while (tokenCount <= sumLength && numExcerpt >= 0) {
Excerpt excerpt = (Excerpt) excerpts.get(numExcerpt--);
bestExcerpts.add(excerpt);
tokenCount += excerpt.getNumTerms();
}
// Sort the best excerpts based on their natural order
Collections.sort(bestExcerpts, ORDER_COMPARATOR);
//
// Now build our Summary from the best the excerpts.
//
tokenCount = 0;
numExcerpt = 0;
Summary s = new Summary();
while (tokenCount <= sumLength && numExcerpt < bestExcerpts.size()) {
Excerpt excerpt = (Excerpt) bestExcerpts.get(numExcerpt++);
double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments();
for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
Fragment f = (Fragment) e.nextElement();
// Don't add fragments if it takes us over the max-limit
if (tokenCount + tokenFraction <= sumLength) {
s.add(f);
}
tokenCount += tokenFraction;
}
}
if (tokenCount > 0 && lastExcerptPos < tokens.length)
s.add(new Ellipsis());
return s;
}