if (! Files.exists(queryfilename) || ! Files.canRead(queryfilename)) {
logger.error("The topics file " + queryfilename + " does not exist, or it cannot be read.");
return false;
} else {
br = Files.openFileReader(queryfilename,desiredEncoding);
TRECFullTokenizer queryTokenizer = new TRECFullTokenizer(
new TagSet(TagSet.TREC_QUERY_TAGS),
new TagSet(TagSet.EMPTY_TAGS),
br);
queryTokenizer.setIgnoreMissingClosingTags(true);
while (!queryTokenizer.isEndOfFile()) {
String docnoToken = null;
StringBuilder query = new StringBuilder();
boolean seenDescriptionToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
boolean seenNarrativeToken = ! IGNORE_DESC_NARR_NAME_TOKENS;
while (!queryTokenizer.isEndOfDocument()) {
String token = queryTokenizer.nextToken();
if (token == null
|| token.length() == 0
|| queryTokenizer.inTagToSkip())
continue;
if (queryTokenizer.inDocnoTag()) {
//The tokenizer is constructed from the trimmed version of the contents
//of the query number tag, so that the last token extracted from it, is
//always the query number, and not an empty string
StringTokenizer docnoTokens =
new StringTokenizer(token.trim(), " ");
while (docnoTokens.hasMoreTokens())
docnoToken = docnoTokens.nextToken().trim();
} else if (queryTokenizer.inTagToProcess()) {
// Removed the code that checks if "description" and
// "narrative" appear in "desc" and "narr", respective.
// THIS WILL HURT THE RETRIEVAL PERFORMANCE. Therefore,
// it is recommended to add these words in the stopword
// list.
if (!seenDescriptionToken && queryTokenizer
.currentTag()
.toUpperCase()
.equals("DESC")
&& token.toUpperCase().equals("DESCRIPTION"))
continue;
if (!seenNarrativeToken && queryTokenizer
.currentTag()
.toUpperCase()
.equals("NARR")
&& token.toUpperCase().equals("NARRATIVE"))
continue;
query.append(token);
query.append(' ');
}
}
queryTokenizer.nextDocument();
if (query.length() == 0)
continue;
vecStringQueries.add(query.toString().trim());
vecStringIds.add(docnoToken.trim());