* @throws IOException
*/
static public DefaultQuerySet readTopics(BufferedReader reader,
final boolean quoteCommas) throws IOException {
logger.debug("Reading a topic file");
final DefaultQuerySet querySet = new DefaultQuerySet();
BulletParser bulletParser = new BulletParser(
TRECParsingFactory.INSTANCE);
bulletParser.setCallback(new DefaultCallback() {
TRECTopic topic = null;
MutableString curText = new MutableString();
Element curElement;
@Override
public boolean characters(char[] text, int offset, int length,
boolean flowBroken) {
curText.append(text, offset, length);
return true;
}
@Override
public boolean startElement(Element element,
Map<Attribute, MutableString> attrMapUnused) {
// --- New tag
if (topic != null)
process();
// ---
if (element == TRECParsingFactory.ELEMENT_TOP) {
topic = new TRECTopic();
}
curElement = element;
return true;
}
void removePrefix(String prefix, MutableString text) {
if (text.startsWith(prefix))
text.delete(0, prefix.length());
}
private void process() {
curText.trim();
curText.replace('\n', ' ');
curText.squeezeSpaces(false);
if (curElement == TRECParsingFactory.ELEMENT_TITLE) {
removePrefix("Topic: ", curText);
if (quoteCommas) {
StringBuilder builder = new StringBuilder();
boolean first = true;
for (String part : curText.toString()
.split("\\s*,\\s*")) {
if (first)
first = false;
else
builder.append(' ');
if (part.indexOf(' ') >= 0) {
builder.append('"');
builder.append(part);
builder.append('"');
} else
builder.append(part);
}
topic.title = builder.toString();
} else
topic.title = curText.toString();
} else if (curElement == TRECParsingFactory.ELEMENT_NUM) {
removePrefix("Number: ", curText);
// Normalise the number
topic.id = new Integer(curText.toString()).toString();
} else if (curElement == TRECParsingFactory.ELEMENT_DESC) {
removePrefix("Description: ", curText);
topic.description = curText.toString();
} else if (curElement == TRECParsingFactory.ELEMENT_NARR) {
// TREC
removePrefix("Narrative: ", curText);
topic.narrative = curText.toString();
} else if (curElement == TRECParsingFactory.ELEMENT_SMRY) {
removePrefix("Summary: ", curText);
topic.summary = curText.toString();
} else if (curElement == TRECParsingFactory.ELEMENT_CON) {
// TREC 1
removePrefix("Concepts: ", curText);
topic.concepts = curText.toString();
} else if (curElement == TRECParsingFactory.ELEMENT_DEF) {
// TREC 1
removePrefix("Definition(s): ", curText);
removePrefix("Definition: ", curText);
topic.definitions = curText.toString();
}
curElement = null;
curText.delete(0, curText.length());
}
@Override
public boolean endElement(Element element) {
if (topic != null)
process();
if (element == TRECParsingFactory.ELEMENT_TOP) {
if (topic.id == null) {
logger.warn("Topic had no identifier - skipping");
} else {
logger.debug(new LazyString("Adding topic %s with title [%s]",
topic.id, topic.title));
querySet.put(topic.id, topic);
}
topic = null;
}
return true;
}