//Important: ignore empty annotations!
if (annotation.has("annotation") && !annotation.getString("annotation").equals("")) {
SpotClass userAnnotation;
if (annotation.getString("annotation").contains("c"))
userAnnotation = SpotClass.common;
else if(annotation.getString("annotation").contains("p"))
userAnnotation = SpotClass.part;
else
userAnnotation = SpotClass.valid;
addInstance(annotation.getString("@surfaceForm"),
annotation.getInt("@offset"),
taggedText,
annotation.getString("@URI"),
null, null,
userAnnotation
);
}
}
}
break;
case TSV:
CSVReader reader = new CSVReader(new FileReader(file), '\t');
String[] row;
while ((row = reader.readNext()) != null) {
try{
SpotClass annotation = row[5].equals("t") ? SpotClass.valid : SpotClass.common;
addInstance(row[0], Integer.parseInt(row[1]), new TaggedText(row[2], getTaggedTokenProvider()), row[3], row[4], row[5], annotation);
}catch (ArrayIndexOutOfBoundsException ignored){}
}
reader.close();
break;
case CSAW:
/**
* Read and tag the crawled documents:
*/
File crawledDocs = new File(file, "crawledDocs");
Map<String, TaggedText> textMap = new HashMap<String, TaggedText>();
for(String crawledDoc : crawledDocs.list()) {
if(crawledDoc.equals("CZdata1") || crawledDoc.equals("docPaths.txt")
|| crawledDoc.equals("13Oct08_allUrls.txt.txt"))
continue;
/**
* Read the text file :
*/
File crawledDocFile = new File(crawledDocs, crawledDoc);
byte[] buffer = new byte[(int) crawledDocFile.length()];
BufferedInputStream f = null;
try {
f = new BufferedInputStream(new FileInputStream(crawledDocFile));
f.read(buffer);
} finally {
if (f != null) try { f.close(); } catch (IOException ignored) { }
}
TaggedText text = new TaggedText(new String(buffer), getTaggedTokenProvider());
textMap.put(crawledDoc, text);
texts.add(text);
}
/**
* Read the annotations:
*/
File annotationFile = new File(file, "CSAW_Annotations.xml");
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = null;
try {
docBuilder = docBuilderFactory.newDocumentBuilder();
Document doc = docBuilder.parse (annotationFile);
doc.getDocumentElement().normalize();
NodeList annotations = doc.getElementsByTagName("annotation");
for(int i = 0; i < annotations.getLength(); i++) {
Node annotation = annotations.item(i);
NodeList childNodes = annotation.getChildNodes();
String docName = null;
int length = 0;
String wikiName = null;
int offset = 0;