package edu.jhu.agiga;
import static edu.jhu.agiga.AgigaSentenceReader.require;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import com.ximpleware.VTDException;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
/**
* StreamingVtdXmlReader is an abstract class that enables efficient reading of
* large Annotated Gigaword files by extracting snippets of XML containing only
* a single document and passing that XML to an appropriate object iterator such
* as AgigaDocumentReader or AgigaSentenceReader.
*
* This implementation using VTD-XML should handle XML files up to 2GB in size.
* For larger files, we can switch to extended VTD-XML as described here:
* <url>http://vtd-xml.sourceforge.net/codeSample/cs12.html</url>
*
* @author mgormley
*
*/
public abstract class StreamingVtdXmlReader<T> implements Iterable<T>, Iterator<T> {
private static Logger log = Logger.getLogger(StreamingVtdXmlReader.class.getName());
private String fileId;
private int numSents;
private int numDocs;
private BufferedReader reader;
private Iterator<T> vtdReader;
public StreamingVtdXmlReader(String inputFile) {
try {
InputStream inputStream = new FileInputStream(inputFile);
numSents = 0;
numDocs = 0;
if (inputFile.endsWith(".gz")) {
inputStream = new GZIPInputStream(inputStream);
}
reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
fileId = getFileId(reader);
} catch (IOException e) {
throw new RuntimeException(e);
} catch (VTDException e) {
throw new RuntimeException(e);
}
}
private String getFileId(BufferedReader reader) throws IOException, VTDException {
reader.mark(1024);
String str = reader.readLine();
str += "</FILE>";
reader.reset();
byte[] b = str.getBytes("UTF-8");
VTDGen vg = new VTDGen();
vg.setDoc(b);
vg.parse(false);
VTDNav vn = vg.getNav();
require(vn.toElement(VTDNav.ROOT));
String fileId = vn.toString(vn.getAttrVal(AgigaConstants.FILE_ID));
return fileId;
}
private boolean nextDoc() {
try {
StringBuilder sb = new StringBuilder();
String line;
boolean isBuilding = false;
while ((line = reader.readLine()) != null) {
if (isBuilding) {
sb.append(line);
sb.append("\n");
if (line.startsWith("</DOC")) {
isBuilding = false;
// Convert the StringBuilder to bytes
String str = sb.toString();
sb = new StringBuilder();
byte[] b = str.getBytes("UTF-8");
// Parse the bytes
vtdReader = getIteratorInstance(b);
numDocs++;
return true;
}
} else if (line.startsWith("<DOC")) {
// Case: !isBuilding && line.startsWith("<DOC")
sb.append(line);
sb.append("\n");
isBuilding = true;
}
}
return false;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
protected abstract Iterator<T> getIteratorInstance(byte[] b);
protected abstract int getNumSents(T item);
@Override
public Iterator<T> iterator() {
return this;
}
@Override
public boolean hasNext() {
while (vtdReader == null || !vtdReader.hasNext()) {
if (!nextDoc()) {
return false;
}
}
return vtdReader != null && vtdReader.hasNext();
}
@Override
public T next() {
hasNext();
T item = vtdReader != null ? vtdReader.next() : null;
if (item != null) {
numSents += getNumSents(item);
}
return item;
}
@Override
public void remove() {
throw new RuntimeException("not implemented");
}
public int getNumDocs() {
return numDocs;
}
public int getNumSents() {
return numSents;
}
public String getFileId() {
return fileId;
}
}