package edu.umd.cloud9.collection.pmc;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import edu.umd.cloud9.collection.IndexableFileInputFormatOld;
import edu.umd.cloud9.collection.XMLInputFormatOld;
import edu.umd.cloud9.collection.XMLInputFormatOld.XMLRecordReader;
public class PmcArticleInputFormat extends IndexableFileInputFormatOld<LongWritable, PmcArticle> {
public void configure(JobConf conf) {
}
public RecordReader<LongWritable, PmcArticle> getRecordReader(InputSplit inputSplit,
JobConf conf, Reporter reporter) throws IOException {
return new PmcArticleRecordReader((FileSplit) inputSplit, conf);
}
public static class PmcArticleRecordReader implements RecordReader<LongWritable, PmcArticle> {
private XMLRecordReader mReader;
private Text mText = new Text();
private LongWritable mLong = new LongWritable();
public PmcArticleRecordReader(FileSplit split, JobConf conf) throws IOException {
conf.set(XMLInputFormatOld.START_TAG_KEY, PmcArticle.XML_START_TAG);
conf.set(XMLInputFormatOld.END_TAG_KEY, PmcArticle.XML_END_TAG);
mReader = new XMLRecordReader(split, conf);
}
public boolean next(LongWritable key, PmcArticle value) throws IOException {
if (mReader.next(mLong, mText) == false)
return false;
key.set(mLong.get());
PmcArticle.readArticle(value, mText.toString());
return true;
}
public LongWritable createKey() {
return new LongWritable();
}
public PmcArticle createValue() {
return new PmcArticle();
}
public long getPos() throws IOException {
return mReader.getPos();
}
public void close() throws IOException {
mReader.close();
}
public float getProgress() throws IOException {
return ((float) (mReader.getPos() - mReader.getStart()))
/ ((float) (mReader.getEnd() - mReader.getStart()));
}
}
}