package wikipedia;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import xml.XmlPartReaderImpl2;
import java.io.IOException;
public class WikipediaXmlRecorderReader2 extends RecordReader<Text, Text> {
public static final String XML_READER_TAG = "xml.reader.tag";
private FileSplit split;
private long start;
private long end;
private long position;
private Text valueText;
private Text keyText;
private XmlPartReaderImpl2 xmlPartReader;
private FSDataInputStream fsDataInputStream;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
this.split = (FileSplit) inputSplit;
setStartEndPosition();
String tagName = getTagName(context);
fsDataInputStream = getInputStream(context);
xmlPartReader = new XmlPartReaderImpl2(tagName, fsDataInputStream, start);
initializeKeyValue(tagName);
}
private void setStartEndPosition() {
start = split.getStart();
end = start + split.getLength();
position = start;
}
private void initializeKeyValue(String tagName) {
keyText = new Text(tagName);
valueText = new Text();
}
private String getTagName(TaskAttemptContext context) {
return context.getConfiguration().get(XML_READER_TAG, "page");
}
private FSDataInputStream getInputStream(TaskAttemptContext context) throws IOException {
Path filePath = split.getPath();
FileSystem fileSystem = filePath.getFileSystem(context.getConfiguration());
return fileSystem.open(filePath);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(position < end && xmlPartReader.hasNext()) {
valueText.set(xmlPartReader.getNextXmlPart());
position = fsDataInputStream.getPos() - 1; // minus 1 otherwise misses last part
return true;
}
return false;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return keyText;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return valueText;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return (position - start) / ((float)end - start);
}
@Override
public void close() throws IOException {
xmlPartReader.close();
}
}