Package wikipedia

Source Code of wikipedia.WikipediaXmlRecorderReader2

package wikipedia;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import xml.XmlPartReaderImpl2;

import java.io.IOException;

public class WikipediaXmlRecorderReader2 extends RecordReader<Text, Text> {

    public static final String XML_READER_TAG = "xml.reader.tag";

    private FileSplit split;
    private long start;
    private long end;
    private long position;
    private Text valueText;
    private Text keyText;
    private XmlPartReaderImpl2 xmlPartReader;
    private FSDataInputStream fsDataInputStream;

    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
        this.split = (FileSplit) inputSplit;

        setStartEndPosition();
        String tagName = getTagName(context);
        fsDataInputStream = getInputStream(context);
        xmlPartReader = new XmlPartReaderImpl2(tagName, fsDataInputStream, start);
        initializeKeyValue(tagName);
    }

    private void setStartEndPosition() {
        start = split.getStart();
        end = start + split.getLength();
        position = start;
    }

    private void initializeKeyValue(String tagName) {
        keyText = new Text(tagName);
        valueText = new Text();
    }

    private String getTagName(TaskAttemptContext context) {
        return context.getConfiguration().get(XML_READER_TAG, "page");
    }

    private FSDataInputStream getInputStream(TaskAttemptContext context) throws IOException {
        Path filePath = split.getPath();
        FileSystem fileSystem = filePath.getFileSystem(context.getConfiguration());
        return fileSystem.open(filePath);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if(position < end && xmlPartReader.hasNext()) {

            valueText.set(xmlPartReader.getNextXmlPart());
            position = fsDataInputStream.getPos() - 1; // minus 1 otherwise misses last part

            return true;
        }
        return false;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return keyText;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return valueText;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return (position - start) / ((float)end - start);
    }

    @Override
    public void close() throws IOException {
        xmlPartReader.close();
    }
}
TOP

Related Classes of wikipedia.WikipediaXmlRecorderReader2

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.