Package pignlproc.storage

Source Code of pignlproc.storage.RawWikipediaLoader

package pignlproc.storage;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;

import pignlproc.format.WikipediaPageInputFormat;
import pignlproc.format.WikipediaPageInputFormat.WikipediaRecordReader;
import pignlproc.markup.AnnotatingMarkupParser;

/**
* LoadFunc to load the title and raw markup of wikipedia articles from a pig
* script.
*/
public class RawWikipediaLoader extends LoadFunc {

    protected WikipediaRecordReader reader;

    protected TupleFactory tupleFactory;

    protected BagFactory bagFactory;

    protected String languageCode = "en";

    public RawWikipediaLoader() {
    }

    public RawWikipediaLoader(String languageCode) {
        if (languageCode != null && !languageCode.isEmpty()) {
            this.languageCode = languageCode;
        }
    }

    @SuppressWarnings("rawtypes")
    @Override
    public InputFormat getInputFormat() throws IOException {
        return new WikipediaPageInputFormat();
    }

    @Override
    public Tuple getNext() throws IOException {
        try {
            boolean next = reader.nextKeyValue();
            if (!next) {
                return null;
            }

            String title = reader.getCurrentKey().toString();
            String rawMarkup = reader.getCurrentValue().toString();
            String uri = AnnotatingMarkupParser.titleToUri(title, languageCode);

            return tupleFactory.newTupleNoCopy(Arrays.asList(new DataByteArray(
                    title), new DataByteArray(uri),
                    new DataByteArray(rawMarkup)));
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }

    @SuppressWarnings("rawtypes")
    @Override
    public void prepareToRead(RecordReader reader, PigSplit split)
            throws IOException {
        this.reader = (WikipediaPageInputFormat.WikipediaRecordReader) reader;
        tupleFactory = TupleFactory.getInstance();
        bagFactory = BagFactory.getInstance();
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        FileInputFormat.setInputPaths(job, location);
    }

}
TOP

Related Classes of pignlproc.storage.RawWikipediaLoader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.