Package com.twitter.elephantbird.pig.load

Source Code of com.twitter.elephantbird.pig.load.LzoBaseRegexLoader

package com.twitter.elephantbird.pig.load;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.twitter.elephantbird.mapreduce.input.LzoTextInputFormat;

/**
* Serves as the base class for all regex-based lzo loading functions.  Each deriving
* class just needs to implement getPattern in some way.
*/

public abstract class LzoBaseRegexLoader extends LzoBaseLoadFunc {
  private static final Logger LOG = LoggerFactory.getLogger(LzoBaseRegexLoader.class);

  protected static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
  protected static final Charset UTF8 = Charset.forName("UTF-8");
  protected static final byte RECORD_DELIMITER = (byte)'\n';
  // This loader tracks the number of matched and unmatched lines as Hadoop counters.
  protected enum LzoBaseRegexLoaderCounters { MatchedRegexLines, UnmatchedRegexLines }

  public LzoBaseRegexLoader() {
    LOG.info("LzoBaseRegexLoader created.");
  }

  /**
   * The regex pattern must be filled out by the inheritor.
   */
  public abstract Pattern getPattern();

  /**
   * Read the file line by line, returning lines the match the regex in Tuples
   * based on the regex match groups.
   */
  @Override
  public Tuple getNext() throws IOException {
    if (reader == null) {
      return null;
    }

    Pattern pattern = getPattern();
    Matcher matcher = pattern.matcher("");
    Object lineObj;
    String line;
    Tuple t = null;
    // Read lines until a match is found, making sure there's no reading past the
    // end of the assigned byte range.
    try {
      while (reader.nextKeyValue()) {

        lineObj = reader.getCurrentValue();

        if (lineObj == null) {
          break;
        }
        line = lineObj.toString();
        matcher = matcher.reset(line);
        // Increment counters for the number of matched and unmatched lines.
        if (matcher.find()) {

          incrCounter(LzoBaseRegexLoaderCounters.MatchedRegexLines, 1L);
          t = tupleFactory_.newTuple(matcher.groupCount());
          for (int i = 1; i <= matcher.groupCount(); i++) {
            if(matcher.group(i) != null) {
              t.set(i - 1, matcher.group(i));
            } else {
              t.set(i - 1, "");
            }
          }
          break;
        } else {
          incrCounter(LzoBaseRegexLoaderCounters.UnmatchedRegexLines, 1L);
          // TODO: stop doing this, as it can slow down the job.
          LOG.debug("No match for line " + line);
        }

        // If the read has walked beyond the end of the split, move on.

      }
    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode,
          PigException.REMOTE_ENVIRONMENT, e);
    }

    return t;
  }

  @Override
  public void setLocation(String location, Job job)
  throws IOException {
    FileInputFormat.setInputPaths(job, location);
  }

  @Override
  public InputFormat getInputFormat() {
      return new LzoTextInputFormat();
  }

}
TOP

Related Classes of com.twitter.elephantbird.pig.load.LzoBaseRegexLoader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.