Package com.cloudera.flume.handlers.hdfs

Source Code of com.cloudera.flume.handlers.hdfs.CustomDfsSink

/**
* Licensed to Cloudera, Inc. under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  Cloudera, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.flume.handlers.hdfs;

import java.io.IOException;
import java.io.OutputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.http.entity.StringEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.flume.conf.Context;
import com.cloudera.flume.conf.FlumeConfiguration;
import com.cloudera.flume.conf.FlumeSpecException;
import com.cloudera.flume.conf.SinkFactory.SinkBuilder;
import com.cloudera.flume.core.Event;
import com.cloudera.flume.core.EventSink;
import com.cloudera.flume.handlers.hive.MarkerStore;
import com.cloudera.flume.handlers.text.FormatFactory;
import com.cloudera.flume.handlers.text.output.OutputFormat;
import com.cloudera.flume.reporter.ReportEvent;
import com.google.common.base.Preconditions;

/**
* This creates a raw hadoop dfs file that outputs data formatted by the
* provided OutputFormat. It is assumed that the output is a file of some sort.
*/
public class CustomDfsSink extends EventSink.Base {
  static final Logger LOG = LoggerFactory.getLogger(CustomDfsSink.class);

  private static final String A_OUTPUTFORMAT = "recordformat";
  private static final DateFormat dateFormatDay = new SimpleDateFormat("yyyy-MM-dd");
  private static final DateFormat dateFormatHourMinuteSecond = new SimpleDateFormat("HH-mm-ss");
  private static final DateFormat dateFormatHourMinute = new SimpleDateFormat("HH-mm");
  private static final DateFormat dateFormatHour = new SimpleDateFormat("HH");


  List <StringEntity> stringEntities = new ArrayList<StringEntity>();
  boolean compressOutput, hiveOutput = false;
  OutputFormat format;
  OutputStream writer;
  AtomicLong count = new AtomicLong();
  String path;
  Path dstPath;
  String hiveTableName;
  String machineHostName;
  Calendar cal;
  Event localEvent;
  FlumeConfiguration conf;
  MarkerStore hup;
  String hiveMarkerFolder, hiveMarkerPath;


  StringBuilder sb = new StringBuilder();
  String elasticIndex, elasticType, elasticSearchUrl;
  boolean runMarkerQueries = false;

  public CustomDfsSink(String path, OutputFormat format) {
    Preconditions.checkArgument(path != null);
    Preconditions.checkArgument(format != null);
    this.path = path;
    this.format = format;
    this.writer = null;
  }
  public CustomDfsSink(String path, OutputFormat format, Event event) {

    Preconditions.checkArgument(path != null);
    Preconditions.checkArgument(format != null);
    this.path = path;
    this.format = format;
    this.writer = null;
    this.localEvent = event;
    cal = Calendar.getInstance();
    cal.setTimeInMillis(localEvent.getTimestamp());     
    this.conf = FlumeConfiguration.get();
    try {
      machineHostName = InetAddress.getLocalHost().getHostName();
    } catch (UnknownHostException e) {
      LOG.error("Error getting hostname for local machine: " + e.getMessage());
    }


  }

  public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName) {
    sb = new StringBuilder();   
   

    Preconditions.checkArgument(path != null);
    Preconditions.checkArgument(format != null);
    this.path = path;
    this.localEvent = event;

    cal = Calendar.getInstance();
    cal.setTimeInMillis(localEvent.getTimestamp());     
    this.format = format;
    this.writer = null;
    this.conf = FlumeConfiguration.get();
    this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();

    if (StringUtils.isNotBlank(hiveTableName)) {
      this.hiveOutput = true;
      this.hiveTableName = hiveTableName;   
      hup = new MarkerStore(hiveTableName, null, false);
    }

    try {
      machineHostName = InetAddress.getLocalHost().getHostName();
    } catch (UnknownHostException e) {
      LOG.error("Error getting hostname for local machine: " + e.getMessage());
    }
  }

  public CustomDfsSink(String path, OutputFormat format, Event event, String hiveTableName, String elasticSearchUrl, String elasticIndex, String elasticType, boolean runMarkerQueries) {
    LOG.info("inside CUSTOMDFSSINK: URL: " + elasticSearchUrl + " Index: " + elasticIndex + " Type: " + elasticType, " RunMarkerQueries: " + runMarkerQueries);

    sb = new StringBuilder();   
    this.elasticSearchUrl = elasticSearchUrl;
    //    if (StringUtils.indexOf(this.elasticSearchUrl, "/", this.elasticSearchUrl.length() - 1) > -1) {
    //      elasticSearchUrl = StringUtils.replaceOnce(StringUtils.reverse(elasticSearchUrl), "/", "");
    //    }
    this.elasticIndex = elasticIndex;
    this.elasticType = elasticType;
    this.runMarkerQueries = runMarkerQueries;

    Preconditions.checkArgument(path != null);
    Preconditions.checkArgument(format != null);
    this.path = path;
    this.localEvent = event;

    cal = Calendar.getInstance();
    cal.setTimeInMillis(localEvent.getTimestamp());     
    this.format = format;
    this.writer = null;
    this.conf = FlumeConfiguration.get();
    this.hiveMarkerFolder = conf.getHiveDefaultMarkerFolder();

    if (StringUtils.isNotBlank(hiveTableName)) {
      this.hiveOutput = true;
      this.hiveTableName = hiveTableName;   
      this.elasticSearchUrl = elasticSearchUrl;
      hup = new MarkerStore(hiveTableName, elasticSearchUrl, runMarkerQueries);
    }

    try {
      machineHostName = InetAddress.getLocalHost().getHostName();
    } catch (UnknownHostException e) {
      LOG.error("Error getting hostname for local machine: " + e.getMessage());
    }
  }

  @Override
  public void append(Event e) throws IOException {
    if (writer == null) {
      throw new IOException("Append failed, did you open the writer?");
    }
    sb.append("{ \"index\" : { \"_index\" : \"" + elasticIndex + "\", \"_type\" : \"" + elasticType + "\" } }\n{ \"" + elasticType + "\" : " + new String(e.getBody()) + " }\n");

    format.format(writer, e);
    count.getAndIncrement();
    super.append(e);

  }

  @Override
  public void close()  {
    try {
      LOG.info("Closing HDFS file: " + dstPath);
      writer.flush();
      LOG.info("done writing raw file to hdfs");
      writer.close();

      if (StringUtils.isNotBlank(elasticSearchUrl) && StringUtils.isNotBlank(elasticIndex&& StringUtils.isNotBlank(elasticType) ) {
        hup.sendESQuery(elasticSearchUrl, sb.toString());
      }


      if (!deleteEmptyFile(dstPath)) {
        if (localEvent != null && hiveOutput) {
          String dataFolder = StringUtils.substringBeforeLast(dstPath.toString(),"/");

          //String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='" + dateFormatDay.format(cal.getTime()) + "', ts='" + dateFormatHour.format(cal.getTime()) + "') LOCATION '" + dataFolder + "'";
          String hqlQuery = "ALTER TABLE " + hiveTableName + " ADD IF NOT EXISTS PARTITION (ds='" + dateFormatDay.format(localEvent.getTimestamp()) + "', ts='" + dateFormatHour.format(localEvent.getTimestamp()) + "') LOCATION '" + dataFolder + "'";         

          LOG.info("HQL Query: " + hqlQuery + "\n\n\n\n\n");

          hiveMarkerPath = hiveMarkerFolder + "/" + machineHostName + "-" + localEvent.getTimestamp() + ".marker";

          if (!hup.runHiveQuery(hqlQuery)) {
            writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath);
          }
          //      boolean hiveMarkerStatus = writeHiveMarker(hqlQuery, dstPath.toString(), hiveMarkerFolder, hiveMarkerPath);
          //
          //      if (hiveMarkerStatus) {
          //        if (hup.runHiveQuery(hqlQuery)) {
          //          hup.cleanHiveMarker(hiveMarkerPath);
          //        }
          //      }
        }
      } else {
        LOG.info("deleted empty file: " + dstPath);
      }
      localEvent = null;
      cal = null;

      writer = null;
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
 
  private boolean deleteEmptyFile(Path dstPath) {
    try {
      FileSystem fs = dstPath.getFileSystem(conf);
      if (fs.getFileStatus(dstPath).getLen() == 0) { //empty file, needs to be deleted
        LOG.info("empty file: " + dstPath);
        return fs.delete(dstPath, false);
      }

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    LOG.info("non-empty file: " + dstPath);
    return false;
  }



  private boolean writeHiveMarker(String hqlQuery, String filePath, String hiveMarkerFolder, String hiveMarkerPath) {
    LOG.info("writing to hiveMarker: " + hiveMarkerFolder);
    LOG.info("hiveMarkerPath: " + hiveMarkerPath);


    FileSystem hdfs;   
    dstPath = new Path(hiveMarkerFolder);
    try {
      hdfs = dstPath.getFileSystem(conf);

      if (!hdfs.exists(dstPath)) {
        hdfs.mkdirs(dstPath);
      }
      dstPath = new Path(hiveMarkerPath);
      FSDataOutputStream writer_marker = hdfs.create(dstPath);
      writer_marker.writeBytes(filePath + "\t" + hqlQuery + "\n");
      writer_marker.close();
      dstPath = null;
      writer_marker = null;

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return true;


  }




  @Override
  public void open() throws IOException {
    FlumeConfiguration conf = FlumeConfiguration.get();
    FileSystem hdfs;

    // use v0.9.1 compression settings
    if (conf.getCollectorDfsCompressGzipStatus()) {
      LOG.warn("Config property "
          + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_GZIP
          + " is deprecated, please use "
          + FlumeConfiguration.COLLECTOR_DFS_COMPRESS_CODEC
          + " set to GzipCodec instead");
      CompressionCodec gzipC = new GzipCodec();
      Compressor gzCmp = gzipC.createCompressor();
      dstPath = new Path(path + gzipC.getDefaultExtension());
      hdfs = dstPath.getFileSystem(conf);
      writer = hdfs.create(dstPath);
      writer = gzipC.createOutputStream(writer, gzCmp);
      LOG.info("Creating HDFS gzip compressed file: " + dstPath.toString());
      return;
    }

    String codecName = conf.getCollectorDfsCompressCodec();
    List<Class<? extends CompressionCodec>> codecs = CompressionCodecFactory
    .getCodecClasses(FlumeConfiguration.get());
    CompressionCodec codec = null;
    ArrayList<String> codecStrs = new ArrayList<String>();
    codecStrs.add("None");
    for (Class<? extends CompressionCodec> cls : codecs) {
      codecStrs.add(cls.getSimpleName());

      if (cls.getSimpleName().equals(codecName)) {
        try {
          codec = cls.newInstance();
        } catch (InstantiationException e) {
          LOG.error("Unable to instantiate " + codec + " class");
        } catch (IllegalAccessException e) {
          LOG.error("Unable to access " + codec + " class");
        }
      }
    }

    if (codec == null) {
      if (!codecName.equals("None")) {
        LOG.warn("Unsupported compression codec " + codecName
            + ".  Please choose from: " + codecStrs);
      }
      dstPath = new Path(path);
      hdfs = dstPath.getFileSystem(conf);
      writer = hdfs.create(dstPath);
      LOG.info("Creating HDFS file: " + dstPath.toString());
      return;
    }

    Compressor cmp = codec.createCompressor();
    dstPath = new Path(path + codec.getDefaultExtension());
    hdfs = dstPath.getFileSystem(conf);
    writer = hdfs.create(dstPath);
    try {
      writer = codec.createOutputStream(writer, cmp);
    } catch (NullPointerException npe) {
      // tries to find "native" version of codec, if that fails, then tries to
      // find java version. If there is no java version, the createOutpuStream
      // exits via NPE. We capture this and convert it into a IOE with a more
      // useful error message.
      LOG.error("Unable to load compression codec " + codec);
      throw new IOException("Unable to load compression codec " + codec);
    }
    LOG.info("Creating " + codec + " compressed HDFS file: "
        + dstPath.toString());
  }

  public static SinkBuilder builder() {
    return new SinkBuilder() {
      @Override
      public EventSink build(Context context, String... args) {
        if (args.length != 2 && args.length != 1) {
          // TODO (jon) make this message easier.
          throw new IllegalArgumentException(
          "usage: customdfs(\"[(hdfs|file|s3n|...)://namenode[:port]]/path\", \"format\")");
        }

        String format = (args.length == 1) ? null : args[1];
        OutputFormat fmt;
        try {
          fmt = FormatFactory.get().getOutputFormat(format);
        } catch (FlumeSpecException e) {
          LOG.error("failed to load format " + format, e);
          throw new IllegalArgumentException("failed to load format " + format);
        }
        return new CustomDfsSink(args[0], fmt,null);
      }
    };
  }

  @Override
  public String getName() {
    return "CustomDfs";
  }

  @Override
  public ReportEvent getReport() {
    ReportEvent rpt = super.getReport();
    rpt.setStringMetric(A_OUTPUTFORMAT, format.getBuilder().getName());
    rpt.setLongMetric(ReportEvent.A_COUNT, count.get());
    return rpt;
  }
}
TOP

Related Classes of com.cloudera.flume.handlers.hdfs.CustomDfsSink

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.