Package org.apache.flume.sink.kite

Source Code of org.apache.flume.sink.kite.DatasetSink

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flume.sink.kite;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URL;
import java.security.PrivilegedExceptionAction;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.UserGroupInformation;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.View;
import org.kitesdk.data.spi.Registration;
import org.kitesdk.data.spi.URIBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Experimental sink that writes events to a Kite Dataset. This sink will
* deserialize the body of each incoming event and store the resulting record
* in a Kite Dataset. It determines target Dataset by opening a repository URI,
* {@code kite.repo.uri}, and loading a Dataset by name,
* {@code kite.dataset.name}.
*/
public class DatasetSink extends AbstractSink implements Configurable {

  private static final Logger LOG = LoggerFactory.getLogger(DatasetSink.class);

  static Configuration conf = new Configuration();

  private String datasetName = null;
  private long batchSize = DatasetSinkConstants.DEFAULT_BATCH_SIZE;

  private URI target = null;
  private Schema targetSchema = null;
  private DatasetWriter<GenericRecord> writer = null;
  private UserGroupInformation login = null;
  private SinkCounter counter = null;

  // for rolling files at a given interval
  private int rollIntervalS = DatasetSinkConstants.DEFAULT_ROLL_INTERVAL;
  private long lastRolledMs = 0l;

  // for working with avro serialized records
  private GenericRecord datum = null;
  // TODO: remove this after PARQUET-62 is released
  private boolean reuseDatum = true;
  private BinaryDecoder decoder = null;
  private LoadingCache<Schema, DatumReader<GenericRecord>> readers =
      CacheBuilder.newBuilder()
      .build(new CacheLoader<Schema, DatumReader<GenericRecord>>() {
        @Override
        public DatumReader<GenericRecord> load(Schema schema) {
          // must use the target dataset's schema for reading to ensure the
          // records are able to be stored using it
          return new GenericDatumReader<GenericRecord>(
            schema, targetSchema);
        }
      });
  private static LoadingCache<String, Schema> schemasFromLiteral = CacheBuilder
      .newBuilder()
      .build(new CacheLoader<String, Schema>() {
        @Override
        public Schema load(String literal) {
          Preconditions.checkNotNull(literal,
              "Schema literal cannot be null without a Schema URL");
          return new Schema.Parser().parse(literal);
        }
      });
  private static LoadingCache<String, Schema> schemasFromURL = CacheBuilder
      .newBuilder()
      .build(new CacheLoader<String, Schema>() {
        @Override
        public Schema load(String url) throws IOException {
          Schema.Parser parser = new Schema.Parser();
          InputStream is = null;
          try {
            FileSystem fs = FileSystem.get(URI.create(url), conf);
            if (url.toLowerCase(Locale.ENGLISH).startsWith("hdfs:/")) {
              is = fs.open(new Path(url));
            } else {
              is = new URL(url).openStream();
            }
            return parser.parse(is);
          } finally {
            if (is != null) {
              is.close();
            }
          }
        }
      });

  protected List<String> allowedFormats() {
    return Lists.newArrayList("avro", "parquet");
  }

  @Override
  public void configure(Context context) {
    // initialize login credentials
    this.login = KerberosUtil.login(
        context.getString(DatasetSinkConstants.AUTH_PRINCIPAL),
        context.getString(DatasetSinkConstants.AUTH_KEYTAB));
    String effectiveUser =
        context.getString(DatasetSinkConstants.AUTH_PROXY_USER);
    if (effectiveUser != null) {
      this.login = KerberosUtil.proxyAs(effectiveUser, login);
    }

    String datasetURI = context.getString(
      DatasetSinkConstants.CONFIG_KITE_DATASET_URI);
    if (datasetURI != null) {
      this.target = URI.create(datasetURI);
      this.datasetName = uriToName(target);
    } else {
      String repositoryURI = context.getString(
        DatasetSinkConstants.CONFIG_KITE_REPO_URI);
      Preconditions.checkNotNull(repositoryURI, "Repository URI is missing");
      this.datasetName = context.getString(
        DatasetSinkConstants.CONFIG_KITE_DATASET_NAME);
      Preconditions.checkNotNull(datasetName, "Dataset name is missing");

      this.target = new URIBuilder(repositoryURI, datasetName).build();
    }

    this.setName(target.toString());

    // other configuration
    this.batchSize = context.getLong(
        DatasetSinkConstants.CONFIG_KITE_BATCH_SIZE,
        DatasetSinkConstants.DEFAULT_BATCH_SIZE);
    this.rollIntervalS = context.getInteger(
        DatasetSinkConstants.CONFIG_KITE_ROLL_INTERVAL,
        DatasetSinkConstants.DEFAULT_ROLL_INTERVAL);

    this.counter = new SinkCounter(datasetName);
  }

  @Override
  public synchronized void start() {
    this.lastRolledMs = System.currentTimeMillis();
    counter.start();
    // signal that this sink is ready to process
    LOG.info("Started DatasetSink " + getName());
    super.start();
  }

  /**
   * Causes the sink to roll at the next {@link #process()} call.
   */
  @VisibleForTesting
  public void roll() {
    this.lastRolledMs = 0l;
  }

  @Override
  public synchronized void stop() {
    counter.stop();

    if (writer != null) {
      // any write problems invalidate the writer, which is immediately closed
      writer.close();
      this.writer = null;
      this.lastRolledMs = System.currentTimeMillis();
    }

    // signal that this sink has stopped
    LOG.info("Stopped dataset sink: " + getName());
    super.stop();
  }

  @Override
  public Status process() throws EventDeliveryException {
    if (writer == null) {
      try {
        this.writer = newWriter(login, target);
      } catch (DatasetException e) {
        // DatasetException includes DatasetNotFoundException
        throw new EventDeliveryException(
          "Cannot write to " + getName(), e);
      }
    }

    // handle file rolling
    if ((System.currentTimeMillis() - lastRolledMs) / 1000 > rollIntervalS) {
      // close the current writer and get a new one
      writer.close();
      this.writer = newWriter(login, target);
      this.lastRolledMs = System.currentTimeMillis();
      LOG.info("Rolled writer for " + getName());
    }

    Channel channel = getChannel();
    Transaction transaction = null;
    try {
      long processedEvents = 0;

      transaction = channel.getTransaction();
      transaction.begin();
      for (; processedEvents < batchSize; processedEvents += 1) {
        Event event = channel.take();
        if (event == null) {
          // no events available in the channel
          break;
        }

        this.datum = deserialize(event, reuseDatum ? datum : null);

        // writeEncoded would be an optimization in some cases, but HBase
        // will not support it and partitioned Datasets need to get partition
        // info from the entity Object. We may be able to avoid the
        // serialization round-trip otherwise.
        writer.write(datum);
      }

      // TODO: Add option to sync, depends on CDK-203
      writer.flush();

      // commit after data has been written and flushed
      transaction.commit();

      if (processedEvents == 0) {
        counter.incrementBatchEmptyCount();
        return Status.BACKOFF;
      } else if (processedEvents < batchSize) {
        counter.incrementBatchUnderflowCount();
      } else {
        counter.incrementBatchCompleteCount();
      }

      counter.addToEventDrainSuccessCount(processedEvents);

      return Status.READY;

    } catch (Throwable th) {
      // catch-all for any unhandled Throwable so that the transaction is
      // correctly rolled back.
      if (transaction != null) {
        try {
          transaction.rollback();
        } catch (Exception ex) {
          LOG.error("Transaction rollback failed", ex);
          throw Throwables.propagate(ex);
        }
      }

      // close the writer and remove the its reference
      writer.close();
      this.writer = null;
      this.lastRolledMs = System.currentTimeMillis();

      // handle the exception
      Throwables.propagateIfInstanceOf(th, Error.class);
      Throwables.propagateIfInstanceOf(th, EventDeliveryException.class);
      throw new EventDeliveryException(th);

    } finally {
      if (transaction != null) {
        transaction.close();
      }
    }
  }

  private DatasetWriter<GenericRecord> newWriter(
    final UserGroupInformation login, final URI uri) {
    View<GenericRecord> view = KerberosUtil.runPrivileged(login,
      new PrivilegedExceptionAction<Dataset<GenericRecord>>() {
        @Override
        public Dataset<GenericRecord> run() {
          return Datasets.load(uri);
        }
      });

    DatasetDescriptor descriptor = view.getDataset().getDescriptor();
    String formatName = descriptor.getFormat().getName();
    Preconditions.checkArgument(allowedFormats().contains(formatName),
      "Unsupported format: " + formatName);

    Schema newSchema = descriptor.getSchema();
    if (targetSchema == null || !newSchema.equals(targetSchema)) {
      this.targetSchema = descriptor.getSchema();
      // target dataset schema has changed, invalidate all readers based on it
      readers.invalidateAll();
    }

    this.reuseDatum = !("parquet".equals(formatName));
    this.datasetName = view.getDataset().getName();

    return view.newWriter();
  }

  /**
   * Not thread-safe.
   *
   * @param event
   * @param reuse
   * @return
   */
  private GenericRecord deserialize(Event event, GenericRecord reuse)
      throws EventDeliveryException {
    decoder = DecoderFactory.get().binaryDecoder(event.getBody(), decoder);
    // no checked exception is thrown in the CacheLoader
    DatumReader<GenericRecord> reader = readers.getUnchecked(schema(event));
    try {
      return reader.read(reuse, decoder);
    } catch (IOException ex) {
      throw new EventDeliveryException("Cannot deserialize event", ex);
    }
  }

  private static Schema schema(Event event) throws EventDeliveryException {
    Map<String, String> headers = event.getHeaders();
    String schemaURL = headers.get(
        DatasetSinkConstants.AVRO_SCHEMA_URL_HEADER);
    try {
      if (headers.get(DatasetSinkConstants.AVRO_SCHEMA_URL_HEADER) != null) {
        return schemasFromURL.get(schemaURL);
      } else {
        return schemasFromLiteral.get(
            headers.get(DatasetSinkConstants.AVRO_SCHEMA_LITERAL_HEADER));
      }
    } catch (ExecutionException ex) {
      throw new EventDeliveryException("Cannot get schema", ex.getCause());
    }
  }

  private static String uriToName(URI uri) {
    return Registration.lookupDatasetUri(URI.create(
      uri.getRawSchemeSpecificPart())).second().get("dataset");
  }
}
TOP

Related Classes of org.apache.flume.sink.kite.DatasetSink

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.