Package com.liveramp.cascading_ext

Source Code of com.liveramp.cascading_ext.CascadingUtil

/**
*  Copyright 2012 LiveRamp
*
*  Licensed under the Apache License, Version 2.0 (the "License");
*  you may not use this file except in compliance with the License.
*  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package com.liveramp.cascading_ext;

import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.FlowStepStrategy;
import cascading.flow.hadoop.HadoopFlowProcess;
import com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy;
import com.liveramp.cascading_ext.bloom.BloomProps;
import com.liveramp.cascading_ext.flow.LoggingFlowConnector;
import com.liveramp.cascading_ext.flow_step_strategy.FlowStepStrategyFactory;
import com.liveramp.cascading_ext.flow_step_strategy.MultiFlowStepStrategy;
import com.liveramp.cascading_ext.flow_step_strategy.RenameJobStrategy;
import com.liveramp.cascading_ext.flow_step_strategy.SimpleFlowStepStrategyFactory;
import com.liveramp.cascading_ext.util.OperationStatsUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.mapred.JobConf;

import java.util.*;

public class CascadingUtil {

  public static final String CASCADING_RUN_ID = "cascading_ext.cascading.run.id";

  private static final CascadingUtil INSTANCE = new CascadingUtil();

  public static CascadingUtil get() {
    return INSTANCE;
  }

  protected CascadingUtil() {
    addDefaultFlowStepStrategy(RenameJobStrategy.class);
    addDefaultFlowStepStrategy(BloomAssemblyStrategy.class);

    defaultProperties.putAll(BloomProps.getDefaultProperties());
  }

  private final Map<Object, Object> defaultProperties = new HashMap<Object, Object>();
  private final List<FlowStepStrategyFactory<JobConf>> defaultFlowStepStrategies = new ArrayList<FlowStepStrategyFactory<JobConf>>();
  private final Set<Class<? extends Serialization>> serializations = new HashSet<Class<? extends Serialization>>();
  private final Map<Integer, Class<?>> serializationTokens = new HashMap<Integer, Class<?>>();

  private transient JobConf conf = null;

  public void setDefaultProperty(Object key, Object value) {
    defaultProperties.put(key, value);
    conf = null;
  }

  public void addDefaultFlowStepStrategy(FlowStepStrategyFactory<JobConf> flowStepStrategyFactory) {
    defaultFlowStepStrategies.add(flowStepStrategyFactory);
  }

  public void addDefaultFlowStepStrategy(Class<? extends FlowStepStrategy<JobConf>> klass) {
    defaultFlowStepStrategies.add(new SimpleFlowStepStrategyFactory(klass));
  }

  public void clearDefaultFlowStepStrategies() {
    defaultFlowStepStrategies.clear();
  }

  public void addSerialization(Class<? extends Serialization> serialization) {
    serializations.add(serialization);
    conf = null;
  }

  public void addSerializationToken(int token, Class<?> klass) {
    if (token < 128) {
      throw new IllegalArgumentException("Serialization tokens must be >= 128 (lower numbers are reserved by Cascading)");
    }

    if (serializationTokens.containsKey(token) && !serializationTokens.get(token).equals(klass)) {
      throw new IllegalArgumentException("Token " + token + " is already assigned to class " + serializationTokens.get(token));
    }

    serializationTokens.put(token, klass);
  }

  private Map<String, String> getSerializationsProperty() {
    // Get the existing serializations
    List<String> strings = new ArrayList<String>();

    String existing = new JobConf().get("io.serializations");
    if (existing != null) {
      strings.add(existing);
    }

    // Append our custom serializations
    for (Class<? extends Serialization> klass : serializations) {
      strings.add(klass.getName());
    }

    return Collections.singletonMap("io.serializations", StringUtils.join(strings, ","));
  }

  private Map<String, String> getSerializationTokensProperty() {
    List<String> strings = new ArrayList<String>();
    for (Map.Entry<Integer, Class<?>> entry : serializationTokens.entrySet()) {
      strings.add(entry.getKey() + "=" + entry.getValue().getName());
    }
    if (strings.isEmpty()) {
      return Collections.emptyMap();
    } else {
      return Collections.singletonMap("cascading.serialization.tokens", StringUtils.join(strings, ","));
    }
  }

  public Map<Object, Object> getDefaultProperties() {
    Map<Object, Object> properties = new HashMap<Object, Object>();
    properties.putAll(getSerializationsProperty());
    properties.putAll(getSerializationTokensProperty());
    properties.putAll(defaultProperties);
    return properties;
  }

  public JobConf getJobConf() {
    if (conf == null) {
      conf = new JobConf();
      setAll(conf, getSerializationsProperty());
      setAll(conf, getSerializationTokensProperty());
    }
    return new JobConf(conf);
  }

  public FlowConnector getFlowConnector() {
    return realGetFlowConnector(Collections.<Object, Object>emptyMap(),
        Collections.<FlowStepStrategy<JobConf>>emptyList());
  }

  public FlowConnector getFlowConnector(Map<Object, Object> properties) {
    return realGetFlowConnector(properties,
        Collections.<FlowStepStrategy<JobConf>>emptyList());
  }

  public FlowConnector getFlowConnector(List<FlowStepStrategy<JobConf>> flowStepStrategies) {
    return realGetFlowConnector(Collections.<Object, Object>emptyMap(),
        flowStepStrategies);
  }

  public FlowConnector getFlowConnector(Map<Object, Object> properties,
                                        List<FlowStepStrategy<JobConf>> flowStepStrategies) {
    return realGetFlowConnector(properties, flowStepStrategies);
  }

  // We extract this method so that the default name based on the stack position makes sense
  private FlowConnector realGetFlowConnector(Map<Object, Object> properties,
                                             List<FlowStepStrategy<JobConf>> flowStepStrategies) {
    //Add in default properties
    Map<Object, Object> combinedProperties = getDefaultProperties();
    combinedProperties.putAll(properties);

    //Add in default flow step strategies
    List<FlowStepStrategy<JobConf>> combinedStrategies = new ArrayList<FlowStepStrategy<JobConf>>(flowStepStrategies);
    for (FlowStepStrategyFactory<JobConf> flowStepStrategyFactory : defaultFlowStepStrategies) {
      combinedStrategies.add(flowStepStrategyFactory.getFlowStepStrategy());
    }

    return new LoggingFlowConnector(combinedProperties,
        new MultiFlowStepStrategy(combinedStrategies),
        OperationStatsUtils.formatStackPosition(OperationStatsUtils.getStackPosition(2)));
  }

  public FlowProcess<JobConf> getFlowProcess() {
    return getFlowProcess(getJobConf());
  }

  public FlowProcess<JobConf> getFlowProcess(JobConf jobConf) {
    return new HadoopFlowProcess(jobConf);
  }

  private void setAll(Configuration conf, Map<String, String> properties) {
    for (Map.Entry<String, String> property : properties.entrySet()) {
      conf.set(property.getKey(), property.getValue());
    }
  }
}
TOP

Related Classes of com.liveramp.cascading_ext.CascadingUtil

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.