Source Code of com.ontology2.haruhi.AmazonEMRCluster

package com.ontology2.haruhi;


import static com.google.common.collect.Iterables.*;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Maps.newHashMap;
import static com.ontology2.centipede.errors.ExitCodeException.*;


import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;


import com.amazonaws.services.ec2.AmazonEC2Client;
import com.amazonaws.services.ec2.model.*;
import com.amazonaws.services.elasticmapreduce.model.*;
import com.amazonaws.services.elasticmapreduce.util.BootstrapActions;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.ontology2.bakemono.pse3.PSE3Options;
import com.ontology2.bakemono.util.CommonOptions;
import com.ontology2.centipede.parser.HasOptions;
import com.ontology2.centipede.parser.OptionParser;
import com.ontology2.haruhi.alert.AlertService;
import com.ontology2.haruhi.emr.NodeType;
import com.ontology2.haruhi.fetchLogs.FetchLogs;
import com.ontology2.haruhi.flows.*;
import com.ontology2.haruhi.ssh.HadoopConfigurationVariable;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.springframework.beans.factory.annotation.Autowired;


import com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.ontology2.centipede.errors.ExitCodeException;
import org.springframework.context.ApplicationContext;


import javax.annotation.PostConstruct;
import javax.annotation.Resource;


public class AmazonEMRCluster implements Cluster {
    private static Log logger = LogFactory.getLog(AmazonEMRCluster.class);
    private final JobFlowInstancesConfig instances;


    @Autowired private AmazonEC2Client ec2Client;
    @Autowired private AmazonS3Client s3Client;
    @Autowired private String awsSoftwareBucket;
    @Autowired private AmazonElasticMapReduce emrClient;
    @Autowired private StepConfig debugStep;
    @Autowired private String awsLogUri;
    
    private final Set<String> doneStates=Sets.newHashSet("COMPLETED","FAILED","TERMINATED");
    
    public AmazonEMRCluster(JobFlowInstancesConfig instances) {
        this.instances = instances;
    }


    @PostConstruct
    public void init() {
        if(this.keyPairName!=null && !this.keyPairName.isEmpty())
            instances.setEc2KeyName(keyPairName);
    }


    public String createPersistentCluster(String clusterName) throws Exception {
        StepConfig[] steps = {
                debugStep
        };


        instances.setKeepJobFlowAliveWhenNoSteps(true);
        RunJobFlowRequest that=new RunJobFlowRequest()
                .withName(clusterName)
                .withSteps(steps)
                .withLogUri(awsLogUri)
                .withInstances(instances);


        RunJobFlowResult result = runJob(that);


        pollClusterForCompletion(result, Sets.union(doneStates, Sets.newHashSet("WAITING")));
        return result.getJobFlowId();
    }






    @Override
    public void runJob(MavenManagedJar defaultJar,List<String> jarArgs)
            throws Exception {
        String jarLocation=defaultJar.s3JarLocation(awsSoftwareBucket);
        List<String> appArgs=newArrayList(skip(jarArgs,2));
        if(!validateJarArgs(appArgs)) {
            throw new Exception("Arguments to JAR were not valid");
        }


        StepConfig[] steps = {
                debugStep,
                new StepConfig("main",new HadoopJarStepConfig(jarLocation).withArgs(jarArgs))
        };


        String jobName = computeJobName(jarArgs);


        RunJobFlowRequest that=new RunJobFlowRequest()
            .withName(jobName)
            .withBootstrapActions(bootstrapActions())
            .withSteps(steps)
            .withLogUri(awsLogUri)
            .withInstances(instances);


        RunJobFlowResult result = runJob(that);


        pollClusterForCompletion(result);


        fetchLogs.run(new String[] {result.getJobFlowId()});
    }


    boolean validateJarArgs(List<String> jarArgs) throws IllegalAccessException, URISyntaxException {
        HasOptions options=extractOptions(jarArgs);
        if(options instanceof CommonOptions) {
            return validateCommonOptions((CommonOptions) options);
        }
        return true;  // don't know how to validate anything else
    }


    private boolean validateCommonOptions(CommonOptions options) throws URISyntaxException {
        if(options.input.isEmpty()) {
            logger.fatal("No input paths given to jar");
            return false;
        }
        for(String inputPath:options.input) {
            if(!validateInputPath(inputPath)) {
                logger.fatal("Could not resolve input path:"+inputPath);
                return false;
            }
        }


        if(!validateOutputPath(options.output)) {
            logger.fatal("Could not resolve output path:"+options.output);
            return false;
        }


        return true;
    }


    private boolean validateOutputPath(String output) throws URISyntaxException {
        if(!output.startsWith("s3n:")) {
            logger.warn("Cannot validate input path not in S3: ["+output+"]");
            return true;
        }


        return validateS3NOutputPath(output);
    }


    private boolean validateInputPath(String inputPath) throws URISyntaxException {
        if(!inputPath.startsWith("s3n:")) {
            logger.warn("Cannot validate input path not in S3: ["+inputPath+"]");
            return true;
        }


        return validateS3NInputPath(inputPath);
    }


    private boolean validateS3NInputPath(String inputPath) throws URISyntaxException {
        URI uri=new URI(inputPath);
        String bucketName=uri.getHost();
        String path=uri.getPath();
        return !fileExistsinS3(bucketName, path);
    }


    private boolean validateS3NOutputPath(String inputPath) throws URISyntaxException {
        URI uri=new URI(inputPath);
        String bucketName=uri.getHost();
        String path=uri.getPath();
        return fileExistsinS3(bucketName, path);
    }


    private boolean fileExistsinS3(String bucketName, String path) {
        return s3Client.listObjects(new ListObjectsRequest()
                .withBucketName(bucketName)
                .withPrefix(path.substring(1))
                .withMaxKeys(1)).getObjectSummaries().isEmpty();
    }


    @Autowired
    private ApplicationContext applicationContext;
    private HasOptions extractOptions(List<String> strings) throws IllegalAccessException {
        return createOptionParser(getOptionsClass()).parse(strings);
    }


    //
    // TODO: Export factory out to Spring
    //


    private OptionParser createOptionParser(Class optionsClass) {
        OptionParser parser=new OptionParser(optionsClass);
        applicationContext.getAutowireCapableBeanFactory().autowireBean(parser);
        return parser;
    }


    private Class getOptionsClass() {
        return PSE3Options.class;
    }


    private void pollClusterForCompletion(RunJobFlowResult result) throws Exception {
        pollClusterForCompletion(result,doneStates);
    }


    String computeJobName(List<String> jarArgs) {
        String jobName = Joiner.on(" ").join(jarArgs);
        if (jobName.length()>255) {
            jobName=jobName.substring(0,255);
        }
        return jobName;
    }


    private void pollClusterForCompletion(RunJobFlowResult result,Set<String> completionStates)
            throws Exception {
        String jobFlowId=result.getJobFlowId();
        logger.info("Created job flow in AWS with id "+jobFlowId);
        alertService.alert("Cluster execution starting:\n for job flow Id"+result.getJobFlowId());
        
        //
        // make it synchronous with polling
        //
        
        final long checkInterval=60*1000;
        String state="";  // always interned!
        boolean tagged=false;
        while(true) {
            List<JobFlowDetail> flows=emrClient
                    .describeJobFlows(
                            new DescribeJobFlowsRequest()
                                .withJobFlowIds(jobFlowId))
                    .getJobFlows();
            
            if(flows.isEmpty()) {
                logger.error("The job flow "+jobFlowId+" can't be seen in the AWS flow list");
                throw new Exception(); 
            }
            
            state=flows.get(0).getExecutionStatusDetail().getState().intern();
            
            if(completionStates.contains(state)) {
                logger.info("Job flow "+jobFlowId+" ended in state "+state);
                break;
            }


            if(state.equals("RUNNING") && !tagged) {
                tagInstancesForJob(jobFlowId);
            }
            logger.info("Job flow "+jobFlowId+" reported status "+state);
            Thread.sleep(checkInterval);
        }


        alertService.alert("Cluster execution ending for job flow Id"+result.getJobFlowId());


        if(state=="") {
            logger.error("We never established communication with the cluster");
            throw ExitCodeException.create(EX_UNAVAILABLE);
        } else if(state=="COMPLETED") {
            logger.info("AWS believes that "+jobFlowId+" successfully completed");
        } else if(state=="WAITING") {
            logger.info("The cluster at "+jobFlowId+" is waiting for job steps");
        } else if(state=="FAILED") {
            logger.error("AWS reports failure of "+jobFlowId+" ");
            throw ExitCodeException.create(EX_SOFTWARE);
        } else {
            logger.error("Process completed in state "+state+" not on done list");
            throw ExitCodeException.create(EX_SOFTWARE);
        }
    }


    private void tagInstancesForJob(String jobFlowId) {
        DescribeInstancesResult r=ec2Client.describeInstances(
                new DescribeInstancesRequest().withFilters(
                        new Filter().withName("tag:aws:elasticmapreduce:job-flow-id").withValues(jobFlowId)
                )
        );


        List<String> clusterInstances=newArrayList();
        for(Reservation that:r.getReservations())  {
            clusterInstances.add(that.getInstances().get(0).getInstanceId());
        }


        ec2Client.monitorInstances(new MonitorInstancesRequest(clusterInstances));
    };


    @Autowired private FetchLogs fetchLogs;
    @Autowired private AlertService alertService;


    @Override
    public void runFlow(MavenManagedJar jar, Flow f,List<String> flowArgs) throws Exception {
        String jarLocation=jar.s3JarLocation(awsSoftwareBucket);
        List<StepConfig> steps = createEmrSteps(f, flowArgs, jarLocation);


        String jobName = computeJobName(flowArgs);
        RunJobFlowRequest that=new RunJobFlowRequest()
        .withName(jobName)
        .withBootstrapActions(bootstrapActions())
        .withSteps(steps)
        .withLogUri(awsLogUri)
        .withInstances(instances);


        RunJobFlowResult result = runJob(that);
        String jobFlowId=result.getJobFlowId();
        pollClusterForCompletion(result);
        fetchLogs.run(new String[] {jobFlowId});
        alertService.alert("Cluster execution complete:\n"+jobName);


    }


    private Collection<BootstrapActionConfig> bootstrapActions() throws Exception {
        Map<HadoopConfigurationVariable, String> params = getHadoopConfigurationVariableStringMap();
        if(params.isEmpty()) {
            return newArrayList();
        }


        BootstrapActions.ConfigureHadoop a=
                new BootstrapActions().newConfigureHadoop();
        for(Map.Entry<HadoopConfigurationVariable,String> that:params.entrySet())
            a.withKeyValue(
                    that.getKey().getConfigFile(),
                    that.getKey().getKey(),
                    that.getValue());


        return newArrayList(
            a.build()
        );


    }


    @Resource
    private Map<String,NodeType> awsInstanceMap;
    @Resource
    private String keyPairName;


    Map<HadoopConfigurationVariable, String> getHadoopConfigurationVariableStringMap() throws Exception {
        Map<HadoopConfigurationVariable,String> out=newHashMap();


        String instanceType=instances.getSlaveInstanceType();


        if(!awsInstanceMap.containsKey(instanceType)) {
            logger.warn("I don't have hadoop configuration settings for ["+instanceType+"] using AWS defaults");
            return out;
        }


        Map<String,String> that=awsInstanceMap.get(instanceType).getHadoopParameters();
        for(Map.Entry<String,String> item:that.entrySet())
            out.put(new HadoopConfigurationVariable(item.getKey()),item.getValue());


        return out;
    }


    List<StepConfig> createEmrSteps(
            Flow f,
            List<String> flowArgs,
            String jarLocation) {
        List<StepConfig> steps= newArrayList(debugStep);
        steps.addAll(createEmrSteps(
                f.generateSteps(flowArgs),
                flowArgs,
                jarLocation,
                new HashMap<String,Object>()
        ));
        return steps;
    }
    List<StepConfig> createEmrSteps(
            List<FlowStep> innerSteps,
            List<String> flowArgs,
            String jarLocation,
            Map<String,Object> upperScopeVariables
    ) {
        List<StepConfig> steps= newArrayList();
        Map<String,Object> local= newHashMap(upperScopeVariables);
        for(FlowStep that:innerSteps)
            if(that instanceof JobStep) {
                JobStep j=(JobStep) that;
                steps.add(new StepConfig(
                        "main"
                        ,new HadoopJarStepConfig(jarLocation)
                            .withArgs(j.getStepArgs(local,flowArgs)))
                );
            } else if(that instanceof AssignmentStep) {
                AssignmentStep ass=(AssignmentStep) that;
                local = ass.process(local, flowArgs);
            } else if(that instanceof ForeachStep) {
                ForeachStep step=(ForeachStep) that;
                for(Object v:step.getValues()) {
                    local.put(step.getLoopVar(),v);
                    steps.addAll(createEmrSteps(step.getFlowSteps(),flowArgs,jarLocation,local));
                }


            } else{
                throw new RuntimeException("Could not process step of type "+that.getClass());
            }
        return steps;
    }


    RunJobFlowResult runJob(RunJobFlowRequest that) {
        logger.info("about to create job flow");


        RunJobFlowResult result=emrClient.runJobFlow(that);




        logger.info("got job flow id "+result.getJobFlowId());


//        emrClient.addTags(new AddTagsRequest(result.getJobFlowId(), Lists.newArrayList(
//                new Tag("com.ontology2.jobFlowId", result.getJobFlowId())
//        )));
//        logger.info("tags added to job flow ");
        return result;
    }
}
Source Code of com.ontology2.haruhi.AmazonEMRCluster

Related Classes of com.ontology2.haruhi.AmazonEMRCluster