JobCreationException,
Exception {
long sleepTime = 500;
aggregateWarning = "true".equalsIgnoreCase(pc.getProperties().getProperty("aggregate.warning"));
MROperPlan mrp = compile(php, pc);
PigStats stats = new PigStats();
stats.setMROperatorPlan(mrp);
stats.setExecType(pc.getExecType());
stats.setPhysicalPlan(php);
ExecutionEngine exe = pc.getExecutionEngine();
ConfigurationValidator.validatePigProperties(exe.getConfiguration());
Configuration conf = ConfigurationUtil.toConfiguration(exe.getConfiguration());
JobClient jobClient = new JobClient(((HExecutionEngine)exe).getJobConf());
JobControlCompiler jcc = new JobControlCompiler(pc, conf);
List<Job> failedJobs = new LinkedList<Job>();
List<Job> completeFailedJobsInThisRun = new LinkedList<Job>();
List<Job> succJobs = new LinkedList<Job>();
JobControl jc;
int totalMRJobs = mrp.size();
int numMRJobsCompl = 0;
double lastProg = -1;
//create the exception handler for the job control thread
//and register the handler with the job control thread
JobControlThreadExceptionHandler jctExceptionHandler = new JobControlThreadExceptionHandler();
while((jc = jcc.compile(mrp, grpName)) != null) {
// Initially, all jobs are in wait state.
List<Job> jobsWithoutIds = jc.getWaitingJobs();
log.info(jobsWithoutIds.size() +" map-reduce job(s) waiting for submission.");
String jobTrackerAdd;
String port;
String jobTrackerLoc;
JobConf jobConf = jobsWithoutIds.get(0).getJobConf();
try {
port = jobConf.get("mapred.job.tracker.http.address");
jobTrackerAdd = jobConf.get(HExecutionEngine.JOB_TRACKER_LOCATION);
jobTrackerLoc = jobTrackerAdd.substring(0,jobTrackerAdd.indexOf(":")) + port.substring(port.indexOf(":"));
}
catch(Exception e){
// Could not get the job tracker location, most probably we are running in local mode.
// If it is the case, we don't print out job tracker location,
// because it is meaningless for local mode.
jobTrackerLoc = null;
log.debug("Failed to get job tracker location.");
}
completeFailedJobsInThisRun.clear();
Thread jcThread = new Thread(jc);
jcThread.setUncaughtExceptionHandler(jctExceptionHandler);
//All the setup done, now lets launch the jobs.
jcThread.start();
// Now wait, till we are finished.
while(!jc.allFinished()){
try { Thread.sleep(sleepTime); }
catch (InterruptedException e) {}
List<Job> jobsAssignedIdInThisRun = new ArrayList<Job>();
for(Job job : jobsWithoutIds){
if (job.getAssignedJobID() != null){
jobsAssignedIdInThisRun.add(job);
log.info("HadoopJobId: "+job.getAssignedJobID());
if(jobTrackerLoc != null){
log.info("More information at: http://"+ jobTrackerLoc+
"/jobdetails.jsp?jobid="+job.getAssignedJobID());
}
}
else{
// This job is not assigned an id yet.
}
}
jobsWithoutIds.removeAll(jobsAssignedIdInThisRun);
double prog = (numMRJobsCompl+calculateProgress(jc, jobClient))/totalMRJobs;
if(prog>=(lastProg+0.01)){
int perCom = (int)(prog * 100);
if(perCom!=100)
log.info( perCom + "% complete");
}
lastProg = prog;
}
//check for the jobControlException first
//if the job controller fails before launching the jobs then there are
//no jobs to check for failure
if(jobControlException != null) {
if(jobControlException instanceof PigException) {
if(jobControlExceptionStackTrace != null) {
LogUtils.writeLog("Error message from job controller", jobControlExceptionStackTrace,
pc.getProperties().getProperty("pig.logfile"),
log);
}
throw jobControlException;
} else {
int errCode = 2117;
String msg = "Unexpected error when launching map reduce job.";
throw new ExecException(msg, errCode, PigException.BUG, jobControlException);
}
}
if (!jc.getFailedJobs().isEmpty() )
{
if ("true".equalsIgnoreCase(
pc.getProperties().getProperty("stop.on.failure","false"))) {
int errCode = 6017;
StringBuilder msg = new StringBuilder();
for (int i=0;i<jc.getFailedJobs().size();i++) {
Job j = jc.getFailedJobs().get(i);
msg.append(getFirstLineFromMessage(j.getMessage()));
if (i!=jc.getFailedJobs().size()-1)
msg.append("\n");
}
throw new ExecException(msg.toString(),
errCode, PigException.REMOTE_ENVIRONMENT);
}
// If we only have one store and that job fail, then we sure that the job completely fail, and we shall stop dependent jobs
for (Job job : jc.getFailedJobs())
{
List<POStore> sts = jcc.getStores(job);
if (sts.size()==1)
completeFailedJobsInThisRun.add(job);
}
failedJobs.addAll(jc.getFailedJobs());
}
int removedMROp = jcc.updateMROpPlan(completeFailedJobsInThisRun);
numMRJobsCompl += removedMROp;
List<Job> jobs = jc.getSuccessfulJobs();
jcc.moveResults(jobs);
succJobs.addAll(jobs);
stats.setJobClient(jobClient);
stats.setJobControl(jc);
stats.accumulateStats();
jc.stop();
}
log.info( "100% complete");
boolean failed = false;
int finalStores = 0;
// Look to see if any jobs failed. If so, we need to report that.
if (failedJobs != null && failedJobs.size() > 0) {
log.error(failedJobs.size()+" map reduce job(s) failed!");
Exception backendException = null;
for (Job fj : failedJobs) {
try {
getStats(fj, jobClient, true, pc);
} catch (Exception e) {
backendException = e;
}
List<POStore> sts = jcc.getStores(fj);
for (POStore st: sts) {
if (!st.isTmpStore()) {
finalStores++;
log.error("Failed to produce result in: \""+st.getSFile().getFileName()+"\"");
}
failedStores.add(st);
failureMap.put(st.getSFile(), backendException);
//log.error("Failed to produce result in: \""+st.getSFile().getFileName()+"\"");
}
}
failed = true;
}
Map<Enum, Long> warningAggMap = new HashMap<Enum, Long>();
if(succJobs!=null) {
for(Job job : succJobs){
List<POStore> sts = jcc.getStores(job);
for (POStore st: sts) {
// Currently (as of Feb 3 2010), hadoop's local mode does not
// call cleanupJob on OutputCommitter (see https://issues.apache.org/jira/browse/MAPREDUCE-1447)
// So to workaround that bug, we are calling setStoreSchema on
// StoreFunc's which implement StoreMetadata here
/**********************************************************/
// NOTE: THE FOLLOWING IF SHOULD BE REMOVED ONCE MAPREDUCE-1447
// IS FIXED - TestStore.testSetStoreSchema() should fail at
// that time and removing this code should fix it.
/**********************************************************/
if(pc.getExecType() == ExecType.LOCAL) {
storeSchema(job, st);
}
if (!st.isTmpStore()) {
succeededStores.add(st);
finalStores++;
log.info("Successfully stored result in: \""+st.getSFile().getFileName()+"\"");
}
else
log.debug("Successfully stored result in: \""+st.getSFile().getFileName()+"\"");
}
getStats(job,jobClient, false, pc);
if(aggregateWarning) {
computeWarningAggregate(job, jobClient, warningAggMap);
}
}
}
if(aggregateWarning) {
CompilationMessageCollector.logAggregate(warningAggMap, MessageType.Warning, log) ;
}
// Report records and bytes written. Only do this in the single store case. Multi-store
// scripts mess up the stats reporting from hadoop.
List<String> rji = stats.getRootJobIDs();
if ( (rji != null && rji.size() == 1 && finalStores == 1) || pc.getExecType() == ExecType.LOCAL ) {
// currently counters are not working in local mode - see PIG-1286
if(stats.getRecordsWritten()==-1 || pc.getExecType() == ExecType.LOCAL) {
log.info("Records written : Unable to determine number of records written");
} else {
log.info("Records written : " + stats.getRecordsWritten());
}
if(stats.getBytesWritten()==-1 || pc.getExecType() == ExecType.LOCAL) {
log.info("Bytes written : Unable to determine number of bytes written");
} else {
log.info("Bytes written : " + stats.getBytesWritten());
}
if(stats.getSMMSpillCount()==-1) {
log.info("Spillable Memory Manager spill count : Unable to determine spillable memory manager spill count");
} else {
log.info("Spillable Memory Manager spill count : " + stats.getSMMSpillCount());
}
if(stats.getProactiveSpillCount() == -1) {
log.info("Proactive spill count : Unable to determine proactive spill count");
} else {
log.info("Proactive spill count : " + stats.getProactiveSpillCount());
}
}
if (!failed) {
log.info("Success!");