Source Code of edu.berkeley.chukwa_xtrace.XtrExtract$Reduce

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.berkeley.chukwa_xtrace;


import org.apache.hadoop.chukwa.ChunkImpl;
import org.apache.hadoop.chukwa.ChukwaArchiveKey;
import org.apache.hadoop.chukwa.extraction.demux.processor.mapper.AbstractProcessor;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecord;
import org.apache.hadoop.chukwa.extraction.engine.ChukwaRecordKey;
import org.apache.hadoop.chukwa.extraction.engine.Record;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;


import edu.berkeley.xtrace.reporting.Report;
import edu.berkeley.xtrace.*;


/**
 * MapReduce job to process xtrace reports coming out of chukwa demux.
 * 
 * Map phase unwraps the chukwa records, reduce phase does trace reconstruction.
 * 
 * We use task ID as the reduce sort key.
 *
 */
public class XtrExtract extends Configured implements Tool {
  
  public static final String OUTLINK_FIELD = "__xtr_outlinks";
  static Logger log = Logger.getLogger(XtrExtract.class);
  
  /**
   * with more than 50,000 reports in a single trace, switch to on-disk sort, 
   * instead of in-memory topological sort.
   */
  static final int MAX_IN_MEMORY_REPORTS = 50* 1000;
  
public static class MapClass extends Mapper <Object, Object, BytesWritable, Text> {
    
    public MapClass() {
      System.out.println("starting xtrace map");
    }
    
    @Override
    protected void map(Object k, Object v, 
        Mapper<Object, Object,BytesWritable, Text>.Context context)
        throws IOException, InterruptedException 
    {
      Text t;
      BytesWritable bw;
      
      if(k instanceof ChukwaArchiveKey && v instanceof ChunkImpl) {
        ChunkImpl value = (ChunkImpl) v;
        Report xtrReport = Report.createFromString(new String(value.getData()));
        bw = new BytesWritable(xtrReport.getMetadata().getTaskId().get());
        //FIXME: can probably optimize the above lines by doing a search in the raw bytes
        t= new Text(value.getData());
      } else if(k instanceof ChukwaRecordKey && v instanceof ChukwaRecord){
        ChukwaRecord value = (ChukwaRecord) v;
        Report xtrReport = Report.createFromString(value.getValue(Record.bodyField));
        bw = new BytesWritable(xtrReport.getMetadata().getTaskId().get());
        //FIXME: can probably optimize the above lines by doing a search in the raw bytes
        t= new Text(value.getValue(Record.bodyField));
      } else {
        log.error("unexpected key/value types: "+ k.getClass().getCanonicalName() 
            + " and " + v.getClass().getCanonicalName() );
        return;
      }
      context.write(bw, t);
    }
  }


  public static class Reduce extends Reducer<BytesWritable, Text,BytesWritable,ArrayWritable> {
    
    public Reduce() {}
    
    /**
     * 
     * Note that loading everything into hashtables means
     * we implicity suppress duplicate-but-identical reports.  
     * 
     */
    protected  void reduce(BytesWritable taskID, Iterable<Text> values, 
          Reducer<BytesWritable, Text,BytesWritable,ArrayWritable>.Context context) 
          throws IOException, InterruptedException
    {
      String taskIDString = IoUtil.bytesToString(taskID.getBytes());
      //in both cases, key is OpId string
      HashMap<String, Report> reports = new LinkedHashMap<String, Report>();


      Counter reportCounter = context.getCounter("app", "distinct reports");
      Counter edgeCounter = context.getCounter("app", "edges");
      Counter badEdgeCounter = context.getCounter("app", "reference to missing report");
      Counter dupCounter = context.getCounter("app", "duplicate report");


      int edgeCount = 0, dups = 0, numReports = 0;
      
      for(Text rep_text: values) {
        Report r = Report.createFromString(rep_text.toString());
        numReports++;
        
        if(numReports < MAX_IN_MEMORY_REPORTS) {
          if(reports.containsKey(r.getMetadata().getOpIdString()))
            dups++;
          reports.put(r.getMetadata().getOpIdString(), r);
        } else if(numReports == MAX_IN_MEMORY_REPORTS) {
          //bail out, prepare to do an external sort.
          return;
        } else
          ;
    //      do the external sort
      }


      HashMap<String, Integer> counts = new HashMap<String, Integer>();
      Queue<Report> zeroInlinkReports = new LinkedList<Report>();
      reportCounter.increment(reports.size());
      dupCounter.increment(dups);
      //FIXME: could usefully compare reports.size() with numReports;
      //that would measure duplicate reports
      
      //increment link counts for children
      for(Report r: reports.values()){ 
        String myOpID = r.getMetadata().getOpIdString();
        int parentCount = 0;
        for(String inLink: r.get("Edge")) {
          
            //sanitize data from old, nonconformant C++ implementation
          if(inLink.contains(","))
            inLink = inLink.substring(0, inLink.indexOf(','));
          
          Report parent = reports.get(inLink);
          if(parent != null) {
            parent.put(OUTLINK_FIELD, myOpID);
            parentCount++;
            edgeCount++;
          }
          else { //no match
            if(!inLink.equals("0000000000000000"))  {
              log.info("no sign of parent: " + inLink);
              badEdgeCounter.increment(1);
            }
            //else quietly suppress
          }
        }
          
        //if there weren't any parents, we can dequeue
        if(parentCount == 0)
          zeroInlinkReports.add(r);
        else
          counts.put(myOpID, parentCount);
      }
      
      log.debug(taskIDString+": " + edgeCount + " total edges");
      edgeCounter.increment(edgeCount);
      //at this point, we have a map from metadata to report, and also
      //from report op ID to inlink count.
      //next step is to do a topological sort.


      
      Text[] finalOutput = new Text[reports.size()];
      log.debug(taskIDString+": expecting to sort " + finalOutput.length + " reports");
      int i=0;
      while(!zeroInlinkReports.isEmpty()) {
        Report r = zeroInlinkReports.poll();
        if(r == null) {
          log.warn(taskIDString+": poll returned null but list not empty. This is probably a bug" 
              + " fairly deep down");
          break;
        }
        finalOutput[i++] = new Text(r.toString());
        List<String> outLinks =  r.get(OUTLINK_FIELD);
        if(outLinks != null) {
          for(String outLink: outLinks) {
            Integer oldCount = counts.get(outLink);
            if(oldCount == null) {
              oldCount = 0;  //FIXME: can this happen?
              log.warn(taskIDString+": found an in-edge where none was expected");
            } if(oldCount == 1) {
              zeroInlinkReports.add(reports.get(outLink));
            }
            counts.put(outLink, oldCount -1);
          }
        }
      }
      if(i != finalOutput.length) {
        if(i > 0)
           log.warn(taskIDString+": I only sorted " + i + " items, but expected " 
            + finalOutput.length+", is your list cyclic?");
        else
          log.warn(taskIDString+": every event in graph has a predecessor; perhaps "
              + "the start event isn't in the input set?");
      }


      context.write(taskID, new ArrayWritable(Text.class, finalOutput));
      //Should sort values topologically and output list.  or?
      
    } //end reduce
    
  }//end reduce class




  @Override
  public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());
    


    extractor.setMapperClass(MapClass.class);
    
    extractor.setReducerClass(Reduce.class);
    extractor.setJobName("x-trace reconstructor");
    extractor.setJarByClass(this.getClass());
    
    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(Text.class);
    
    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(ArrayWritable.class);
    
    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();
//    extractor.waitForCompletion(false);
    return 0;


  }


  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new Configuration(),
        new XtrExtract(), args);
    System.exit(res);
  }


}
Source Code of edu.berkeley.chukwa_xtrace.XtrExtract$Reduce

Related Classes of edu.berkeley.chukwa_xtrace.XtrExtract$Reduce