Source Code of cbcb.kmulus.PartitionDatabase

package cbcb.kmulus;


import java.util.Map;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;


import cbcb.kmulus.db.cluster.ClusterPresenceVectors;
import cbcb.kmulus.db.processing.GenerateSequencePresenceVectors;
import cbcb.kmulus.db.processing.PrepareClusteringOutput;
import cbcb.kmulus.db.processing.UnionClusterPresenceVectors;
import cbcb.kmulus.db.processing.WriteClusterSequencesToHDFS;
import cbcb.kmulus.db.processing.WriteSequencesToCluster;


import com.google.common.collect.ImmutableMap;


/**
 * The pipeline for generating clustered database partitions from a single database.
 * 
 * @author CH Albach
 */
public class PartitionDatabase {
  
  private static final String USAGE = 
    "PartitionDatabase DATABASE_SEQS OUTPUT_DIR NUM_SEQ NUM_CLUSTERS\n" +
    "\t[[START]:[STOP]] [KMER_LEN]\n" +
    "\tSTART and STOP indicate which range of steps should be run:\n" +
    "\t{ r-repeat mask | t-transform to PV | c-cluster\n" +
    "\t  p-prepare output | w-write partitions | u-union centers }";
  
  /* Final output directories. */
  private static final String PARTITIONS_SUFFIX = "partitions";
  private static final String CENTERS_SUFFIX = "centers";
  
  /* Intermediate output directories. */
  private static final String TEMP_SUFFIX = "temp";
  private static final String GENERATE_SUFFIX = "gen";
  private static final String CLUSTER_SUFFIX = "cluster";
  private static final String PREP_SUFFIX = "prep";
  
  private static final String DEFAULT_KMER_LEN = "3";
  
  private static final String STEP_DELIM = ":";
  private enum PipeStep {REPEAT_MASK, TRANSFORM_PV, CLUSTER, PREP, WRITE_PARTITIONS, UNION_CENTERS};
  private static final Map<Character, PipeStep> stepMap = ImmutableMap.<Character, PipeStep>builder()
      .put('r', PipeStep.REPEAT_MASK)
      .put('t', PipeStep.TRANSFORM_PV)
      .put('c', PipeStep.CLUSTER)
      .put('p', PipeStep.PREP)
      .put('w', PipeStep.WRITE_PARTITIONS)
      .put('u', PipeStep.UNION_CENTERS).build();
    
  
  public static void main(String[] args) {
    if (args.length < 4) {
      System.err.println(USAGE);
      return;
    }
    
    String dbInput = args[0];
    String tempOut = args[1] + Path.SEPARATOR + TEMP_SUFFIX;
    String finalOut = args[1];
    String numSeq = args[2];
    String numClusters = args[3];
    String kmerLen = args.length > 5 ? args[5] : DEFAULT_KMER_LEN;
    PipeStep start = PipeStep.REPEAT_MASK;
    PipeStep end = PipeStep.UNION_CENTERS;
    
    if (args.length > 4) {
      if (!args[4].contains(STEP_DELIM)) {
        System.err.println(USAGE);
        return;
      }
      String[] chunks = args[4].split(STEP_DELIM);
      String startStr = chunks[0];
      if (startStr.length() > 0) {
        start = stepMap.get(startStr.charAt(0));
      }
      
      if (chunks.length > 1) {
        String endStr = chunks[1];
        if (endStr.length() > 0) {
          end = stepMap.get(endStr.charAt(0));
        }
      }
      
      if (start == null || end == null) {
        System.err.println(USAGE);
        return;
      }
    }
    // Verify that the inputs are of the correct type.
    Integer.parseInt(numSeq);
    Integer.parseInt(numClusters);
    Integer.parseInt(kmerLen);
    
    try {
      /* Define all intermediate and final output directories. */
      String pvOut = tempOut + Path.SEPARATOR + GENERATE_SUFFIX;
      String clusterOut = tempOut + Path.SEPARATOR + CLUSTER_SUFFIX + Path.SEPARATOR + ClusterPresenceVectors.FINAL_DIR;
      String prepClusterOut = tempOut + Path.SEPARATOR + PREP_SUFFIX;
      String partitionsOut = finalOut + Path.SEPARATOR + PARTITIONS_SUFFIX;
      String centersOut = finalOut + Path.SEPARATOR + CENTERS_SUFFIX;




      switch (start) {
      case REPEAT_MASK:
        // Delete the output directories.
        Configuration conf = new Configuration();
        FileSystem.get(conf).delete(new Path(tempOut), true);
        FileSystem.get(conf).delete(new Path(finalOut), true);
        conf.set("mapred.child.java.opts", "-Xmx1024m");
        // TODO(calbach): Repeat masking.
        if (end == PipeStep.REPEAT_MASK) {
          break;
        }


        // Transform the database sequences into PresenceVectors.
      case TRANSFORM_PV:


        int result = ToolRunner.run(
            new GenerateSequencePresenceVectors(),
            new String[]{dbInput, pvOut, kmerLen});


        if (result < 0) {
          System.err.println(GenerateSequencePresenceVectors.class.getName() + " failed.");
          System.exit(result);
        }
        if (end == PipeStep.TRANSFORM_PV) {
          break;
        }


        // Cluster the PresenceVectors.
      case CLUSTER:
        clusterOut = tempOut + Path.SEPARATOR + CLUSTER_SUFFIX;
        int runIter = 0;
        do {
          result = ToolRunner.run(new Configuration(),
              new ClusterPresenceVectors(runIter),
              new String[]{pvOut, clusterOut, numSeq, numClusters, kmerLen});


          runIter++;
        } while (result == ClusterPresenceVectors.CODE_LOOP);


        if (result == ClusterPresenceVectors.CODE_CONVERGED) {
          result = ToolRunner.run(new Configuration(), 
              new ClusterPresenceVectors(),
              new String[]{pvOut, clusterOut, numSeq, numClusters, kmerLen});
        }


        if (result < 0) {
          System.err.println(ClusterPresenceVectors.class.getName() + " failed.");
          System.exit(result);
        }
        clusterOut += Path.SEPARATOR + ClusterPresenceVectors.FINAL_DIR;


        if (end == PipeStep.CLUSTER) {
          break;
        }


        // Reformat the clustering output for partitioning.
      case PREP:


        result = ToolRunner.run(
            new PrepareClusteringOutput(),
            new String[]{clusterOut, prepClusterOut});


        if (result < 0) {
          System.err.println(PrepareClusteringOutput.class.getName() + " failed.");
          System.exit(result);
        }
        if (end == PipeStep.PREP) {
          break;
        }


        // Generate the database partitions.
      case WRITE_PARTITIONS:
        result = ToolRunner.run(
            new WriteSequencesToCluster(),
            new String[]{prepClusterOut, dbInput, partitionsOut, tempOut + Path.SEPARATOR + "part", numClusters});


        if (result < 0) {
          System.err.println(WriteSequencesToCluster.class.getName() + " failed.");
          System.exit(result);
        }
        
        result = ToolRunner.run(
            new WriteClusterSequencesToHDFS(),
            new String[]{tempOut + Path.SEPARATOR + "part", partitionsOut, tempOut + Path.SEPARATOR + "null", numClusters});


        if (result < 0) {
          System.err.println(WriteClusterSequencesToHDFS.class.getName() + " failed.");
          System.exit(result);
        }
        
        if (end == PipeStep.WRITE_PARTITIONS) {
          break;
        }


        // Generate a union vector as the center for each cluster.
      case UNION_CENTERS:
        result = ToolRunner.run(
            new UnionClusterPresenceVectors(),
            new String[]{clusterOut, centersOut, kmerLen});


        if (result < 0) {
          System.err.println(UnionClusterPresenceVectors.class.getName() + " failed.");
          System.exit(result);
        }
        if (end == PipeStep.UNION_CENTERS) {
          break;
        }


      default:
        break;
      }
    } catch (Exception e) {
      e.printStackTrace();
      System.out.println("Pipeline failed.");
    }
  }
}
Source Code of cbcb.kmulus.PartitionDatabase

Related Classes of cbcb.kmulus.PartitionDatabase