Source Code of org.apache.mahout.df.mapreduce.partial.InterResults

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.mahout.df.mapreduce.partial;


import java.io.IOException;


import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.df.node.Node;


/**
 * Stores/Loads the intermediate results of step1 needed by step2.<br>
 * This class should not be needed outside of the partial package, so all its
 * methods are protected.<br>
 */
public class InterResults {
  private InterResults() {
  }


  /**
   * Load the trees and the keys returned from the first step
   * 
   * @param fs forest path file system
   * @param forestPath file path to the (key,tree) file
   * @param numMaps number of map tasks
   * @param numTrees total number of trees in the forest
   * @param partition current partition
   * @param keys array of size numTrees, will contain the loaded keys
   * @param trees array of size numTrees, will contain the loaded trees
   * @return number of instances in the current partition
   * @throws IOException
   */
  public static int load(FileSystem fs, Path forestPath, int numMaps,
      int numTrees, int partition, TreeID[] keys, Node[] trees)
      throws IOException {
    if (keys.length != trees.length) {
      throw new IllegalArgumentException("keys.length != trees.length");
    }


    FSDataInputStream in = fs.open(forestPath);


    TreeID key = new TreeID();
    int numInstances = -1;


    try {
      // get current partition's size
      for (int p = 0; p < numMaps; p++) {
        if (p == partition) {
          numInstances = in.readInt();
        } else {
          in.readInt();
        }
      }


      // load (key, tree)
      int current = 0;
      for (int index = 0; index < numTrees; index++) {
        key.readFields(in);


        if (key.partition() != partition) {
          keys[current] = key.clone();
          trees[current] = Node.read(in);


          current++;
        } else {
          // skip the trees of the current partition
          Node.read(in);
        }
      }


      if (current != keys.length) {
        throw new IllegalStateException("loaded less keys/trees than expected");
      }
    } finally {
      in.close();
    }


    return numInstances;
  }


  /**
   * Write the forest trees into a file
   * 
   * @param fs File System
   * @param keys keys returned by the first step
   * @param trees trees returned by the first step
   * @param sizes partitions' sizes in hadoop order
   * @throws IOException
   */
  public static void store(FileSystem fs, Path forestPath,
      TreeID[] keys, Node[] trees, int[] sizes) throws IOException {
    if (keys.length != trees.length) {
      throw new IllegalArgumentException("keys.length != trees.length");
    }


    int numTrees = keys.length;
    int numMaps = sizes.length;


    FSDataOutputStream out = fs.create(forestPath);


    // write partitions' sizes
    for (int p = 0; p < numMaps; p++) {
      out.writeInt(sizes[p]);
    }


    // write the data
    for (int index = 0; index < numTrees; index++) {
      keys[index].write(out);
      trees[index].write(out);
    }


    out.close();
  }


}
Source Code of org.apache.mahout.df.mapreduce.partial.InterResults

Related Classes of org.apache.mahout.df.mapreduce.partial.InterResults