Package org.apache.mahout.h2obindings

Source Code of org.apache.mahout.h2obindings.H2OHdfs

/*
*  Licensed to the Apache Software Foundation (ASF) under one or more
*  contributor license agreements.  See the NOTICE file distributed with
*  this work for additional information regarding copyright ownership.
*  The ASF licenses this file to You under the Apache License, Version 2.0
*  (the "License"); you may not use this file except in compliance with
*  the License.  You may obtain a copy of the License at
*
*      http://www.apache.org/licenses/LICENSE-2.0
*
*  Unless required by applicable law or agreed to in writing, software
*  distributed under the License is distributed on an "AS IS" BASIS,
*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*  See the License for the specific language governing permissions and
*  limitations under the License.
*/

package org.apache.mahout.h2obindings;

import java.io.File;
import java.net.URI;

import water.fvec.Frame;
import water.fvec.Vec;
import water.Futures;
import water.parser.ValueString;
import water.util.FrameUtils;

import org.apache.mahout.math.Vector;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.h2obindings.drm.H2ODrm;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.ReflectionUtils;


/**
* SequenceFile I/O class (on HDFS)
*/
public class H2OHdfs {
  /**
   * Predicate to check if a given filename is a SequenceFile.
   *
   * Inspect the first three bytes to determine the format of the file.
   *
   * @param filename Name of the file to check.
   * @return True if file is of SequenceFile format.
   */
  public static boolean isSeqfile(String filename) {
    try {
      String uri = filename;
      Configuration conf = new Configuration();
      Path path = new Path(uri);
      FileSystem fs = FileSystem.get(URI.create(uri), conf);
      FSDataInputStream fin = fs.open(path);
      byte seq[] = new byte[3];

      fin.read(seq);
      fin.close();

      if (seq[0] == 'S' && seq[1] == 'E' && seq[2] == 'Q') {
        return true;
      } else {
        return false;
      }
    } catch (java.io.IOException e) {
      return false;
    }
  }

  /**
   * Create DRM from SequenceFile.
   *
   * Create a Mahout DRM backed on H2O from the specified SequenceFile.
   *
   * @param filename Name of the sequence file.
   * @param parMin Minimum number of data partitions in the DRM.
   * @return DRM object created.
   */
  public static H2ODrm drmFromFile(String filename, int parMin) {
    try {
      if (isSeqfile(filename)) {
        return drmFromSeqfile(filename, parMin);
      } else {
        return new H2ODrm(FrameUtils.parseFrame(null,new File(filename)));
      }
    } catch (java.io.IOException e) {
      return null;
    }
  }

  /**
   * Internal method called from <code>drmFromFile</code> if format verified.
   */
  public static H2ODrm drmFromSeqfile(String filename, int parMin) {
    long rows = 0;
    int cols = 0;
    Frame frame = null;
    Vec labels = null;

    SequenceFile.Reader reader = null;
    try {
      String uri = filename;
      Configuration conf = new Configuration();
      Path path = new Path(uri);
      FileSystem fs = FileSystem.get(URI.create(uri), conf);
      Vec.Writer writers[];
      Vec.Writer labelwriter = null;
      boolean isIntKey = false, isLongKey = false, isStringKey = false;

      reader = new SequenceFile.Reader(fs, path, conf);

      if (reader.getValueClass() != VectorWritable.class) {
        System.out.println("ValueClass in file " + filename +
                           "must be VectorWritable, but found " +
                           reader.getValueClassName());
        return null;
      }

      Writable key = (Writable)
        ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      VectorWritable value = (VectorWritable)
        ReflectionUtils.newInstance(reader.getValueClass(), conf);

      long start = reader.getPosition();

      if (reader.getKeyClass() == Text.class) {
        isStringKey = true;
      } else if (reader.getKeyClass() == LongWritable.class) {
        isLongKey = true;
      } else {
        isIntKey = true;
      }

      while (reader.next(key, value)) {
        if (cols == 0) {
          Vector v = value.get();
          cols = Math.max(v.size(), cols);
        }
        if (isLongKey) {
          rows = Math.max(((LongWritable)(key)).get()+1, rows);
        }
        if (isIntKey) {
          rows = Math.max(((IntWritable)(key)).get()+1, rows);
        }
        if (isStringKey) {
          rows++;
        }
      }
      reader.seek(start);

      frame = H2OHelper.emptyFrame(rows, cols, parMin, -1);
      writers = new Vec.Writer[cols];
      for (int i = 0; i < writers.length; i++) {
        writers[i] = frame.vecs()[i].open();
      }

      if (reader.getKeyClass() == Text.class) {
        labels = frame.anyVec().makeZero();
        labelwriter = labels.open();
      }

      long r = 0;
      while (reader.next(key, value)) {
        Vector v = value.get();
        if (isLongKey) {
          r = ((LongWritable)(key)).get();
        }
        if (isIntKey) {
          r = ((IntWritable)(key)).get();
        }
        for (int c = 0; c < v.size(); c++) {
          writers[c].set(r, v.getQuick(c));
        }
        if (labels != null) {
          labelwriter.set(r, ((Text)key).toString());
        }
        if (isStringKey) {
          r++;
        }
      }

      Futures fus = new Futures();
      for (Vec.Writer w : writers) {
        w.close(fus);
      }
      if (labelwriter != null) {
        labelwriter.close(fus);
      }
      fus.blockForPending();
    } catch (java.io.IOException e) {
      return null;
    } finally {
      IOUtils.closeStream(reader);
    }
    return new H2ODrm(frame, labels);
  }

  /**
   * Create SequenceFile on HDFS from DRM object.
   *
   * @param filename Filename to create and store DRM data in.
   * @param drm DRM object storing Matrix data in memory.
   */
  public static void drmToFile(String filename, H2ODrm drm) throws java.io.IOException {
    Frame frame = drm.frame;
    Vec labels = drm.keys;
    String uri = filename;
    Configuration conf = new Configuration();
    Path path = new Path(uri);
    FileSystem fs = FileSystem.get(URI.create(uri), conf);
    SequenceFile.Writer writer = null;
    boolean isSparse = H2OHelper.isSparse(frame);
    ValueString vstr = new ValueString();

    if (labels != null) {
      writer = SequenceFile.createWriter(fs, conf, path, Text.class, VectorWritable.class);
    } else {
      writer = SequenceFile.createWriter(fs, conf, path, IntWritable.class, VectorWritable.class);
    }

    for (long r = 0; r < frame.anyVec().length(); r++) {
      Vector v = null;
      if (isSparse) {
        v = new SequentialAccessSparseVector(frame.numCols());
      } else {
        v = new DenseVector(frame.numCols());
      }

      for (int c = 0; c < frame.numCols(); c++) {
        v.setQuick(c, frame.vecs()[c].at(r));
      }

      if (labels != null) {
        writer.append(new Text(labels.atStr(vstr, r).toString()), new VectorWritable(v));
      } else {
        writer.append(new IntWritable((int)r), new VectorWritable(v));
      }
    }

    writer.close();
  }
}
TOP

Related Classes of org.apache.mahout.h2obindings.H2OHdfs

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.