Package org.apache.crunch.types

Examples of org.apache.crunch.types.PTypeFamily


   *          The right-side table of the join, whose contents will be fully
   *          read into memory
   * @return A table keyed on the join key, containing pairs of joined values
   */
  public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
    PTypeFamily tf = left.getTypeFamily();
    Iterable<Pair<K, V>> iterable = right.materialize();

    if (iterable instanceof MaterializableIterable) {
      MaterializableIterable<Pair<K, V>> mi = (MaterializableIterable<Pair<K, V>>) iterable;
      MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(mi.getPath().toString(),
          right.getPType());
      ParallelDoOptions.Builder optionsBuilder = ParallelDoOptions.builder();
      if (mi.isSourceTarget()) {
        optionsBuilder.sourceTargets((SourceTarget) mi.getSource());
      }
      return left.parallelDo("mapjoin", mapJoinDoFn,
          tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())),
          optionsBuilder.build());
    } else { // in-memory pipeline
      return left.parallelDo(new InMemoryJoinFn<K, U, V>(iterable),
          tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())));
    }
  }
View Full Code Here


   */
  public static <T> PCollection<T> reservorSample(
      PCollection<T> input,
      int sampleSize,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
    return weightedReservoirSample(
        input.parallelDo(new MapFn<T, Pair<T, Integer>>() {
          public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
        }, ptype),
        sampleSize,
View Full Code Here

   */
  public static <T, N extends Number> PCollection<T> weightedReservoirSample(
      PCollection<Pair<T, N>> input,
      int sampleSize,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
        new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
          @Override
          public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
            return Pair.of(0, p);
          }
        }, ptf.tableOf(ptf.ints(), input.getPType()));
    int[] ss = new int[] { sampleSize };
    return groupedWeightedReservoirSample(groupedIn, ss, seed)
        .parallelDo(new MapFn<Pair<Integer, T>, T>() {
          @Override
          public T map(Pair<Integer, T> p) {
View Full Code Here

   */
  public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
      PTable<Integer, Pair<T, N>> input,
      int[] sampleSizes,
      Long seed) {
    PTypeFamily ptf = input.getTypeFamily();
    PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
    PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(),
        ptf.pairs(ptf.doubles(), ttype));
   
    return input.parallelDo(new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt)
        .groupByKey(1)
        .combineValues(new WRSCombineFn<T>(sampleSizes, ttype))
        .parallelDo(new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() {
          @Override
          public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) {
            return Pair.of(p.first(), p.second().second());
          }
        }, ptf.pairs(ptf.ints(), ttype));
  }
View Full Code Here

   * @return A new {@code PCollection} that contains the unique elements of the input
   */
  public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) {
    Preconditions.checkArgument(flushEvery > 0);
    PType<S> pt = input.getPType();
    PTypeFamily ptf = pt.getFamily();
    return input
        .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery, pt), ptf.tableOf(pt, ptf.nulls()))
        .groupByKey()
        .parallelDo("post-distinct", new PostDistinctFn<S>(), pt);
  }
View Full Code Here

  /**
   * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
   * of their occurrences.
   */
  public static <S> PTable<S, Long> count(PCollection<S> collect) {
    PTypeFamily tf = collect.getTypeFamily();
    return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
      public Pair<S, Long> map(S input) {
        return Pair.of(input, 1L);
      }
    }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
        .combineValues(Aggregators.SUM_LONGS());
  }
View Full Code Here

   * @param collect The PCollection whose elements should be counted.
   * @param <S> The type of the PCollection.
   * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
   */
  public static <S> PObject<Long> length(PCollection<S> collect) {
    PTypeFamily tf = collect.getTypeFamily();
    PTable<Integer, Long> countTable = collect
        .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
          public Pair<Integer, Long> map(S input) {
            return Pair.of(1, 1L);
          }
        }, tf.tableOf(tf.ints(), tf.longs()))
        .groupByKey(GroupingOptions.builder().numReducers(1).build())
        .combineValues(Aggregators.SUM_LONGS());
    PCollection<Long> count = countTable.values();
    return new FirstElementPObject<Long>(count);
  }
View Full Code Here

      }
    }
  }

  public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
    PTypeFamily ptf = ptable.getTypeFamily();
    PTableType<K, V> base = ptable.getPTableType();
    PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
    PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
    return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter)
        .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize))
        .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
          public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
            emitter.emit(input.second());
View Full Code Here

    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
      throw new IllegalArgumentException("Can only get max for Comparable elements, not for: "
          + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    PCollection<S> maxCollect = PTables.values(collect
        .parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
          private transient S max = null;

          public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
            if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
              max = input;
            }
          }

          public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
            if (max != null) {
              emitter.emit(Pair.of(true, max));
            }
          }
        }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
        .combineValues(new CombineFn<Boolean, S>() {
          public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
            S max = null;
            for (S v : input.second()) {
              if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
View Full Code Here

    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
      throw new IllegalArgumentException("Can only get min for Comparable elements, not for: "
          + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    PCollection<S> minCollect = PTables.values(collect
        .parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {
          private transient S min = null;

          public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
            if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
              min = input;
            }
          }

          public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
            if (min != null) {
              emitter.emit(Pair.of(false, min));
            }
          }
        }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
        .combineValues(new CombineFn<Boolean, S>() {
          public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
            S min = null;
            for (S v : input.second()) {
              if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
View Full Code Here

TOP

Related Classes of org.apache.crunch.types.PTypeFamily

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.