Source Code of org.apache.drill.exec.physical.impl.mergereceiver.MergingRecordBatch$Node

package org.apache.drill.exec.physical.impl.mergereceiver;


/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.PriorityQueue;


import io.netty.buffer.ByteBuf;
import org.apache.drill.common.expression.ErrorCollector;
import org.apache.drill.common.expression.ErrorCollectorImpl;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.common.logical.data.Order.Ordering;
import org.apache.drill.exec.compile.sig.GeneratorMapping;
import org.apache.drill.exec.compile.sig.MappingSet;
import org.apache.drill.exec.exception.ClassTransformationException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.ClassGenerator;
import org.apache.drill.exec.expr.ClassGenerator.HoldingContainer;
import org.apache.drill.exec.expr.CodeGenerator;
import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
import org.apache.drill.exec.expr.TypeHelper;
import org.apache.drill.exec.expr.fn.FunctionGenerationHelper;
import org.apache.drill.exec.memory.OutOfMemoryException;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.physical.config.MergingReceiverPOP;
import org.apache.drill.exec.proto.BitControl.FinishedReceiver;
import org.apache.drill.exec.proto.ExecProtos.FragmentHandle;
import org.apache.drill.exec.proto.GeneralRPCProtos.Ack;
import org.apache.drill.exec.proto.UserBitShared;
import org.apache.drill.exec.record.AbstractRecordBatch;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.ExpandableHyperContainer;
import org.apache.drill.exec.record.RawFragmentBatch;
import org.apache.drill.exec.record.RawFragmentBatchProvider;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.RecordBatchLoader;
import org.apache.drill.exec.record.SchemaBuilder;
import org.apache.drill.exec.record.TypedFieldId;
import org.apache.drill.exec.record.VectorAccessible;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.record.WritableBatch;
import org.apache.drill.exec.record.selection.SelectionVector2;
import org.apache.drill.exec.record.selection.SelectionVector4;
import org.apache.drill.exec.rpc.RpcException;
import org.apache.drill.exec.rpc.RpcOutcomeListener;
import org.apache.drill.exec.vector.CopyUtil;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.allocator.VectorAllocator;
import org.eigenbase.rel.RelFieldCollation.Direction;


import parquet.Preconditions;


import com.google.common.collect.Lists;
import com.sun.codemodel.JConditional;
import com.sun.codemodel.JExpr;




/**
 * The MergingRecordBatch merges pre-sorted record batches from remote senders.
 */
public class MergingRecordBatch extends AbstractRecordBatch<MergingReceiverPOP> implements RecordBatch {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MergingRecordBatch.class);


  private static final long ALLOCATOR_INITIAL_RESERVATION = 1*1024*1024;
  private static final long ALLOCATOR_MAX_RESERVATION = 20L*1000*1000*1000;


  private RecordBatchLoader[] batchLoaders;
  private RawFragmentBatchProvider[] fragProviders;
  private FragmentContext context;
  private BatchSchema schema;
  private VectorContainer outgoingContainer;
  private MergingReceiverGeneratorBase merger;
  private MergingReceiverPOP config;
  private boolean hasRun = false;
  private boolean prevBatchWasFull = false;
  private boolean hasMoreIncoming = true;


  private int outgoingPosition = 0;
  private int senderCount = 0;
  private RawFragmentBatch[] incomingBatches;
  private int[] batchOffsets;
  private PriorityQueue <Node> pqueue;
  private RawFragmentBatch emptyBatch = null;
  private boolean done = false;


  public static enum Metric implements MetricDef{
    BYTES_RECEIVED,
    NUM_SENDERS,
    NEXT_WAIT_NANOS;


    @Override
    public int metricId() {
      return ordinal();
    }
  }


  public MergingRecordBatch(FragmentContext context,
                            MergingReceiverPOP config,
                            RawFragmentBatchProvider[] fragProviders) throws OutOfMemoryException {


    super(config, context);
    this.fragProviders = fragProviders;
    this.context = context;
    this.outgoingContainer = new VectorContainer();
    this.stats.setLongStat(Metric.NUM_SENDERS, config.getNumSenders());
    this.config = config;
  }


  private RawFragmentBatch getNext(RawFragmentBatchProvider provider) throws IOException{
    stats.startWait();
    try {
      RawFragmentBatch b = provider.getNext();
      if(b != null){
        stats.addLongStat(Metric.BYTES_RECEIVED, b.getByteCount());
        stats.batchReceived(0, b.getHeader().getDef().getRecordCount(), false);
      }
      return b;
    } finally {
      stats.stopWait();
    }
  }


  @Override
  public IterOutcome innerNext() {
    if (fragProviders.length == 0) {
      return IterOutcome.NONE;
    }
    if (done) {
      return IterOutcome.NONE;
    }
    boolean schemaChanged = false;


    if (prevBatchWasFull) {
      logger.debug("Outgoing vectors were full on last iteration");
      allocateOutgoing();
      outgoingPosition = 0;
      prevBatchWasFull = false;
    }


    if (hasMoreIncoming == false) {
      logger.debug("next() was called after all values have been processed");
      outgoingPosition = 0;
      return IterOutcome.NONE;
    }


    // lazy initialization
    if (!hasRun) {
      schemaChanged = true; // first iteration is always a schema change


      // set up each (non-empty) incoming record batch
      List<RawFragmentBatch> rawBatches = Lists.newArrayList();
      boolean firstBatch = true;
      for (RawFragmentBatchProvider provider : fragProviders) {
        RawFragmentBatch rawBatch = null;
        try {
          rawBatch = getNext(provider);
          if (rawBatch == null && context.isCancelled()) {
            return IterOutcome.STOP;
          }
        } catch (IOException e) {
          context.fail(e);
          return IterOutcome.STOP;
        }
        if (rawBatch.getHeader().getDef().getRecordCount() != 0) {
          rawBatches.add(rawBatch);
        } else {
          if (emptyBatch == null) {
            emptyBatch = rawBatch;
          }
          try {
            while ((rawBatch = getNext(provider)) != null && rawBatch.getHeader().getDef().getRecordCount() == 0);
            if (rawBatch == null && context.isCancelled()) {
              return IterOutcome.STOP;
            }
          } catch (IOException e) {
            context.fail(e);
            return IterOutcome.STOP;
          }
          if (rawBatch != null) {
            rawBatches.add(rawBatch);
          } else {
            rawBatches.add(emptyBatch);
          }
        }
      }


      // allocate the incoming record batch loaders
      senderCount = rawBatches.size();
      if (senderCount == 0) {
        if (firstBatch) {
          RecordBatchLoader loader = new RecordBatchLoader(oContext.getAllocator());
          try {
            loader.load(emptyBatch.getHeader().getDef(), emptyBatch.getBody());
          } catch (SchemaChangeException e) {
            throw new RuntimeException(e);
          }
          for (VectorWrapper w : loader) {
            outgoingContainer.add(w.getValueVector());
          }
          outgoingContainer.buildSchema(SelectionVectorMode.NONE);
          done = true;
          return IterOutcome.OK_NEW_SCHEMA;
        }
        return IterOutcome.NONE;
      }
      incomingBatches = new RawFragmentBatch[senderCount];
      batchOffsets = new int[senderCount];
      batchLoaders = new RecordBatchLoader[senderCount];
      for (int i = 0; i < senderCount; ++i) {
        incomingBatches[i] = rawBatches.get(i);
        batchLoaders[i] = new RecordBatchLoader(oContext.getAllocator());
      }


      int i = 0;
      for (RawFragmentBatch batch : incomingBatches) {
        // initialize the incoming batchLoaders
        UserBitShared.RecordBatchDef rbd = batch.getHeader().getDef();
        try {
          batchLoaders[i].load(rbd, batch.getBody());
        } catch(SchemaChangeException e) {
          logger.error("MergingReceiver failed to load record batch from remote host.  {}", e);
          context.fail(e);
          return IterOutcome.STOP;
        }
        batch.release();
        ++batchOffsets[i];
        ++i;
      }


      // Canonicalize each incoming batch, so that vectors are alphabetically sorted based on SchemaPath.
      for (RecordBatchLoader loader : batchLoaders) {
        loader.canonicalize();
      }


      // Ensure all the incoming batches have the identical schema.
      if (!isSameSchemaAmongBatches(batchLoaders)) {
        logger.error("Incoming batches for merging receiver have diffferent schemas!");
        context.fail(new SchemaChangeException("Incoming batches for merging receiver have diffferent schemas!"));
        return IterOutcome.STOP;
      }


      // create the outgoing schema and vector container, and allocate the initial batch
      SchemaBuilder bldr = BatchSchema.newBuilder().setSelectionVectorMode(BatchSchema.SelectionVectorMode.NONE);
      int vectorCount = 0;
      for (VectorWrapper<?> v : batchLoaders[0]) {


        // add field to the output schema
        bldr.addField(v.getField());


        // allocate a new value vector
        ValueVector outgoingVector = TypeHelper.getNewVector(v.getField(), oContext.getAllocator());
        outgoingVector.allocateNew();
        outgoingContainer.add(outgoingVector);
        ++vectorCount;
      }




      schema = bldr.build();
      if (schema != null && !schema.equals(schema)) {
        // TODO: handle case where one or more batches implicitly indicate schema change
        logger.debug("Initial state has incoming batches with different schemas");
      }
      outgoingContainer.buildSchema(BatchSchema.SelectionVectorMode.NONE);


      // generate code for merge operations (copy and compare)
      try {
        merger = createMerger();
      } catch (SchemaChangeException e) {
        logger.error("Failed to generate code for MergingReceiver.  {}", e);
        context.fail(e);
        return IterOutcome.STOP;
      }


      // allocate the priority queue with the generated comparator
      this.pqueue = new PriorityQueue<Node>(fragProviders.length, new Comparator<Node>() {
        public int compare(Node node1, Node node2) {
          int leftIndex = (node1.batchId << 16) + node1.valueIndex;
          int rightIndex = (node2.batchId << 16) + node2.valueIndex;
          return merger.doEval(leftIndex, rightIndex);
        }
      });


      // populate the priority queue with initial values
      for (int b = 0; b < senderCount; ++b) {
        while (batchLoaders[b] != null && batchLoaders[b].getRecordCount() == 0) {
          try {
            RawFragmentBatch batch = getNext(fragProviders[b]);
            incomingBatches[b] = batch;
            if (batch != null) {
              batchLoaders[b].load(batch.getHeader().getDef(), batch.getBody());
            } else {
              batchLoaders[b].clear();
              batchLoaders[b] = null;
              if (context.isCancelled()) {
                return IterOutcome.STOP;
              }
            }
          } catch (IOException | SchemaChangeException e) {
            context.fail(e);
            return IterOutcome.STOP;
          }
        }
        if (batchLoaders[b] != null) {
          pqueue.add(new Node(b, 0));
        }
      }


      hasRun = true;
      // finished lazy initialization
    }


    while (!pqueue.isEmpty()) {
      // pop next value from pq and copy to outgoing batch
      Node node = pqueue.peek();
      if (!copyRecordToOutgoingBatch(node)) {
        logger.debug("Outgoing vectors space is full; breaking");
        prevBatchWasFull = true;
        break;
      }
      pqueue.poll();


//      if (isOutgoingFull()) {
//        // set a flag so that we reallocate on the next iteration
//        logger.debug("Outgoing vectors record batch size reached; breaking");
//        prevBatchWasFull = true;
//      }


      if (node.valueIndex == batchLoaders[node.batchId].getRecordCount() - 1) {
        // reached the end of an incoming record batch
        RawFragmentBatch nextBatch = null;
        try {
          nextBatch = getNext(fragProviders[node.batchId]);


          while (nextBatch != null && nextBatch.getHeader().getDef().getRecordCount() == 0) {
            nextBatch = getNext(fragProviders[node.batchId]);
          }
          if (nextBatch == null && context.isCancelled()) {
            return IterOutcome.STOP;
          }
        } catch (IOException e) {
          context.fail(e);
          return IterOutcome.STOP;
        }


        incomingBatches[node.batchId] = nextBatch;


        if (nextBatch == null) {
          // batch is empty
          boolean allBatchesEmpty = true;


          for (RawFragmentBatch batch : incomingBatches) {
            // see if all batches are empty so we can return OK_* or NONE
            if (batch != null) {
              allBatchesEmpty = false;
              break;
            }
          }


          if (allBatchesEmpty) {
            hasMoreIncoming = false;
            break;
          }


          // this batch is empty; since the pqueue no longer references this batch, it will be
          // ignored in subsequent iterations.
          continue;
        }


        UserBitShared.RecordBatchDef rbd = incomingBatches[node.batchId].getHeader().getDef();
        try {
          batchLoaders[node.batchId].load(rbd, incomingBatches[node.batchId].getBody());
        } catch(SchemaChangeException ex) {
          context.fail(ex);
          return IterOutcome.STOP;
        }
        incomingBatches[node.batchId].release();
        batchOffsets[node.batchId] = 0;


        // add front value from batch[x] to priority queue
        if (batchLoaders[node.batchId].getRecordCount() != 0)
          pqueue.add(new Node(node.batchId, 0));


      } else {
        pqueue.add(new Node(node.batchId, node.valueIndex + 1));
      }


      if (prevBatchWasFull) break;
    }


    // set the value counts in the outgoing vectors
    for (VectorWrapper vw : outgoingContainer) {
      vw.getValueVector().getMutator().setValueCount(outgoingPosition);
    }


    if (pqueue.isEmpty()) {
      if (!done) {
        done = !done;
      }
    }


    if (schemaChanged) {
      return IterOutcome.OK_NEW_SCHEMA;
    }
    else {
      return IterOutcome.OK;
    }
  }


  @Override
  public FragmentContext getContext() {
    return context;
  }


  @Override
  public BatchSchema getSchema() {
    return outgoingContainer.getSchema();
  }


  @Override
  public int getRecordCount() {
    return outgoingPosition;
  }


  @Override
  public void kill(boolean sendUpstream) {
    if (sendUpstream) {
      informSenders();
    } else {
      cleanup();
      for (RawFragmentBatchProvider provider : fragProviders) {
        provider.kill(context);
      }
    }
  }


  private void informSenders() {
    FragmentHandle handlePrototype = FragmentHandle.newBuilder()
            .setMajorFragmentId(config.getOppositeMajorFragmentId())
            .setQueryId(context.getHandle().getQueryId())
            .build();
    for (int i = 0; i < config.getNumSenders(); i++) {
      FragmentHandle sender = FragmentHandle.newBuilder(handlePrototype)
              .setMinorFragmentId(i)
              .build();
      FinishedReceiver finishedReceiver = FinishedReceiver.newBuilder()
              .setReceiver(context.getHandle())
              .setSender(sender)
              .build();
      context.getControlTunnel(config.getProvidingEndpoints().get(i)).informReceiverFinished(new OutcomeListener(), finishedReceiver);
    }
  }


  private class OutcomeListener implements RpcOutcomeListener<Ack> {


    @Override
    public void failed(RpcException ex) {
      logger.warn("Failed to inform upstream that receiver is finished");
    }


    @Override
    public void success(Ack value, ByteBuf buffer) {
      // Do nothing
    }
  }


  @Override
  protected void killIncoming(boolean sendUpstream) {
    //No op
  }


  @Override
  public Iterator<VectorWrapper<?>> iterator() {
    return outgoingContainer.iterator();
  }


  @Override
  public SelectionVector2 getSelectionVector2() {
    throw new UnsupportedOperationException();
  }


  @Override
  public SelectionVector4 getSelectionVector4() {
    throw new UnsupportedOperationException();
  }


  @Override
  public TypedFieldId getValueVectorId(SchemaPath path) {
    return outgoingContainer.getValueVectorId(path);
  }


  @Override
  public VectorWrapper<?> getValueAccessorById(Class<?> clazz, int... ids) {
    return outgoingContainer.getValueAccessorById(clazz, ids);
  }


  @Override
  public WritableBatch getWritableBatch() {
    return WritableBatch.get(this);
  }


  private boolean isSameSchemaAmongBatches(RecordBatchLoader[] batchLoaders) {
    Preconditions.checkArgument(batchLoaders.length > 0, "0 batch is not allowed!");


    BatchSchema schema = batchLoaders[0].getSchema();


    for (int i = 1; i < batchLoaders.length; i++) {
      if (!schema.equals(batchLoaders[i].getSchema())) {
        logger.error("Schemas are different. Schema 1 : " + schema + ", Schema 2: " + batchLoaders[i].getSchema() );
        return false;
      }
    }
    return true;
  }


  private void allocateOutgoing() {
    outgoingContainer.allocateNew();
  }


//  private boolean isOutgoingFull() {
//    return outgoingPosition == DEFAULT_ALLOC_RECORD_COUNT;
//  }


  /**
   * Creates a generate class which implements the copy and compare methods.
   *
   * @return instance of a new merger based on generated code
   * @throws SchemaChangeException
   */
  private MergingReceiverGeneratorBase createMerger() throws SchemaChangeException {


    try {
      CodeGenerator<MergingReceiverGeneratorBase> cg = CodeGenerator.get(MergingReceiverGeneratorBase.TEMPLATE_DEFINITION, context.getFunctionRegistry());
      ClassGenerator<MergingReceiverGeneratorBase> g = cg.getRoot();


      ExpandableHyperContainer batch = null;
      boolean first = true;
      for (RecordBatchLoader loader : batchLoaders) {
        if (first) {
          batch = new ExpandableHyperContainer(loader);
          first = false;
        } else {
          batch.addBatch(loader);
        }
      }


      generateComparisons(g, batch);


      g.setMappingSet(COPIER_MAPPING_SET);
      CopyUtil.generateCopies(g, batch, true);
      g.setMappingSet(MAIN_MAPPING);
      MergingReceiverGeneratorBase merger = context.getImplementationClass(cg);


      merger.doSetup(context, batch, outgoingContainer);
      return merger;
    } catch (ClassTransformationException | IOException e) {
      throw new SchemaChangeException(e);
    }
  }


  public final MappingSet MAIN_MAPPING = new MappingSet( (String) null, null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP);
  public final MappingSet LEFT_MAPPING = new MappingSet("leftIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP);
  public final MappingSet RIGHT_MAPPING = new MappingSet("rightIndex", null, ClassGenerator.DEFAULT_SCALAR_MAP, ClassGenerator.DEFAULT_SCALAR_MAP);
  GeneratorMapping COPIER_MAPPING = new GeneratorMapping("doSetup", "doCopy", null, null);
  public final MappingSet COPIER_MAPPING_SET = new MappingSet(COPIER_MAPPING, COPIER_MAPPING);


  private void generateComparisons(ClassGenerator g, VectorAccessible batch) throws SchemaChangeException {
    g.setMappingSet(MAIN_MAPPING);


    for(Ordering od : popConfig.getOrderings()){
      // first, we rewrite the evaluation stack for each side of the comparison.
      ErrorCollector collector = new ErrorCollectorImpl();
      final LogicalExpression expr = ExpressionTreeMaterializer.materialize(od.getExpr(), batch, collector,context.getFunctionRegistry());
      if(collector.hasErrors()) throw new SchemaChangeException("Failure while materializing expression. " + collector.toErrorString());
      g.setMappingSet(LEFT_MAPPING);
      HoldingContainer left = g.addExpr(expr, false);
      g.setMappingSet(RIGHT_MAPPING);
      HoldingContainer right = g.addExpr(expr, false);
      g.setMappingSet(MAIN_MAPPING);


      // next we wrap the two comparison sides and add the expression block for the comparison.
      LogicalExpression fh = FunctionGenerationHelper.getComparator(left, right, context.getFunctionRegistry());
      HoldingContainer out = g.addExpr(fh, false);
      JConditional jc = g.getEvalBlock()._if(out.getValue().ne(JExpr.lit(0)));


      if(od.getDirection() == Direction.ASCENDING){
        jc._then()._return(out.getValue());
      }else{
        jc._then()._return(out.getValue().minus());
      }
    }


    g.getEvalBlock()._return(JExpr.lit(0));
  }


  /**
   * Copy the record referenced by the supplied node to the next output position.
   * Side Effect: increments outgoing position if successful
   *
   * @param node Reference to the next record to copy from the incoming batches
   */
  private boolean copyRecordToOutgoingBatch(Node node) {
    int inIndex = (node.batchId << 16) + node.valueIndex;
    if (!merger.doCopy(inIndex, outgoingPosition)) {
      return false;
    } else {
      outgoingPosition++;
      return true;
    }
  }


  /**
   * A Node contains a reference to a single value in a specific incoming batch.  It is used
   * as a wrapper for the priority queue.
   */
  public class Node {
    public int batchId;      // incoming batch
    public int valueIndex;   // value within the batch
    Node(int batchId, int valueIndex) {
      this.batchId = batchId;
      this.valueIndex = valueIndex;
    }
  }


  @Override
  public void cleanup() {
    outgoingContainer.clear();
    if (batchLoaders != null) {
      for(RecordBatchLoader rbl : batchLoaders){
        if (rbl != null) {
          rbl.clear();
        }
      }
    }
    oContext.close();
    if (fragProviders != null) {
      for (RawFragmentBatchProvider f : fragProviders) {
        f.cleanup();
      }
    }
  }


}
Source Code of org.apache.drill.exec.physical.impl.mergereceiver.MergingRecordBatch$Node

Related Classes of org.apache.drill.exec.physical.impl.mergereceiver.MergingRecordBatch$Node