Package cascading.tap.partition

Source Code of cascading.tap.partition.BasePartitionTap$PartitionScheme

/*
* Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.tap.partition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import cascading.flow.FlowProcess;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tap.type.FileType;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import cascading.tuple.TupleEntryIterableChainIterator;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.TupleEntrySchemeCollector;
import cascading.tuple.TupleEntrySchemeIterator;
import cascading.tuple.util.TupleViews;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
*
*/
public abstract class BasePartitionTap<Config, Input, Output> extends Tap<Config, Input, Output>
  {
  /** Field LOG */
  private static final Logger LOG = LoggerFactory.getLogger( BasePartitionTap.class );
  /** Field OPEN_FILES_THRESHOLD_DEFAULT */
  protected static final int OPEN_WRITES_THRESHOLD_DEFAULT = 300;

  private class PartitionIterator extends TupleEntryIterableChainIterator
    {
    public PartitionIterator( final FlowProcess<? extends Config> flowProcess, Input input ) throws IOException
      {
      super( getSourceFields() );

      List<Iterator<Tuple>> iterators = new ArrayList<Iterator<Tuple>>();

      if( input != null )
        {
        String identifier = parent.getFullIdentifier( flowProcess );
        iterators.add( createPartitionEntryIterator( flowProcess, input, identifier, getCurrentIdentifier( flowProcess ) ) );
        }
      else
        {
        String[] childIdentifiers = getChildPartitionIdentifiers( flowProcess, false );

        for( String childIdentifier : childIdentifiers )
          iterators.add( createPartitionEntryIterator( flowProcess, null, parent.getIdentifier(), childIdentifier ) );
        }

      reset( iterators );
      }

    private PartitionTupleEntryIterator createPartitionEntryIterator( FlowProcess<? extends Config> flowProcess, Input input, String parentIdentifier, String childIdentifier ) throws IOException
      {
      TupleEntrySchemeIterator schemeIterator = createTupleEntrySchemeIterator( flowProcess, parent, childIdentifier, input );

      return new PartitionTupleEntryIterator( getSourceFields(), partition, parentIdentifier, childIdentifier, schemeIterator );
      }
    }

  public class PartitionCollector extends TupleEntryCollector
    {
    private final FlowProcess<? extends Config> flowProcess;
    private final Config conf;
    private final Fields parentFields;
    private final Fields partitionFields;
    private TupleEntry partitionEntry;
    private final Tuple partitionTuple;
    private final Tuple parentTuple;

    public PartitionCollector( FlowProcess<? extends Config> flowProcess )
      {
      super( Fields.asDeclaration( getSinkFields() ) );
      this.flowProcess = flowProcess;
      this.conf = flowProcess.getConfigCopy();
      this.parentFields = parent.getSinkFields();
      this.partitionFields = ( (PartitionScheme) getScheme() ).partitionFields;
      this.partitionEntry = new TupleEntry( this.partitionFields );

      this.partitionTuple = TupleViews.createNarrow( getSinkFields().getPos( this.partitionFields ) );
      this.parentTuple = TupleViews.createNarrow( getSinkFields().getPos( this.parentFields ) );

      this.partitionEntry.setTuple( partitionTuple );
      }

    TupleEntryCollector getCollector( String path )
      {
      TupleEntryCollector collector = collectors.get( path );

      if( collector != null )
        return collector;

      try
        {
        LOG.debug( "creating collector for parent: {}, path: {}", parent.getFullIdentifier( conf ), path );

        collector = createTupleEntrySchemeCollector( flowProcess, parent, path, openedCollectors );

        openedCollectors++;
        flowProcess.increment( Counters.Paths_Opened, 1 );
        }
      catch( IOException exception )
        {
        throw new TapException( "unable to open partition path: " + path, exception );
        }

      if( collectors.size() > openWritesThreshold )
        purgeCollectors();

      collectors.put( path, collector );

      if( LOG.isInfoEnabled() && collectors.size() % 100 == 0 )
        LOG.info( "caching {} open Taps", collectors.size() );

      return collector;
      }

    private void purgeCollectors()
      {
      int numToClose = Math.max( 1, (int) ( openWritesThreshold * .10 ) );

      if( LOG.isInfoEnabled() )
        LOG.info( "removing {} open Taps from cache of size {}", numToClose, collectors.size() );

      Set<String> removeKeys = new HashSet<String>();
      Set<String> keys = collectors.keySet();

      for( String key : keys )
        {
        if( numToClose-- == 0 )
          break;

        removeKeys.add( key );
        }

      for( String removeKey : removeKeys )
        {
        closeCollector( removeKey );
        collectors.remove( removeKey );
        }

      flowProcess.increment( Counters.Path_Purges, 1 );
      }

    @Override
    public void close()
      {
      super.close();

      try
        {
        for( String path : new ArrayList<String>( collectors.keySet() ) )
          closeCollector( path );
        }
      finally
        {
        collectors.clear();
        }
      }

    public void closeCollector( String path )
      {
      TupleEntryCollector collector = collectors.get( path );
      if( collector == null )
        return;
      try
        {
        collector.close();

        flowProcess.increment( Counters.Paths_Closed, 1 );
        }
      catch( Exception exception )
        {
        LOG.error( "exception while closing TupleEntryCollector {}", path, exception );

        boolean failOnError = false;
        Object failProperty = flowProcess.getProperty( PartitionTapProps.FAIL_ON_CLOSE );

        if( failProperty != null )
          failOnError = Boolean.parseBoolean( failProperty.toString() );

        if( failOnError )
          throw new TapException( exception );
        }
      }

    protected void collect( TupleEntry tupleEntry ) throws IOException
      {
      // reset select views
      TupleViews.reset( partitionTuple, tupleEntry.getTuple() ); // partitionTuple is inside partitionEntry
      TupleViews.reset( parentTuple, tupleEntry.getTuple() );

      String path = partition.toPartition( partitionEntry );

      getCollector( path ).add( parentTuple );
      }
    }

  /** Field parent */
  protected Tap parent;
  /** Field partition */
  protected Partition partition;
  /** Field keepParentOnDelete */
  protected boolean keepParentOnDelete = false;
  /** Field openTapsThreshold */
  protected int openWritesThreshold = OPEN_WRITES_THRESHOLD_DEFAULT;

  /** Field openedCollectors */
  private long openedCollectors = 0;
  /** Field collectors */
  private final Map<String, TupleEntryCollector> collectors = new LinkedHashMap<String, TupleEntryCollector>( 1000, .75f, true );

  protected abstract TupleEntrySchemeCollector createTupleEntrySchemeCollector( FlowProcess<? extends Config> flowProcess, Tap parent, String path, long sequence ) throws IOException;

  protected abstract TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<? extends Config> flowProcess, Tap parent, String path, Input input ) throws IOException;

  public enum Counters
    {
      Paths_Opened, Paths_Closed, Path_Purges
    }

  protected BasePartitionTap( Tap parent, Partition partition, int openWritesThreshold )
    {
    super( new PartitionScheme( parent.getScheme(), partition.getPartitionFields() ), parent.getSinkMode() );
    this.parent = parent;
    this.partition = partition;
    this.openWritesThreshold = openWritesThreshold;
    }

  protected BasePartitionTap( Tap parent, Partition partition, SinkMode sinkMode )
    {
    super( new PartitionScheme( parent.getScheme(), partition.getPartitionFields() ), sinkMode );
    this.parent = parent;
    this.partition = partition;
    }

  protected BasePartitionTap( Tap parent, Partition partition, SinkMode sinkMode, boolean keepParentOnDelete, int openWritesThreshold )
    {
    super( new PartitionScheme( parent.getScheme(), partition.getPartitionFields() ), sinkMode );
    this.parent = parent;
    this.partition = partition;
    this.keepParentOnDelete = keepParentOnDelete;
    this.openWritesThreshold = openWritesThreshold;
    }

  /**
   * Method getParent returns the parent Tap of this PartitionTap object.
   *
   * @return the parent (type Tap) of this PartitionTap object.
   */
  public Tap getParent()
    {
    return parent;
    }

  /**
   * Method getPartition returns the {@link Partition} instance used by this PartitionTap
   *
   * @return the partition instance
   */
  public Partition getPartition()
    {
    return partition;
    }

  /**
   * Method getChildPartitionIdentifiers returns an array of all identifiers for all available partitions.
   * <p/>
   * This method is used internally to set all incoming paths, override to limit applicable partitions.
   * <p/>
   * Note the returns array may be large.
   *
   * @param flowProcess    of type FlowProcess
   * @param fullyQualified of type boolean
   * @return a String[] of partition identifiers
   * @throws IOException
   */
  public String[] getChildPartitionIdentifiers( FlowProcess<? extends Config> flowProcess, boolean fullyQualified ) throws IOException
    {
    return ( (FileType) parent ).getChildIdentifiers( flowProcess.getConfigCopy(), partition.getPathDepth(), fullyQualified );
    }

  @Override
  public String getIdentifier()
    {
    return parent.getIdentifier();
    }

  protected abstract String getCurrentIdentifier( FlowProcess<? extends Config> flowProcess );

  /**
   * Method getOpenWritesThreshold returns the openTapsThreshold of this PartitionTap object.
   *
   * @return the openTapsThreshold (type int) of this PartitionTap object.
   */
  public int getOpenWritesThreshold()
    {
    return openWritesThreshold;
    }

  @Override
  public TupleEntryCollector openForWrite( FlowProcess<? extends Config> flowProcess, Output output ) throws IOException
    {
    return new PartitionCollector( flowProcess );
    }

  @Override
  public TupleEntryIterator openForRead( FlowProcess<? extends Config> flowProcess, Input input ) throws IOException
    {
    return new PartitionIterator( flowProcess, input );
    }

  @Override
  public boolean createResource( Config conf ) throws IOException
    {
    return parent.createResource( conf );
    }

  @Override
  public boolean deleteResource( Config conf ) throws IOException
    {
    return keepParentOnDelete || parent.deleteResource( conf );
    }

  @Override
  public boolean commitResource( Config conf ) throws IOException
    {
    return parent.commitResource( conf );
    }

  @Override
  public boolean rollbackResource( Config conf ) throws IOException
    {
    return parent.rollbackResource( conf );
    }

  @Override
  public boolean resourceExists( Config conf ) throws IOException
    {
    return parent.resourceExists( conf );
    }

  @Override
  public long getModifiedTime( Config conf ) throws IOException
    {
    return parent.getModifiedTime( conf );
    }

  @Override
  public boolean equals( Object object )
    {
    if( this == object )
      return true;
    if( object == null || getClass() != object.getClass() )
      return false;
    if( !super.equals( object ) )
      return false;

    BasePartitionTap that = (BasePartitionTap) object;

    if( parent != null ? !parent.equals( that.parent ) : that.parent != null )
      return false;
    if( partition != null ? !partition.equals( that.partition ) : that.partition != null )
      return false;

    return true;
    }

  @Override
  public int hashCode()
    {
    int result = super.hashCode();
    result = 31 * result + ( parent != null ? parent.hashCode() : 0 );
    result = 31 * result + ( partition != null ? partition.hashCode() : 0 );
    return result;
    }

  @Override
  public String toString()
    {
    return getClass().getSimpleName() + "[\"" + parent + "\"]" + "[\"" + partition + "\"]";
    }

  public static class PartitionScheme<Config, Input, Output> extends Scheme<Config, Input, Output, Void, Void>
    {
    private final Scheme scheme;
    private final Fields partitionFields;

    public PartitionScheme( Scheme scheme )
      {
      this.scheme = scheme;
      this.partitionFields = null;
      }

    public PartitionScheme( Scheme scheme, Fields partitionFields )
      {
      this.scheme = scheme;

      if( partitionFields == null || partitionFields.isAll() )
        this.partitionFields = null;
      else if( partitionFields.isDefined() )
        this.partitionFields = partitionFields;
      else
        throw new IllegalArgumentException( "partitionFields must be defined or the ALL substitution, got: " + partitionFields.printVerbose() );
      }

    public Fields getSinkFields()
      {
      if( partitionFields == null || scheme.getSinkFields().isAll() )
        return scheme.getSinkFields();

      return Fields.merge( scheme.getSinkFields(), partitionFields );
      }

    public void setSinkFields( Fields sinkFields )
      {
      scheme.setSinkFields( sinkFields );
      }

    public Fields getSourceFields()
      {
      if( partitionFields == null || scheme.getSourceFields().isUnknown() )
        return scheme.getSourceFields();

      return Fields.merge( scheme.getSourceFields(), partitionFields );
      }

    public void setSourceFields( Fields sourceFields )
      {
      scheme.setSourceFields( sourceFields );
      }

    public int getNumSinkParts()
      {
      return scheme.getNumSinkParts();
      }

    public void setNumSinkParts( int numSinkParts )
      {
      scheme.setNumSinkParts( numSinkParts );
      }

    @Override
    public void sourceConfInit( FlowProcess<? extends Config> flowProcess, Tap<Config, Input, Output> tap, Config conf )
      {
      scheme.sourceConfInit( flowProcess, tap, conf );
      }

    @Override
    public void sourcePrepare( FlowProcess<? extends Config> flowProcess, SourceCall<Void, Input> sourceCall ) throws IOException
      {
      scheme.sourcePrepare( flowProcess, sourceCall );
      }

    @Override
    public boolean source( FlowProcess<? extends Config> flowProcess, SourceCall<Void, Input> sourceCall ) throws IOException
      {
      throw new UnsupportedOperationException( "should never be called" );
      }

    @Override
    public void sourceCleanup( FlowProcess<? extends Config> flowProcess, SourceCall<Void, Input> sourceCall ) throws IOException
      {
      scheme.sourceCleanup( flowProcess, sourceCall );
      }

    @Override
    public void sinkConfInit( FlowProcess<? extends Config> flowProcess, Tap<Config, Input, Output> tap, Config conf )
      {
      scheme.sinkConfInit( flowProcess, tap, conf );
      }

    @Override
    public void sinkPrepare( FlowProcess<? extends Config> flowProcess, SinkCall<Void, Output> sinkCall ) throws IOException
      {
      scheme.sinkPrepare( flowProcess, sinkCall );
      }

    @Override
    public void sink( FlowProcess<? extends Config> flowProcess, SinkCall<Void, Output> sinkCall ) throws IOException
      {
      throw new UnsupportedOperationException( "should never be called" );
      }

    @Override
    public void sinkCleanup( FlowProcess<? extends Config> flowProcess, SinkCall<Void, Output> sinkCall ) throws IOException
      {
      scheme.sinkCleanup( flowProcess, sinkCall );
      }
    }
  }
TOP

Related Classes of cascading.tap.partition.BasePartitionTap$PartitionScheme

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.
div>