Package com.icentris.sql

Source Code of com.icentris.sql.BulkLoad

package com.icentris.sql;

import java.io.*;
import java.sql.*;
import java.util.*;

// Comment_next_line_to_compile_with_Java_1.3
import java.util.regex.Matcher;
// Comment_next_line_to_compile_with_Java_1.3
import java.util.regex.Pattern;
// Comment_next_line_to_compile_with_Java_1.3
//import java.util.regex.PatternSyntaxException;

// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.Matcher;
// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.Pattern;
// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.PatternSyntaxException;

import com.icentris.util.CharSequencer;
import com.icentris.util.DateUtil;
import com.icentris.util.StringUtil;

/** A class intended to accept an InputStream and pass each line through a
* regexp pattern to generate an INSERT statement then send that insert to the db. 
* Uses Statement.executeBatch() to increase efficiency.  You can specify the
* batch size with setBatchSize(), default is 5000.
*/
public class BulkLoad {
  // USER-DEFINED instance variables
  private Reader reader;
  private Connection connection;
  private int batchSize = 5000;
  private String pattern;
  private String insertSql;
  private HashMap<Integer,BulkLoadType> types = new HashMap<Integer,BulkLoadType>();
  private boolean trackNonMatchingLines = true;

  // GENERATED instance variables
  private int linesProcessed = -1;
  private int linesMatched   = -1;
  private String[] nonMatchingLines = null;
  private long dbTime;
  private long totalTime;

  /** The default BulkLoadType (used when any BulkLoadType
   * is null), handles java.sql.Types.VARCHAR (java.lang.String) types
   */
  public static final BulkLoadType VARCHAR =
    new BulkLoadType() {
      public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
        throws SQLException
      {
        ps.setString( pos, value );
      }
    };

  /** Handles java.sql.Types.DATE (java.sql.Date) types */
  public static final BulkLoadType DATE =
    new BulkLoadType() {
      public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
        throws SQLException
      {
        java.util.Date date = DateUtil.parseDate(value);
        if ( date == null ) {
          ps.setDate( pos, null );
        } else {
          ps.setDate( pos, new java.sql.Date(date.getTime()) );
        }
      }
    };

  /** Handles java.sql.Types.INTEGER (int) types */
  public static final BulkLoadType INTEGER =
    new BulkLoadType() {
      public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
        throws SQLException
      {
        ps.setInt( pos, Integer.parseInt(value) );
      }
    };

  /** Handles java.sql.Types.CLOB (int) types */
  public static final BulkLoadType CLOB =
    new BulkLoadType() {
      public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
        throws SQLException
      {
        ps.setCharacterStream( pos, new StringReader(value), value.length() );
      }
    };

  /** A placeholder type to use when a match is not to be set through a ? PreparedStatement placeholder. */
  public static final BulkLoadType SKIP_ME =
    new BulkLoadType() {
      public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
        throws SQLException
      {}
    };


  /** You must first call setConnection(), setReader(), setType(), and setInsertSql().
   * setType() is semi-optional, however, since any submatch not given a type will be assumed to be VARCHAR,
   * see {@link #setType} for a full discussion.
   *
   * <p>Similar to executeBatch() which optionally allows a driver to continue processing when
   * an error occurs, this class _always_ continues when errors occur, and only throws a
   * {@link BulkLoadException} after all processing is completed.  The thrown BulkLoadException
   * will be populated with updateCounts and BatchUpdateExceptions for all batches.
   *
   * @return an array of int[] arrays that were returned by each executeBatch() call
   * @throws BulkLoadException if any executeBatch() threw a BatchUpdateException,
   *  or none of the lines matched
   */
  public int[][] executeBatches() throws IOException, SQLException, BulkLoadException {
    if ( connection == null ) {
      throw new IllegalStateException("getConnection() returns null, did you forget to call setConnection() before execute()?");
    }
    if ( reader == null ) {
      throw new IllegalStateException("getReader() returns null, did you forget to call setReader() before execute?");
    }
    if ( insertSql == null ) {
      throw new IllegalStateException("getInsertSql() returns null, did you forget to call getInsertSql() before execute()?");
    }

    ArrayList<int[]> allUpdateCounts = new ArrayList<int[]>();
    Hashtable<Integer,BatchUpdateException> batchUpdateExceptions = new Hashtable<Integer,BatchUpdateException>();
    linesProcessed  = 0;
    linesMatched    = 0;
    totalTime  = 0;
    long totalStart = System.currentTimeMillis();
    dbTime          = 0;
    long dbStart     = -1;
    ArrayList<String> nonMatchingLinesList = new ArrayList<String>();

    if ( trackNonMatchingLines() == true ) {
      nonMatchingLines = new String[0];
    } else {
      nonMatchingLines = null;
    }

    BufferedReader br = new BufferedReader(reader);
    String line;

    Pattern re = Pattern.compile(pattern);

    boolean hasRegexReplace = (insertSql.indexOf("$") > -1);
    // split on ? (question marks) to find out how many there are in this sql statement
    int numQuestionMarks = StringUtil.split(insertSql, '?').length - 1;
    // do we have at least one question mark?
    boolean hasQuestionMark = (numQuestionMarks > 0);

    PreparedStatement ps = null;
    Statement statement = null;
    try {
      if (        hasRegexReplace == false && hasQuestionMark == true ) {
        ps = connection.prepareStatement( insertSql );
      } else if ( hasRegexReplace == true  && hasQuestionMark == true ) {
        // SLOWEST: a new ps will have to be created for each line
        // so this is expected to be the slowest option!
      } else if ( hasRegexReplace == true  && hasQuestionMark == false ) {
        statement = connection.createStatement();
      } else {
        throw new IllegalStateException("I expect either $ or ? in the insertSql!");
      }

      boolean hasBatch = false;
      for ( ; (line = br.readLine()) != null; linesProcessed++ ) {
        CharSequencer charSequence = new CharSequencer(line);
        Matcher matcher = re.matcher( charSequence );
        if ( matcher.matches() == true ) {
          linesMatched++;
          if ( hasRegexReplace == true ) {
            String sql = matcher.replaceFirst( getInsertSql() );
            if ( sql == null ) {
              throw new IOException("At least the first row should match pattern:(" + pattern + ")");
            }
            if ( statement != null ) {
              statement.addBatch( sql );
              hasBatch = true;
            } else {
              ps = connection.prepareStatement( sql );
            }
          }
          if ( hasQuestionMark == true ) {
            int numMatches = matcher.groupCount();
            if ( numMatches < types.size() ) {
              throw new IllegalStateException("You gave me " + types.size() + " types, but I only matched " +
                numMatches + " sub-groups in the regex.  I need at least as many sub-matches as I have types.");
            }
            if ( numMatches < numQuestionMarks ) {
              throw new IllegalStateException("Your statement has " + numQuestionMarks +
                " ? placeholders, but I only matched " + numMatches +
                " sub-groups in the regex.  I need at least as many sub-matches as I have ? placeholders.");
            }
            String[] matches = new String[numMatches];
            for ( int i=0; i < matches.length; i++ ) {
              matches[i] = matcher.group(i);
            }
            int paramCount = 1;
            for ( int j=1; j <= numMatches; j++ ) {
              BulkLoadType type = (BulkLoadType) types.get(new Integer(j));
              if ( type == null ) type = BulkLoad.VARCHAR;
              if ( ! type.equals(BulkLoad.SKIP_ME) ) {
                String submatch = matcher.group( j );
                type.parseAndSet( ps, paramCount, submatch, matches );
                paramCount++;
              }
            }
            // this does not take advantage of batching, thus is expected to be the slowest option
            // as noted above at SLOWEST:
            if ( hasRegexReplace == true ) {
              dbStart = System.currentTimeMillis();
              int[] updateCounts = new int[] { ps.executeUpdate() };
              dbTime += System.currentTimeMillis() - dbStart;
              dbStart = -1;
              allUpdateCounts.add( updateCounts );
            } else {
              ps.addBatch();
              //ps.clearParameters();
              hasBatch = true;
            }
          }
        } else if ( trackNonMatchingLines() == true ) {
          nonMatchingLinesList.add( line );
        }
        // is it time to send a batch?
        if ( ((linesProcessed + 1) % batchSize) == 0 &&
             hasBatch == true )
        {
          try {
            dbStart = System.currentTimeMillis();
            if ( statement != null ) {
              int[] updateCounts = statement.executeBatch();
              allUpdateCounts.add( updateCounts );
              statement.clearBatch();
            } else if ( ps != null && hasRegexReplace == false ) {
              int[] updateCounts = ps.executeBatch();
              allUpdateCounts.add( updateCounts );
              ps.clearBatch();
            }
            dbTime += System.currentTimeMillis() - dbStart;
            dbStart = -1;
          } catch (BatchUpdateException e) {
            if ( dbStart > -1 ) dbTime += System.currentTimeMillis() - dbStart;
            int[] updateCounts = e.getUpdateCounts();
            allUpdateCounts.add( updateCounts );
            int batchNumber = allUpdateCounts.size();
            batchUpdateExceptions.put( new Integer(batchNumber), e);
          }
          hasBatch = false;
        }
      }
      // we're all done, send over anything we haven't yet!
      if ( hasBatch == true ) {
        try {
          dbStart = System.currentTimeMillis();
          if ( statement != null ) {
            int[] updateCounts = statement.executeBatch();
            allUpdateCounts.add( updateCounts );
            statement.clearBatch();
          } else if ( ps != null && hasRegexReplace == false ) {
            int[] updateCounts = ps.executeBatch();
            allUpdateCounts.add( updateCounts );
            ps.clearBatch();
          }
          dbTime += System.currentTimeMillis() - dbStart;
          dbStart = -1;
        } catch (BatchUpdateException e) {
          if ( dbStart > -1 ) dbTime += System.currentTimeMillis() - dbStart;
          int[] updateCounts = e.getUpdateCounts();
          allUpdateCounts.add( updateCounts );
          int batchNumber = allUpdateCounts.size();
          batchUpdateExceptions.put( new Integer(batchNumber), e);
        }
      }
    } finally {
      totalTime = System.currentTimeMillis() - totalStart;
      if ( ps != null        ) try { ps.close();        } catch (Exception e) {}
      if ( statement != null ) try { statement.close(); } catch (Exception e) {}
    }

    if ( trackNonMatchingLines() == true ) {
      nonMatchingLines = (String[]) nonMatchingLinesList.toArray(new String[0]);
    }
    int[][] returnVal = (int[][]) allUpdateCounts.toArray(new int[0][]);
    totalTime = System.currentTimeMillis() - totalStart;
    if ( batchUpdateExceptions.size() > 0 || linesMatched < linesProcessed ) {
      throw new BulkLoadException( returnVal, batchUpdateExceptions, linesProcessed, linesMatched, nonMatchingLines, totalTime, dbTime );
    } else {
      return returnVal;
    }
  }

  public String toString() {
    StringBuffer sb = new StringBuffer();
    sb.append("Lines Processed=[" + linesProcessed + "], ");
    sb.append("Lines Matched=[" + linesMatched + "], ");
    sb.append("DB Time=[" + dbTime + "], ");
    sb.append("Total Time=[" + totalTime + "]");
    return sb.toString();
  }

  /** Clears out info set by executeBatches() method
   * (e.g. linesProcessed, linesMatched, nonMatchingLines)
   */
  public void clear() {
    linesProcessed   = -1;
    linesMatched     = -1;
    nonMatchingLines = null;
    totalTime   = -1;
    dbTime           = -1;
  }

  public int getLinesProcessed() {
    return linesProcessed;
  }

  public int getLinesMatched() {
    return linesMatched;
  }

  /** An array of lines that didn't match the pattern.  This is not
   * tracked if trackNonMatchingLines is false, and will thus return null.
   */
  public String[] getNonMatchingLines() {
    return nonMatchingLines;
  }

  public long getDbTime() {
    return dbTime;
  }

  public long getTotalTime() {
    return totalTime;
  }

  public Reader getReader() {
    return reader;
  }

  public void setReader(Reader reader) {
    this.reader = reader;
  }

  public Connection getConnection() {
    return connection;
  }

  public void setConnection(Connection connection) {
    this.connection = connection;
  }

  public int getBatchSize() {
    return batchSize;
  }

  public void setBatchSize(int batchSize) {
    this.batchSize = batchSize;
  }

  public String getPattern() {
    return pattern;
  }

  /** The pattern to look for on each line of the input. */
  public void setPattern(String pattern) {
    this.pattern = pattern;
  }

  public String getInsertSql() {
    return insertSql;
  }

  /** The INSERT INTO statement with ? where the submatches
   * should be in the insert statement.
   */
  public void setInsertSql(String insertSql) {
    this.insertSql = insertSql;
  }

  /** @see #setType */
  public HashMap getTypes() {
    return (HashMap) types.clone();
  }

  /** @see #setType */
  public BulkLoadType getType(int submatchPos) {
    return (BulkLoadType) types.get( new Integer(submatchPos) );
  }

  /** bulkLoadTypes give the information needed for a PreparedStatement.
   * Each bulkLoadType can be one of:
   * <ol>
   *   <li><code>null</code> the ? in that position is a VARCHAR (or java.lang.String)
   *   <li>one of the standard BulkLoadType's pre-specified as fields
   *    of this class, (e.g. {@link #VARCHAR} is used where there are <code>null</code>'s)
   *   <li>your own implementation of a {@link BulkLoadType}.
   *   <li>{@link #SKIP_ME} if you have a submatch that does not have a matching ? in the PreparedStatement
   * </ol>
   *
   * @param submatchPos corresponds to the submatch number, not the ? number
   * @param bulkLoadType is the type to use for translating this submatch into the correct type for this ?
   *  placeholder in the PreparedStatement
   */
  public void setType(int submatchPos, BulkLoadType bulkLoadType) {
    types.put( new Integer(submatchPos), bulkLoadType );
  }

  /** @see #setTrackNonMatchingLines */
  public boolean trackNonMatchingLines() {
    return trackNonMatchingLines;
  }

  /** Default: true, tracks lines that didn't match the pattern.  In cases
   * where things are working, this should be very few lines.  If you know
   * there will be many lines not matching, you may want to set this to false
   * to save resources used in tracking.
   */
  public void setTrackNonMatchingLines(boolean trackNonMatchingLines) {
    this.trackNonMatchingLines = trackNonMatchingLines;
  }
}
TOP

Related Classes of com.icentris.sql.BulkLoad

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.