package com.icentris.sql;
import java.io.*;
import java.sql.*;
import java.util.*;
// Comment_next_line_to_compile_with_Java_1.3
import java.util.regex.Matcher;
// Comment_next_line_to_compile_with_Java_1.3
import java.util.regex.Pattern;
// Comment_next_line_to_compile_with_Java_1.3
//import java.util.regex.PatternSyntaxException;
// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.Matcher;
// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.Pattern;
// Uncomment_next_line_to_compile_with_Java_1.3
//import fake.java.util.regex.PatternSyntaxException;
import com.icentris.util.CharSequencer;
import com.icentris.util.DateUtil;
import com.icentris.util.StringUtil;
/** A class intended to accept an InputStream and pass each line through a
* regexp pattern to generate an INSERT statement then send that insert to the db.
* Uses Statement.executeBatch() to increase efficiency. You can specify the
* batch size with setBatchSize(), default is 5000.
*/
public class BulkLoad {
// USER-DEFINED instance variables
private Reader reader;
private Connection connection;
private int batchSize = 5000;
private String pattern;
private String insertSql;
private HashMap<Integer,BulkLoadType> types = new HashMap<Integer,BulkLoadType>();
private boolean trackNonMatchingLines = true;
// GENERATED instance variables
private int linesProcessed = -1;
private int linesMatched = -1;
private String[] nonMatchingLines = null;
private long dbTime;
private long totalTime;
/** The default BulkLoadType (used when any BulkLoadType
* is null), handles java.sql.Types.VARCHAR (java.lang.String) types
*/
public static final BulkLoadType VARCHAR =
new BulkLoadType() {
public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
throws SQLException
{
ps.setString( pos, value );
}
};
/** Handles java.sql.Types.DATE (java.sql.Date) types */
public static final BulkLoadType DATE =
new BulkLoadType() {
public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
throws SQLException
{
java.util.Date date = DateUtil.parseDate(value);
if ( date == null ) {
ps.setDate( pos, null );
} else {
ps.setDate( pos, new java.sql.Date(date.getTime()) );
}
}
};
/** Handles java.sql.Types.INTEGER (int) types */
public static final BulkLoadType INTEGER =
new BulkLoadType() {
public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
throws SQLException
{
ps.setInt( pos, Integer.parseInt(value) );
}
};
/** Handles java.sql.Types.CLOB (int) types */
public static final BulkLoadType CLOB =
new BulkLoadType() {
public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
throws SQLException
{
ps.setCharacterStream( pos, new StringReader(value), value.length() );
}
};
/** A placeholder type to use when a match is not to be set through a ? PreparedStatement placeholder. */
public static final BulkLoadType SKIP_ME =
new BulkLoadType() {
public void parseAndSet(PreparedStatement ps, int pos, String value, String[] allValues)
throws SQLException
{}
};
/** You must first call setConnection(), setReader(), setType(), and setInsertSql().
* setType() is semi-optional, however, since any submatch not given a type will be assumed to be VARCHAR,
* see {@link #setType} for a full discussion.
*
* <p>Similar to executeBatch() which optionally allows a driver to continue processing when
* an error occurs, this class _always_ continues when errors occur, and only throws a
* {@link BulkLoadException} after all processing is completed. The thrown BulkLoadException
* will be populated with updateCounts and BatchUpdateExceptions for all batches.
*
* @return an array of int[] arrays that were returned by each executeBatch() call
* @throws BulkLoadException if any executeBatch() threw a BatchUpdateException,
* or none of the lines matched
*/
public int[][] executeBatches() throws IOException, SQLException, BulkLoadException {
if ( connection == null ) {
throw new IllegalStateException("getConnection() returns null, did you forget to call setConnection() before execute()?");
}
if ( reader == null ) {
throw new IllegalStateException("getReader() returns null, did you forget to call setReader() before execute?");
}
if ( insertSql == null ) {
throw new IllegalStateException("getInsertSql() returns null, did you forget to call getInsertSql() before execute()?");
}
ArrayList<int[]> allUpdateCounts = new ArrayList<int[]>();
Hashtable<Integer,BatchUpdateException> batchUpdateExceptions = new Hashtable<Integer,BatchUpdateException>();
linesProcessed = 0;
linesMatched = 0;
totalTime = 0;
long totalStart = System.currentTimeMillis();
dbTime = 0;
long dbStart = -1;
ArrayList<String> nonMatchingLinesList = new ArrayList<String>();
if ( trackNonMatchingLines() == true ) {
nonMatchingLines = new String[0];
} else {
nonMatchingLines = null;
}
BufferedReader br = new BufferedReader(reader);
String line;
Pattern re = Pattern.compile(pattern);
boolean hasRegexReplace = (insertSql.indexOf("$") > -1);
// split on ? (question marks) to find out how many there are in this sql statement
int numQuestionMarks = StringUtil.split(insertSql, '?').length - 1;
// do we have at least one question mark?
boolean hasQuestionMark = (numQuestionMarks > 0);
PreparedStatement ps = null;
Statement statement = null;
try {
if ( hasRegexReplace == false && hasQuestionMark == true ) {
ps = connection.prepareStatement( insertSql );
} else if ( hasRegexReplace == true && hasQuestionMark == true ) {
// SLOWEST: a new ps will have to be created for each line
// so this is expected to be the slowest option!
} else if ( hasRegexReplace == true && hasQuestionMark == false ) {
statement = connection.createStatement();
} else {
throw new IllegalStateException("I expect either $ or ? in the insertSql!");
}
boolean hasBatch = false;
for ( ; (line = br.readLine()) != null; linesProcessed++ ) {
CharSequencer charSequence = new CharSequencer(line);
Matcher matcher = re.matcher( charSequence );
if ( matcher.matches() == true ) {
linesMatched++;
if ( hasRegexReplace == true ) {
String sql = matcher.replaceFirst( getInsertSql() );
if ( sql == null ) {
throw new IOException("At least the first row should match pattern:(" + pattern + ")");
}
if ( statement != null ) {
statement.addBatch( sql );
hasBatch = true;
} else {
ps = connection.prepareStatement( sql );
}
}
if ( hasQuestionMark == true ) {
int numMatches = matcher.groupCount();
if ( numMatches < types.size() ) {
throw new IllegalStateException("You gave me " + types.size() + " types, but I only matched " +
numMatches + " sub-groups in the regex. I need at least as many sub-matches as I have types.");
}
if ( numMatches < numQuestionMarks ) {
throw new IllegalStateException("Your statement has " + numQuestionMarks +
" ? placeholders, but I only matched " + numMatches +
" sub-groups in the regex. I need at least as many sub-matches as I have ? placeholders.");
}
String[] matches = new String[numMatches];
for ( int i=0; i < matches.length; i++ ) {
matches[i] = matcher.group(i);
}
int paramCount = 1;
for ( int j=1; j <= numMatches; j++ ) {
BulkLoadType type = (BulkLoadType) types.get(new Integer(j));
if ( type == null ) type = BulkLoad.VARCHAR;
if ( ! type.equals(BulkLoad.SKIP_ME) ) {
String submatch = matcher.group( j );
type.parseAndSet( ps, paramCount, submatch, matches );
paramCount++;
}
}
// this does not take advantage of batching, thus is expected to be the slowest option
// as noted above at SLOWEST:
if ( hasRegexReplace == true ) {
dbStart = System.currentTimeMillis();
int[] updateCounts = new int[] { ps.executeUpdate() };
dbTime += System.currentTimeMillis() - dbStart;
dbStart = -1;
allUpdateCounts.add( updateCounts );
} else {
ps.addBatch();
//ps.clearParameters();
hasBatch = true;
}
}
} else if ( trackNonMatchingLines() == true ) {
nonMatchingLinesList.add( line );
}
// is it time to send a batch?
if ( ((linesProcessed + 1) % batchSize) == 0 &&
hasBatch == true )
{
try {
dbStart = System.currentTimeMillis();
if ( statement != null ) {
int[] updateCounts = statement.executeBatch();
allUpdateCounts.add( updateCounts );
statement.clearBatch();
} else if ( ps != null && hasRegexReplace == false ) {
int[] updateCounts = ps.executeBatch();
allUpdateCounts.add( updateCounts );
ps.clearBatch();
}
dbTime += System.currentTimeMillis() - dbStart;
dbStart = -1;
} catch (BatchUpdateException e) {
if ( dbStart > -1 ) dbTime += System.currentTimeMillis() - dbStart;
int[] updateCounts = e.getUpdateCounts();
allUpdateCounts.add( updateCounts );
int batchNumber = allUpdateCounts.size();
batchUpdateExceptions.put( new Integer(batchNumber), e);
}
hasBatch = false;
}
}
// we're all done, send over anything we haven't yet!
if ( hasBatch == true ) {
try {
dbStart = System.currentTimeMillis();
if ( statement != null ) {
int[] updateCounts = statement.executeBatch();
allUpdateCounts.add( updateCounts );
statement.clearBatch();
} else if ( ps != null && hasRegexReplace == false ) {
int[] updateCounts = ps.executeBatch();
allUpdateCounts.add( updateCounts );
ps.clearBatch();
}
dbTime += System.currentTimeMillis() - dbStart;
dbStart = -1;
} catch (BatchUpdateException e) {
if ( dbStart > -1 ) dbTime += System.currentTimeMillis() - dbStart;
int[] updateCounts = e.getUpdateCounts();
allUpdateCounts.add( updateCounts );
int batchNumber = allUpdateCounts.size();
batchUpdateExceptions.put( new Integer(batchNumber), e);
}
}
} finally {
totalTime = System.currentTimeMillis() - totalStart;
if ( ps != null ) try { ps.close(); } catch (Exception e) {}
if ( statement != null ) try { statement.close(); } catch (Exception e) {}
}
if ( trackNonMatchingLines() == true ) {
nonMatchingLines = (String[]) nonMatchingLinesList.toArray(new String[0]);
}
int[][] returnVal = (int[][]) allUpdateCounts.toArray(new int[0][]);
totalTime = System.currentTimeMillis() - totalStart;
if ( batchUpdateExceptions.size() > 0 || linesMatched < linesProcessed ) {
throw new BulkLoadException( returnVal, batchUpdateExceptions, linesProcessed, linesMatched, nonMatchingLines, totalTime, dbTime );
} else {
return returnVal;
}
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("Lines Processed=[" + linesProcessed + "], ");
sb.append("Lines Matched=[" + linesMatched + "], ");
sb.append("DB Time=[" + dbTime + "], ");
sb.append("Total Time=[" + totalTime + "]");
return sb.toString();
}
/** Clears out info set by executeBatches() method
* (e.g. linesProcessed, linesMatched, nonMatchingLines)
*/
public void clear() {
linesProcessed = -1;
linesMatched = -1;
nonMatchingLines = null;
totalTime = -1;
dbTime = -1;
}
public int getLinesProcessed() {
return linesProcessed;
}
public int getLinesMatched() {
return linesMatched;
}
/** An array of lines that didn't match the pattern. This is not
* tracked if trackNonMatchingLines is false, and will thus return null.
*/
public String[] getNonMatchingLines() {
return nonMatchingLines;
}
public long getDbTime() {
return dbTime;
}
public long getTotalTime() {
return totalTime;
}
public Reader getReader() {
return reader;
}
public void setReader(Reader reader) {
this.reader = reader;
}
public Connection getConnection() {
return connection;
}
public void setConnection(Connection connection) {
this.connection = connection;
}
public int getBatchSize() {
return batchSize;
}
public void setBatchSize(int batchSize) {
this.batchSize = batchSize;
}
public String getPattern() {
return pattern;
}
/** The pattern to look for on each line of the input. */
public void setPattern(String pattern) {
this.pattern = pattern;
}
public String getInsertSql() {
return insertSql;
}
/** The INSERT INTO statement with ? where the submatches
* should be in the insert statement.
*/
public void setInsertSql(String insertSql) {
this.insertSql = insertSql;
}
/** @see #setType */
public HashMap getTypes() {
return (HashMap) types.clone();
}
/** @see #setType */
public BulkLoadType getType(int submatchPos) {
return (BulkLoadType) types.get( new Integer(submatchPos) );
}
/** bulkLoadTypes give the information needed for a PreparedStatement.
* Each bulkLoadType can be one of:
* <ol>
* <li><code>null</code> the ? in that position is a VARCHAR (or java.lang.String)
* <li>one of the standard BulkLoadType's pre-specified as fields
* of this class, (e.g. {@link #VARCHAR} is used where there are <code>null</code>'s)
* <li>your own implementation of a {@link BulkLoadType}.
* <li>{@link #SKIP_ME} if you have a submatch that does not have a matching ? in the PreparedStatement
* </ol>
*
* @param submatchPos corresponds to the submatch number, not the ? number
* @param bulkLoadType is the type to use for translating this submatch into the correct type for this ?
* placeholder in the PreparedStatement
*/
public void setType(int submatchPos, BulkLoadType bulkLoadType) {
types.put( new Integer(submatchPos), bulkLoadType );
}
/** @see #setTrackNonMatchingLines */
public boolean trackNonMatchingLines() {
return trackNonMatchingLines;
}
/** Default: true, tracks lines that didn't match the pattern. In cases
* where things are working, this should be very few lines. If you know
* there will be many lines not matching, you may want to set this to false
* to save resources used in tracking.
*/
public void setTrackNonMatchingLines(boolean trackNonMatchingLines) {
this.trackNonMatchingLines = trackNonMatchingLines;
}
}