Source Code of org.apache.ctakes.core.cr.JdbcCollectionReader

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.core.cr;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.sql.Clob;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Types;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;


import org.apache.log4j.Logger;


import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;


import org.apache.ctakes.core.resource.FileResource;
import org.apache.ctakes.core.resource.JdbcConnectionResource;
import org.apache.ctakes.typesystem.type.structured.DocumentID;


/**
 * Collection Reader that pulls documents to be processed from a database.
 * 
 * @author Mayo Clinic
 */
public class JdbcCollectionReader extends CollectionReader_ImplBase
{


    // LOG4J logger based on class name
    private Logger logger = Logger.getLogger(getClass().getName());


    /**
     * SQL statement to retrieve the document.
     */
    public static final String PARAM_SQL = "SqlStatement";


    /**
     * Name of column from resultset that contains the document text. Supported
     * column types are CHAR, VARCHAR, and CLOB.
     */
    public static final String PARAM_DOCTEXT_COL = "DocTextColName";


    /**
     * Name of external resource for database connection.
     */
    public static final String PARAM_DB_CONN_RESRC = "DbConnResrcName";


    /**
     * Optional parameter. Specifies column names that will be used to form a
     * document ID.
     */
    public static final String PARAM_DOCID_COLS = "DocIdColNames";


    /**
     * Optional parameter. Specifies delimiter used when document ID is built.
     */
    public static final String PARAM_DOCID_DELIMITER = "DocIdDelimiter";


    /**
     * Optional parameter. Name of external resource for prepared statement
     * value file. Each line of this file represents prepared statement values
     * that will be used to substitute for the "?" placeholders. TAB character
     * \t is used to delimit the values on a single line. The prepared statement
     * will be called once for each line in this file.
     */
    public static final String PARAM_VALUE_FILE_RESRC = "ValueFileResrcName";


    private PreparedStatement queryPrepStmt;
    private ResultSet rs;


    private String docTextColName;
    private int docColType;
    private String docColTypeName;


    // optional, will remain null if not set
    private String[] docIdColNames = null;


    // default is underscore
    private String docIdDelimiter = "_";


    private int totalRowCount = 0;
    private int currRowCount = 0;


    // optional, will remain null if not set
    // Array of List objects. Each List objects represents a list of prepared
    // stmt values.
    private List<String>[] prepStmtValArr = null;
    private int prepStmtValArrIdx = 0;
    private boolean usePrepStmtVals = false;


    public void initialize() throws ResourceInitializationException
    {
        try
        {
            String sql = (String) getConfigParameterValue(PARAM_SQL);
            docTextColName = (String) getConfigParameterValue(PARAM_DOCTEXT_COL);
            String resrcName = (String) getConfigParameterValue(PARAM_DB_CONN_RESRC);
            JdbcConnectionResource resrc = (JdbcConnectionResource) getUimaContext().getResourceObject(resrcName);


            docIdColNames = (String[]) getConfigParameterValue(PARAM_DOCID_COLS);
            if (getConfigParameterValue(PARAM_DOCID_DELIMITER) != null)
            {
                docIdDelimiter = (String) getConfigParameterValue(PARAM_DOCID_DELIMITER);
            }


            Connection conn = resrc.getConnection();
            queryPrepStmt = conn.prepareStatement(sql);


            String fileResrcName = (String) getConfigParameterValue(PARAM_VALUE_FILE_RESRC);
            if ((fileResrcName != null) && (fileResrcName.trim().length() > 0))
            {
                FileResource fileResrc = (FileResource) getUimaContext().getResourceObject(fileResrcName);
                if (fileResrc != null)
                {
                    loadValueFile(fileResrc.getFile());
                    usePrepStmtVals = true;
                } else
                {
                    throw new Exception("Failed to get " + fileResrcName
                            + " from ResourceManager");
                }
            }


            totalRowCount = getRowCount(conn, sql);
        } catch (Exception e)
        {
            throw new ResourceInitializationException(e);
        }
    }


    /**
     * Loads the prepared statement value file.
     * 
     * @param valueFile
     * @throws IOException
     */
    private void loadValueFile(File valueFile) throws IOException
    {
        List<String> lineList = new ArrayList<String>();
        BufferedReader br = new BufferedReader(new FileReader(valueFile));
        String line = br.readLine();
        while (line != null && line.trim().length()>0)
        {
            lineList.add(line);
            line = br.readLine();
        }
        br.close();


        prepStmtValArr = new List[lineList.size()];
        for (int i = 0; i < lineList.size(); i++)
        {
            String currLine = (String) lineList.get(i);
            List<String> valList = new ArrayList<String>();
            StringTokenizer st = new StringTokenizer(currLine, "\t");
            while (st.hasMoreTokens())
            {
                String token = st.nextToken().trim();
                valList.add(token);
            }
            prepStmtValArr[i] = valList;
        }
        logger.info("Loaded " + lineList.size() + " lines from value file: "
                + valueFile.getAbsolutePath());
    }


    /**
     * Slice up the query SQL and rebuild a SQL statement that gets a row count;
     * 
     * @param querySql
     * @return
     */
    private int getRowCount(Connection conn, String querySql)
            throws SQLException
    {
        StringBuffer sb = new StringBuffer();
        sb.append("SELECT COUNT(*) ");
        int idx = querySql.indexOf("FROM");
        sb.append(querySql.subSequence(idx, querySql.length()));
        PreparedStatement cntStmt = conn.prepareStatement(sb.toString());


        if (usePrepStmtVals)
        {
            int totalCnt = 0;
            for (int i = 0; i < prepStmtValArr.length; i++)
            {
                List<String> valList = prepStmtValArr[i];
                setPrepStmtValues(cntStmt, valList);
                ResultSet rs = cntStmt.executeQuery();
                rs.next();
                totalCnt += rs.getInt(1);
            }
            return totalCnt;
        } else
        {
            ResultSet rs = cntStmt.executeQuery();
            rs.next();
            return rs.getInt(1);
        }
    }


    /**
     * Helper method that sets the prepared statement values.
     * 
     * @param prepStmt
     * @param valList
     */
    private void setPrepStmtValues(PreparedStatement prepStmt, List<String> valList)
            throws SQLException
    {
        prepStmt.clearParameters();
        for (int i = 0; i < valList.size(); i++)
        {
            Object valObj = valList.get(i);
            prepStmt.setObject(i + 1, valObj);
        }
    }


    /*
     * (non-Javadoc)
     * 
     * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
     */
    public void getNext(CAS cas) throws IOException, CollectionException
    {
        currRowCount++;
        try
        {
            // pull doc text from resultset
            String document = null;
            if ((docColType == Types.CHAR)
                    || (docColType == Types.VARCHAR))
            {
                document = rs.getString(docTextColName);
            } else if (docColType == Types.CLOB)
            {
                document = convertToString(rs.getClob(docTextColName));
            } else
            {
                throw new Exception("Unsupported document text column type: "
                        + docColTypeName);
            }


            try
            {
                // if there's a CAS Initializer, call it
                if (getCasInitializer() != null)
                {
                    Reader reader = new StringReader(document);
                    getCasInitializer().initializeCas(reader, cas);
                } else
                {
                    // No CAS Initiliazer, so set document text ourselves.
                    // put document in CAS (assume CAS)
                    cas.getJCas().setDocumentText(document);
                }


                DocumentID docIdAnnot = new DocumentID(cas
                        .getJCas());
                docIdAnnot.setDocumentID(getDocumentID(rs));
                docIdAnnot.addToIndexes();


                logger.info("Reading document with ID="
                        + docIdAnnot.getDocumentID());
            } catch (Exception e)
            {
                logger.error("CasInitializer failed to process document: ");
                logger.error(document);
                throw e;
            }
        } catch (Exception e)
        {
            throw new CollectionException(e);
        }
    }


    /**
     * Builds a document ID from one or more pieces of query data. If the query
     * data is not specified, the current row # is used.
     * 
     * @param rs
     * @return
     */
    private String getDocumentID(ResultSet rs) throws SQLException
    {
        if (docIdColNames != null)
        {
            StringBuffer sb = new StringBuffer();
            for (int i = 0; i < docIdColNames.length; i++)
            {
                String val = rs.getObject(docIdColNames[i]).toString();
                sb.append(val);
                if (i != (docIdColNames.length - 1))
                {
                    sb.append(docIdDelimiter);
                }
            }
            return sb.toString();
        } else
        {
            // default is to return row num
            return String.valueOf(currRowCount);
        }
    }


    /**
     * Loads the clob data into a String object.
     * 
     * @param clob
     * @return
     * @throws SQLException
     * @throws IOException
     */
    private String convertToString(Clob clob) throws SQLException, IOException
    {
        StringBuffer sb = new StringBuffer();
        BufferedReader br = new BufferedReader(clob.getCharacterStream());
        String line = br.readLine();
        while (line != null)
        {
            sb.append(line);
            sb.append('\n');
            line = br.readLine();
        }
        br.close();
        return sb.toString();
    }


    /*
     * (non-Javadoc)
     * 
     * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#hasNext()
     */
    public boolean hasNext() throws IOException, CollectionException
    {
        try
        {


            if (rs == null)
            {
                if (usePrepStmtVals)
                {
                    List<String> valList = prepStmtValArr[prepStmtValArrIdx];
                    setPrepStmtValues(queryPrepStmt, valList);
                    prepStmtValArrIdx++;
                }


                rs = queryPrepStmt.executeQuery();


                // TODO only needs to be done once
                ResultSetMetaData rsMetaData = rs.getMetaData();
                int colIdx = rs.findColumn(docTextColName);
                docColType = rsMetaData.getColumnType(colIdx);
                docColTypeName = rsMetaData.getColumnTypeName(1);
            }


            boolean hasAnotherRow = rs.next();
            if (hasAnotherRow == false)
            {
                // it's important to close ResultSets as they can accumlate
                // in the JVM heap. Too many open result sets can inadvertently
                // cause the DB conn to be closed by the server.
                rs.close();
            }


            if (usePrepStmtVals)
            {
                if ((hasAnotherRow == false)
                        && (prepStmtValArrIdx < prepStmtValArr.length))
                {
                    // the results for the previous prepared statement execution
                    // have been exhausted, so the statement needs to be
                    // executed with the next set of values


                    // reset the rs instance variable to NULL
                    rs = null;
                    // re-invoke the hasNext() method so the prepared
                    // statement gets executed again with the next set of values
                    return this.hasNext();
                }
            }


            return hasAnotherRow;
        } catch (Exception e)
        {
            throw new CollectionException(e);
        }
    }


    /*
     * (non-Javadoc)
     * 
     * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
     */
    public Progress[] getProgress()
    {
        Progress p = new ProgressImpl(currRowCount, totalRowCount,
                Progress.ENTITIES);
        return new Progress[] { p };
    }


    /*
     * (non-Javadoc)
     * 
     * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
     */
    public void close() throws IOException
    {
        try
        {
            queryPrepStmt.close();
        } catch (Exception e)
        {
            throw new IOException(e.getMessage());
        }
    }
}
Source Code of org.apache.ctakes.core.cr.JdbcCollectionReader

Related Classes of org.apache.ctakes.core.cr.JdbcCollectionReader