Package com.ikanow.infinit.e.harvest.extraction.document.database

Source Code of com.ikanow.infinit.e.harvest.extraction.document.database.NoSqlHarvester

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.harvest.extraction.document.database;

import java.io.IOException;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.log4j.Logger;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.CommitType;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.harvest.HarvestContext;
import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.ikanow.infinit.e.harvest.utils.TextEncryption;

//TODO going to use Kundera (may not bother including JARs, force user to do that)

/**
* DatabaseHarvester
* Used to harvest data from RDBMs
* @author cmorgan
*/
public class NoSqlHarvester implements HarvesterInterface
{

  // Private class variables
  private RdbmsManager rdbms = null;
  private PropertiesManager properties = null;
  private String jdbcUrl = null;
  private List<DocumentPojo> docsToAdd = null;
  @SuppressWarnings("unused")
  private List<DocumentPojo> docsToUpdate = null;
  private List<DocumentPojo> docsToDelete = null;
  private Set<Integer> sourceTypesCanHarvest = new HashSet<Integer>();
  private int maxDocsPerCycle = Integer.MAX_VALUE;
 
  private HarvestContext _context;

  // Initialize the Logger
  private static final Logger logger = Logger.getLogger(NoSqlHarvester.class);
 
  /**
   * Default Constructor
   */
  public NoSqlHarvester()
  {     
    sourceTypesCanHarvest.add(InfiniteEnums.DATABASE);
    PropertiesManager pm = new PropertiesManager();
    maxDocsPerCycle = pm.getMaxDocsPerSource();
  }
 
  /**
   * executeHarvest
   * Execute the harvest against a single source object
   * @param source
   * @return List<DocumentPojo>
   */
  public void executeHarvest(HarvestContext context, SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove)
  {
    _context = context;
    docsToAdd = toAdd;
    docsToUpdate = toUpdate;
    docsToDelete = toRemove;

    if (_context.isStandalone()) {
      maxDocsPerCycle = _context.getStandaloneMaxDocs();
    }
    try
    {
      processDatabase(source);
    }
    catch (Exception e)
    {
      _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, "Error when harvesting DB: " + e.getMessage(), false, false);
      logger.error("Exception Message: " + e.getMessage(), e);
    }
  }
 
  /**
   * processDatabase
   * Connect to the database referenced in the SourcePojo and execute sql query to fetch
   * a result set, pass result set to add records to this.docs
   * @param source
   * @throws ClassNotFoundException
   * @throws SQLException
   * @throws IOException
   * @return void
   */
  private void processDatabase(SourcePojo source) throws ClassNotFoundException, SQLException, IOException
  {
    // Set up properties and RDBMS Manager
    properties = new PropertiesManager();
    rdbms = new RdbmsManager();

    // Get the type of database to access from the source object
    String dt = source.getDatabaseConfig().getDatabaseType()
   
    // Create the jdbcUrl connection string for the manager
    jdbcUrl = rdbms.getConnectionString(dt, 
        source.getDatabaseConfig().getHostname(), source.getDatabaseConfig().getPort(),
        source.getDatabaseConfig().getDatabaseName());
    rdbms.setUrl(jdbcUrl);
   
    // Set the class name to load for JDBC, i.e. com.mysql.jdbc.Driver for MySql
    rdbms.setClassName(properties.getJdbcClass(source.getDatabaseConfig().getDatabaseType()));

    // If authentication is required to access the database set the user/pass here
    if ( source.getAuthentication() != null )
    {
      String userName = source.getAuthentication().getUsername();
      String password = source.getAuthentication().getPassword();
     
      if (userName != null && userName.length() > 0) rdbms.setUser(userName);
      if (password != null && password.length() > 0) rdbms.setPassword(new TextEncryption().decrypt(password));
    }
   
    // Some horrible logic to handle a DB cycle being >max records
    int nNumRecordsToSkip = 0;
    SourceHarvestStatusPojo lastHarvestInfo = source.getHarvestStatus();
    if (null != lastHarvestInfo) { // either in delta query mode, or truncation mode of initial query
      if (null != lastHarvestInfo.getHarvest_message()) {
        if (lastHarvestInfo.getHarvest_message().startsWith("query:")) { // Initial query
          try {
            int nEndOfNum = lastHarvestInfo.getHarvest_message().indexOf('.', 6);
            if (nEndOfNum > 0) {
              nNumRecordsToSkip = Integer.parseInt(lastHarvestInfo.getHarvest_message().substring(6, nEndOfNum));
            }
          }
          catch (Exception e) {} // Do nothing, just default to 0
          lastHarvestInfo = null; // (go through initial query branch below)
        }
        else { // Delta query or error!
          if (lastHarvestInfo.getHarvest_message().startsWith("deltaquery:")) { // Delta query, phew
            try {
              int nEndOfNum = lastHarvestInfo.getHarvest_message().indexOf('.', 11);
              if (nEndOfNum > 0) {
                nNumRecordsToSkip = Integer.parseInt(lastHarvestInfo.getHarvest_message().substring(11, nEndOfNum));
              }
            }
            catch (Exception e) {} // Do nothing, just default to 0             
          }         
        }
      }
    }//TESTED ("query:" by eye, "deltaquery:" implicitly as cut and paste) //TOTEST with "." and with other fields added
   
    // If the query has been performed before run delta and delete queries
    if (lastHarvestInfo != null)
    {
      String deltaQuery = null;
      String deleteQuery = null;
      String deltaDate = null;
     
      // Get the date last harvested from the harvest object and then convert the date
      // to the proper format for the database being queried
      if (source.getHarvestStatus().getHarvested() != null)
      {
        deltaDate = getDatabaseDateString(dt, source.getHarvestStatus().getHarvested());
      }
           
      // Delta Query - get new data that has appeared in our source since the harvest last run
      // Important Note: The query should be against a "last modified" field not publishedDate unless they are
      // equal to ensure new records get added properly and not ignored by the harvester
      if ((source.getDatabaseConfig().getDeltaQuery() != null) && !source.getDatabaseConfig().getDeltaQuery().isEmpty())
      {
        deltaQuery = source.getDatabaseConfig().getDeltaQuery();
        // Replace '?' in the delta query with the date value retrieved via getHarvested()
        if ((source.getHarvestStatus().getHarvested() != null) && (deltaQuery.contains("?")))
        {
          deltaQuery = deltaQuery.replace("?", "'" + deltaDate + "'").toString();
        }
      }
      // Skip logic:
      if (deltaQuery.contains("!SKIP!")) {
        deltaQuery = deltaQuery.replace("!SKIP!", new Integer(nNumRecordsToSkip).toString());
      }
      else {
        rdbms.setRecordsToSkip(nNumRecordsToSkip);       
      }
     
      // Delete docs from the index if they have been deleted from the source
      // database. Designed to check a "deleted" table to get records that have been
      // deleted to identify the records to remove form the index.
      if ((source.getDatabaseConfig().getDeleteQuery() != null) && !source.getDatabaseConfig().getDeleteQuery().isEmpty())
      {
        deleteQuery = source.getDatabaseConfig().getDeleteQuery();
        // Replace '?' in the delete query with the date value retrieved via getHarvested()
        if ((source.getHarvestStatus().getHarvested() != null) && (deleteQuery.contains("?")))
        {
          deleteQuery = deleteQuery.replace("?", "'" + deltaDate + "'").toString();
        }
        rdbms.setQuery(deleteQuery, deltaDate);
        rdbms.executeQuery();
        deleteRecords(rdbms.getResultSet(), source);
      }
     
      // Set the rdbms query = deltaQuery
      if ((deltaQuery != null) && (deltaDate != null))
      {
        rdbms.setQuery(deltaQuery, deltaDate);
      }
      else
      {
        rdbms.setQuery(source.getDatabaseConfig().getQuery());
      }
    }
    else // Very first import (though might be in horrible initial query mode)
    {
      String query = source.getDatabaseConfig().getQuery();
      if (query.contains("!SKIP!")) {
        query = query.replace("!SKIP!", new Integer(nNumRecordsToSkip).toString());
      }
      else {
        rdbms.setRecordsToSkip(nNumRecordsToSkip);       
      }
      // Set rdbs query = default source.database.query value
      rdbms.setQuery(query);
    }
   
    // Execute the specified query
    String sErr = rdbms.executeQuery();
   
    if (null != sErr) {
      _context.getHarvestStatus().update(source, new Date(), HarvestEnum.error, "Error when harvesting DB: " + sErr, false, false);
    }
    else {
      // Build the list of docs using the result set from the query
      boolean bTruncated = addRecords(rdbms.getResultSet(), rdbms.getMetaData(), source);
     
      // Update source document with harvest success message
      String truncationMode = null;
      if (bTruncated) {
        source.setReachedMaxDocs();
        if (null == lastHarvestInfo) { // Was an initial import
          truncationMode = new StringBuffer("query:").append(nNumRecordsToSkip + this.docsToAdd.size()).append('.').toString();
        }
        else { // Was a delta query
          truncationMode = new StringBuffer("deltaquery:").append(nNumRecordsToSkip + this.docsToAdd.size()).append('.').toString();         
        }
      }//TESTED (by hand in truncation and non-truncation cases)
      _context.getHarvestStatus().update(source, new Date(), HarvestEnum.in_progress, truncationMode, false, false);
    }     
    // Close the connection
    try {
      rdbms.closeConnection();
      rdbms = null;
    }
    catch (Exception e) {
      // Do nothing, we've already updated the harvest
    }
  }
 

  /**
   * addRecords -
   *
   * @param rs
   * @param md
   * @param source
   */
  private boolean addRecords(ResultSet rs, ResultSetMetaData md, SourcePojo source)
  { 
    LinkedList<String> duplicateSources = new LinkedList<String>();    
      try
     
        int nAdded = 0;
        while(rs.next() && (nAdded < this.maxDocsPerCycle))
        { 
        /**
         * Check for existing doc by checking the doc.url field which will contain
         * the following information for a database record:
         * source.url + primary key fields. See example below
         * jdbc:mysql://<IP ADDRESS>:3306/washingtondc/987863
         */
        try
        {
          DuplicateManager qr = _context.getDuplicateManager();
         
            String primaryKey = null;
            if (null != source.getDatabaseConfig().getPrimaryKey()) {
              primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
            }
            if (null == primaryKey) { // Just pick something unique, to avoid URL collisions
              primaryKey = new ObjectId().toString();
            }
            String docUrl = source.getUrl() + "/" + primaryKey;
           
          // Check to see if the record has already been added
          // If it has been added then we need to update it with the new information
          if (!qr.isDuplicate_Url(docUrl, source, duplicateSources))
          {
            nAdded++;
            DocumentPojo newDoc = createDoc(CommitType.insert, rs, md, source, docUrl);
            if (!duplicateSources.isEmpty()) {
              newDoc.setDuplicateFrom(duplicateSources.getFirst());
            }
            this.docsToAdd.add(newDoc);
          }
          else {
            //TODO (INF-1300): update, I guess need to check if the record has changed?
            // If not, do nothing; if so,
            //newOrUpdatedDoc.setId(qr.getLastDuplicateId()); // (set _id to doc we're going to replace)
            //this.docsToUpdate.add(newOrUpdatedDoc);
          }         
        }
        catch (Exception e)
        {
          logger.error("Error on record: Exception Message: " + e.getMessage(), e);
          _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on record: ", e).toString(), true);
        }        
        }//end loop over records
       
        return rs.next(); // (ie true if there are more records...)
      }
      catch (Exception e)
      {     
        logger.error("Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on addRecords: ", e).toString(), true);
    }
      return false;
  }
 
 
  /**
   * deleteRecords
   * @param rs
   * @param source
   */
  private void deleteRecords(ResultSet rs, SourcePojo source)
  {
      try
      {
      while(rs.next())
      {
        // Set up the primary variables
          String primaryKey = rs.getString(source.getDatabaseConfig().getPrimaryKey());
          String docUrl = source.getUrl() + "/" + primaryKey;
         
          // Get our system id from the record
          DocumentPojo docToRemove = new DocumentPojo();
          docToRemove.setUrl(docUrl);
          docsToDelete.add(docToRemove);
      }
    }
      catch (SQLException e)
      {
      logger.error("Error on delete: Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on delete: ", e).toString(), true);
    }
  }
 

 
  /**
   * processDoc
   * @param commitType
   * @param col
   * @param rs
   * @param md
   * @param source
   * @param docUrl
   */
  private DocumentPojo createDoc(CommitType commitType, ResultSet rs, ResultSetMetaData md,
      SourcePojo source, String docUrl)
  {
    // Set up Feed object to be used to contain information
    DocumentPojo doc = null;
 
    try
   
      // Check to see if the commit type is a insert or update
      if (commitType == CommitType.insert)
      {
        // create the doc pojo
        doc = new DocumentPojo();
        doc.setUrl(docUrl);
        doc.setCreated(new Date());       
      }
      else
     
        //TODO (INF-1300): support for updated docs (will want to save old creation time, zap everything else?)
      }
     
      doc.setModified(new Date());
     
      // Strip out html if it is present
      if (null != source.getDatabaseConfig().getTitle()) {
        if (rs.getString(source.getDatabaseConfig().getTitle()) != null)
        {
          doc.setTitle(rs.getString(source.getDatabaseConfig().getTitle()).replaceAll("\\<.*?\\>", ""));
        }
      }
     
      if (null != source.getDatabaseConfig().getSnippet()) {
        if (rs.getString(source.getDatabaseConfig().getSnippet()) != null)
        {
          doc.setDescription(rs.getString(source.getDatabaseConfig().getSnippet()).replaceAll("\\<.*?\\>", ""));
        }
      }

      if (null != source.getDatabaseConfig().getPublishedDate()) {
        if (rs.getString(source.getDatabaseConfig().getPublishedDate()) != null)
        {
          Object d = null;
          try
          {
            Object o = rs.getDate(source.getDatabaseConfig().getPublishedDate());
            d = convertJdbcTypes(null, o);
          }
          catch (Exception e)
          {
            d = new Date();
          }
          doc.setPublishedDate((Date) d);
        }
        else
        {
          doc.setPublishedDate(new Date());
        }
      }
      else
      {
        doc.setPublishedDate(new Date());
      }
     
          // Create a list of metadata to be added to the doc
      for ( int i = 1; i <= md.getColumnCount(); i++ )
      {
        String column = md.getColumnLabel(i);
        Object value = rs.getObject(i);
       
        // Convert value to standard Java type from JDBC type if needed
        value = convertJdbcTypes(column, value);
       
        if (  (column != null) && (value != null) )
        {
          if (!source.getDatabaseConfig().getPreserveCase()) {
            column = column.toLowerCase();
          }
          doc.addToMetadata(column, value);
        }
          }     
    }
    catch (SQLException e)
    {
      logger.error("Error on add: Exception Message: " + e.getMessage(), e);
      _context.getHarvestStatus().logMessage(HarvestExceptionUtils.createExceptionMessage("Error on add: ", e).toString(), true);
    }
   
    return doc;
  }
 

  /**
   * convertJdbcTypes
   *
   * @param o
   * @return
   */
  private static Object convertJdbcTypes(String columnName, Object o)
  {
    if (o instanceof java.sql.Timestamp)
    {
      java.sql.Timestamp t = (java.sql.Timestamp)o;
      Date d = new Date(t.getTime());
      o = d;
    }
    else if (o instanceof java.sql.Date)
    {
      java.sql.Date t = (java.sql.Date)o;
      Date d = new Date(t.getTime());
      o = d;
    }
    else if (o instanceof java.sql.Time)
    {
      java.sql.Time t = (java.sql.Time)o;
      Date d = new Date(t.getTime());
      o = d;
    }
    if ((null == columnName) || (columnName.indexOf("__") > 0)) { // (ie __suffix)
      return o;
    }
    else { // If type not explicity specified then treat as string
      return o.toString();
    }
  }
 
 
  /**
   * getDatabaseDateString
   * Convert the date (d) to a format suitable for the database
   * being accessed using SimpeDateFormat.format(d)
   * @param   dt       - InfiniteEnums.DatabaseType
   * @param   d       - Date
   * @return          - String
   */
  private static String getDatabaseDateString(String dt, Date d)
  {
    if (dt.equalsIgnoreCase("mysql")) {
      // yyyy-MM-dd HH:mm:ss
      return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(d);     
    }
    else if (dt.equalsIgnoreCase("oracle")) {
      // dd-MMM-yyyy HH:mm:ss
      return new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss").format(d);     
    }
    else {
      java.sql.Timestamp t = new Timestamp(d.getTime());
      return t.toString();     
    }
  }
 


  @Override
  public boolean canHarvestType(int sourceType)
  {
    return sourceTypesCanHarvest.contains(sourceType);
  }
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.extraction.document.database.NoSqlHarvester

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.