Package com.ikanow.infinit.e.harvest

Source Code of com.ikanow.infinit.e.harvest.HarvestController

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
/**
*
*/
package com.ikanow.infinit.e.harvest;

import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDocumentLevelException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorDailyLimitExceededException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelMajorException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.ExtractorSourceLevelTransientException;
import com.ikanow.infinit.e.data_model.InfiniteEnums.HarvestEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.EntityExtractorEnum;
import com.ikanow.infinit.e.data_model.interfaces.harvest.IEntityExtractor;
import com.ikanow.infinit.e.data_model.interfaces.harvest.ITextExtractor;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo;
import com.ikanow.infinit.e.data_model.store.social.sharing.SharePojo.ShareCommunityPojo;
import com.ikanow.infinit.e.data_model.utils.GeoOntologyMapping;
import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager;
import com.ikanow.infinit.e.data_model.utils.TrustManagerManipulator;
import com.ikanow.infinit.e.harvest.enrichment.custom.StructuredAnalysisHarvester;
import com.ikanow.infinit.e.harvest.enrichment.custom.UnstructuredAnalysisHarvester;
import com.ikanow.infinit.e.harvest.enrichment.legacy.TextRankExtractor;
import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI;
import com.ikanow.infinit.e.harvest.enrichment.legacy.alchemyapi.ExtractorAlchemyAPI_Metadata;
import com.ikanow.infinit.e.harvest.enrichment.legacy.opencalais.ExtractorOpenCalais;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Integrated;
import com.ikanow.infinit.e.harvest.extraction.document.DuplicateManager_Standalone;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Integrated;
import com.ikanow.infinit.e.harvest.extraction.document.HarvestStatus_Standalone;
import com.ikanow.infinit.e.harvest.extraction.document.HarvesterInterface;
import com.ikanow.infinit.e.harvest.extraction.document.database.DatabaseHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.file.FileHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.logstash.LogstashHarvester;
import com.ikanow.infinit.e.harvest.extraction.document.rss.FeedHarvester;
import com.ikanow.infinit.e.harvest.extraction.text.boilerpipe.TextExtractorBoilerpipe;
import com.ikanow.infinit.e.harvest.extraction.text.legacy.TextExtractorTika;
import com.ikanow.infinit.e.harvest.utils.AuthUtils;
import com.ikanow.infinit.e.harvest.utils.HarvestExceptionUtils;
import com.ikanow.infinit.e.harvest.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.gridfs.GridFSDBFile;

/**
* @author cmorgan
*
* Used to process all incoming sources in the system
* @param <DimensionPojo>
*/
public class HarvestController implements HarvestContext
{
  private HarvestControllerPipeline procPipeline = null;
  private IkanowSecurityManager _securityManager = null;
  public IkanowSecurityManager getSecurityManager() { return _securityManager; }
 
  private PropertiesManager pm = new PropertiesManager();
  private IEntityExtractor default_entity_extractor = null;
  private ITextExtractor default_text_extractor = null;
  private ArrayList<HarvesterInterface> harvesters = new ArrayList<HarvesterInterface>();
  private static Set<String> urlsThatError = new TreeSet<String>();
  private static final Logger logger = Logger.getLogger(HarvestController.class);

  private HashMap<String, IEntityExtractor> entity_extractor_mappings = null;
  private HashMap<String, ITextExtractor> text_extractor_mappings = null;
  private HashSet<String> failedDynamicExtractors = null;
  private static HashMap<String, Class<?> > dynamicExtractorClassCache = null;

  private int _nMaxDocs = Integer.MAX_VALUE;
  private DuplicateManager _duplicateManager = new DuplicateManager_Integrated();
  private HarvestStatus _harvestStatus = new HarvestStatus_Integrated(); // (can either be standalone or integrated, defaults to standalone)
  public DuplicateManager getDuplicateManager() { return _duplicateManager; }
  public HarvestStatus getHarvestStatus() { return _harvestStatus; }
  boolean _bIsStandalone = false;
  public boolean isStandalone() { return _bIsStandalone; }
  public void setStandaloneMode(int nMaxDocs) {
    setStandaloneMode(nMaxDocs, false); // (by default don't dedup, however you may want to test updates)
  }
  public void setStandaloneMode(int nMaxDocs, boolean bRealDedup) {
    _bIsStandalone = true;
    urlsThatError.clear(); // (for api testing, obviously don't want to stop trying if we get an error)
    if (nMaxDocs >= 0) {
      _nMaxDocs = nMaxDocs;
    }
    if (!bRealDedup) {
      _duplicateManager = new DuplicateManager_Standalone();
    }
    _harvestStatus = new HarvestStatus_Standalone();

    if (null != dynamicExtractorClassCache) { // (standalone so don't cache extractors)
      dynamicExtractorClassCache.clear();
    }   
  }
  public int getStandaloneMaxDocs() {
    return _nMaxDocs;
  }
  private long nBetweenFeedDocs_ms = 10000; // (default 10s)

  //statistics variables
  private static AtomicInteger num_sources_harvested = new AtomicInteger(0);
  private static AtomicInteger num_docs_extracted = new AtomicInteger(0);
  private static AtomicInteger num_errors_source = new AtomicInteger(0);
  private static AtomicInteger num_error_url = new AtomicInteger(0);
  private static AtomicInteger num_error_px = new AtomicInteger(0);
  private static AtomicInteger num_ent_extracted = new AtomicInteger(0);
  private static AtomicInteger num_event_extracted = new AtomicInteger(0);

  private int nUrlErrorsThisSource = 0;

  /**
   * Used to find out the sources harvest of information is successful
   * @return
   */
  public boolean isSuccessful() {
    return true;
  }

  // Handle clean shutdown of harvester
  private static boolean bIsKilled = false;
  public static void killHarvester() { bIsKilled = true; }
  public static boolean isHarvestKilled() { return bIsKilled; }

  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // TOP LEVEL LOGICAL

  // Utility objects for loading custom text and entity extractors across all threads just once
  @SuppressWarnings("rawtypes")
  private static HashMap<String, Class> customExtractors = null;
  private static ClassLoader customExtractorClassLoader = HarvestController.class.getClassLoader();

  /**
   *  Constructor for Harvest Controller class
   * 
   * @throws IOException
   */
  public HarvestController() throws IOException { this(false); }
 
  private static boolean _initializedSSL = false;
 
  @SuppressWarnings("rawtypes")
  public HarvestController(boolean overrideTypeSettings) throws IOException
  {
    if (!_initializedSSL) {
      _initializedSSL = true;
      try {
        // Ensure we don't have any self-signed cert debacles:
        TrustManagerManipulator.allowAllSSL();   
      }
      finally {}
    }
   
    PropertiesManager props = new PropertiesManager();
    String sTypes = props.getHarvesterTypes();
    if (overrideTypeSettings) { // (override API settings in test mode)
      sTypes = "Feed,File,Database,Logstash";
    }
    String sType[] = sTypes.split("\\s*,\\s*");

   
    // Add a harvester for each data type
    for (String s: sType) {
      if (s.equalsIgnoreCase("database")) {
        try {
          this.harvesters.add(new DatabaseHarvester());
        }
        catch (Exception e) {
          logger.error(s + " not supported: " + e.getMessage());
        }
        catch(NoClassDefFoundError e) {
          logger.error(s + " not supported: " + e.getMessage());
        }       
      }
      else if (s.equalsIgnoreCase("logstash")) {
        try {
          this.harvesters.add(new LogstashHarvester());
        }
        catch (Exception e) {
          logger.error(s + " not supported: " + e.getMessage());
        }
        catch(NoClassDefFoundError e) {
          logger.error(s + " not supported: " + e.getMessage());
        }               
      }
      else if (s.equalsIgnoreCase("file")) {

        // According to http://www.ryanchapin.com/fv-b-4-648/java-lang-OutOfMemoryError--unable-to-create-new-native-thread-Exception-When-Using-SmbFileInputStream.html
        // this is needed to avoid java.lang.OutOfMemoryError (intermittent - for me at least, it's happened for exactly 1 source, but consistently when it does)
        System.setProperty("jcifs.resolveOrder", "DNS");
        System.setProperty("jcifs.smb.client.dfs.disabled", "true");

        try {
          this.harvesters.add(new FileHarvester());
        }
        catch (Exception e) {
          logger.error(s + " not supported: " + e.getMessage());
        }
        catch(NoClassDefFoundError e) {
          logger.error(s + " not supported: " + e.getMessage());
        }       
      }
      else if (s.equalsIgnoreCase("feed")) {
        try {
          this.harvesters.add(new FeedHarvester());
        }
        catch (Exception e) {
          logger.error(s + " not supported: " + e.getMessage());
        }
        catch(NoClassDefFoundError e) {
          logger.error(s + " not supported: " + e.getMessage());
        }       
      }
    }

    // Load all the extractors, set up defaults
    entity_extractor_mappings = new HashMap<String, IEntityExtractor>();
    text_extractor_mappings = new HashMap<String, ITextExtractor>();

    // Load custom text/entity extractors
    synchronized (HarvestController.class) {
      if (null == customExtractors) {
        customExtractors = new HashMap<String, Class>();
        customExtractorClassLoader = HarvestController.class.getClassLoader();
      }
      // Text extractors:
      String customTextList = props.getCustomTextExtractors();
      if (null != customTextList) {
        String customTextArray[] = customTextList.split("\\s*,\\s*");
        for (String customText: customTextArray) {
          if (!customExtractors.containsKey(customText)) {
            // (else already have this extractor)
            try {
              Class customTextExtractor = customExtractorClassLoader.loadClass(customText);
              ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
              text_extractor_mappings.put(obj.getName().toLowerCase(), obj);
              customExtractors.put(customText, customTextExtractor);
            }
            catch (Exception e) {
              logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("ITextExtractor: Couldn't load " + customText +": " + e.getMessage(), e);
            }       
          }       
          else { // Already loaded, put in again
            try {
              Class customTextExtractor = customExtractors.get(customText)
              ITextExtractor obj = (ITextExtractor)customTextExtractor.newInstance();
              text_extractor_mappings.put(obj.getName().toLowerCase(), obj);           
            }
            catch (Exception e) {
              logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("ITextExtractor: Couldn't use already loaded " + customText +": " + e.getMessage(), e);
            }       
          }
        }
      }//TESTED
      // Entity extractors
      String customEntityList = props.getCustomEntityExtractors();
      if (null != customEntityList) {
        String customEntityArray[] = customEntityList.split("\\s*,\\s*");
        for (String customEntity: customEntityArray) {
          if (!customExtractors.containsKey(customEntity)) {
            // (else already have this extractor - but may have it for text, so some work to do)
            try {
              Class customEntityExtractor = customExtractorClassLoader.loadClass(customEntity);
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
              entity_extractor_mappings.put(obj.getName().toLowerCase(), obj);
              customExtractors.put(customEntity, customEntityExtractor);
            }
            catch (Exception e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }
            catch(NoClassDefFoundError e) {
              logger.error("IEntityExtractor: Couldn't load " + customEntity +": " + e.getMessage(), e);
            }       
          }
          else { // If this object exists and if it's a text extractor, then see if it's also an entity extractor
            try {
              Class customEntityExtractor = customExtractors.get(customEntity);           
              IEntityExtractor obj = (IEntityExtractor)customEntityExtractor.newInstance();
              entity_extractor_mappings.put(obj.getName(), obj);
            }
            catch (Exception e) {
              logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);           
            }
            catch(NoClassDefFoundError e) {
              logger.error("IEntityExtractor: Couldn't use already loaded " + customEntity +": " + e.getMessage(), e);           
            }       
          }
        }
      }//TESTED
    }

    try {
      entity_extractor_mappings.put("opencalais", new ExtractorOpenCalais());
    }
    catch (Exception e) {
      logger.warn("Can't use OpenCalais as entity extractor: " + e.getMessage());     
    }
    try {
      entity_extractor_mappings.put("textrank", new TextRankExtractor());
    }
    catch (Exception e) {
      logger.warn("Can't use textrank as entity extractor: " + e.getMessage());     
    }

    try {
      ExtractorAlchemyAPI both = new ExtractorAlchemyAPI();
      entity_extractor_mappings.put("alchemyapi", both);
      text_extractor_mappings.put("alchemyapi", both)
      ExtractorAlchemyAPI_Metadata both_metadata = new ExtractorAlchemyAPI_Metadata();
      entity_extractor_mappings.put("alchemyapi-metadata", both_metadata);
      text_extractor_mappings.put("alchemyapi-metadata", both_metadata);       
    }
    catch (Exception e) {
      logger.warn("Can't use AlchemyAPI as entity/text extractor: " + e.getMessage());     
    }
    try {
      text_extractor_mappings.put("boilerpipe", new TextExtractorBoilerpipe());
    }
    catch (Exception e) {
      logger.warn("Can't use Boilerpipe as text extractor: " + e.getMessage());     
    }
    try {
      text_extractor_mappings.put("tika", new TextExtractorTika());
    }
    catch (Exception e) {
      logger.warn("Can't use Tika as text extractor: " + e.getMessage());     
    }

    if (null != pm.getDefaultEntityExtractor()) {
      default_entity_extractor = entity_extractor_mappings.get(pm.getDefaultEntityExtractor().toLowerCase());
    }
    else {
      default_entity_extractor = null;
    }
    if (null != pm.getDefaultTextExtractor()) {
      default_text_extractor = text_extractor_mappings.get(pm.getDefaultTextExtractor().toLowerCase());
    }
    else {
      try {
        default_text_extractor = new TextExtractorBoilerpipe();     
      }
      catch (Exception e) {
        logger.warn("Can't use BoilerPlate as default text extractor: " + e.getMessage());
      }
    }
    nBetweenFeedDocs_ms = props.getWebCrawlWaitTime();
   
    // Set up security manager - basically always needed so might as well create here
   
    _securityManager = new IkanowSecurityManager();               
  }

  /**
   * Handles going through what to do with a source for harvesting
   * The process currently is:
   * 1. Extract from source
   * 2. Enrich with metadata from toAdd (entity, fulltext, events, etc)
   *
   * @param source The source to harvest
   */
  public void harvestSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove)
  {
    nUrlErrorsThisSource = 0;

    // New Harvest Pipeline logic
    if (null != source.getProcessingPipeline()) {
      if (null == procPipeline) {
        procPipeline = new HarvestControllerPipeline();
      }
      procPipeline.extractSource_preProcessingPipeline(source, this);
      //(just copy the config into the legacy source fields since the
      // actual processing is the same in both cases)
    }//TESTED

    // Can override the default (feed) wait time from within the source (eg for sites that we know
    // don't get upset about getting hammered)
    if (null != source.getRssConfig()) {
      if (null != source.getRssConfig().getWaitTimeOverride_ms()) {
        nBetweenFeedDocs_ms = source.getRssConfig().getWaitTimeOverride_ms();
      }
    }
    LinkedList<DocumentPojo> toDuplicate = new LinkedList<DocumentPojo>();

    // Reset any state that might have been generated from the previous source
    getDuplicateManager().resetForNewSource();
    getHarvestStatus().resetForNewSource();

    //First up, Source Extraction (could spawn off some threads to do source extraction)
    // Updates will be treated as follows:
    // - extract etc etc (since they have changed)
    // [and then in generic processing
    // - remove them (including their child objects, eg events) ...
    //   ... - but retain "created" date (and in the future artefacts like comments)]
    extractSource(source, toAdd, toUpdate, toRemove, toDuplicate);
    // (^^^ this adds toUpdate to toAdd)

    if (null != source.getProcessingPipeline()) {
      procPipeline.setInterDocDelayTime(nBetweenFeedDocs_ms);
      try {
        procPipeline.enrichSource_processingPipeline(source, toAdd, toUpdate, toRemove);
      }
      finally { // (ensure can clear memory)
        procPipeline.clearState();
      }
    }
    else { // Old logic (more complex, less functional)
      enrichSource(source, toAdd, toUpdate, toRemove);
    }
    completeEnrichmentProcess(source, toAdd, toUpdate, toRemove);

    // (Now we've completed enrichment either normally or by cloning, add the dups back to the normal documents for generic processing)
    LinkedList<DocumentPojo> groupedDups = new LinkedList<DocumentPojo>(); // (ie clones)
    DocumentPojo masterDoc = null; // (just looking for simple pointer matching here)

    for (DocumentPojo dupDoc: toDuplicate) {
      if (null == dupDoc.getCloneFrom()) {
        toAdd.add(dupDoc);       
      }
      else if (null != dupDoc.getCloneFrom().getTempSource()) { //(Else doc was removed from toAdd list due to extraction errors)
        if (null == masterDoc) { // First time through
          masterDoc = dupDoc.getCloneFrom();
        }
        else if (!masterDoc.getUrl().equals(dupDoc.getUrl())) { // New group!
          groupedDups = enrichDocByCloning(groupedDups);
          if (null != groupedDups) {
            toAdd.addAll(groupedDups);
            groupedDups.clear();
          }
          else {
            groupedDups = new LinkedList<DocumentPojo>();
          }
          masterDoc = dupDoc.getCloneFrom();
        }
        groupedDups.add(dupDoc);         
      }
    }//end loop over duplicates
    //TESTED, included case where the master doc errors during extraction (by good fortune!)

    if (null != groupedDups) { // (Leftover group)
      groupedDups = enrichDocByCloning(groupedDups);
      if (null != groupedDups) {
        toAdd.addAll(groupedDups);
      }     
    }//TESTED (as above)
  }
  /**
   * Figures out what source extractors to use and then fills the toAdd list
   * with DocumentPojo objects from the extractors.
   *
   * @param flags The source extractors to use
   * @param start source to start extracting at
   * @param end source to stop extracting at
   * @param toAdd A reference to the toAdd that should be filled with what the source extracts
   */
  @SuppressWarnings("unchecked")
  private void extractSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove, List<DocumentPojo> toDup)
  {
    boolean normalCase = true;
    normalCase = (1 == source.getCommunityIds().size()) || // (normal case..)
            ((2 == source.getCommunityIds().size()) && source.getCommunityIds().contains(source.getOwnerId()));
              // (test case..)
   
    //determine which source extractor to use
    for ( HarvesterInterface harvester : harvesters)
    {
      if ( harvester.canHarvestType(InfiniteEnums.castExtractType(source.getExtractType())) )
      {
        try {
          List<DocumentPojo> tmpToAdd = new LinkedList<DocumentPojo>();
          List<DocumentPojo> tmpToUpdate = new LinkedList<DocumentPojo>();
          List<DocumentPojo> tmpToRemove = new LinkedList<DocumentPojo>();
          harvester.executeHarvest(this, source, tmpToAdd, tmpToUpdate, tmpToRemove);

          int nDocs = 0;
          for (List<DocumentPojo> docList: Arrays.asList(tmpToAdd, tmpToUpdate)) {
            for (DocumentPojo doc : docList) {
              if (++nDocs > _nMaxDocs) {
                break;
              }
              // Handle cloning on "duplicate docs" from different sources
              boolean bDuplicated = false;
              if (null != doc.getDuplicateFrom() && (null == doc.getUpdateId())) {
                DocumentPojo newDoc = enrichDocByDuplicating(doc);
                // (Note this is compatible with the cloning case whose logic is below:
                //  this document gets fully populated here then added to dup list (with dupFrom==null), with a set of slaves
                //  with dupFrom==sourceKey. When the dup list is traversed (after bypassing enrichment), the slaves are
                //  then created from this master)
                if (null != newDoc) {
                  doc = newDoc;
                  bDuplicated = true;
                }
              }
              else { // if the update id is non-null then ignore the above logic
                doc.setDuplicateFrom(null);
              }
              // Copy over material from source pojo:
              doc.setSource(source.getTitle());
              doc.setTempSource(source);
              doc.setMediaType(source.getMediaType());
              if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) {
                doc.setTags(source.getTags());
              }
              ObjectId sCommunityId = source.getCommunityIds().iterator().next(); // (multiple communities handled below)
              String sIndex = new StringBuffer("doc_").append(sCommunityId.toString()).toString();
              doc.setCommunityId(sCommunityId);               
              doc.setIndex(sIndex);
              if (normalCase) { // Normal case (or test case)
                doc.setSourceKey(source.getKey());
              }
              else { // Many communities for a single source, not a pleasant case
                String sMasterDocSourceKey = null;
                for (ObjectId id: source.getCommunityIds()) {
                  if (null == sMasterDocSourceKey) {
                    sMasterDocSourceKey = (source.getKey());
                    doc.setSourceKey(sMasterDocSourceKey);
                  }
                  else { // Will defer these until after the master doc has been added to the database
                    DocumentPojo cloneDoc = new DocumentPojo();

                    // Will need these fields
                    cloneDoc.setIndex(new StringBuffer("doc_").append(id).toString());
                    cloneDoc.setCommunityId(id);
                    cloneDoc.setSourceKey(source.getKey());
                    cloneDoc.setSource(source.getTitle());
                    cloneDoc.setUrl(doc.getUrl());
                    if ((null == source.getAppendTagsToDocs()) || source.getAppendTagsToDocs()) {
                      cloneDoc.setTags(source.getTags());
                    }

                    cloneDoc.setCloneFrom(doc);
                    toDup.add(cloneDoc);
                  }
                }//TESTED (both in clone and clone+duplicate)
              }             
              // Normally add to enrichment list (for duplicates, bypass this)
              if (bDuplicated) {
                toDup.add(doc); // (Already enriched by duplication process)
              }
              else {
                toAdd.add(doc);
              }
            }
          }//(end loop over docs to add/update)

          num_docs_extracted.addAndGet(tmpToAdd.size() > _nMaxDocs ? _nMaxDocs : tmpToAdd.size());
          toUpdate.addAll(tmpToUpdate);
          toRemove.addAll(tmpToRemove);
        }
        catch (Exception e) {

          //DEBUG
          //e.printStackTrace();
          logger.error("Error extracting source=" + source.getKey() + ", type=" + source.getExtractType() + ", reason=" + e.getMessage());         
          _harvestStatus.update(source, new Date(), HarvestEnum.error, "Extraction error: " + e.getMessage(), false, false);         
        }
        break; //exit for loop, source is extracted
      }
    }
  }

  //
  // (LEGACY) Gets metadata using the extractors and appends to documents
  //

  private void enrichSource(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove)
  {
    StructuredAnalysisHarvester sah = null;
    UnstructuredAnalysisHarvester usah = null;

    // Create metadata from the text using regex (also calculate header/footer information if desired)
    if (source.getUnstructuredAnalysisConfig() != null)
    {
      usah = new UnstructuredAnalysisHarvester();

      // If performing structured analysis also then need to mux them
      // since the UAH will run on the body/description potentially created by the SAH
      // and the SAH will take the metadata generated by UAH to create entities and events
      if (source.getStructuredAnalysisConfig() != null) {
        sah = new StructuredAnalysisHarvester();
        sah.addUnstructuredHandler(usah);
      }
      else {
        toAdd = usah.executeHarvest(this, source, toAdd);
      }
    }

    // For sources that generate structured data, we can turn that into entities and events
    // and fill in document fields from the metadata (that can be used by entity extraction)
    if (source.getStructuredAnalysisConfig() != null)
    {
      if (null == sah) {
        sah = new StructuredAnalysisHarvester();
      }
      toAdd = sah.executeHarvest(this, source, toAdd);
      // (if usah exists then this runs usah)
    }

    // Perform text and entity extraction
    if (source.getStructuredAnalysisConfig() == null) // (Else is performed during SAH above)
    {
      if (isEntityExtractionRequired(source))
      {
        // Text/Entity Extraction
        try {
          extractTextAndEntities(toAdd, source, false, false);
        }
        catch (Exception e) {
          handleExtractError(e, source); //handle extractor error if need be       
        }
      }
    } // (end if no SAH)

    // Finish processing:
    // Complete batches
    if (isEntityExtractionRequired(source))
    {
      try {
        extractTextAndEntities(null, source, true, false);
      }
      catch (Exception e) {}
    }   
  }

  private void completeEnrichmentProcess(SourcePojo source, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate, List<DocumentPojo> toRemove)
  {
    // Map ontologies:

    completeDocumentBuilding(toAdd, toUpdate);

    int pxErrors = getHarvestStatus().getNumMessages();   
    num_error_px.addAndGet(pxErrors);
   
    // Log the number of feeds extracted for the current source
    if ((toAdd.size() > 0) || (toUpdate.size() > 0) || (toRemove.size() > 0) || (nUrlErrorsThisSource > 0) || (pxErrors > 0)) {
      StringBuffer sLog = new StringBuffer("source=").append((null==source.getUrl()?source.getKey():source.getUrl())).append(" ");
        // (only need this for the log, not the source harvest message)

      if ((null != source.getHarvestStatus()) && (null != source.getHarvestStatus().getHarvest_message() && !source.getHarvestStatus().getHarvest_message().isEmpty()))
      {
        String message = source.getHarvestStatus().getHarvest_message().replace("\n"" ");
        if (message.length() > 512) {
          sLog.append("extracterr='").append(message.substring(0, 512)).append("...' ");         
        }
        else {
          sLog.append("extracterr='").append(message).append("' ");
        }
      }//TESTED
     
      StringBuffer sLog2 = new StringBuffer();

      // Extraction stats:
      sLog2.append("extracted=").append(toAdd.size()).append(" updated=").append(toUpdate.size()).
          append(" deleted=").append(toRemove.size()).append(" urlerrors=").append(nUrlErrorsThisSource).append(" pxerrors=").append(pxErrors);
     
      getHarvestStatus().logMessage(sLog2.toString(), false);
      sLog.append(sLog2);
     
      // Other error info for the log only:
      String mostCommonMessage = getHarvestStatus().getMostCommonMessage();
      if (null != mostCommonMessage) {
        if (mostCommonMessage.length() > 256) {
          mostCommonMessage = mostCommonMessage.substring(0, 253) + "...'";
        }
        sLog.append(mostCommonMessage); // (don't need this in the harvest status since we already have all of them)
      }
      logger.info(sLog.toString());
    }//TESTED

    // May need to update status again (eg any extractor errors or successes - in the harvesters or immediately above):
    if (getHarvestStatus().moreToLog()) {
      getHarvestStatus().update(source, new Date(), source.getHarvestStatus().getHarvest_status(), "", false, false);
    }
    // (note: the harvest status is updated 3 times:
    //  1) inside the source-type harvester (which: 1.1) resets the message 1.2) wipes the messages, but sets prevStatus.getHarvest_message() above)
    //  2) above (the update call, which occurs if logMessage() has been called at any point)
    //  3) after store/index manager, which normally just sets the status unless any errors occurred during indexing

    num_sources_harvested.incrementAndGet();   
  }

  // Quick utility to return if entity extraction has been specified by the user

  public boolean isEntityExtractionRequired(SourcePojo source) {
    return (((null == source.useExtractor()) && (null != default_entity_extractor))
        || ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")))
        ||
        (((null == source.useTextExtractor()) && (null != default_text_extractor))
            || ((null != source.useTextExtractor()) && !source.useTextExtractor().equalsIgnoreCase("none")))
            ;   
  }

  /**
   * Takes a list of toAdd and extracts each ones full text and entities/events/sentiment (metadata)
   *
   * @param toAdd The list of toAdd without metadata to extract on
   * @return Any errors that occured while extracting, null if no error
   * @throws ExtractorSourceLevelTransientException
   */
  public void extractTextAndEntities(List<DocumentPojo> toAdd, SourcePojo source, boolean bFinalizeBatchOnly, boolean calledFromPipeline)
  throws ExtractorDocumentLevelException, ExtractorSourceLevelException,
  ExtractorDailyLimitExceededException, ExtractorSourceLevelMajorException, ExtractorSourceLevelTransientException
  {
    IEntityExtractor currentEntityExtractor = null;
    try {
      int error_on_feed_count = 0, feed_count = 0;

      // EXTRACTOR SELECTION LOGIC

      if (null != source.useExtractor()) {
        currentEntityExtractor = entity_extractor_mappings.get(source.useExtractor().toLowerCase());
        if (null == currentEntityExtractor) { // (second chance)
          currentEntityExtractor = (IEntityExtractor) lookForDynamicExtractor(source, false);
        }
      }
      if (currentEntityExtractor == null) // none specified or didn't find it (<-latter is error)
      {
        if ((null != source.useExtractor()) && !source.useExtractor().equalsIgnoreCase("none")) {         

          // ie specified one but it doesn't exist....
          StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_extractor=").append(source.useExtractor());
          logger.warn(errMsg.toString());

          // No point trying this for the rest of the day
          throw new ExtractorSourceLevelException(errMsg.toString());         
        }
        else if (null == source.useExtractor()) { // Didn't specify one, just use default:
          currentEntityExtractor = default_entity_extractor;
        }
      }//TESTED         

      if (bFinalizeBatchOnly) {
        try {
          currentEntityExtractor.extractEntities(null);
        }
        catch (Exception e) {} // do nothing, eg handle entity extractors that don't handle things well
        return;
      }

      // A teeny bit of complex logic:
      // toAdd by default use a text extractor
      // DB/Files by default don't (but can override)

      ITextExtractor currentTextExtractor = null;
      boolean bUseRawContentWhereAvailable = false; // (only applies for feeds)
      if (null != source.useTextExtractor()) {
        currentTextExtractor = text_extractor_mappings.get(source.useTextExtractor().toLowerCase());
        if (null == currentTextExtractor) { // (second chance)
          currentTextExtractor = (ITextExtractor) lookForDynamicExtractor(source, true);
        }
      }
      if (null == currentTextExtractor) { // none specified or didn't find it (<-latter is error)       
        if (null != source.useTextExtractor()) {                   

          if ((null == source.getStructuredAnalysisConfig()) && (null == source.getUnstructuredAnalysisConfig())
              && (null == source.getProcessingPipeline()))
          {
            //(UAH and SAH get raw access to the data if they need it, so can carry on - ditto processing pipeline)

            StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" no_txt_extractor=").append(source.useTextExtractor());
            logger.warn(errMsg.toString());

            // No point trying this for the rest of the day
            throw new ExtractorSourceLevelException(errMsg.toString());
          }
          else {
            bUseRawContentWhereAvailable = true; // (only checked for feeds)           
          }//TESTED
        }
        else if (source.getExtractType().equalsIgnoreCase("feed")) // (DB/files just use their existing fullText)
        {
          if (null != currentEntityExtractor) {
            String selfExtraction = currentEntityExtractor.getCapability(EntityExtractorEnum.URLTextExtraction);
            // Leave as null unless have no built-in capability
            if ((null == selfExtraction) || !selfExtraction.equals("true"))
            {
              currentTextExtractor = default_text_extractor;
            }
          }
          else {
            currentTextExtractor = default_text_extractor;           
          }
        }//TESTED   
      }

      // EXTRACTION
      Iterator<DocumentPojo> i = toAdd.iterator(); //iterator created so that elements in the toAdd list can be
      // removed within the loop
      while ( i.hasNext() )
      {
        long nTime_ms = System.currentTimeMillis();
        DocumentPojo doc = i.next();
        boolean bExtractedText = false;

        // If I've been stopped then just remove all remaining documents
        // (pick them up next time through)
        if (bIsKilled) {
          i.remove();
          if (!calledFromPipeline) {
            doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
          }
          continue;
        }

        if ( calledFromPipeline || !urlsThatError.contains(doc.getUrl()) ) //only attempt if url is okay
        {       
          feed_count++;

          try {
            // (Check for truncation)
            if ((null != currentEntityExtractor) && (null != doc.getFullText())) {
              try {
                String s = currentEntityExtractor.getCapability(EntityExtractorEnum.MaxInputBytes);
                if (null != s) {
                  int maxLength = Integer.parseInt(s);
                  if (doc.getFullText().length() > maxLength) { //just warn, it's up to the extractor to sort it out
                    getHarvestStatus().logMessage("Warning: truncating document to max length: " + s, false);
                  }
                }
              }
              catch (Exception e) {} // max length not reported just carry on
            }
           
            if (null != currentTextExtractor)
            { 
              bExtractedText = true;
              currentTextExtractor.extractText(doc);
              if (null != currentEntityExtractor) {
                currentEntityExtractor.extractEntities(doc);
              }

            }//TESTED
            else //db/filesys should already have full text extracted (unless otherwise specified)
            {
              if (source.getExtractType().equalsIgnoreCase("feed")) { // Need full text so get from current
               
                if ((null == doc.getFullText()) || !bUseRawContentWhereAvailable) {
                  bExtractedText = true;
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntitiesAndText(doc);
                  }
                }//TESTED (AlchemyAPI case)
                else { // Feed for which we've already extracted data
                  if (null != currentEntityExtractor) {
                    currentEntityExtractor.extractEntities(doc);
                  }
                }//TESTED
              }
              else { // DB/File => use full text
                if (null != currentEntityExtractor) {
                  currentEntityExtractor.extractEntities(doc);
                }
              }//TESTED
            }

            //statistics counting
            if ( doc.getEntities() != null )
              num_ent_extracted.addAndGet(doc.getEntities().size());
            if ( doc.getAssociations() != null )
              num_event_extracted.addAndGet(doc.getAssociations().size());

          }
          catch (ExtractorDailyLimitExceededException e) {

            //extractor can't do anything else today, return
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }

            // Source error, ignore all other documents
            while (i.hasNext()) {
              doc = i.next();
              if (!calledFromPipeline) {
                doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
              }
              i.remove();
            }
            //TESTED

            throw e; // (ie stop processing this source)
          }//TESTED
          catch (Exception e) { // Anything except daily limit exceeded, expect it to be ExtractorDocumentLevelException

            //TODO (INF-1922): put this in a separate function and call that from pipeline on failure...
            // (not sure what to do about error_on_feed_count though, need to maintain a separate one of those in pipeline?)
           
            // This can come from (sort-of/increasingly) "user" code so provide a bit more information
            StringBuffer errMessage = HarvestExceptionUtils.createExceptionMessage(e);
            _harvestStatus.logMessage(errMessage.toString(), true);
            num_error_url.incrementAndGet();
            nUrlErrorsThisSource++;
           
            if (!calledFromPipeline) {
              urlsThatError.add(doc.getUrl());
            }

            error_on_feed_count++;
            i.remove();
            if (!calledFromPipeline) {
              doc.setTempSource(null); // (can safely corrupt this doc since it's been removed)
            }
          }
          //TESTED
        }
        // (note this is only ever called in legacy mode - it's handled in the HarvestControllerPipeline)
        if ((null != source.getExtractType()) && (source.getExtractType().equalsIgnoreCase("feed"))) {
          if (i.hasNext() && bExtractedText) {
            nTime_ms = nBetweenFeedDocs_ms - (System.currentTimeMillis() - nTime_ms); // (ie delay time - processing time)
            if (nTime_ms > 0) {
              try { Thread.sleep(nTime_ms); } catch (Exception e) {};
              // (wait 10s between web-site accesses for politeness)
            }
          }
        }//(TESTED)

      } // end loop over documents 
      //check if all toAdd were erroring, or more than 20 (arbitrary number)
      //NOTE: this is duplicated in HarvestControllerPipeline for non-legacy cases
      if ((error_on_feed_count == feed_count) && (feed_count > 5))
      {
        String errorMsg = new StringBuffer().append(feed_count).append(" docs, ").append(error_on_feed_count).append(", errors").toString();
        if (error_on_feed_count > 20) {
          throw new ExtractorSourceLevelMajorException(errorMsg);
        }
        else {
          throw new ExtractorSourceLevelException(errorMsg);
        }//TESTED
      }
    }
    catch (ExtractorDailyLimitExceededException e) {
      // Percolate upwards!
      throw e;
    }
    catch (ExtractorSourceLevelException e) {
      // Percolate upwards!
      throw e;
    }
    catch (ExtractorSourceLevelMajorException e) {
      // Percolate upwards!
      throw e;
    }
    catch (Exception e) { // Misc internal error
      StringBuffer errMsg = new StringBuffer("Skipping source=").append(source.getKey()).append(" error=").append(e.getMessage());
      logger.error(errMsg.toString(), e);
      throw new ExtractorSourceLevelTransientException(errMsg.toString());
    }//TESTED

  }//TESTED

  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // UTILITY FUNCTIONS

  /**
   * Decides what to do with a source when an error is returned from the
   * extractor process.
   *
   * @param error The error that was returned from extractor
   * @param source The source that the extractor was working on
   */
  public void handleExtractError(Exception error, SourcePojo source)
  {
    if ( null != error)
    {
      if ( error instanceof ExtractorDocumentLevelException)
      {
        num_error_url.incrementAndGet();
        nUrlErrorsThisSource++;
      }
      else if ( error instanceof ExtractorSourceLevelException)
      {
        num_errors_source.incrementAndGet();
        //We flag the source in mongo and temp disable
        _harvestStatus.update(source, new Date(), HarvestEnum.error, "Source Level extraction error: " + error.getMessage(), true, false);
      }//TESTED
      else if ( error instanceof ExtractorSourceLevelMajorException)
      {
        num_errors_source.incrementAndGet();
        //We flag the source in mongo and perma disable
        _harvestStatus.update(source, new Date(), HarvestEnum.error, "Major source level Extraction error: " + error.getMessage(), true, true);
      }//TESTED
      else if ( error instanceof ExtractorSourceLevelTransientException)
      {
        num_errors_source.incrementAndGet();
        //We flag the source in mongo
        _harvestStatus.update(source, new Date(), HarvestEnum.error, "Transient source level extraction error: " + error.getMessage(), false, false);       
      }//TESTED
      else if ( error instanceof ExtractorDailyLimitExceededException)
      {
        //We flag the source in mongo and temp disable
        _harvestStatus.update(source, new Date(), HarvestEnum.success, "Extractor daily limit error.", true, false);       
      }//TESTED
    }
  }//TESTED (just that the instanceofs work)

  /**
   * Prints out some quick info about how the harvester performed
   */
  public static void logHarvesterStats()
  {
    StringBuilder sb = new StringBuilder();
    sb.append("num_of_sources_harvested=" + num_sources_harvested.get());
    sb.append(" num_of_docs_extracted=" + num_docs_extracted.get());
    sb.append(" num_of_entities_extracted=" + num_ent_extracted.get());
    sb.append(" num_of_events_extracted=" + num_event_extracted.get());
    sb.append(" num_of_source_errors=" + num_errors_source.get());
    sb.append(" num_of_url_errors=" + num_error_url.get());
    sb.append(" num_of_px_errors=" + num_error_px.get());
    logger.info(sb.toString());
  }

  // Utility to handle the various multiple community problems:
  // - Different sources, name URL ("duplicates") ... get the doc from the DB (it's there by definition)
  // - Same source, multiple communities ("clones") ... get the doc from the first community processed

  private static DocumentPojo enrichDocByDuplicating(DocumentPojo docToReplace) {
    DocumentPojo newDoc = null;
    BasicDBObject dbo = getDocumentMetadataFromWhichToDuplicate(docToReplace);
    if (null != dbo) {
      String sContent = getDocumentContentFromWhichToDuplicate(docToReplace);
      if (null != sContent) {
        newDoc = duplicateDocument(docToReplace, dbo, sContent, false);
        // (Note this erases the "duplicateFrom" field - this is important because it distinguishes "clones" and "duplicates")
      }
    }
    return newDoc;   
  }//TESTED

  private static LinkedList<DocumentPojo> enrichDocByCloning(List<DocumentPojo> docsToReplace) {
    DocumentPojo newDoc = null;
    BasicDBObject dbo = null;
    String sContent = null;
    LinkedList<DocumentPojo> newDocs = new LinkedList<DocumentPojo>();
    for (DocumentPojo docToReplace: docsToReplace) {

      if (null == dbo) { // First time through...
        sContent = docToReplace.getCloneFrom().getFullText();
        docToReplace.getCloneFrom().setFullText(null);
        dbo = (BasicDBObject) docToReplace.getCloneFrom().toDb();
        docToReplace.getCloneFrom().setFullText(sContent);
      }
      newDoc = duplicateDocument(docToReplace, dbo, sContent, true);
      newDocs.add(newDoc);
    }
    return newDocs;

  }//TESTED

  // Sub-utility

  private static BasicDBObject getDocumentMetadataFromWhichToDuplicate(DocumentPojo docToReplace) {
    BasicDBObject query = new BasicDBObject("url", docToReplace.getUrl());
    query.put("sourceKey", docToReplace.getDuplicateFrom());
    BasicDBObject dbo = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(query);

    return dbo;
  }//TESTED

  private static String getDocumentContentFromWhichToDuplicate(DocumentPojo docToReplace) {
    try {
      // Get the full text:
      byte[] storageArray = new byte[200000];
      BasicDBObject contentQ = new BasicDBObject("url", docToReplace.getUrl());
      contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, docToReplace.getSourceKey())));
      BasicDBObject fields = new BasicDBObject(CompressedFullTextPojo.gzip_content_, 1);
      BasicDBObject dboContent = (BasicDBObject) DbManager.getDocument().getContent().findOne(contentQ, fields);
      if (null != dboContent) {
        byte[] compressedData = ((byte[])dboContent.get(CompressedFullTextPojo.gzip_content_));       
        ByteArrayInputStream in = new ByteArrayInputStream(compressedData);
        GZIPInputStream gzip = new GZIPInputStream(in);       
        int nRead = 0;
        StringBuffer output = new StringBuffer();
        while (nRead >= 0) {
          nRead = gzip.read(storageArray, 0, 200000);
          if (nRead > 0) {
            String s = new String(storageArray, 0, nRead, "UTF-8");
            output.append(s);
          }
        }
        return output.toString();
      }   
      else { // Will just need to-reprocess this document
        return null;
      }
    }
    catch (Exception e) {
      // Do nothing, just carry on
      e.printStackTrace();
    }   
    return null;
  }//TESTED

  private static DocumentPojo duplicateDocument(DocumentPojo docToReplace, BasicDBObject dbo, String content, boolean bClone) {
    DocumentPojo newDoc = DocumentPojo.fromDb(dbo, DocumentPojo.class);
    newDoc.setFullText(content);
    newDoc.setId(null); // (ie ensure it's unique)

    if (bClone) { // Cloned docs have special source key formats (and also need to update their community)
      ObjectId docCommunity = docToReplace.getCommunityId();
      newDoc.setSourceKey(docToReplace.getSourceKey());
      newDoc.setCommunityId(docCommunity);
      newDoc.setIndex(new StringBuffer("doc_").append(docCommunity).toString());     
    }   
    else { // For cloned documents, published etc can be taken from the master document, ie newDoc is already accurate
      // Copy over timing details from new document (set by the harvesters)
      newDoc.setPublishedDate(docToReplace.getPublishedDate());
      newDoc.setCreated(docToReplace.getCreated());
      newDoc.setModified(docToReplace.getModified());     
    }
    return newDoc;
  }//TESTED

  //
  // Any documents that have got this far are going to get processed
  //

  // Processing:
  //Attempt to map entity types to set of ontology types
  //eventually the plan is to allow extractors to set the ontology_type of
  //entities to anything found in the opencyc ontology 

  static public void completeDocumentBuilding(List<DocumentPojo> docs, List<DocumentPojo> updateDocs)
  {   
    // Handle documents to be added
    // Currently, just set ontology type
    if ( docs != null )
    {
      for ( DocumentPojo doc : docs )
      {
        if ( doc.getEntities() != null )
        {
          num_ent_extracted.addAndGet(doc.getEntities().size());
          for ( EntityPojo entity : doc.getEntities() )
          {
            if ( entity.getGeotag() != null )
            {
              if (null == entity.getOntology_type()) {
                entity.setOntology_type(GeoOntologyMapping.mapEntityToOntology(entity.getType()));
              }
            }
          }
        }
        if ( doc.getAssociations() != null )
        {
          num_event_extracted.addAndGet(doc.getAssociations().size());
        }
      }
    }
    // Remove any docs from update list that didn't get updated
    if ( updateDocs != null )
    {
      Iterator<DocumentPojo> it = updateDocs.iterator();
      while (it.hasNext()) {
        DocumentPojo d = it.next();
        if (null == d.getTempSource()) { //this doc got deleted
          it.remove();
        }
      }
    }
  }

  ///////////////////////////////////////////////////////////////
  ///////////////////////////////////////////////////////////////

  // Dynamic extraction utilities

  private synchronized Object lookForDynamicExtractor(SourcePojo source, boolean bTextExtractor)
  {
    String extractorName = bTextExtractor ? source.useTextExtractor() : source.useExtractor();
    if (null == extractorName) {
      return null;
    }
    Object outClassInstance = null;
   
    if (null != failedDynamicExtractors) { // (cache for failed shares)
      if (failedDynamicExtractors.contains(extractorName)) {
        return null;
      }
    }
    ClassLoader savedClassLoader = null;
    try {
      ObjectId extractorId = null;
      if (extractorName.startsWith("/")) { // allow /<id>/free text..
        extractorName = extractorName.substring(1).replaceFirst("/.*", "");
      }//TESTED
      try {
        extractorId = new ObjectId(extractorName);
      }
      catch (Exception e) { // not a dynamic share that's fine, just exit no harm done
        return null;
      }
      // If we're here then it was a share

      BasicDBObject query = new BasicDBObject("_id", extractorId);
      SharePojo extractorInfo = SharePojo.fromDb(MongoDbManager.getSocial().getShare().findOne(query), SharePojo.class);
      if ((null != extractorInfo) && (null != extractorInfo.getBinaryId())) {
        // Check share owned by an admin:
        if (!AuthUtils.isAdmin(extractorInfo.getOwner().get_id())) {
          throw new RuntimeException("Extractor share owner must be admin");
        }//TESTED
        // Check >0 source communities are in the share communities
        int nMatches = 0;
        for (ShareCommunityPojo commObj: extractorInfo.getCommunities()) {
          if (source.getCommunityIds().contains(commObj.get_id())) {
            nMatches++;
            break;
          }
        }
        if (0 == nMatches) {
          throw new RuntimeException("Extractor not shared across source communities");         
        }//TESTED
       
        savedClassLoader = Thread.currentThread().getContextClassLoader();
       
        //HashMap<String, Class<?> > dynamicExtractorClassCache = null;
        if (null == dynamicExtractorClassCache) {
          dynamicExtractorClassCache = new HashMap<String, Class<?> >();
        }

        URL[] cachedJarFile = { new File(maintainJarFileCache(extractorInfo)).toURI().toURL() };       
       
        Class<?> classToLoad = dynamicExtractorClassCache.get(extractorInfo.getTitle());       
        if (null == classToLoad) {       
          URLClassLoader child = new URLClassLoader(cachedJarFile, savedClassLoader);
         
          Thread.currentThread().setContextClassLoader(child);
          classToLoad = Class.forName(extractorInfo.getTitle(), true, child);
          dynamicExtractorClassCache.put(extractorInfo.getTitle(), classToLoad);
        }

        if (bTextExtractor) {
          ITextExtractor txtExtractor = (ITextExtractor )classToLoad.newInstance();
          text_extractor_mappings.put(source.useTextExtractor(), txtExtractor);
          outClassInstance = txtExtractor;
        }
        else {
          IEntityExtractor entExtractor = (IEntityExtractor)classToLoad.newInstance();
          entity_extractor_mappings.put(source.useExtractor(), entExtractor);         
          outClassInstance = entExtractor;
        }
      }
    }
    catch (Exception e) {
      getHarvestStatus().logMessage("custom extractor error: " + e.getMessage(), false);
      if (null == failedDynamicExtractors) {
        failedDynamicExtractors = new HashSet<String>();
        failedDynamicExtractors.add(extractorName);
      }
      //e.printStackTrace();
    } // General fail just carry on
    catch (Error err) {
      getHarvestStatus().logMessage("custom extractor error: " + err.getMessage(), false);
      if (null == failedDynamicExtractors) {
        failedDynamicExtractors = new HashSet<String>();
        failedDynamicExtractors.add(extractorName);
      }
      //err.printStackTrace();
     
    } // General fail just carry on
    finally {
      if (null != savedClassLoader) {
        Thread.currentThread().setContextClassLoader(savedClassLoader);       
      }
    }
    return outClassInstance;
  }//TOTEST

  /**
   * Finds the gridfile given by id and returns the bytes
   *
   * @param id the object id of the gridfile to lookup (stored in sharepojo)
   * @return bytes of file stored in gridfile
   */ 
//  private static byte[] getGridFile(ObjectId id)
//  {
//    ByteArrayOutputStream out = new ByteArrayOutputStream();
//    try
//    {
//      GridFSDBFile file = DbManager.getSocial().getShareBinary().find(id);           
//      file.writeTo(out);
//      byte[] toReturn = out.toByteArray();
//      out.close();
//      return toReturn;
//    }
//    catch (Exception ex){}   
//    return null;
//  }

  /**
   * Downloads jar file from web using URL call.  Typically
   * the jar files we be kept in our /share store so we will
   * be calling our own api.
   *
   * @param jarURL
   * @return
   * @throws Exception
   */
  public static String maintainJarFileCache(SharePojo share) throws Exception
  {   
    String tempFileName = System.getProperty("java.io.tmpdir") + "/" + share.get_id() + ".cache.jar";
    File tempFile = new File(tempFileName);

    // Compare dates (if it exists) to see if we need to update the cache)
   
    if (!tempFile.exists() || (tempFile.lastModified() < share.getModified().getTime())) {
      OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFileName));
      if ( share.getBinaryId() != null )
      {     
        GridFSDBFile file = DbManager.getSocial().getShareBinary().find(share.getBinaryId());           
        file.writeTo(out);       
      }
      else
      {
        out.write(share.getBinaryData());
      }
    }//TESTED
   
    return tempFileName;
  }
}
TOP

Related Classes of com.ikanow.infinit.e.harvest.HarvestController

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.