Source Code of com.ikanow.infinit.e.processing.generic.GenericProcessingController

/*******************************************************************************
 * Copyright 2012, The Infinit.e Open Source Project.
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License, version 3,
 * as published by the Free Software Foundation.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 ******************************************************************************/
package com.ikanow.infinit.e.processing.generic;


import java.util.HashMap;
import java.util.List;










//import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;


import com.google.gson.Gson;
import com.ikanow.infinit.e.data_model.InfiniteEnums;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.IndexManager;
import com.ikanow.infinit.e.data_model.index.document.DocumentPojoIndexMap;
import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourceHarvestStatusPojo;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.custom.mapreduce.CustomMapReduceJobPojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocCountPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.data_model.store.social.community.CommunityPojo;
import com.ikanow.infinit.e.processing.generic.aggregation.AggregationManager;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.ikanow.infinit.e.processing.generic.utils.PropertiesManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;


import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
import org.elasticsearch.cluster.metadata.AliasMetaData;
import org.elasticsearch.common.collect.CrossVersionImmutableMapOfImmutableMaps;


//DEBUG (alias corruption)
//import org.elasticsearch.action.admin.indices.status.IndexStatus;
//import org.elasticsearch.action.admin.indices.status.IndicesStatusRequest;
//import org.elasticsearch.action.admin.indices.status.IndicesStatusResponse;


public class GenericProcessingController {


  //NOTE THIS FUNCTION SHOULD CONTAIN NO STATE SINCE IT CAN BE RUN ACROSS MULTIPLE THREADS
  
  //(Nothing currently to log)
  //private static final Logger logger = Logger.getLogger(GenericProcessingController.class);


  ///////////////////////////////////////////////////////////////////////////////////////
  //
  // Set up the databases and indexes
  
  public void Initialize() {
    InitializeDatabase();
    InitializeIndex(false, false, false);
      // (Don't delete anything, obviously)
  }
  
  public void InitializeDatabase() {
    // Add indices:
    try 
    {
      PropertiesManager pm = new PropertiesManager();
      
      ////////////////////////
      //
      // Remove old indexes, mostly just old code that is no longer needed
      //
      dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.url_, 1);
      dropIndexIfItExists(DbManager.getDocument().getContent(), CompressedFullTextPojo.sourceKey_, 2);
      dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceUrl_, 1);
      dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.sourceKey_, 1);
      dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.title_, 1);
        // (Title simply not needed, that was a mistake from an early iteration)
      dropIndexIfItExists(DbManager.getDocument().getMetadata(), DocumentPojo.updateId_, 1);
      dropIndexIfItExists(DbManager.getSocial().getShare(), "type", 1);
      dropIndexIfItExists(DbManager.getSocial().getCookies(), "apiKey", 1);
      dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 2);
      dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 2);
      // (see shard keys below, these legacy ones can appear if the DB is restored from a different machine's backup)
      dropIndexIfNotNeeded(DbManager.getDocument().getContent(), "sourceKey_1_url_1", 0, "sourceKey_2_url_2", 0);
      dropIndexIfNotNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_1", 0, "sourceKey_1__id_-1", 0);
      
      ////////////////////////
      //
      // Indexes needed for sharding:
      //
      // ** Content (has changed a bit)      
      BasicDBObject compIndex = new BasicDBObject(CompressedFullTextPojo.sourceKey_, 1);
      compIndex.put(CompressedFullTextPojo.url_, 1);
      addIndexIfNeeded(DbManager.getDocument().getContent(), "sourceKey_2_url_2", 0, compIndex); // (remove legacy 2_2 and replace with 1_1, which supports shards)
      // ** Metadata
      // Add {_id:1} to "standalone" sourceKey, sort docs matching source key by "time" (sort of!) 
      compIndex = new BasicDBObject(DocumentPojo.sourceKey_, 1);
      compIndex.put(DocumentPojo._id_, 1);
      addIndexIfNeeded(DbManager.getDocument().getMetadata(), "sourceKey_1__id_-1", 0, compIndex); // (remove legacy 1_-1 and replace with 1_1, which supports shards)
      // ** Entities and associations
      DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.index_, 1));
      DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.index_, 1));
      
      ////////////////////////
      //
      // Other indexes
      //
      // Needed to handle updates of large files containing many URLs:
      DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.sourceUrl_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
      // Needed for duplicate checking
      // (Compound index lets me access {url, sourceKey}, {url} efficiently ... but need sourceKey separately to do {sourceKey})
      compIndex = new BasicDBObject(DocumentPojo.url_, 1);
      compIndex.put(DocumentPojo.sourceKey_, 1);
      DbManager.getDocument().getMetadata().ensureIndex(compIndex);
      // Needed to handle document updates
      DbManager.getDocument().getMetadata().ensureIndex(new BasicDBObject(DocumentPojo.updateId_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
      // Needed to update documents' entities' doc counts
      if (!pm.getAggregationDisabled()) {
        compIndex = new BasicDBObject(EntityPojo.docQuery_index_, 1);
        compIndex.put(DocumentPojo.communityId_, 1);
        DbManager.getDocument().getMetadata().ensureIndex(compIndex);
      }
      // Needed for keeping source/community doc counts
      compIndex = new BasicDBObject(DocCountPojo._id_, 1);
      compIndex.put(DocCountPojo.doccount_, 1);
      DbManager.getDocument().getCounts().ensureIndex(compIndex);
      // Needed for keep tracking of entities
      DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.disambiguated_name_, 1));
      DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.alias_, 1));
      // Needed for background re-calculation
      DbManager.getFeature().getEntity().ensureIndex(new BasicDBObject(EntityFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
      DbManager.getFeature().getAssociation().ensureIndex(new BasicDBObject(AssociationFeaturePojo.db_sync_prio_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
      // Needed for geo-location in the entity pipeline
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("country", 1));
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("search_field", 1));
      DbManager.getFeature().getGeo().ensureIndex(new BasicDBObject("geoindex", "2d"));
      // Needed for source management
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.key_, 1));
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.communityIds_, 1));
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_harvested_, 1));
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_synced_, 1));
      // Federated query engine
      DbManager.getIngest().getSource().ensureIndex(new BasicDBObject(SourcePojo.federatedQueryCommunityIds_, 1), new BasicDBObject(MongoDbManager.sparse_, true));
      
      // Searching shares
      // Compound index lets me access {type, communities._id}, {type} efficiently
      compIndex = new BasicDBObject("type", 1);
      compIndex.put("communities._id", 1);      
      DbManager.getSocial().getShare().ensureIndex(compIndex);
      // User logins
      DbManager.getSocial().getCookies().ensureIndex(new BasicDBObject("apiKey", 2), new BasicDBObject(MongoDbManager.sparse_, true));
      // Custom job scheduling
      DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobtitle_, 1));
      //TODO (): MOVE THESE TO SPARSE INDEXES AFTER YOU'VE UPDATED THE LOGIC (SWAP THE 1 AND 2)
      DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
//      DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.jobidS_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
//      dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.jobidS_, 1);
      DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 1), new BasicDBObject(MongoDbManager.sparse_, false));
//      DbManager.getCustom().getLookup().ensureIndex(new BasicDBObject(CustomMapReduceJobPojo.waitingOn_, 2), new BasicDBObject(MongoDbManager.sparse_, true));
//      dropIndexIfItExists(DbManager.getCustom().getLookup(),CustomMapReduceJobPojo.waitingOn_, 1);
    }    
    catch (Exception e)  {
      e.printStackTrace();
      throw new RuntimeException(e.getMessage());
    }
  }//TESTED (not changed since by-eye test in Beta)
      
  // Some *DB* index utilities (note note Lucene index)
  
  private static void addIndexIfNeeded(DBCollection coll, String indexToCheck, int nIndexIndex, BasicDBObject newIndex)
  {
    StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
    if (0 != nIndexIndex) {
      indexNameStrBuff.append("_").append(nIndexIndex);
    }
    String indexName2 = indexNameStrBuff.toString();
    List<DBObject> list = coll.getIndexInfo();
    for (DBObject dbo: list) {        
      String name = (String) dbo.get("name");
      if (indexName2.equalsIgnoreCase(name)) {
        return; // no need to create a new index
      }
    }
    // If we're here then we didn't find the index so create a new index
    try { 
      coll.ensureIndex(newIndex); 
    } 
    catch (Exception e) {} 
  }//TESTED
  
  private static void dropIndexIfNotNeeded(DBCollection coll, String indexToCheck, int nIndexToCheckIndex, String indexToDelete, int nIndexToDeleteIndex)
  {
    StringBuffer indexNameStrBuff = new StringBuffer(indexToCheck);
    if (0 != nIndexToCheckIndex) {
      indexNameStrBuff.append("_").append(nIndexToCheckIndex);
    }
    String indexToCheck2 = indexNameStrBuff.toString();
    indexNameStrBuff.setLength(0);
    indexNameStrBuff.append(indexToDelete);
    if (0 != nIndexToDeleteIndex) {
      indexNameStrBuff.append("_").append(nIndexToDeleteIndex);
    }
    
    boolean foundIndexToDelete = false;
    boolean foundIndexToCheck = false;
    String indexToDelete2 = indexNameStrBuff.toString();
    List<DBObject> list = coll.getIndexInfo();
    for (DBObject dbo: list) {        
      String name = (String) dbo.get("name");
      if (indexToCheck2.equalsIgnoreCase(name)) {
        foundIndexToCheck = true;
      }
      else if (indexToDelete2.equalsIgnoreCase(name)) {
        foundIndexToDelete = true;        
      }
    }
    if (foundIndexToCheck && foundIndexToDelete) {
      try { 
        coll.dropIndex(indexToDelete2);
      } 
      catch (Exception e) {}
    }
  }//TESTED


  private void dropIndexIfItExists(DBCollection coll, String indexName, int nIndexIndex)
  {
    StringBuffer indexNameStrBuff = new StringBuffer(indexName);
    if (0 != nIndexIndex) {
      indexNameStrBuff.append("_").append(nIndexIndex);
    }
    String indexName2 = indexNameStrBuff.toString();
    List<DBObject> list = coll.getIndexInfo();
    for (DBObject dbo: list) {
      String name = (String) dbo.get("name");
      if (indexName2.equalsIgnoreCase(name)) {
        try { 
          coll.dropIndex(name);
        } 
        catch (Exception e) {} 
      }
    }
  }//TESTED
  
  /////////////////////////////////////////////////////////


  // Lucene index initialization
  
  // (Note some of the code below is duplicated in MongoDocumentTxfer, so make sure you sync changes)
  public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature) {
    InitializeIndex(bDeleteDocs, bDeleteEntityFeature, bDeleteEventFeature, false);
  }  
  public void InitializeIndex(boolean bDeleteDocs, boolean bDeleteEntityFeature, boolean bDeleteEventFeature, boolean bRebuildDocsIndex) {
    
    try { //create elasticsearch indexes
      
      PropertiesManager pm = new PropertiesManager();
      
      if (!pm.getAggregationDisabled()) {
        
        boolean languageNormalization = pm.getNormalizeEncoding();
        
        Builder localSettingsEvent = ImmutableSettings.settingsBuilder();
        localSettingsEvent.put("number_of_shards", 1).put("number_of_replicas", 0);
        localSettingsEvent.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        if (languageNormalization) {
          localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
        }
        else {
          localSettingsEvent.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");          
        }
  
        Builder localSettingsGaz = ImmutableSettings.settingsBuilder();
        localSettingsGaz.put("number_of_shards", 1).put("number_of_replicas", 0);
        localSettingsGaz.put("index.analysis.analyzer.suggestAnalyzer.tokenizer", "standard");
        if (languageNormalization) {
          localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "icu_normalizer","icu_folding","standard","lowercase");
        }
        else {
          localSettingsGaz.putArray("index.analysis.analyzer.suggestAnalyzer.filter", "standard","lowercase");          
        }
      
        //event feature
        String eventGazMapping = new Gson().toJson(new AssociationFeaturePojoIndexMap.Mapping(), AssociationFeaturePojoIndexMap.Mapping.class);  
        ElasticSearchManager eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
        if (null == eventIndex) { // (if has been previously referenced in this process space)
          eventIndex = IndexManager.getIndex(AssociationFeaturePojoIndexMap.indexName_);
        }
        eventIndex.createAlias(AssociationFeaturePojoIndexMap.indexCollectionName_);
        if (bDeleteEventFeature) {
          eventIndex.deleteMe();
          eventIndex = IndexManager.createIndex(AssociationFeaturePojoIndexMap.indexName_, null, false, null, eventGazMapping, localSettingsEvent);
        }
        //entity feature
        String gazMapping = new Gson().toJson(new EntityFeaturePojoIndexMap.Mapping(), EntityFeaturePojoIndexMap.Mapping.class);  
        ElasticSearchManager entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
        if (null == entityIndex) { // (if has been previously referenced in this process space)
          entityIndex = IndexManager.getIndex(EntityFeaturePojoIndexMap.indexName_);
        }
        entityIndex.createAlias(EntityFeaturePojoIndexMap.indexCollectionName_);
        if (bDeleteEntityFeature) {
          entityIndex.deleteMe();
          entityIndex = IndexManager.createIndex(EntityFeaturePojoIndexMap.indexName_, null, false, null, gazMapping, localSettingsGaz);
        }
      }
      
      //DOCS - much more complicated than anything else 


      boolean bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_); 
        // (ie if main doc index doesn't exist then always rebuild all indexes)
      
      if (bPingMainIndexFailed) { // extra level of robustness... sleep for a minute then double check the index is really missing...
        try { Thread.sleep(60000); } catch (Exception e) {}
        bPingMainIndexFailed = !ElasticSearchManager.pingIndex(DocumentPojoIndexMap.globalDocumentIndex_);
      }
      bRebuildDocsIndex |= bPingMainIndexFailed;
      
      // check the main index has the "collection" alias - if not then rebuild everything


      if (!bPingMainIndexFailed && (null == _aliasInfo)) { 
        ElasticSearchManager docIndex = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_);
        ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
        _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
        if (!_aliasInfo.containsKey(DocumentPojoIndexMap.globalDocumentIndexCollection_)) {
          bRebuildDocsIndex = true;
        }
      } //TESTED
      
      createCommunityDocIndex(DocumentPojoIndexMap.globalDocumentIndex_, null, false, true, bDeleteDocs);
      createCommunityDocIndex(DocumentPojoIndexMap.manyGeoDocumentIndex_, null, false, false, bDeleteDocs);
      
      // Some hardwired dummy communities
      createCommunityDocIndex("4e3706c48d26852237078005", null, true, false, bDeleteDocs); // (admin)
      createCommunityDocIndex("4e3706c48d26852237079004", null, true, false, bDeleteDocs); // (test user)
        // (create dummy index used to keep personal group aliases)
            
      if (bRebuildDocsIndex || bDeleteDocs) {


        // OK, going to have different shards for different communities:
        // Get a list of all the communities:
        
        BasicDBObject query = new BasicDBObject();
        BasicDBObject fieldsToDrop = new BasicDBObject("members", 0);
        fieldsToDrop.put("communityAttributes", 0);
        fieldsToDrop.put("userAttributes", 0);
        DBCursor dbc = DbManager.getSocial().getCommunity().find(query, fieldsToDrop);
        
        List<DBObject> tmparray = dbc.toArray(); // (brings the entire thing into memory so don't get cursor timeouts)
        int i = 0;
        System.out.println("Initializing " + dbc.size() + " indexes:");
        for (int j = 0; j < 2; ++j) {
          for (DBObject dbotmp: tmparray) {
            if ((++i % 100) == 0) {
              System.out.println("Initialized " + i + " indexes.");
            }
            BasicDBObject dbo = (BasicDBObject) dbotmp;
            
            // OK, going to see if there are any sources with this group id, create a new index if so:
            // (Don't use CommunityPojo data model here for performance reasons....
            //  (Also, haven't gotten round to porting CommunityPojo field access to using static fields))
            ObjectId communityId = (ObjectId) dbo.get("_id");
            boolean bPersonalGroup = dbo.getBoolean("isPersonalCommunity", false);
            boolean bSystemGroup = dbo.getBoolean("isSystemCommunity", false);
            ObjectId parentCommunityId = (ObjectId) dbo.get("parentId");
            
            createCommunityDocIndex(communityId.toString(), parentCommunityId, bPersonalGroup, bSystemGroup, bDeleteDocs, j==0);
            
          }//end loop over communities
        }// end loop over communities - first time parents only
      } // (end if need to do big loop over all sources)
    }
    catch (Exception e) 
    {
      e.printStackTrace();
      throw new RuntimeException(e.getMessage());
    }
  }//TESTED (not changed since by-eye test in Beta - retested after moving code into createCommunityDocIndex below)
  
  ///////////////////////////////////////////////////////////////////////////////////////
  
  // Utility code for creating community indexes
  
  public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId,  boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex)
  {
    createCommunityDocIndex(nameOrCommunityIdStr, parentCommunityId, bPersonalGroup, bSystemGroup, bClearIndex, false); 
  }
  
  public static void createCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, 
      boolean bPersonalGroup, boolean bSystemGroup, boolean bClearIndex, boolean bParentsOnly)
  {
    //create elasticsearch indexes
    PropertiesManager pm = new PropertiesManager();
    boolean languageNormalization = pm.getNormalizeEncoding();
    int nPreferredReplicas = pm.getMaxIndexReplicas();
    
    String docMapping = new Gson().toJson(new DocumentPojoIndexMap.Mapping(), DocumentPojoIndexMap.Mapping.class).replace("__AMP__", "@");
    
    String sGroupIndex = null; // for indexing, ie always a single index
    String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
    try {
      sGroupIndex = new StringBuffer("doc_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
      sAliasIndex = new StringBuffer("docs_").append(new ObjectId(nameOrCommunityIdStr).toString()).toString();
    }
    catch (Exception e) {
      sGroupIndex = nameOrCommunityIdStr;
      if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) { 
        sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
      }
      else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
        sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;        
      }
      else { // fallback
        sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
      }
      //TESTED
    }    
    if (!bPersonalGroup) {
            
      if (null == parentCommunityId) {
      
        int nShards = bSystemGroup? 10 : 5 ; // (system group is largest)
        
        // Remove the alias, in case it exists:
        // Then create an index with this name:
        Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
        localSettingsGroupIndex.put("number_of_shards", nShards).put("number_of_replicas", nPreferredReplicas);  
        if (languageNormalization) {
          localSettingsGroupIndex.put("index.analysis.analyzer.default.tokenizer","standard");
          localSettingsGroupIndex.putArray("index.analysis.analyzer.default.filter", "icu_normalizer","icu_folding","standard","lowercase","stop");
        }//TESTED


        ElasticSearchManager docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
        if (null == docIndex) { // index has already been referenced, hence createIndex returns null
          docIndex = IndexManager.getIndex(sGroupIndex);
        }
        if (bClearIndex) {
          docIndex.deleteMe();
          docIndex = IndexManager.createIndex(sGroupIndex, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
        }
        if (null != docIndex) {
          try {
            docIndex.pingIndex(); // (wait until it's created itself)
          }
          catch (Exception e) {} // (just make sure this doesn't die horribly)
        }
        else {
          docIndex = IndexManager.getIndex(sGroupIndex);
        }
        if (null != docIndex) { // should always be true
          docIndex.createAlias(sAliasIndex);
          docIndex.closeIndex();
        }
      }
      else if (!bParentsOnly) { // A sub-index of a parent       
        
        parentCommunityId = getRootCommunity(parentCommunityId);
        
        if (null != parentCommunityId) {
          String parentCommunityIdStr = parentCommunityId.toString();
          
          String sParentGroupIndex = new StringBuffer("doc_").append(new ObjectId(parentCommunityIdStr).toString()).toString();
          ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
          
          //DEBUG (alias corruption)
//          if (null == _aliasInfo) {
//            ClusterStateResponse clusterState = docIndex.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
//            _aliasInfo = CrossVersionImmutableMapOfImmutableMaps.getAliases(clusterState.getState().getMetaData());
//          }
//          else {
//            if (_aliasInfo.containsKey(sGroupIndex)) { // has no aliases, we're not good
//              return;
//            }
//            else {
//              //DEBUG
//              System.out.println("Alias " + sGroupIndex + " has no aliases (but should)");            
//              ElasticSearchManager docIndex2 = IndexManager.getIndex(sGroupIndex);
//              docIndex2.deleteMe();
//            }
//          }
        
          docIndex.createAlias(sGroupIndex); // for indexing 
            // (this is going to be tricky when the functionality is fully implemented
            //  because it will need to handle the parent index splitting)
          docIndex.createAlias(sAliasIndex); // for queries
          docIndex.closeIndex();
          // (do nothing on delete - that will be handled at the parent index level)
        }
      }
      //TESTED (parents, children, and personal + docs_ aliases)
    }
    else { // (Personal group)
      // Just create the dummy index, no different to getting it in practice
      Builder localSettingsGroupIndex = ImmutableSettings.settingsBuilder();
      localSettingsGroupIndex.put("number_of_shards", 1).put("number_of_replicas", 0); // (ie guaranteed to be local to each ES node)  
      ElasticSearchManager dummyGroupIndex = IndexManager.createIndex(DocumentPojoIndexMap.dummyDocumentIndex_, DocumentPojoIndexMap.documentType_, false, null, docMapping, localSettingsGroupIndex);
      if (null == dummyGroupIndex) {
        dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
      }      
      
      // Just create an alias, so that queries work arbitrarily:
      dummyGroupIndex.createAlias(sGroupIndex); // (at some point we should delete the sGroupIndex alias, but leave it in for bw compatibility for now)
      dummyGroupIndex.createAlias(sAliasIndex); // (never index dummy indices so only need query index)
      // (do nothing on delete since don't have any docs in here anyway)
    }
  }
  //TESTED (including new docs_ alias)


  ///////////////////////////
  
  // (this utility function is needed for the legacy case where empty communities were
  //  treated as aliases of the dummy community ... first time I encounter a community, I need
  //  to recreate it...)
  
  public static void recreateCommunityDocIndex_unknownFields(ObjectId communityId, boolean bDeleteFirst) {
    CommunityPojo cp = CommunityPojo.fromDb(MongoDbManager.getSocial().getCommunity().findOne(new BasicDBObject("_id", communityId)), CommunityPojo.class);
    if (null != cp) {
      deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), true);
        // (in the legacy world this would have been treated as a "personal" ie equivalently to a dummy community ...
        //  this does nothing if it's already a real community)
      
      if (bDeleteFirst) {
        deleteCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity());
      }      
      createCommunityDocIndex(communityId.toString(), cp.getParentId(), cp.getIsPersonalCommunity(), cp.getIsSystemCommunity(), false);
    }
  }
  //TESTED
  
  ///////////////////////////
    
  public static void deleteCommunityDocIndex(String nameOrCommunityIdStr, ObjectId parentCommunityId, boolean bPersonalGroup) {
    
    String sGroupIndex = null; // for indexing, ie always a single index
    String sAliasIndex = null; // for querying, ie will point to doc_commid, doc_commid_1, etc
    ObjectId communityId = null;
    try {
      communityId = new ObjectId(nameOrCommunityIdStr);
      sGroupIndex = new StringBuffer("doc_").append(communityId.toString()).toString();
      sAliasIndex = new StringBuffer("docs_").append(communityId.toString()).toString();
    }
    catch (Exception e) {
      sGroupIndex = nameOrCommunityIdStr;
      if (DocumentPojoIndexMap.globalDocumentIndex_.equals(nameOrCommunityIdStr)) { 
        sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
      }
      else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(nameOrCommunityIdStr)) {
        sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;        
      }
      else { // fallback
        sAliasIndex = nameOrCommunityIdStr.replaceAll("doc(?:ument)?_", "docs_");
      }
      //TESTED
    }    
    if (bPersonalGroup) {
      ElasticSearchManager dummyGroupIndex = IndexManager.getIndex(DocumentPojoIndexMap.dummyDocumentIndex_);
      dummyGroupIndex.removeAlias(sAliasIndex);
      dummyGroupIndex.removeAlias(sGroupIndex);
    }
    else if (null != parentCommunityId) {
      
      parentCommunityId = getRootCommunity(parentCommunityId);
      if (null != parentCommunityId) {
        String sParentGroupIndex = new StringBuffer("doc_").append(parentCommunityId.toString()).toString();
        ElasticSearchManager docIndex = IndexManager.getIndex(sParentGroupIndex);
        docIndex.removeAlias(sGroupIndex);
        docIndex.removeAlias(sAliasIndex);
        docIndex.closeIndex();
      }    
    }
    else {
      ElasticSearchManager docIndex = IndexManager.getIndex(sGroupIndex);
      docIndex.deleteMe();      
    }
    //TESTED (parent, children, and personal)
    
    // Also need to delete any records indexes:
    // It's a bit more complex because we're not exactly sure which indexes exist:
    
    if (null != communityId) {
      ElasticSearchManager indexMgr = ElasticSearchManager.getIndex(DocumentPojoIndexMap.globalDocumentIndex_); 
        // (just something that's guaranteed to exist)


      String stashedIndex = "recs_" + communityId.toString();
      String liveIndicesPrefix = "recs_t_" + communityId.toString();
      
      ClusterStateResponse clusterState = indexMgr.getRawClient().admin().cluster().state(new ClusterStateRequest()).actionGet();
      String indices[] = clusterState.getState().getMetaData().getConcreteAllOpenIndices();
      for (String index: indices) {
        if (index.startsWith(stashedIndex) || index.startsWith(liveIndicesPrefix)) {
          ElasticSearchManager.getIndex(index).deleteMe();
        }
      }//TESTED
      
      // THIS CODE ONLY WORKS ON ES-1.0+ ... so have replaced with the less efficient code above
      
      // First off: stashed interface:
      
//      String stashedIndex = "recs_" + communityId.toString();
//      ClusterStateResponse retVal = indexMgr.getRawClient().admin().cluster().prepareState()
//          .setIndices(stashedIndex)
//          .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
//      
//      if (!retVal.getState().getMetaData().getIndices().isEmpty()) {
//        ElasticSearchManager.getIndex(stashedIndex).deleteMe();
//      }//TESTED
//      // (else doesn't exist...)
//      
//      // Second: all the time-indexed versions
//      
//      String indexPattern = new StringBuffer("recs_t_").append(communityId.toString()).append("*").toString();
//      retVal = indexMgr.getRawClient().admin().cluster().prepareState()
//          .setIndices(indexPattern)
//          .setRoutingTable(false).setNodes(false).setListenerThreaded(false).get();
//
//      for (IndexMetaData indexMetadata: retVal.getState().getMetaData()) {
//        ElasticSearchManager.getIndex(indexMetadata.index()).deleteMe();
//      }//TESTED
    }//TESTED
  }
  //TESTED (personal and system)
    
  ///////////////////////////
  
  // Utility function to get the root community of a community hierarchy, since you can't add aliases to aliases
  
  static ObjectId getRootCommunity(ObjectId parentCommunityId) {
    
    for (;;) {
      BasicDBObject query = new BasicDBObject("_id", parentCommunityId);
      BasicDBObject field = new BasicDBObject("parentId", 1);
      BasicDBObject retVal = (BasicDBObject) MongoDbManager.getSocial().getCommunity().findOne(query, field);
      if (null == retVal) { // (shouldn't ever happen)
        return parentCommunityId;
      }
      ObjectId tmp = retVal.getObjectId("parentId", null);
      if (null == tmp) { // (no more parents)
        return parentCommunityId;
      }
      if (tmp.equals(parentCommunityId)) { // (shouldn't ever happen but will prevent infinite loop)
        return parentCommunityId;         
      }
      parentCommunityId = tmp;
    }
  }//TESTED (cases where have and don't have parent id)
  
  ///////////////////////////////////////////////////////////////////////////////////////
  //
  // Interface to handle scaleable indexes
  // Currently this is a dummy interface, but it will make it easy to split the indexes in the future


  private static HashMap<String, String> _docIndexMap = null;
  private static String _assocIndex = null;
  private static String _entityIndex = null;
  private static CrossVersionImmutableMapOfImmutableMaps<AliasMetaData> _aliasInfo = null;
    
  //TODO (INF-1136): Test and integrate this (phase 1), then implement the index splitting code (phase 2)
  
  public static synchronized String getIndex(String communityIdOrIndexStr) {
    if (communityIdOrIndexStr == EntityFeaturePojoIndexMap.indexName_) { // pointer == intended
      if (null == _entityIndex) {
        _entityIndex = EntityFeaturePojoIndexMap.indexName_;
      }
      return _entityIndex;
    }
    else if (communityIdOrIndexStr == AssociationFeaturePojoIndexMap.indexName_) { // pointer == intended      
      if (null == _assocIndex) {
        _assocIndex = AssociationFeaturePojoIndexMap.indexName_;
      }
      return _assocIndex;
    }
    else { // Documents
      
      if (null == _docIndexMap) {
        _docIndexMap = new HashMap<String, String>();
      }
      String sAliasIndex;
      try {
        sAliasIndex = new StringBuffer("doc_").append(new ObjectId(communityIdOrIndexStr).toString()).toString();
      }      
      catch (Exception e) {
        if (DocumentPojoIndexMap.globalDocumentIndex_.equals(communityIdOrIndexStr)) { 
          communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.globalDocumentIndexCollection_;
        }
        else if (DocumentPojoIndexMap.manyGeoDocumentIndex_.equals(communityIdOrIndexStr)) {
          communityIdOrIndexStr = sAliasIndex = DocumentPojoIndexMap.manyGeoDocumentIndexCollection_;        
        }
        else { // fallback
          communityIdOrIndexStr = sAliasIndex = communityIdOrIndexStr.replaceAll("doc(?:ument)?_", "");
        }        
      }
      String sDocIndex = _docIndexMap.get(communityIdOrIndexStr);
      if (null == sDocIndex) {
        sDocIndex = sAliasIndex;
        _docIndexMap.put(communityIdOrIndexStr, sAliasIndex);
      }
      return sDocIndex;
    }
  }
  //TOTEST (lots of cases)
  
  ///////////////////////////////////////////////////////////////////////////////////////
  //
  // Enrich and store documents (source is optional - can choose not to index if set)
  // (and remove any documents)
  
  public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete)
  {
    processDocuments(harvestType, toAdd, toUpdate_subsetOfAdd, toDelete, null);
  }
  public void processDocuments(int harvestType, List<DocumentPojo> toAdd, List<DocumentPojo> toUpdate_subsetOfAdd, List<DocumentPojo> toDelete, SourcePojo source)
  {
    PropertiesManager props = new PropertiesManager();
    
    // Note: toAdd = toAdd(old) + toUpdate
    // Need to treat updates as follows:
    // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc)


    // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...)
    toDelete.addAll(toUpdate_subsetOfAdd);
    StoreAndIndexManager storageManager = new StoreAndIndexManager();
    storageManager.removeFromDatastore_byURL(toDelete);
      // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester)


    // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...)
    
    // Aggregation:
    // 1+2. Create aggregate entities/events ("features") and write them to the DB
    // (then can store feeds - doesn't matter that the event/entities have been modified by the aggregation)
    // 3. (Scheduled for efficiency) Update all documents' frequencies based on new entities and events
    // 4. (Scheduled for efficiency) Synchronize with index [after this, queries can find them - so (2) must have happened]
      // (Syncronization currently "corrupts" the entities so needs to be run last)


    AggregationManager perSourceAggregation = null;
    
    if (!props.getAggregationDisabled()) {
      perSourceAggregation = new AggregationManager();
    }
    
    // 1+2]
    if (null != perSourceAggregation) {
      perSourceAggregation.doAggregation(toAdd, toDelete);
      perSourceAggregation.createOrUpdateFeatureEntries();
    }
    
    // Save feeds to feeds collection in MongoDb
    // (second field determines if content gets saved)
    if (null != perSourceAggregation) {
      perSourceAggregation.applyAggregationToDocs(toAdd);
        // (First save aggregated statistics back to the docs' entity/event instances)
    }
    storeFeeds(toAdd, (harvestType != InfiniteEnums.DATABASE), source);


    // Then finish aggregation:
    
    if (null != perSourceAggregation) {
      // 3]  
      perSourceAggregation.runScheduledDocumentUpdates();
      
      // 4] This needs to happen last because it "corrupts" the entities and events
      perSourceAggregation.runScheduledSynchronization();
    }
    
  }//TESTED (by eye - logic is v simple)
  
  ///////////////////////////////////////////////////////////////////////////////////////
  //
  // STORAGE AND INDEXING
  //
  //////////////////////////////////////////////////////////////////////////////////////


  /**
   * Writes the feeds to the DB and index
   * 
   * @param feeds list of feeds to be added to db
   */
  private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source)
  {
    if ( null != docs && docs.size() > 0 )
    {
      StoreAndIndexManager store = new StoreAndIndexManager();
      store.addToDatastore(docs, bSaveContent, source);
    }
  }//TESTED (by eye)
  
  // See StoreAndIndexManager
  
  ///////////////////////////////////////////////////////////////////////////////////////
  //
  // AGGREGATION
  //
  //////////////////////////////////////////////////////////////////////////////////////


  // See AggregationManager
  
}
Source Code of com.ikanow.infinit.e.processing.generic.GenericProcessingController

Related Classes of com.ikanow.infinit.e.processing.generic.GenericProcessingController