Package com.ikanow.infinit.e.processing.generic.synchronization

Source Code of com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.processing.generic.synchronization;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.bson.types.ObjectId;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.action.search.SearchRequestBuilder;
import org.elasticsearch.index.get.GetField;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.sort.SortOrder;

import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.config.source.SourcePojo;
import com.ikanow.infinit.e.data_model.store.document.CompressedFullTextPojo;
import com.ikanow.infinit.e.data_model.store.document.DocumentPojo;
import com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;

public class SynchronizationManager {

  private static Logger logger = Logger.getLogger(SynchronizationManager.class);
  private static boolean bKillMeNow = false;
  public static void killMe() {
    bKillMeNow = true;
  }
 
  private List<SourcePojo> sources = null;
  public void setSources(List<SourcePojo> sources) {
    this.sources = sources;
  }
 
  /**
   * Does the DB sync, pulls all mongo docs that occured from the
   * cleanseStartTime and source and makes sure they are in the search db.
   *
   * @param lastCleanse 1 hour before this harvester started
   * @param sources list of sources we are syncing
   * @return The number of errors fixed (docs deleted)
   */
  // DON'T USE THIS UNTIL REWRITTEN - IT SHOULD TRANSFER DOCS ACROSS, NOT LEAVE THEM ALONE
  @Deprecated
  public int syncDB(long cleanseStartTime, Set<String> dbCache)
  {
    dbCache.clear();
   
    int fixcount = 0;
    DBCollection contentDb = DbManager.getDocument().getContent();
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    StoreAndIndexManager storeManager = new StoreAndIndexManager();
   
    for ( SourcePojo sp : sources ) {
      // Don't combine the sources (apart from unusual multi-community case), because
      // that prevents you from using the compound sourceKey/_id index
     
      List<String> sourceKeyList = new ArrayList<String>();
      sourceKeyList.add(sp.getKey());
      if (sp.getCommunityIds().size() > 1) { // Special case, need to add the communities
        for (ObjectId communityId: sp.getCommunityIds()) {
          sourceKeyList.add(new StringBuffer(sp.getKey()).append('#').append(communityId.toString()).toString());
        }
      }//(end handling rare multi-community case)
     
      try
     
        List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
        //FIRST DO ALL NEW FEEDS
        BasicDBObject query = new BasicDBObject();
        query.put(DocumentPojo._id_, new BasicDBObject(MongoDbManager.gt_, new ObjectId((int)(cleanseStartTime/1000), 0, 0))); // time aspect
        query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList) ); //source aspect
        BasicDBObject queryFields = new BasicDBObject();
        queryFields.append(DocumentPojo.url_, 1);
        queryFields.append(DocumentPojo.index_, 1);
        queryFields.append(DocumentPojo.sourceKey_, 1);
       
        DBCursor cur = documentDb.find(query, queryFields).batchSize(100);
        ElasticSearchManager esm = null;
        ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index");
        String sIndex = null;

        while (cur.hasNext())
        {
          if (bKillMeNow) {
            return fixcount;
          }
                 
          DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
          if (null != doc.getId()) {
            dbCache.add(doc.getId().toString());
          }
         
          // Get index of doc to check in:
          String sNewIndex = doc.getIndex();
          if (null == sNewIndex) {
            sIndex = null;
            esm = esm_base;
          }
          else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
            sIndex = sNewIndex;
            if (sNewIndex.equals("document_index")) {
              esm = esm_base;
            }
            else {
              esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
            }
          }       
         
          //Compare mongo doc to search doc
          Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
          if ( null == results || results.isEmpty() )
          {
            //either too many entries (duplicates) or no entry
            //delete this doc from both
            logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");           
            docs_to_remove.add(doc);         
            documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));           
            contentDb.remove(contentQ);
            fixcount++;
          }
        } //end loop over new docs for this source
        storeManager.removeFromSearch(docs_to_remove);
       
        //NOW VERIFY ALL OLD FEEDS
        int iteration = 1;
        boolean removedAll = true;
        docs_to_remove.clear();
        while (removedAll)
        {
          int rows = iteration*iteration*10; //10x^2 exponentially check more docs
          int oldfixes = 0;
          BasicDBObject queryOLD = new BasicDBObject()
          queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList) ); //source aspect
          BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);
         
          DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
          while (curOLD.hasNext())
          {
            DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);       
            if (null != doc.getId()) {
              dbCache.add(doc.getId().toString());
            }
           
            // Get index of doc to check in:
            String sNewIndex = doc.getIndex();
            if (null == sNewIndex) {
              sIndex = null;
              esm = esm_base;
            }
            else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
              sIndex = sNewIndex;
              if (sNewIndex.equals("document_index")) {
                esm = esm_base;
              }
              else {
                esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
              }
            }
           
            //Compare mongo doc to search doc
            Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
            if ( null == results || results.isEmpty() )
            {
              //either too many entries (duplicates) or no entry
              //delete this doc from both
              logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");           
              docs_to_remove.add(doc);           
              documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
              contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
              fixcount++;
              oldfixes++;
            }         
          }
          if ( oldfixes != rows )
            removedAll = false;
        }//(end loop over old docs for this source)
        storeManager.removeFromSearch(docs_to_remove);
      }
      catch (Exception e)
      {
        // If an exception occurs log the error
        logger.error("Exception Message: " + e.getMessage(), e);
      }
    }   
    return fixcount;
  }//TESTED (unchanged from "tested" Beta version)
 
 
  /**
   * Does the DB sync, pulls all solr docs that occured from the
   * cleanseStartTime and source and makes sure they are in the mongo db.
   *
   * @param lastCleanse 1 hour before this harvester started
   * @param sources list of sources we are syncing
   * @return The number of errors fixed (docs deleted)
   */
  //TODO INF-2239 ... lol fail if syncDB isn't called then dbCache is empty and everything gets deleted...
  public int syncSearch(long cleanseStartTime, Set<String> dbCache)
  {
    int fixcount = 0;
    StoreAndIndexManager storeManager = new StoreAndIndexManager();
   
    // NO LONGER NEEDED, HAVE CACHE (EXCEPT IN ONE PLACE, THE "OLD DOCS" CHECK)
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    BasicDBObject queryFields = new BasicDBObject(); // (ie just _id, basically only need to know if it exists)
    try
   
      //get solr entries from last cleanse point 
      int source_index = 0;
      int source_count = sources.size();
      for ( SourcePojo sp : sources )
      {
        if (bKillMeNow) {
          return fixcount;
        }
        List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
       
        // Get all indexes this source might use:
        StringBuffer sb = new StringBuffer("document_index");
        for (ObjectId sCommunityId: sp.getCommunityIds()) {
          sb.append(",doc_").append(sCommunityId.toString());
        }
        sb.append("/document_index");
       
        ElasticSearchManager esm = ElasticSearchManager.getIndex(sb.toString());
               
        SearchRequestBuilder searchOptions = esm.getSearchOptions();
        BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
        boolQuery.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
        boolQuery.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey() ));
        searchOptions.setSize(200); // (note this is multiplied by the number of primary shards)
        searchOptions.setSearchType(SearchType.SCAN);
        searchOptions.setScroll("10m");
        SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
        String scrollId = rsp.getScrollId();
        int nSkip = 0;
       
        for (;;) // Until no more hits
        {
          rsp = esm.doScrollingQuery(scrollId, "10m");
          SearchHit[] docs = rsp.getHits().getHits();
          scrollId = rsp.getScrollId();
         
          if ((null == docs) || (0 == docs.length)) {
            break;
          }         
          if (docs.length > 100) { // just display large checks)
            logger.info("Checking ES docs for large source=" + sp.getKey() + " source: " + source_index + "/" + source_count + " from " + nSkip + " to " + (nSkip+docs.length) );
          }
         
          //Check all solr docs against mongodb
         
          for (SearchHit hit: docs)
          {
            String idStr = hit.getId();
            boolean found = true; //(fail closed!)
            if (null == dbCache) {
              //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
              ObjectId id = new ObjectId(idStr);
              BasicDBObject query = new BasicDBObject(DocumentPojo._id_, id);
              query.put(DocumentPojo.sourceKey_, sp.getKey()); // (ensures uses only the right shard)
              DBObject dbo = documentDb.findOne(query, queryFields);
              found = (dbo != null);
            }//TESTED
            else {
              found = dbCache.contains(idStr);
            }//TESTED
            if (!found)
            {       
              ObjectId id = new ObjectId(idStr);
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index + " not found in mongo");
              fixcount++;
            } // end if not found
          } // end loop over docs to check
         
          nSkip += docs.length;
        }// until no more hits
        if (!docs_to_remove.isEmpty()) {
          storeManager.removeFromSearch(docs_to_remove);
          docs_to_remove.clear();
        }
       
        //CHECK OLD FEEDS 10 at atime
        int iteration = 1;
        boolean removedAll = true;
        while (removedAll )
        {
          int rows = iteration*iteration*10;//exponential scaling 10x^2
          iteration++;
          int oldfixes = 0;
         
          //get old docs from es
          SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions();
          BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery();
          boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
          boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
          searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC);
          searchOptionsOLD.setSize(rows);
          SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD);
          SearchHit[] docsOLD = rspOLD.getHits().getHits();
         
          //Check all solr docs against mongodb
         
          for (SearchHit hit: docsOLD)        
          {
            String idStr = hit.getId();
            boolean found = true;
            if (null == dbCache) {
              //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
              ObjectId id = new ObjectId(idStr);
              BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
              DBObject dbo = documentDb.findOne(queryOLD, queryFields);
              found = (dbo != null);
            }//TESTED
            else {
               found = dbCache.contains(idStr);
            }//TESTED
            if (!found)
            {       
              // Also need to check the DB since dbCache is not guaranteed to be populated with the same
              // number of "final" docs
              ObjectId id = new ObjectId(idStr);
              if (rows > 10) { // (dbCache always loaded with the first 10 rows)
                BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
                if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present
                  continue;
                }
              }
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + idStr + "/" + source_index + " not found in mongo");
              oldfixes++;
              fixcount++;
            }
          }
          if (!docs_to_remove.isEmpty()) {
            storeManager.removeFromSearch(docs_to_remove);
          }
         
          if ( oldfixes != rows )
            removedAll = false;
        }
        source_index++;
      } // end loop over sources
    }
    catch (Exception e)
    {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
    }
    return fixcount;
  }//TESTED (unchanged from "tested" Beta version)
}
TOP

Related Classes of com.ikanow.infinit.e.processing.generic.synchronization.SynchronizationManager

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.