Package com.ikanow.infinit.e.processing.generic.store_and_index

Examples of com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager


           }
           Integer count2 = sourceKeyMap.get(sourceKey);
           sourceKeyMap.put(sourceKey, (count2 == null ? 1 : count2 + 1));
        }
      }
      StoreAndIndexManager dataStore = new StoreAndIndexManager();
      dataStore.removeFromDatastore_byURL(docs);
      AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
      dataStore.removeSoftDeletedDocuments();
      AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());     
     
      // Actually update the DB counts:
      for (Map.Entry<ObjectId, Integer> communityInfo: communityMap.entrySet()) {
        System.out.println("Removed " + communityInfo.getValue() + " records from community " + communityInfo.getKey());
        DbManager.getDocument().getCounts().update(new BasicDBObject("_id", communityInfo.getKey()),
View Full Code Here


    // Need to treat updates as follows:
    // - Delete (inc children, eg events) but get fields to keep (currently _id, created; in the future comments etc)

    // Delete toUpdate and toAdd (also overwriting "created" for updated docs, well all actually...)
    toDelete.addAll(toUpdate_subsetOfAdd);
    StoreAndIndexManager storageManager = new StoreAndIndexManager();
    storageManager.removeFromDatastore_byURL(toDelete);
      // (note: expands toDelete if any sourceUrl "docs" are present, see FileHarvester)

    // (Storing docs messes up the doc/event/entity objects, so don't do that just yet...)
   
    // Aggregation:
View Full Code Here

   */
  private void storeFeeds(List<DocumentPojo> docs, boolean bSaveContent, SourcePojo source)
  {
    if ( null != docs && docs.size() > 0 )
    {
      StoreAndIndexManager store = new StoreAndIndexManager();
      store.addToDatastore(docs, bSaveContent, source);
    }
  }//TESTED (by eye)
View Full Code Here

    dbCache.clear();
   
    int fixcount = 0;
    DBCollection contentDb = DbManager.getDocument().getContent();
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    StoreAndIndexManager storeManager = new StoreAndIndexManager();
   
    for ( SourcePojo sp : sources ) {
      // Don't combine the sources (apart from unusual multi-community case), because
      // that prevents you from using the compound sourceKey/_id index
     
      List<String> sourceKeyList = new ArrayList<String>();
      sourceKeyList.add(sp.getKey());
      if (sp.getCommunityIds().size() > 1) { // Special case, need to add the communities
        for (ObjectId communityId: sp.getCommunityIds()) {
          sourceKeyList.add(new StringBuffer(sp.getKey()).append('#').append(communityId.toString()).toString());
        }
      }//(end handling rare multi-community case)
     
      try
     
        List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
        //FIRST DO ALL NEW FEEDS
        BasicDBObject query = new BasicDBObject();
        query.put(DocumentPojo._id_, new BasicDBObject(MongoDbManager.gt_, new ObjectId((int)(cleanseStartTime/1000), 0, 0))); // time aspect
        query.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList) ); //source aspect
        BasicDBObject queryFields = new BasicDBObject();
        queryFields.append(DocumentPojo.url_, 1);
        queryFields.append(DocumentPojo.index_, 1);
        queryFields.append(DocumentPojo.sourceKey_, 1);
       
        DBCursor cur = documentDb.find(query, queryFields).batchSize(100);
        ElasticSearchManager esm = null;
        ElasticSearchManager esm_base = ElasticSearchManager.getIndex("document_index");
        String sIndex = null;

        while (cur.hasNext())
        {
          if (bKillMeNow) {
            return fixcount;
          }
                 
          DocumentPojo doc = DocumentPojo.fromDb(cur.next(), DocumentPojo.class);
          if (null != doc.getId()) {
            dbCache.add(doc.getId().toString());
          }
         
          // Get index of doc to check in:
          String sNewIndex = doc.getIndex();
          if (null == sNewIndex) {
            sIndex = null;
            esm = esm_base;
          }
          else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
            sIndex = sNewIndex;
            if (sNewIndex.equals("document_index")) {
              esm = esm_base;
            }
            else {
              esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
            }
          }       
         
          //Compare mongo doc to search doc
          Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
          if ( null == results || results.isEmpty() )
          {
            //either too many entries (duplicates) or no entry
            //delete this doc from both
            logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");           
            docs_to_remove.add(doc);         
            documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
            BasicDBObject contentQ = new BasicDBObject(CompressedFullTextPojo.url_, doc.getUrl());
            contentQ.put(CompressedFullTextPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, Arrays.asList(null, doc.getSourceKey())));           
            contentDb.remove(contentQ);
            fixcount++;
          }
        } //end loop over new docs for this source
        storeManager.removeFromSearch(docs_to_remove);
       
        //NOW VERIFY ALL OLD FEEDS
        int iteration = 1;
        boolean removedAll = true;
        docs_to_remove.clear();
        while (removedAll)
        {
          int rows = iteration*iteration*10; //10x^2 exponentially check more docs
          int oldfixes = 0;
          BasicDBObject queryOLD = new BasicDBObject()
          queryOLD.put(DocumentPojo.sourceKey_, new BasicDBObject(MongoDbManager.in_, sourceKeyList) ); //source aspect
          BasicDBObject sortOLD = new BasicDBObject(DocumentPojo._id_, 1);
         
          DBCursor curOLD = documentDb.find(queryOLD, queryFields).sort(sortOLD).limit(rows);
          while (curOLD.hasNext())
          {
            DocumentPojo doc = DocumentPojo.fromDb(curOLD.next(), DocumentPojo.class);       
            if (null != doc.getId()) {
              dbCache.add(doc.getId().toString());
            }
           
            // Get index of doc to check in:
            String sNewIndex = doc.getIndex();
            if (null == sNewIndex) {
              sIndex = null;
              esm = esm_base;
            }
            else if ((null == sIndex) || (!sNewIndex.equals(sIndex))) {
              sIndex = sNewIndex;
              if (sNewIndex.equals("document_index")) {
                esm = esm_base;
              }
              else {
                esm = ElasticSearchManager.getIndex(sNewIndex + "/document_index");
              }
            }
           
            //Compare mongo doc to search doc
            Map<String, GetField> results = esm.getDocument(doc.getId().toString(),DocumentPojo.url_);
            if ( null == results || results.isEmpty() )
            {
              //either too many entries (duplicates) or no entry
              //delete this doc from both
              logger.info("db sync removing doc: " + doc.getId() + "/" + doc.getSourceKey() + " not found in search (or duplicate)");           
              docs_to_remove.add(doc);           
              documentDb.remove(new BasicDBObject(DocumentPojo._id_, doc.getId()));
              contentDb.remove(new BasicDBObject(DocumentPojo.url_, doc.getUrl()));
              fixcount++;
              oldfixes++;
            }         
          }
          if ( oldfixes != rows )
            removedAll = false;
        }//(end loop over old docs for this source)
        storeManager.removeFromSearch(docs_to_remove);
      }
      catch (Exception e)
      {
        // If an exception occurs log the error
        logger.error("Exception Message: " + e.getMessage(), e);
View Full Code Here

   */
  //TODO INF-2239 ... lol fail if syncDB isn't called then dbCache is empty and everything gets deleted...
  public int syncSearch(long cleanseStartTime, Set<String> dbCache)
  {
    int fixcount = 0;
    StoreAndIndexManager storeManager = new StoreAndIndexManager();
   
    // NO LONGER NEEDED, HAVE CACHE (EXCEPT IN ONE PLACE, THE "OLD DOCS" CHECK)
    DBCollection documentDb = DbManager.getDocument().getMetadata();
    BasicDBObject queryFields = new BasicDBObject(); // (ie just _id, basically only need to know if it exists)
    try
   
      //get solr entries from last cleanse point 
      int source_index = 0;
      int source_count = sources.size();
      for ( SourcePojo sp : sources )
      {
        if (bKillMeNow) {
          return fixcount;
        }
        List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
       
        // Get all indexes this source might use:
        StringBuffer sb = new StringBuffer("document_index");
        for (ObjectId sCommunityId: sp.getCommunityIds()) {
          sb.append(",doc_").append(sCommunityId.toString());
        }
        sb.append("/document_index");
       
        ElasticSearchManager esm = ElasticSearchManager.getIndex(sb.toString());
               
        SearchRequestBuilder searchOptions = esm.getSearchOptions();
        BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
        boolQuery.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
        boolQuery.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey() ));
        searchOptions.setSize(200); // (note this is multiplied by the number of primary shards)
        searchOptions.setSearchType(SearchType.SCAN);
        searchOptions.setScroll("10m");
        SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
        String scrollId = rsp.getScrollId();
        int nSkip = 0;
       
        for (;;) // Until no more hits
        {
          rsp = esm.doScrollingQuery(scrollId, "10m");
          SearchHit[] docs = rsp.getHits().getHits();
          scrollId = rsp.getScrollId();
         
          if ((null == docs) || (0 == docs.length)) {
            break;
          }         
          if (docs.length > 100) { // just display large checks)
            logger.info("Checking ES docs for large source=" + sp.getKey() + " source: " + source_index + "/" + source_count + " from " + nSkip + " to " + (nSkip+docs.length) );
          }
         
          //Check all solr docs against mongodb
         
          for (SearchHit hit: docs)
          {
            String idStr = hit.getId();
            boolean found = true; //(fail closed!)
            if (null == dbCache) {
              //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
              ObjectId id = new ObjectId(idStr);
              BasicDBObject query = new BasicDBObject(DocumentPojo._id_, id);
              query.put(DocumentPojo.sourceKey_, sp.getKey()); // (ensures uses only the right shard)
              DBObject dbo = documentDb.findOne(query, queryFields);
              found = (dbo != null);
            }//TESTED
            else {
              found = dbCache.contains(idStr);
            }//TESTED
            if (!found)
            {       
              ObjectId id = new ObjectId(idStr);
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index + " not found in mongo");
              fixcount++;
            } // end if not found
          } // end loop over docs to check
         
          nSkip += docs.length;
        }// until no more hits
        if (!docs_to_remove.isEmpty()) {
          storeManager.removeFromSearch(docs_to_remove);
          docs_to_remove.clear();
        }
       
        //CHECK OLD FEEDS 10 at atime
        int iteration = 1;
        boolean removedAll = true;
        while (removedAll )
        {
          int rows = iteration*iteration*10;//exponential scaling 10x^2
          iteration++;
          int oldfixes = 0;
         
          //get old docs from es
          SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions();
          BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery();
          boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
          boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
          searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC);
          searchOptionsOLD.setSize(rows);
          SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD);
          SearchHit[] docsOLD = rspOLD.getHits().getHits();
         
          //Check all solr docs against mongodb
         
          for (SearchHit hit: docsOLD)        
          {
            String idStr = hit.getId();
            boolean found = true;
            if (null == dbCache) {
              //OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
              ObjectId id = new ObjectId(idStr);
              BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
              DBObject dbo = documentDb.findOne(queryOLD, queryFields);
              found = (dbo != null);
            }//TESTED
            else {
               found = dbCache.contains(idStr);
            }//TESTED
            if (!found)
            {       
              // Also need to check the DB since dbCache is not guaranteed to be populated with the same
              // number of "final" docs
              ObjectId id = new ObjectId(idStr);
              if (rows > 10) { // (dbCache always loaded with the first 10 rows)
                BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
                if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present
                  continue;
                }
              }
              DocumentPojo doc = new DocumentPojo();
              doc.setId(id);
              doc.setIndex(hit.getIndex() + "/document_index");
              docs_to_remove.add(doc);
              logger.info("db sync removing doc: " + idStr + "/" + source_index + " not found in mongo");
              oldfixes++;
              fixcount++;
            }
          }
          if (!docs_to_remove.isEmpty()) {
            storeManager.removeFromSearch(docs_to_remove);
          }
         
          if ( oldfixes != rows )
            removedAll = false;
        }
View Full Code Here

      }//TESTED
     
      nSynced++;
      docsToTransfer.add(doc);
      if (0 == (nSynced % 10000)) {
        StoreAndIndexManager manager = new StoreAndIndexManager();
       
        if (bAggregate) {
          // Loop over communities and aggregate each one then store the modified entities/assocs         
          doAggregation(communityList, docsToTransfer);
          communityList = null; // (in case the next 10,000 docs are all in the same community!)
          currCommunityId = null;

        }//TOTEST       
       
        manager.addToSearch(docsToTransfer);
        docsToTransfer.clear();
        System.out.println("(Synced " + nSynced + " records)");
      }
     
    } // (End loop over docs)
           
    // Sync remaining docs
   
    if (!docsToTransfer.isEmpty()) {
      if (bAggregate) {
        // Loop over communities and aggregate each one then store the modified entities/assocs         
        doAggregation(communityList, docsToTransfer);       
      }
     
      StoreAndIndexManager manager = new StoreAndIndexManager();
      manager.addToSearch(docsToTransfer);       
    }
   
    if (null != chunk) {
      System.out.println("Found " + nSynced + " records to sync in chunk");
    }
View Full Code Here

      }
     
     
      long nDocsDeleted = 0;
      if (null != source.getKey()) { // or may delete everything!
        StoreAndIndexManager dataStore = new StoreAndIndexManager();
        nDocsDeleted = dataStore.removeFromDatastoreAndIndex_bySourceKey(source.getKey(), null, false, communityId.toString());
       
        DbManager.getDocument().getCounts().update(new BasicDBObject(DocCountPojo._id_, new ObjectId(communityIdStr)),
            new BasicDBObject(DbManager.inc_, new BasicDBObject(DocCountPojo.doccount_, -nDocsDeleted)));
       
        if (bDocsOnly) { // Update the source harvest status (easy: no documents left!)
          try {
            DbManager.getIngest().getSource().update(queryDbo,
                new BasicDBObject(MongoDbManager.set_,
                    new BasicDBObject(SourceHarvestStatusPojo.sourceQuery_doccount_, 0L))
                );
          }
          catch (Exception e) {} // Just carry on, shouldn't ever happen and it's too late to do anything about it
          //TESTED         
        }
        else { // !bDocsOnly, ie delete source also
          DbManager.getIngest().getSource().remove(queryDbo);         
        }
        if ((null != source.getExtractType()) && source.getExtractType().equalsIgnoreCase("logstash")) {
          BasicDBObject logStashMessage = new BasicDBObject();
          logStashMessage.put("_id", source.getId());
          logStashMessage.put("deleteOnlyCommunityId", communityId);
          logStashMessage.put("sourceKey", source.getKey());
          logStashMessage.put("deleteDocsOnly", bDocsOnly);
         
          if ((null != source.getProcessingPipeline()) && !source.getProcessingPipeline().isEmpty()) {
            SourcePipelinePojo px = source.getProcessingPipeline().iterator().next();
            if ((null != px.logstash) && (null != px.logstash.distributed) && px.logstash.distributed) {
              logStashMessage.put("distributed", true);
            }
          }//TESTED (by hand)
          DbManager.getIngest().getLogHarvesterQ().save(logStashMessage);
          // (the rest of this is async, so we're done here)
        }//TESTED
       
        // Do all this last:
        // (Not so critical if we time out here, the next harvest cycle should finish it; though would like to be able to offload this
        //  also if we are doing it from the API, then need a different getUUID so we don't collide with our own harvester...)
        AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
        dataStore.removeSoftDeletedDocuments();
        AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());       
     
      else if (!bDocsOnly) { // (null source key, just remove the source)
        DbManager.getIngest().getSource().remove(queryDbo);       
      }
      if (!bDocsOnly) { // Also deleting the entire source
View Full Code Here

      docFields.append(DocumentPojo.url_, 1);
      docFields.append(DocumentPojo.sourceUrl_, 1);
      docFields.append(DocumentPojo.index_, 1);
      docFields.append(DocumentPojo.sourceKey_, 1);
     
      StoreAndIndexManager dataStore = new StoreAndIndexManager();
      ObjectId nextId = null;
      while (nToPrune > 0) {
        int nToDelete = nToPrune;
        if (nToDelete > 10000) {
          nToDelete = 10000;
        }
        if (null != nextId) {
          docQuery.put(DocumentPojo._id_, new BasicDBObject(DbManager.gt_, nextId));
        }//TESTED (by hand)
       
        DBCursor dbc = DbManager.getDocument().getMetadata().find(docQuery, docFields).sort(sortField).limit(nToDelete);
          // (ie batches of 10K, ascending ordered by _id)
       
        nToPrune -= nToDelete;
        if (0 == nDocsDeleted) {
          nDocsDeleted = dbc.count();
        }
        if (0 == dbc.size()) {
          break;
        }
        List<DocumentPojo> docs = DocumentPojo.listFromDb(dbc, DocumentPojo.listType());
       
        nextId = dataStore.removeFromDatastore_byURL(docs);
      }
    }
    // No need to do anything related to soft deletion, this is all handled when the harvest ends
  }//TESTED
View Full Code Here

              SourceUtils.updateSyncStatus(source, HarvestEnum.success);             
            }
          }
         
        }//end loop over all sources
        StoreAndIndexManager dataStore = new StoreAndIndexManager();
        AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
        dataStore.removeSoftDeletedDocuments();
        AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());

    logger.info("DB fixes: " + fixes_db);
    logger.info("Search fixes: " + fixes_search);
       
    logger.info("Completed sync process at: " + new Date().toString());
View Full Code Here

        catch (InterruptedException e3) { }
      }             
        }
        com.ikanow.infinit.e.processing.generic.utils.PropertiesManager aggProps = new com.ikanow.infinit.e.processing.generic.utils.PropertiesManager();
        boolean bAggDisabled = aggProps.getAggregationDisabled();
        StoreAndIndexManager dataStore = new StoreAndIndexManager();
        boolean bResizedDB = dataStore.resizeDB();
        if (!bAggDisabled) {
          AggregationManager.updateEntitiesFromDeletedDocuments(dataStore.getUUID());
        }
        dataStore.removeSoftDeletedDocuments();
        if (!bAggDisabled) {
          AggregationManager.updateDocEntitiesFromDeletedDocuments(dataStore.getUUID());
        }
        if (bResizedDB) {
          _logger.info("(resized DB, now " + dataStore.getDatabaseSize() + " documents)");
        }
       
    HarvestController.logHarvesterStats();
    _logger.info("Completed harvest process at: " + new Date().toString());
   
View Full Code Here

TOP

Related Classes of com.ikanow.infinit.e.processing.generic.store_and_index.StoreAndIndexManager

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.