*/
//TODO INF-2239 ... lol fail if syncDB isn't called then dbCache is empty and everything gets deleted...
public int syncSearch(long cleanseStartTime, Set<String> dbCache)
{
int fixcount = 0;
StoreAndIndexManager storeManager = new StoreAndIndexManager();
// NO LONGER NEEDED, HAVE CACHE (EXCEPT IN ONE PLACE, THE "OLD DOCS" CHECK)
DBCollection documentDb = DbManager.getDocument().getMetadata();
BasicDBObject queryFields = new BasicDBObject(); // (ie just _id, basically only need to know if it exists)
try
{
//get solr entries from last cleanse point
int source_index = 0;
int source_count = sources.size();
for ( SourcePojo sp : sources )
{
if (bKillMeNow) {
return fixcount;
}
List<DocumentPojo> docs_to_remove = new ArrayList<DocumentPojo>();
// Get all indexes this source might use:
StringBuffer sb = new StringBuffer("document_index");
for (ObjectId sCommunityId: sp.getCommunityIds()) {
sb.append(",doc_").append(sCommunityId.toString());
}
sb.append("/document_index");
ElasticSearchManager esm = ElasticSearchManager.getIndex(sb.toString());
SearchRequestBuilder searchOptions = esm.getSearchOptions();
BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
boolQuery.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
boolQuery.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey() ));
searchOptions.setSize(200); // (note this is multiplied by the number of primary shards)
searchOptions.setSearchType(SearchType.SCAN);
searchOptions.setScroll("10m");
SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
String scrollId = rsp.getScrollId();
int nSkip = 0;
for (;;) // Until no more hits
{
rsp = esm.doScrollingQuery(scrollId, "10m");
SearchHit[] docs = rsp.getHits().getHits();
scrollId = rsp.getScrollId();
if ((null == docs) || (0 == docs.length)) {
break;
}
if (docs.length > 100) { // just display large checks)
logger.info("Checking ES docs for large source=" + sp.getKey() + " source: " + source_index + "/" + source_count + " from " + nSkip + " to " + (nSkip+docs.length) );
}
//Check all solr docs against mongodb
for (SearchHit hit: docs)
{
String idStr = hit.getId();
boolean found = true; //(fail closed!)
if (null == dbCache) {
//OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
ObjectId id = new ObjectId(idStr);
BasicDBObject query = new BasicDBObject(DocumentPojo._id_, id);
query.put(DocumentPojo.sourceKey_, sp.getKey()); // (ensures uses only the right shard)
DBObject dbo = documentDb.findOne(query, queryFields);
found = (dbo != null);
}//TESTED
else {
found = dbCache.contains(idStr);
}//TESTED
if (!found)
{
ObjectId id = new ObjectId(idStr);
DocumentPojo doc = new DocumentPojo();
doc.setId(id);
doc.setIndex(hit.getIndex() + "/document_index");
docs_to_remove.add(doc);
logger.info("db sync removing doc: " + id + "/" + hit.getIndex() + "/" + source_index + " not found in mongo");
fixcount++;
} // end if not found
} // end loop over docs to check
nSkip += docs.length;
}// until no more hits
if (!docs_to_remove.isEmpty()) {
storeManager.removeFromSearch(docs_to_remove);
docs_to_remove.clear();
}
//CHECK OLD FEEDS 10 at atime
int iteration = 1;
boolean removedAll = true;
while (removedAll )
{
int rows = iteration*iteration*10;//exponential scaling 10x^2
iteration++;
int oldfixes = 0;
//get old docs from es
SearchRequestBuilder searchOptionsOLD = esm.getSearchOptions();
BoolQueryBuilder boolQueryOLD = QueryBuilders.boolQuery();
boolQueryOLD.must(QueryBuilders.rangeQuery(DocumentPojo.created_).from(cleanseStartTime));
boolQueryOLD.must(QueryBuilders.termQuery(DocumentPojo.sourceKey_, sp.getKey()));
searchOptionsOLD.addSort(DocumentPojo.created_, SortOrder.ASC);
searchOptionsOLD.setSize(rows);
SearchResponse rspOLD = esm.doQuery(boolQueryOLD, searchOptionsOLD);
SearchHit[] docsOLD = rspOLD.getHits().getHits();
//Check all solr docs against mongodb
for (SearchHit hit: docsOLD)
{
String idStr = hit.getId();
boolean found = true;
if (null == dbCache) {
//OBSOLETED, USE DBCACHE INSTEAD (WHERE AVAILABLE):
ObjectId id = new ObjectId(idStr);
BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
DBObject dbo = documentDb.findOne(queryOLD, queryFields);
found = (dbo != null);
}//TESTED
else {
found = dbCache.contains(idStr);
}//TESTED
if (!found)
{
// Also need to check the DB since dbCache is not guaranteed to be populated with the same
// number of "final" docs
ObjectId id = new ObjectId(idStr);
if (rows > 10) { // (dbCache always loaded with the first 10 rows)
BasicDBObject queryOLD = new BasicDBObject(DocumentPojo._id_, id);
if (null != documentDb.findOne(queryOLD, queryFields)) { // it is actually present
continue;
}
}
DocumentPojo doc = new DocumentPojo();
doc.setId(id);
doc.setIndex(hit.getIndex() + "/document_index");
docs_to_remove.add(doc);
logger.info("db sync removing doc: " + idStr + "/" + source_index + " not found in mongo");
oldfixes++;
fixcount++;
}
}
if (!docs_to_remove.isEmpty()) {
storeManager.removeFromSearch(docs_to_remove);
}
if ( oldfixes != rows )
removedAll = false;
}