//System.out.println(" CANDIDATE=" + candidate.getKey() + " ..." + candidate.getId());
if ((null != sSourceType) && !candidate.getExtractType().equalsIgnoreCase(sSourceType)) {
HarvestEnum candidateStatus = null;
if (null != candidate.getHarvestStatus()) {
candidateStatus = candidate.getHarvestStatus().getHarvest_status();
if (bSync && (null == candidateStatus)) { // Don't sync unharvested sources, obviously!
// Checking whether to respect the searchCycle_secs for distributed sources is a bit more complex
boolean isDistributed = (null != candidate.getDistributionFactor());
boolean distributedInProcess = isDistributed &&
candidate.reachedMaxDocs() || // (<- only set inside a process)
((null != candidate.getHarvestStatus()) && // (robustness)
(null != candidate.getHarvestStatus().getDistributionTokensFree()) && // (else starting out)
(candidate.getDistributionFactor() != candidate.getHarvestStatus().getDistributionTokensFree()));
// (else this is the start)
//(TESTED - local and distributed)
if (((HarvestEnum.success_iteration != candidateStatus) && !distributedInProcess)
((null != candidate.getSearchCycle_secs()) && (candidate.getSearchCycle_secs() < 0)))
// (ie EITHER we're not iteration OR we're disabled)
//(^^^ don't respect iteration status if source manually disabled)
if ((null != candidate.getSearchCycle_secs()) || (null != defaultSearchCycle_ms)) {
if (null == candidate.getSearchCycle_secs()) {
if (candidate.getSearchCycle_secs() < 0) {
continue; // negative search cycle => disabled
if ((null != candidate.getHarvestStatus()) && (null != candidate.getHarvestStatus().getHarvested())) {
//(ie the source has been harvested, and there is a non-default search cycle setting)
if ((candidate.getHarvestStatus().getHarvested().getTime() + 1000L*candidate.getSearchCycle_secs())
> now.getTime())
if ((HarvestEnum.in_progress != candidateStatus) && (null != candidateStatus) && (null == candidate.getOwnerId()))
//(^^ last test, if it's in_progress then it died recently (or hasn't started) so go ahead and harvest anyway)
// (also hacky use of getOwnerId just to see if this is a source override source or not)
continue; // (too soon since the last harvest...)
}//TESTED (including hacky use of ownerId)
//TESTED: manually disabled (ignore), not success_iteration (ignore if outside cycle), success_iteration (always process)
query.put(SourcePojo._id_, candidate.getId());
BasicDBObject modifyClause = new BasicDBObject();
modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvest_status_, HarvestEnum.in_progress.toString());
if (bSync) {
modifyClause.put(SourceHarvestStatusPojo.sourceQuery_synced_, now);
else {
modifyClause.put(SourceHarvestStatusPojo.sourceQuery_harvested_, now);
modifyClause.put(SourceHarvestStatusPojo.sourceQuery_lastHarvestedBy_, getHostname());
BasicDBObject modify = new BasicDBObject(MongoDbManager.set_, modifyClause);
try {
BasicDBObject fields = new BasicDBObject(SourcePojo.templateProcessingFlow_, 0);
BasicDBObject dbo = (BasicDBObject) DbManager.getIngest().getSource().findAndModify(query, fields, null, false, modify, false, false);
if (null != dbo) {
SourcePojo fullSource = SourcePojo.fromDb(dbo, SourcePojo.class, new SourcePojoSubstitutionDbMap());
// If distributionFactor set then grab one token and set state back to
// success_iteration, to allow other threads/processes to grab me
if ((null != fullSource.getDistributionFactor()) && !bSync)
// Get the current distribution token
int distributionToken = 0;
boolean bReset = false;
if ((null == fullSource.getHarvestStatus()) || (null == fullSource.getHarvestStatus().getDistributionTokensFree())) {
distributionToken = fullSource.getDistributionFactor();
// (also set up some parameters so don't need to worry about null checks later)
if (null == fullSource.getHarvestStatus()) {
fullSource.setHarvestStatus(new SourceHarvestStatusPojo());
else {
distributionToken = fullSource.getHarvestStatus().getDistributionTokensFree();
//Check last harvested time to ensure this isn't an old state (reset if so)
if ((distributionToken != fullSource.getDistributionFactor()) ||
(0 != fullSource.getHarvestStatus().getDistributionTokensComplete()))
if (null != fullSource.getHarvestStatus().getRealHarvested()) { // harvested is useless here because it's already been updated
if ((new Date().getTime() - fullSource.getHarvestStatus().getRealHarvested().getTime()) >
_ONEDAY) // (ie older than a day)
distributionToken = fullSource.getDistributionFactor(); // ie start again
}//(end check for any existing state)
if (distributionToken == fullSource.getDistributionFactor()) {
bReset = true; // (first time through, might as well go ahead and reset to ensure all the vars are present)
// If in error then just want to grab all remaining tokens and reset the status
if (HarvestEnum.error == fullSource.getHarvestStatus().getHarvest_status()) { // currently an error
if (distributionToken != fullSource.getDistributionFactor()) { // In the middle, ie just errored
fullSource.setDistributionTokens(new HashSet<Integer>());
while (distributionToken > 0) {
BasicDBObject dummy = new BasicDBObject();
bReset = updateHarvestDistributionState_tokenComplete(fullSource, HarvestEnum.error, dummy, dummy);
// (then finish off completion down below)
}//TESTED (error mode, 2 cases: complete and incomplete)
//System.out.println(" DIST_SOURCE=" + fullSource.getKey() + "/" + fullSource.getDistributionFactor() + ": " + distributionToken + ", " + bReset);
//(note we'll see this even if searchCycle is set because the "source" var (which still has the old
// state) is stuck back at the start of uncheckedList, so each harvester will see the source >1 time)
if (0 != distributionToken) { // (else no available tokens for this cycle)
fullSource.setDistributionTokens(new HashSet<Integer>());
// Remove one of the available tokens (they don't get reset until the source is complete)
updateHarvestDistributionState_newToken(fullSource.getId(), distributionToken, HarvestEnum.success_iteration, bReset);
// After this loop is complete, put back at the start of the unchecked list
// so another thread can pick up more tokens:
if (null == putMeBackAtTheStart_distributed) {
putMeBackAtTheStart_distributed = new LinkedList<SourcePojo>();
// Before adding back to list, set a transient field to ensure it bypasses any search cycle checks
// (for in process logic where we won't see the update status from the DB)
// Reset full source's status so we know if we started in success/error/success_iteration
if (null == candidateStatus) {
candidateStatus = HarvestEnum.success;
} // (end if available tokens)
else { // (don't process, just set back to original status)
HarvestEnum harvestStatus = HarvestEnum.success;
if (null != fullSource.getHarvestStatus()) {
if (null != fullSource.getHarvestStatus().getHarvest_status()) {
harvestStatus = fullSource.getHarvestStatus().getHarvest_status();