Package com.ikanow.infinit.e.api.knowledge.federated

Source Code of com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine$FederatedHarvest

* Copyright 2012, The Infinit.e Open Source Project.
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <>.
package com.ikanow.infinit.e.api.knowledge.federated;

import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;

import net.minidev.json.JSONArray;

import org.bson.types.ObjectId;

import com.ikanow.infinit.e.data_model.api.ResponsePojo;
import com.ikanow.infinit.e.data_model.api.knowledge.AdvancedQueryPojo;
import com.ikanow.infinit.e.data_model.interfaces.query.IQueryExtension;
import com.ikanow.infinit.e.data_model.utils.IkanowSecurityManager;
import com.jayway.jsonpath.JsonPath;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.AsyncHttpClient.BoundRequestBuilder;
import com.ning.http.client.Response;

import org.apache.log4j.Logger;

public class SimpleFederatedQueryEngine implements IQueryExtension {

  static private boolean _DEBUG = false;
  //static private boolean _DEBUG = true;
  static Logger _logger = Logger.getLogger(SimpleFederatedQueryEngine.class);

  private boolean _cacheMode = true;
  private boolean _testMode = false;
  public void setTestMode(boolean testMode) {
    _testMode = testMode;
    _cacheMode = !_testMode; // (don't cache in some places if in test mode)
  public static class FederatedRequest {
    // Endpoint version
    public SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl subRequest;
    public Future<Response> responseFuture;
    public AsyncHttpClient asyncClient;
    public String mergeKey;
    // Script import version
    public FederatedHarvest importThread;
    public AdvancedQueryPojo fullQuery;
    public Throwable errorMessage;
    public BasicDBObject scriptResult = null;
    // Joint
    public SourceFederatedQueryConfigPojo endpointInfo;
    public BasicDBObject cachedResult = null;
    public BasicDBObject cachedDoc = null;
    public BasicDBObject cachedDoc_expired = null;
    public String requestParameter;
    public String[] communityIdStrs;
    public String queryIndex;
  public void addEndpoint(SourceFederatedQueryConfigPojo newEndpoint) {
    if (null == _endpoints) {
      _endpoints = new LinkedList<SourceFederatedQueryConfigPojo>();
  LinkedList<FederatedRequest> _asyncRequestsPerQuery = null;
  LinkedList<SourceFederatedQueryConfigPojo> _endpoints = null;
  IkanowSecurityManager _scriptingSecurityManager = new IkanowSecurityManager();
  private ScriptEngine _pyEngine = null;
  public void preQueryActivities(ObjectId queryId, AdvancedQueryPojo query, String[] communityIdStrs) {
    _asyncRequestsPerQuery = null;
    // 1] Check whether this makes sense to query, get the (sole) entity if so
    String entityType = null;
    String entityValue = null;
    String entityIndex = null;
    String textToTest = null;
    if ((null != query.qt) && (query.qt.size() > 0) && (query.qt.size() < 4)) {

      String logic = query.logic;
      if (null != logic) {
        logic = logic.toLowerCase();
      if ((null != logic) && (logic.contains("or") || logic.contains("not"))) {
        if (_DEBUG) _logger.debug("DEB: preQA1: Logic too complex: " + query.logic);
        if (_testMode) {
          throw new RuntimeException("Bad testQueryJson: Logic too complex: " + query.logic);
        return; // logic too complex
      }//TESTED (1.3)
      for (AdvancedQueryPojo.QueryTermPojo qt: query.qt) {
        if ((null != qt.entity) || ((null != qt.entityType) && (null != qt.entityValue))) {
          if (null == entityType) { // we now have == 1 entity
            if (null != qt.entityValue) {
              entityValue = qt.entityValue;
              entityType = qt.entityType;
              entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase();
            }//TESTED (1.5)
            else {
              entityIndex = qt.entity.toLowerCase();
              int index = qt.entity.lastIndexOf('/');
              if (index > 0) {
                entityValue = qt.entity.substring(0, index);
                entityType = qt.entity.substring(index + 1).toLowerCase();
            }//TESTED (1.6)
          else { // >1 entity, not supported
            if (_DEBUG) _logger.debug("DEB: preQA2a: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi());
            if (_testMode) {
              throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi());
          }//TESTED (1.4)
        else if ((null != qt.etext) && (qt.etext.equals("*"))) {
          //this is fine provided it's only ANDed together (eg above logic case)
        else if (null != qt.etext) { // Only work if it matches one of the regexes
          if (null == entityType) {
            textToTest = qt.etext;
            entityType = "etext";
          else { // >1 entity, not supported
            if (_DEBUG) _logger.debug("DEB: preQA2b: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi());
            if (_testMode) {
              throw new RuntimeException("Bad testQueryJson: >1 entity: " + qt.entity + " / " + entityType + " / " + query.toApi());
          }//TESTED (1.4)         
        else if (null == qt.time) { // temporal
          if (_DEBUG) _logger.debug("DEB: preQA3: non-entity/date " + query.toApi());
          if (_testMode) {
            throw new RuntimeException("Bad testQueryJson: non-entity/date " + query.toApi());
        }//TESTED (1.1)
      }//(end loop over query terms)
    }//TESTED (1.*)
    if (null == entityType) { // Query too complex
      if (_DEBUG) _logger.debug("DEB: preQA4: query missing entity " + query.toApi());
      if (_testMode) {
        throw new RuntimeException("Bad testQueryJson: query missing entity " + query.toApi());
    }//TESTED (1.2)
    entityType = entityType.toLowerCase();
    // 2] If so, query across all the end
    for (SourceFederatedQueryConfigPojo endpoint: _endpoints) {
      // Endpoint validation:
      if (null == endpoint.entityTypes) {
        if (_testMode) {
          throw new RuntimeException("No entity types specified");
        else {
      if (null != textToTest) { // This is text, see if you can convert to an entity
        entityValue = null; //(reset for different endpoints - used in the check to decide whether to continue)
        for (String entityTypeRegex: endpoint.entityTypes) {
          if (entityTypeRegex.startsWith("/")) {
            int regexIndex = entityTypeRegex.lastIndexOf('/'); // (guaranteed to be >= 0)
            try {
              Pattern regex = Pattern.compile(entityTypeRegex.substring(1, regexIndex));
              if (regex.matcher(textToTest).matches()) {
                entityType = entityTypeRegex.substring(1+regexIndex);
                if (entityType.length() > 0) {
                  entityValue = textToTest;
                  entityIndex = entityValue.toLowerCase() + "/" + entityType.toLowerCase();
            catch (Exception e) { // if not in test mode, carry on
              if (_testMode) {
                throw new RuntimeException(e);
        }//(end loop over entity regexes)
      if (null == entityValue) { // None of the regexes matched
        if (_testMode) {
          throw new RuntimeException("Text specified, does not match any of the regexes: " + Arrays.toString(endpoint.entityTypes.toArray()) + " ... text = " + textToTest);
      if (_DEBUG) _logger.debug("DEB: preQA5: ENDPOINT: " + Arrays.toString(endpoint.entityTypes.toArray()) + " / " + entityType);

      if ((null != endpoint.importScript) && !endpoint.importScript.isEmpty()) {
        if (null == endpoint.scriptlang) {
          endpoint.scriptlang = "python"; // python ==default
        if (endpoint.scriptlang.equalsIgnoreCase("python")) {
          _pyEngine = new ScriptEngineManager().getEngineByName("python");
          if (null == _pyEngine) {
            _logger.error("Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled");
            if (_testMode) {
              throw new RuntimeException("Python not installed - copy jython 2.5+ into /opt/infinite-home/lib/unbundled");
          }//TESTED (by hand, importScript != null and scriptlang: "python", jython not on classpath)
        else {
          _logger.error("Python is currently the only supported scriptlang");
          if (_testMode) {
            throw new RuntimeException("Python is currently the only supported scriptlang");
        }//TESTED (by hand, importScript != null and scriptlang: "none")
      if ((null != endpoint.bypassSimpleQueryParsing) && endpoint.bypassSimpleQueryParsing) {
        throw new RuntimeException("Currently only simple query parsing is supported");       
      if ((null != endpoint.entityTypes) && endpoint.entityTypes.contains(entityType)) {
        // Check if the *doc* (not *API response*) generated from this endpoint/entity has been cached, check expiry if so
        String cachedDocUrl = buildScriptUrl(endpoint.parentSource.getKey(), entityIndex);
        BasicDBObject cachedDoc = null;
        BasicDBObject cachedDoc_expired = null;
        if (_cacheMode && ((null == endpoint.cacheTime_days) || (endpoint.cacheTime_days >= 0))) {
          BasicDBObject cachedDocQuery = new BasicDBObject(DocumentPojo.url_, cachedDocUrl);
          cachedDocQuery.put(DocumentPojo.sourceKey_, endpoint.parentSource.getKey());
          cachedDoc = (BasicDBObject) DbManager.getDocument().getMetadata().findOne(cachedDocQuery);
          if ((null != cachedDoc) && checkDocCache_isExpired(cachedDoc, endpoint)) {
            cachedDoc_expired = cachedDoc;
            cachedDoc = null;
        }//TESTED (by hand)
        if (null == _asyncRequestsPerQuery) {
          // If we've got this far create a list to store the async requests
          _asyncRequestsPerQuery = new LinkedList<FederatedRequest>();

        if (null != cachedDoc) {
          // Common params:
          FederatedRequest requestOverview = new FederatedRequest();
          requestOverview.endpointInfo = endpoint;
          requestOverview.communityIdStrs = communityIdStrs;
          requestOverview.requestParameter = entityValue;
          requestOverview.queryIndex = entityIndex;
          requestOverview.mergeKey = endpoint.parentSource.getKey();
          if (_DEBUG) _logger.debug("DEB: preQA6z: Doc Cache: " + cachedDocUrl + " , " + cachedDoc);                     
          requestOverview.cachedDoc = cachedDoc;
        }//TESTED (by hand)
        else if (null != endpoint.importScript) {
          BasicDBObject cachedVal = null;
          if (_cacheMode) { // (source key not static, plus not sure it's desirable, so for simplicity just don't cache requests in test mode)
            cachedVal = this.getCache(cachedDocUrl, endpoint);
          // Common params:
          FederatedRequest requestOverview = new FederatedRequest();
          requestOverview.endpointInfo = endpoint;
          requestOverview.communityIdStrs = communityIdStrs;
          requestOverview.requestParameter = entityValue;
          requestOverview.queryIndex = entityIndex;
          requestOverview.mergeKey = endpoint.parentSource.getKey();
          requestOverview.cachedDoc_expired = cachedDoc_expired;
          if (null == cachedVal) {
            requestOverview.importThread = new FederatedHarvest();
            requestOverview.importThread.queryEngine = this;
            requestOverview.importThread.request = requestOverview;
          else {
            if (_DEBUG) _logger.debug("DEB: preQA6a: Cache: " + cachedDocUrl + " , " + cachedVal);           
            requestOverview.cachedResult = cachedVal;           
          // Launch thread
        }//TESTED (by hand)
        else {
          AsyncHttpClient asyncHttpClient = null;
          try {
            for (SourceFederatedQueryConfigPojo.FederatedQueryEndpointUrl request: endpoint.requests) {
              asyncHttpClient = new AsyncHttpClient();
              BoundRequestBuilder asyncRequest = null;
              String postContent = null;
              if (null != request.httpFields) {
                postContent = request.httpFields.get("Content");
                if (null == postContent)
                  postContent = request.httpFields.get("content");
              }//TESTED (by hand, "", "httpFields": { "Content": "test" }
              if (null == postContent) {
                asyncRequest = asyncHttpClient.prepareGet(request.endPointUrl.replace("$1", entityValue));
              else {
                asyncRequest = asyncHttpClient.preparePost(request.endPointUrl.replace("$1", entityValue));             
              }//TESTED (by hand, "", "httpFields": { "Content": "test" }
              if (null != request.urlParams) {
                for (Map.Entry<String, String> keyValue: request.urlParams.entrySet()) {
                  asyncRequest = asyncRequest.addQueryParameter(keyValue.getKey(), keyValue.getValue().replace("$1", entityValue));
              }//TESTED (1.5, 1.6, 3.*, 4.*)
              if (null != request.httpFields) {
                for (Map.Entry<String, String> keyValue: request.httpFields.entrySet()) {
                  if (!keyValue.getKey().equalsIgnoreCase("content")) {
                    asyncRequest = asyncRequest.addHeader(keyValue.getKey(), keyValue.getValue().replace("$1", entityValue));
              }//TESTED (by hand, "", "httpFields": { "Cookie": "mycookie=test" }
              if (null != postContent) {
                asyncRequest = asyncRequest.setBody(postContent.replace("$1", entityValue));
              }//TESTED (by hand, "", "httpFields": { "Content": "$1" }
              // Common params:
              FederatedRequest requestOverview = new FederatedRequest();
              requestOverview.endpointInfo = endpoint;
              requestOverview.communityIdStrs = communityIdStrs;
              requestOverview.requestParameter = entityValue;
              requestOverview.asyncClient = asyncHttpClient;
              requestOverview.queryIndex = entityIndex;
              requestOverview.mergeKey = endpoint.parentSource.getKey();
              requestOverview.subRequest = request;
              // Now check out the cache:
              URI rawUri =;
              String url = rawUri.toString();
              if ((null == endpoint.parentSource.getOwnedByAdmin()) || !endpoint.parentSource.getOwnedByAdmin())
                //TODO (INF-2798): Make this consistent with the how security is handled elsewhere
                int port = rawUri.getPort();
                if ((80 != port) && (443 != port) && (-1 != port)) {
                  _logger.error("Only admin can make requests on non-standard ports: " + url + ": " + port);
                  if (_testMode) {
                    throw new RuntimeException("Only admin can make requests on non-standard ports: " + url + ": " + port);
              }//TESTED (by hand)
              BasicDBObject cachedVal = this.getCache(url, endpoint);
              requestOverview.cachedResult = cachedVal;
              requestOverview.cachedDoc_expired = cachedDoc_expired;
              if (null == cachedVal) {
                requestOverview.responseFuture = asyncRequest.execute();           
              else {
                if (_DEBUG) _logger.debug("DEB: preQA6b: Cache: " + url + " , " + cachedVal);
                requestOverview.responseFuture = null;
            }//(end loop over multiple requests
          catch (Exception e) {
            _logger.error("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage());
            if (_testMode) {
              throw new RuntimeException("Unknown error creating federated query for " + endpoint.titlePrefix + ": " + e.getMessage(), e);
        }//(end cached doc vs script vs request mode for queries)
      }//(end if this request is for this entity type)
      else { // no entity matches - if in test mode then bomb out with useful error
        if (_testMode) {
          throw new RuntimeException("Specified entity: " + entityIndex + " not in set: " + Arrays.toString(endpoint.entityTypes.toArray()));
    }//(end loop over endpoints)

  public void postQueryActivities(ObjectId queryId, List<BasicDBObject> docs, ResponsePojo response)
    boolean grabbedScores = false;
    double aggregateSignif = 100.0;
    double queryRelevance = 100.0;
    double score = 100.0;
    if (null != _asyncRequestsPerQuery) {
      int added = 0;
      BasicDBObject doc = null;
      BasicDBList bsonArray = new BasicDBList();
      PeekingIterator<FederatedRequest> it = Iterators.peekingIterator(_asyncRequestsPerQuery.iterator());
      while (it.hasNext()) {
        FederatedRequest request =;
        if (null == request.cachedDoc) { // no cached doc
          try {
            if (null == request.cachedResult) {  // no cached value       
              if (null != request.importThread) {
                // 1) wait for the thread to finish
                if (null == request.endpointInfo.queryTimeout_secs) {
                  request.endpointInfo.queryTimeout_secs = 300;
                for (int timer = 0; timer < request.endpointInfo.queryTimeout_secs; timer++) {
                  try {
                    if (!request.importThread.isAlive()) {
                  }//TESTED (by hand)
                  catch (Exception e) {
                if (request.importThread.isAlive()) {
                  request.errorMessage = new RuntimeException("Script timed out");
                }//TESTED (by hand)
                // 2) Get the results
                if (null != request.errorMessage) {
                  if (_testMode) {
                    throw new RuntimeException(request.errorMessage);
                else if (null == request.scriptResult) {               
                  if (_testMode) {
                    throw new RuntimeException("Script mode: no cached result found from: " + request.requestParameter);
                else {
                  String url = buildScriptUrl(request.mergeKey, request.queryIndex);
                  if (_cacheMode) { // (don't cache python federated queries in test mode)
                    this.cacheApiResponse(url, request.scriptResult, request.endpointInfo);
              } // end script mode
              else { // HTTP mode
                Response endpointResponse = request.responseFuture.get();
                request.asyncClient = null;
                String jsonStr = endpointResponse.getResponseBody();
                String url = endpointResponse.getUri().toURL().toString();
                Object bsonUnknownType = com.mongodb.util.JSON.parse(jsonStr);
                BasicDBObject bson = null;
                if (bsonUnknownType instanceof BasicDBObject) {
                  bson = (BasicDBObject) bsonUnknownType;
                else if (bsonUnknownType instanceof BasicDBList) {
                  bson = new BasicDBObject("array", bsonUnknownType);
                else if (bsonUnknownType instanceof String) {
                  bson = new BasicDBObject("value", bsonUnknownType);                 
                if (_DEBUG) _logger.debug("DEB: postQA1: " + url + ": " + jsonStr);
                if (null != bson) {
                  MongoDbUtil.enforceTypeNamingPolicy(bson, 0);
                  this.cacheApiResponse(url, bson, request.endpointInfo);
              }//(end script vs request method)
            }//TESTED (3.1, 4.2)
            else { // (just used cached value)
              if (_DEBUG) _logger.debug("DEB: postQA2: " + request.cachedResult.toString());
            }//TESTED (4.1, 4.3)
          catch (Exception e) {
            if (null == request.subRequest) {
              _logger.error("Error with script: " + e.getMessage());
              if (_testMode) {
                throw new RuntimeException("Error with script: " + e.getMessage(), e);
            else {
              _logger.error("Error with " + request.subRequest.endPointUrl + ": " + e.getMessage());
              if (_testMode) {
                throw new RuntimeException("Error with " + request.subRequest.endPointUrl + ": " + e.getMessage(), e);
          if (!it.hasNext() || (request.mergeKey != it.peek().mergeKey)) { // deliberate ptr arithmetic
            String url = buildScriptUrl(request.mergeKey, request.queryIndex);
            if (_DEBUG) _logger.debug("DEB: postQA3: " + url + ": " + bsonArray);         
            doc = createDocFromJson(bsonArray, url, request, request.endpointInfo);
        } // (end if no cached doc)
        else { // cached doc, bypass lots of processing because no merging and doc already built
          doc = request.cachedDoc;
        }//TESTED (by hand)
        if (null != doc) {
          // Cache the document unless already cached (or caching disabled)
          if ((null == request.cachedDoc) && _cacheMode &&
              ((null == request.endpointInfo.cacheTime_days) || (request.endpointInfo.cacheTime_days >= 0)))
            if (null != request.cachedDoc_expired) {
              ObjectId updateId = request.cachedDoc_expired.getObjectId(DocumentPojo.updateId_);
              if (null != updateId) {
                doc.put(DocumentPojo.updateId_, updateId);
              else {
                doc.put(DocumentPojo.updateId_, request.cachedDoc_expired.getObjectId(DocumentPojo._id_));
              BasicDBObject docUpdate = new BasicDBObject(DocumentPojo.url_, doc.getString(DocumentPojo.url_));
              docUpdate.put(DocumentPojo.sourceKey_, doc.getString(DocumentPojo.sourceKey_));
              if (_DEBUG) _logger.debug("DEB: postQA4a: re-cached ... " + docUpdate.toString() + ": " + doc.getObjectId(DocumentPojo.updateId_));
            if (_DEBUG) _logger.debug("DEB: postQA4b: cached ... " + doc);         

          }//TESTED (by hand, 3 cases: cached not expired, cached expired first time, cached expired multiple times)
          if (!grabbedScores) {
            if (!docs.isEmpty()) {
              BasicDBObject topDoc = docs.get(0);
              aggregateSignif = topDoc.getDouble(DocumentPojo.aggregateSignif_, aggregateSignif);
              queryRelevance = topDoc.getDouble(DocumentPojo.queryRelevance_, queryRelevance);
              score = topDoc.getDouble(DocumentPojo.score_, score);
              grabbedScores = true;
              // OK would also like to grab the original matching entity
              BasicDBList ents = (BasicDBList) topDoc.get(DocumentPojo.entities_);
              if (null != ents) {
                for (Object entObj: ents) {
                  BasicDBObject ent = (BasicDBObject)entObj;
                  String entIndex = ent.getString(EntityPojo.index_, "");
                  if (entIndex.equals(request.queryIndex)) {
                    ents = (BasicDBList) doc.get(DocumentPojo.entities_);
                    if (null != ents) {
              }//TESTED (by hand)
          doc.put(DocumentPojo.aggregateSignif_, aggregateSignif);
          doc.put(DocumentPojo.queryRelevance_, queryRelevance);
          doc.put(DocumentPojo.score_, score);

          // If we're returning to a query then we'll adjust the doc format (some of the atomic fields become arrays)
          if (!_testMode) {
            convertDocToQueryFormat(doc, request.communityIdStrs);             
          }//TESTED (by hand)
          docs.add(0, doc);
          doc = null; //(reset)
          //(end if built a doc from the last request/set of requests)
        }//TESTED (3.1)   
      }//(end loop over federated requests)
      if (null != response.getStats()) {
        response.getStats().found += added;
      }//TESTED (by hand)     

  public BasicDBObject createDocFromJson(BasicDBList jsonList, String url, FederatedRequest request, SourceFederatedQueryConfigPojo endpointInfo) {
    BasicDBObject doc = null; // (don't create unless needed)
    BasicDBList ents = null;
    StringBuffer entVals = null;
    HashSet<String> entDedup = null;
    if (_testMode) { // In test mode, need to return the JSON even if no entities are specified
      doc = new BasicDBObject();
    if (null != endpointInfo.docConversionMap) {
      for (Map.Entry<String, String> docInfo: endpointInfo.docConversionMap.entrySet()) {
        for (Object jsonObj: jsonList) {
          BasicDBObject json = (BasicDBObject)jsonObj;
          try {
            String key = docInfo.getKey();
            // (allow user to not prepend array: if they don't want to)
            if ((1 == json.size()) && json.containsKey((Object)"array")) {
              if (!key.startsWith("array:") &&
                  !key.startsWith(":array") && !key.startsWith("$:array") &&
                  !key.startsWith("::") && !key.startsWith("$::"))
                if (key.startsWith(":")) { // jpath
                  key = ":array" + key;
                else if (key.startsWith("$:")) { // jpath
                  key = "$:array" + key.substring(1);
                else {
                  key = "array:" + key;
            }//TESTED (by hand)
            if (key.startsWith(":")) { // jpath
              key = "$" + key;
            // NOTE: *not* org.json.JSONArray
            JSONArray candidateEntities = null;
            if (key.startsWith("$")) {
              JSONArray candidateEntities_tmp =, key.replace(':', '.'));
              if (null != candidateEntities_tmp) {
                candidateEntities = new JSONArray();
                for (Object o: candidateEntities_tmp) {
                  if (o instanceof String) {
                  else if (o instanceof JSONArray) {
                }//TESTED (displayUrl vs entities, 3.2)
            }//(TESTED (permutations above by hand))
            else {
              String s = (String) MongoDbUtil.getProperty(json, key.replace(':', '.'));
              if (null != s) {
                candidateEntities = new JSONArray();
            }//TESTED (3.1)                     
            if (null != candidateEntities) for (int i = 0; i < candidateEntities.size(); ++i) {
              Object o = candidateEntities.get(i);
              if (!(o instanceof String)) {
              String s = o.toString();
              if (null == doc) {
                doc = new BasicDBObject();
                //(various fields added below)
              if (docInfo.getValue().equalsIgnoreCase(DocumentPojo.displayUrl_)) {
                doc.put(DocumentPojo.displayUrl_, s);
              }//TESTED (3.1, 4.*)
              else { // Entities!
                if (null == ents) {
                  ents = new BasicDBList();
                String index = s.toLowerCase() + "/" + docInfo.getValue().toLowerCase();
                if (null == entDedup) {
                  entDedup = new HashSet<String>();
                else if (entDedup.contains(index)) { // Entity deduplication
                }//TESTED (3.2)
                if (null == entVals) {
                  entVals = new StringBuffer(": ");
                else {
                  entVals.append(", ");
                String dimension = null;
                if (null != endpointInfo.typeToDimensionMap) {
                  try {
                    dimension = EntityPojo.Dimension.valueOf(endpointInfo.typeToDimensionMap.get(docInfo.getValue())).toString();
                  catch (Exception e) {}
                if (null == dimension) {
                  dimension = EntityPojo.Dimension.What.toString();
                }//TESTED (by hand)
                // (alternative to "made up" values would be to go looking in the existing docs/ents?)
                // (we'll try to avoid that for now...)
                BasicDBObject ent = new BasicDBObject();
                ent.put(EntityPojo.disambiguated_name_, s);
                ent.put(EntityPojo.type_, docInfo.getValue());
                ent.put(EntityPojo.dimension_, dimension);
                ent.put(EntityPojo.relevance_, 1.0);
                ent.put(EntityPojo.doccount_, 1L); // (ie relative to this query)
                ent.put(EntityPojo.averageFreq_, 1.0);
                ent.put(EntityPojo.datasetSignificance_, 10.0); // (ie relative to this query)
                ent.put(EntityPojo.frequency_, 1.0);
                ent.put(EntityPojo.index_, index);
                ent.put(EntityPojo.queryCoverage_, 100.0); // (ie relative to this query)
                ent.put(EntityPojo.totalfrequency_, 1.0); // (ie relative to this query)
              }//TESTED (3.1, 4.*)
          catch (Exception e) {
            //(do nothing? null or the wrong type)
        }//end loop over various JSON objects retrieved
      }//(End loop over doc conversion elements)
    }//TESTED (3.*, 4.*)
    if ((null == ents) && !_testMode) { // don't return unless there are any entities
      return null;
    else if (null != doc) {
      // Insert mandatory fields:
      // (Note the query format is a little bit different, the following fields are converted to arrays:
      //  sourceKey, source, communityId, mediaType)
      doc.put(DocumentPojo._id_, new ObjectId());
      doc.put(DocumentPojo.url_, url);
      doc.put(DocumentPojo.created_, new Date());
      doc.put(DocumentPojo.modified_, new Date());
      doc.put(DocumentPojo.publishedDate_, new Date());
      doc.put(DocumentPojo.sourceKey_, endpointInfo.parentSource.getKey());
      doc.put(DocumentPojo.source_, endpointInfo.parentSource.getTitle());
      doc.put(DocumentPojo.communityId_, new ObjectId(request.communityIdStrs[0]));
      doc.put(DocumentPojo.mediaType_, endpointInfo.parentSource.getMediaType());
      doc.put(DocumentPojo.metadata_, new BasicDBObject("json", jsonList.toArray()));
      if ((null != entVals) && (entVals.length() > 165)) { // (arbitrary length)
      doc.put(DocumentPojo.title_, new StringBuffer(endpointInfo.titlePrefix).append(": ").append(request.requestParameter).append(entVals).toString());
      doc.put(DocumentPojo.entities_, ents);
      Gson gson = new GsonBuilder().setPrettyPrinting().create();
      JsonParser jp = new JsonParser();
      JsonElement je = jp.parse(jsonList.toString());
      doc.put(DocumentPojo.description_, gson.toJson(je)); // (prettified JSON)       
    }//TESTED (3.*, 4.*)
    return doc;
  // Convert some of the doc fields to
  private static BasicDBObject convertDocToQueryFormat(BasicDBObject doc, String[] communityIdStrs) {
    doc.put(DocumentPojo.sourceKey_, Arrays.asList(doc.get(DocumentPojo.sourceKey_)));
    doc.put(DocumentPojo.source_, Arrays.asList(doc.get(DocumentPojo.source_)));
    doc.put(DocumentPojo.mediaType_, Arrays.asList(doc.get(DocumentPojo.mediaType_)));
    doc.put(DocumentPojo.communityId_, communityIdStrs);

    return doc;
  }//TESTED (by hand)
  private static String buildScriptUrl(String mergeKey, String entityIndex) {
    String url = new StringBuilder().append("inf://federated/").append(mergeKey).append("/").append(entityIndex).toString();
    return url;
  }//TESTED (by hand)

  public static boolean TEST_MODE_ONLY = false
  public static DBCollection getCacheCollection() {
    if (TEST_MODE_ONLY) {
      return MongoDbManager.getCollection("test", "fed_query_cache");
    else {
      return MongoDbManager.getIngest().getFederatedCache();
  private static boolean _staticInitializer = false;
  private static long _lastChecked = -1L;
  private BasicDBObject getCache(String url, SourceFederatedQueryConfigPojo endpoint) {
    if ((null != endpoint.cacheTime_days) && (endpoint.cacheTime_days <= 0)) { // cache disabled
      return null;
    DBCollection endpointCacheCollection = getCacheCollection();
    if (!_staticInitializer) {
      _staticInitializer = true;
      endpointCacheCollection.ensureIndex(new BasicDBObject(SimpleFederatedCache.expiryDate_, 1));
    BasicDBObject cacheObj = (BasicDBObject) endpointCacheCollection.findOne(new BasicDBObject(SimpleFederatedCache._id_, url));
    if (null == cacheObj) {
      return null;
    // else found something, means there's stuff in the DB
    // so check it's not too big:
    Date now = new Date();         
    if ((-1 == _lastChecked) || (now.getTime() > (_lastChecked + (600L*1000L)))) { // (only check every 10 minutes)
      if (endpointCacheCollection.count() > SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE) {
        _lastChecked = now.getTime();
        // Remove everything with expiry date older than now       
        endpointCacheCollection.remove(new BasicDBObject(SimpleFederatedCache.expiryDate_, new BasicDBObject(DbManager.lt_, new Date())));
    }//TESTED (4.3)
    Date expiryDate = cacheObj.getDate(SimpleFederatedCache.expiryDate_, now);
    if (now.getTime() < expiryDate.getTime()) {
      return cacheObj;
    else {
      return null;
    }//TESTED (4.2)
  }//TESTED (4.*)
  private static final int DEFAULT_CACHE_TIME_DAYS = 5;
  private void cacheApiResponse(String url, BasicDBObject toCacheJson, SourceFederatedQueryConfigPojo endpoint) {
    int cacheTime_days = DEFAULT_CACHE_TIME_DAYS;
    if (null != endpoint.cacheTime_days) {
      cacheTime_days = endpoint.cacheTime_days;
    if (cacheTime_days <= 0) { // Disable _request_ cache (to disable all caching include doc caching use -1)
    DBCollection endpointCacheCollection = getCacheCollection();
    BasicDBObject toCacheObj = new BasicDBObject(SimpleFederatedCache._id_, url);
    toCacheObj.put(SimpleFederatedCache.cachedJson_, toCacheJson);
    toCacheObj.put(SimpleFederatedCache.expiryDate_, new Date(new Date().getTime() + cacheTime_days*3600L*24L*1000L));;
  }//TESTED (3.1, 4.*)

  // Document level caching, although it effectively serves as a mostly redundant request cache,
  // It's actually used to allow users to save federated query documents in their buckets
  public static boolean checkDocCache_isExpired(BasicDBObject cachedDoc, SourceFederatedQueryConfigPojo endpoint) {
    if (null == endpoint.cacheTime_days)
      endpoint.cacheTime_days = DEFAULT_CACHE_TIME_DAYS; 
    Date now = new Date();
    long cacheThreshold = cachedDoc.getDate(DocumentPojo.created_, now).getTime() + endpoint.cacheTime_days*3600L*24L*1000L;
    if (cacheThreshold < now.getTime()) // (ie doc-creation-time + cache is earlier than now => time to decache)
      if (_DEBUG) _logger.debug("DEB: preQA6zz: Cache expired: " + cachedDoc.getString(DocumentPojo.url_) + ": " + new Date(cacheThreshold) + " vs " + now);           
      return true;
      return false;

  // (my own threading)
  public static class FederatedHarvest extends Thread {
    public FederatedRequest request;
    public SimpleFederatedQueryEngine queryEngine;
    public void run() {
      String scriptResult = null;
      LinkedList<String> debugLog = queryEngine._testMode ? new LinkedList<String>() : null;
      try {
        scriptResult = queryEngine.performImportPythonScript(request.endpointInfo.importScript, request.requestParameter, request.fullQuery, request.endpointInfo.parentSource.getOwnedByAdmin(), debugLog);
      catch (Exception e) {
        request.errorMessage = e;
      Object parsedScriptResult = null;
      try {
        parsedScriptResult = com.mongodb.util.JSON.parse(scriptResult.toString());
      catch (Exception e) {
        request.errorMessage = new RuntimeException("Error deserializing " + scriptResult + ": " + e.getMessage());
      if (parsedScriptResult instanceof BasicDBObject) {
        request.scriptResult = (BasicDBObject) parsedScriptResult;
      else if (parsedScriptResult instanceof BasicDBList) {
        request.scriptResult = new BasicDBObject("array", parsedScriptResult);
      else if (parsedScriptResult instanceof String) {
        request.scriptResult = new BasicDBObject("value", parsedScriptResult);
      else {
        request.errorMessage = new RuntimeException("Error deserializing " + scriptResult + ": " + parsedScriptResult);
      try {
        MongoDbUtil.enforceTypeNamingPolicy(request.scriptResult, 0);
        if ((null != debugLog) && !debugLog.isEmpty()) {
          request.scriptResult.put("$logs", debugLog);
      catch (Exception ee) {
        request.errorMessage = new RuntimeException("Error deserializing " + scriptResult + ": " + ee.getMessage());
  }//TESTED (by hand)

  private String performImportPythonScript(String importScript, String entityValue, AdvancedQueryPojo query, Boolean isSrcAdmin, LinkedList<String> debugLog) {
    String modCode = importScript;
    try {         
      // Create a return value
      String fnName = "var" + new ObjectId();
      // Pull all the imports, evaluate them outside the security manager
      Pattern importRegex = Pattern.compile("^\\s*(?:import|from)\\s[^\n]+", Pattern.MULTILINE);
      Matcher m = importRegex.matcher(importScript);
      StringBuffer sbImports = new StringBuffer();
      while (m.find()) {
      if (null != query) {
        _pyEngine.put("_query", query.toApi()); // full query       
      if (null != entityValue) {
        _pyEngine.put("_entityValue", entityValue); // allow either entityValue       

      // Logging function
      if (null != debugLog) {
        String logName = "log" + new ObjectId();
        _pyEngine.put(logName, debugLog);
        _pyEngine.eval("def ikanow_log(logmsg):\n   " + logName + ".add(logmsg)\n\n");
      else {
        _pyEngine.eval("def ikanow_log(logmsg):\n   pass\n\n");       
      // Enable SSL everywhere (
      //didn't work:
      // Now run the script
      modCode = m.replaceAll("");
      modCode = modCode.replaceAll("\n([^\n]+)$", "\n" + fnName + " = " + "$1");
      if ((null == isSrcAdmin) || !isSrcAdmin) {
        _scriptingSecurityManager.eval(_pyEngine, modCode);
      else {
        //TODO (INF-2798): Make this consistent with the how security is handled elsewhere

      // Get return value
      Object result = _pyEngine.get(fnName.toString());
      if (_DEBUG) _logger.debug("DEB: T1: Return val from script: " + result);     
      if (null == result) {
        throw new RuntimeException("Null return from script - final line needs to evaluate expression to return, eg '' or 'varname'");
      }//TESTED (by hand)
      return result.toString();
    } catch (Exception e) {
      throw new RuntimeException(e);
    catch (Error ee) {
      throw new RuntimeException(ee);
  // Check 1.x test results
  public void test_CheckIfQueryLaunched(String testName, boolean shouldWork) {
    FederatedRequest request = null;
    if (null != this._asyncRequestsPerQuery) {
      if (!this._asyncRequestsPerQuery.isEmpty()) {
        request = this._asyncRequestsPerQuery.peek();
        if (this._asyncRequestsPerQuery.size() > 1) {
          System.out.println("*** " + testName + ":  + too many async requests");
    if (shouldWork) {
      if (null == request) {
        System.out.println("*** " + testName + ": Expected request, got none");       
    else {
      if (null != request) {
        if (null != request.responseFuture) {
          System.out.println("*** " + testName + ": incorrectly added " + request.requestParameter);
        else if ((null == request.cachedResult) && (null == request.cachedDoc)) {
          System.out.println("*** " + testName + ": no request, not cached " + request.cachedResult);
  }//TESTED (by hand - who tests the tester?!)

  public void test_CheckIfDocAdded(String testName, List<BasicDBObject> docs) {
    // 1) Did a doc get added?
    if (2 != docs.size()) {
      System.out.println("*** " + testName + ": didn't add doc? " + docs.size());     
    // 2) Was it the right doc?
    BasicDBObject doc = docs.get(0);

    String docToCheck =
      "{ \"displayUrl\" : \"http://test3_1\" , \"url\" : \"inf://federated/fakeendpoint.123/test3_1/testentityin\" , \"sourceKey\" : [ \"fakeendpoint.123\"] , \"source\" : [ \"fakeendpoint\"] , \"communityId\" : [ \"4c927585d591d31d7b37097a\"] , \"mediaType\" : [ \"Report\"] , \"metadata\" : { \"json\" : [ { \"test\" : { \"field\" : [ \"test3_1\" , \"test3_1\"] , \"field2\" : \"http://test3_1\"}}]} , \"title\" : \"fake endpoint: : test3_1: test3_1\" , \"entities\" : [ { \"disambiguated_name\" : \"test3_1\" , \"type\" : \"TestEntityOut\" , \"dimension\" : \"What\" , \"relevance\" : 1.0 , \"doccount\" : 1 , \"averageFreq\" : 1.0 , \"datasetSignificance\" : 10.0 , \"frequency\" : 1.0 , \"index\" : \"test3_1/testentityout\" , \"queryCoverage\" : 100.0 , \"totalfrequency\" : 1.0}] , \"description\" : \"[\\n  {\\n    \\\"test\\\": {\\n      \\\"field\\\": [\\n        \\\"test3_1\\\",\\n        \\\"test3_1\\\"\\n      ],\\n      \\\"field2\\\": \\\"http://test3_1\\\"\\n    }\\n  }\\n]\" , \"aggregateSignif\" : 115.0 , \"queryRelevance\" : 105.0 , \"score\" : 110.0}";
    if (!docToCheck.equals(doc.toString())) {
      System.out.println("*** " + testName + ": document incorrect:\n" + docToCheck + "\nVS\n" + doc.toString());
    // 3) Did the doc get cached?
    DBCollection endpointCacheCollection = getCacheCollection();
    String hardwiredCacheId = "http://localhost:8186/test/test3_1/?param1=test3_1";
    BasicDBObject cachedVal = (BasicDBObject) endpointCacheCollection.findOne(new BasicDBObject(SimpleFederatedCache._id_, hardwiredCacheId));
    if (null == cachedVal) {
      System.out.println("*** " + testName + ": no cache for: " + doc.get(DocumentPojo.url_) + " / " + hardwiredCacheId);
    else {
      Date expiryDate = cachedVal.getDate(SimpleFederatedCache.expiryDate_, null);
      if ((null == expiryDate) || ((new Date().getTime() - expiryDate.getTime()) > 10*1000L)) {
        System.out.println("*** " + testName + ": expiry date for: " + doc.get(DocumentPojo.url_) + ": " + expiryDate + " vs now= " + new Date());       
      BasicDBObject cachedJson = (BasicDBObject) cachedVal.get(SimpleFederatedCache.cachedJson_);
      Object docVal = null;
      try {
        Object[] docVals = (Object[] ) ((BasicDBObject)doc.get(DocumentPojo.metadata_)).get("json");
        docVal = docVals[0];
      catch (Exception e) {
        Collection docVals = (Collection) ((BasicDBObject)doc.get(DocumentPojo.metadata_)).get("json");
        docVal = docVals.iterator().next();
      if ((null == cachedJson) || !cachedJson.equals(docVal)) {
        System.out.println("*** " + testName + ": cache: " + doc.get(DocumentPojo.url_) + ": cached val incorrect:\n" + docVal + "\nVS\n" + cachedJson);
  }//TESTED (by hand)
  // (general test utils)
  // Reset state ready for next test
  public void test_queryClear(boolean clearCacheAlso) {
    if (null != this._asyncRequestsPerQuery) {
      for (FederatedRequest req: this._asyncRequestsPerQuery) {
        try {
          if (null != req.asyncClient) {
        catch (Exception e) {}
    if (clearCacheAlso) {
      DBCollection endpointCacheCollection = getCacheCollection();
      endpointCacheCollection.remove(new BasicDBObject());
  }//TESTED (by hand - who tests the tester?!)
  public void test_cacheExpire() {
    DBCollection endpointCacheCollection = getCacheCollection();

    DBCursor dbc = endpointCacheCollection.find();
    for (DBObject cacheEntryObj: dbc) {
      BasicDBObject cacheEntry = (BasicDBObject) cacheEntryObj;
      cacheEntry.put(SimpleFederatedCache.expiryDate_, new Date(new Date().getTime() - 3600L*1000L)); // (ie expired an hour ago);
  public void test_cacheClear(boolean clearApiCache, boolean clearDocCache, String key) {
    SimpleFederatedQueryEngine.TEST_MODE_ONLY = true;   
    // Clear doc_metadata table before running
    if (clearDocCache) {
      MongoDbManager.getDocument().getMetadata().remove(new BasicDBObject(DocumentPojo.sourceKey_, key));
    if (clearApiCache) {
      // (note: this only deletes the test collection because of TEST_MODE_ONLY code)
      SimpleFederatedQueryEngine.getCacheCollection().remove(new BasicDBObject()); // (clear test collection for testing)   
  public void test_cacheFill(String testName, boolean fill, boolean shouldBeFull) {
    DBCollection endpointCacheCollection = getCacheCollection();
    if (fill) {
      for (long i = 0; i < (1 + SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE); ++i) {
        SimpleFederatedCache fakeCacheElement = new SimpleFederatedCache();
        fakeCacheElement.expiryDate = new Date(new Date().getTime() - 3600L*1000L); // (ie expired an hour ago)
        fakeCacheElement._id = testName + "_" + i;
        fakeCacheElement.cachedJson = new BasicDBObject();;
      _lastChecked = new Date(new Date().getTime() - 602L*1000L).getTime();
    long count = endpointCacheCollection.count();
    if (shouldBeFull) {
      if (count < SimpleFederatedCache.QUERY_FEDERATION_CACHE_CLEANSE_SIZE) {
        System.out.println("*** " + testName + ": cache should just contain many elements, not: " + count);
    else {
      if (1 != count) {
        System.out.println("*** " + testName + ": cache should just contain one element, not: " + count);

Related Classes of com.ikanow.infinit.e.api.knowledge.federated.SimpleFederatedQueryEngine$FederatedHarvest

Copyright © 2018 All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact