Package com.ikanow.infinit.e.api.knowledge

Source Code of com.ikanow.infinit.e.api.knowledge.SearchHandler

/*******************************************************************************
* Copyright 2012, The Infinit.e Open Source Project.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
******************************************************************************/
package com.ikanow.infinit.e.api.knowledge;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.lucene.queryParser.CrossVersionQueryParser;
import org.apache.lucene.search.CrossVersionIndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.CrossVersionIndexWriter;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.bson.types.ObjectId;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.action.search.SearchRequestBuilder;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.BaseQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHitField;
import org.elasticsearch.search.sort.SortOrder;

import com.ikanow.infinit.e.api.knowledge.aliases.AliasLookupTable;
import com.ikanow.infinit.e.api.knowledge.aliases.AliasManager;
import com.ikanow.infinit.e.api.utils.SocialUtils;
import com.ikanow.infinit.e.data_model.api.BasePojoApiMap;
import com.ikanow.infinit.e.data_model.api.ResponsePojo;
import com.ikanow.infinit.e.data_model.api.ResponsePojo.ResponseObject;
import com.ikanow.infinit.e.data_model.api.knowledge.DimensionListPojo;
import com.ikanow.infinit.e.data_model.api.knowledge.SearchSuggestPojo;
import com.ikanow.infinit.e.data_model.api.knowledge.SearchSuggestPojoApiMap;
import com.ikanow.infinit.e.data_model.index.ElasticSearchManager;
import com.ikanow.infinit.e.data_model.index.feature.entity.EntityFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.index.feature.event.AssociationFeaturePojoIndexMap;
import com.ikanow.infinit.e.data_model.store.DbManager;
import com.ikanow.infinit.e.data_model.store.MongoDbManager;
import com.ikanow.infinit.e.data_model.store.document.EntityPojo;
import com.ikanow.infinit.e.data_model.store.document.GeoPojo;
import com.ikanow.infinit.e.data_model.store.feature.association.AssociationFeaturePojo;
import com.ikanow.infinit.e.data_model.store.feature.entity.EntityFeaturePojo;
import com.ikanow.infinit.e.data_model.store.feature.geo.GeoFeaturePojo;
import com.ikanow.infinit.e.data_model.utils.ContentUtils;
import com.ikanow.infinit.e.data_model.utils.DimensionUtility;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.CommandResult;
import com.mongodb.DBCollection;


/**
* This class is for all operations related to the retrieval, addition
* or update of people within the system
*
* @author cmorgan
*
*/
//(remove this during active development - want to just depress a deprecation warning but no way of doing this for both 0.19 and 1.0)
//@SuppressWarnings("deprecation")
@SuppressWarnings("all")
public class SearchHandler
{
  private static final Logger logger = Logger.getLogger(SearchHandler.class);

  private final StringBuffer logMsg = new StringBuffer()
  private static long lastSuggestLog = 0;
  private static long lastAliasLog = 0;

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // SEARCH SUGGEST API call

  //TODO (INF-1660): here and for assoc, should enforce doc_count>0? (or should i remove from entity feature when freq hits 0??)
  // (or both?)

  private static final String entityIndex_ = EntityFeaturePojoIndexMap.indexCollectionName_ + "/" + EntityFeaturePojoIndexMap.indexName_;

  public ResponsePojo getSuggestions(String userIdStr, String term, String communityIdStrList, boolean bIncludeGeo, boolean bIncludeLinkdata, boolean bWantNoAlias)
  {
    long nSysTime = System.currentTimeMillis();   

    ResponsePojo rp = new ResponsePojo();

    ElasticSearchManager gazIndex = ElasticSearchManager.getIndex(entityIndex_);

    // Need to do a quick decomposition of the term to fit in with analyzed strings
    String escapedterm = null;
    StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30, new StringReader(ContentUtils.stripDiacritics(term)));
    CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
    StringBuffer sb = new StringBuffer();
    try {
      try {
        st.reset();
        while (st.incrementToken()) {
          if (sb.length() > 0) {
            sb.append(" +");
          }
          else {
            sb.append('+');           
          }
          sb.append(luceneEncodeTerm(termAtt.toString()));
        }
      }
      finally {
        st.close();     
      }
    } catch (IOException e) {
      e.printStackTrace();
    }   

    if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
      sb.append('*');
    }//TESTED     
    escapedterm = sb.toString();     

    // Create the search query

    SearchRequestBuilder searchOptions = gazIndex.getSearchOptions();
    BaseQueryBuilder queryObj1 = QueryBuilders.queryString(escapedterm).defaultField(EntityFeaturePojoIndexMap.Mapping.RootObject.RootProperties.alias_pri_);

    String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);
    BaseQueryBuilder queryObj2 = QueryBuilders.boolQuery().should(QueryBuilders.termsQuery(EntityFeaturePojo.communityId_, communityIdStrs));

    BaseQueryBuilder queryObj = QueryBuilders.boolQuery().must(queryObj1).must(queryObj2);

    searchOptions.addSort(EntityFeaturePojo.doccount_, SortOrder.DESC);
    searchOptions.addFields(EntityFeaturePojo.disambiguated_name_, EntityFeaturePojo.doccount_,
        EntityFeaturePojo.type_, EntityFeaturePojo.dimension_);
    if (bIncludeGeo) {
      searchOptions.addFields(EntityFeaturePojo.geotag_);
      searchOptions.addFields(EntityFeaturePojo.ontology_type_);
    }
    if (bIncludeLinkdata) {
      searchOptions.addFields(EntityFeaturePojo.linkdata_);     
    }

    // Initial alias handling:

    AliasLookupTable aliasTable = null;
    HashMap<String, SearchSuggestPojo> aliasResults = null;
    if (!bWantNoAlias) {
      AliasManager aliasManager = AliasManager.getAliasManager();
      if (null != aliasManager) {
        aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr);
      }
    }
    //TESTED

    // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them
    List<EntityFeaturePojo> extraEntries = null;
    if (null != aliasTable) {
      extraEntries = checkAliasMasters(aliasTable, escapedterm);
    }
    // (end initial alias handling)

    int nDesiredSize = 20;
    if (null == aliasTable) {   
      searchOptions.setSize(nDesiredSize); // will forward all 20
    }
    else {
      searchOptions.addFields(EntityFeaturePojo.index_);
      searchOptions.setSize(3*nDesiredSize); // will forward top 20 after de-aliasing

      aliasResults = new HashMap<String, SearchSuggestPojo>();
      // (We use this to ensure we only include each entity once after aliasing)
    }
    //TESTED

    // Perform the search

    SearchResponse rsp = gazIndex.doQuery(queryObj, searchOptions);

    // Format the return values

    SearchHit[] docs = rsp.getHits().getHits();     
    DimensionListPojo dimlist = new DimensionListPojo();
    int nDocsAdded = 0;
   
    if (null != extraEntries) { // Put the alias masters at the top:
      //DEBUG
      //System.out.println(Arrays.toString(extraEntries.toArray()));
      for (EntityFeaturePojo alias: extraEntries) {
        SearchSuggestPojo sp = new SearchSuggestPojo();
        if (null != alias.getDimension()) {
          sp.setDimension(alias.getDimension().toString());
        }
        else {
          sp.setDimension("What");
        }
        sp.setValue(alias.getDisambiguatedName());
        sp.setType(alias.getType());
        if (bIncludeGeo) {
          sp.setGeotag(alias.getGeotag());
        }
        sp.setOntology_type(alias.getOntology_type());
        dimlist.addSearchSuggestPojo(sp);
      }
    }//TESTED (inc geo)
   
    if (null != docs)
    {
      for (SearchHit hit: docs)
      {
        SearchHitField shf = hit.field(EntityFeaturePojo.disambiguated_name_);
        if (null == shf) { // robustness check, sometimes if the harvester goes wrong this field might be missing
          continue;
        }
        String disname = (String) shf.value();
        String type = (String) hit.field(EntityFeaturePojo.type_).value();
        String dimension = (String) hit.field(EntityFeaturePojo.dimension_).value();
        SearchSuggestPojo sp = new SearchSuggestPojo();       

        sp.setValue(disname);
        sp.setDimension(dimension);
        sp.setType(type);
        if (bIncludeGeo)
        {
          SearchHitField loc = hit.field(EntityFeaturePojo.geotag_);
          if ( loc != null )
            sp.setLocFromES((String) loc.value());
          SearchHitField ont = hit.field(EntityFeaturePojo.ontology_type_);
          if ( ont != null )
            sp.setOntology_type((String)ont.value());
        }
        if (bIncludeLinkdata) {
          SearchHitField linkdata = hit.field(EntityFeaturePojo.linkdata_);
          if ( linkdata != null )
            sp.setLinkdata(linkdata.values());
        }               

        // More alias handling
        String index = null;
        if (null != aliasTable) {
          index = (String) hit.field(EntityFeaturePojo.index_).value();
          EntityFeaturePojo alias = aliasTable.getAliasMaster(index);
          if (null != alias) { // Found!
            if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
              continue;
            }
            else if ((null != alias.getDisambiguatedName()) && (null != alias.getType())) {
              // (these need to be present)

              //DEBUG (perf critical)
              //logger.debug("Alias! Replace " + index + " with " + alias.getIndex());

              index = alias.getIndex();
              disname = alias.getDisambiguatedName();
              type = alias.getType();
              if (null != alias.getDimension()) {
                dimension = alias.getDimension().toString();
              }
              else { // Guess from type
                dimension = DimensionUtility.getDimensionByType(type).toString();
              }
              // Reset values:
              sp.setValue(disname);
              sp.setDimension(dimension);
              sp.setType(type);
            }
          }
          SearchSuggestPojo existing = aliasResults.get(index);
          if (null != existing) {

            //DEBUG (perf critical)
            //logger.debug("Alias! Remove duplicate " + index);

            if ((null == existing.getGeotag()) && (null != sp.getGeotag())) {
              // (if they're both set then sigh just ignore on a first-come-first-served basis)
              existing.setGeotag(sp.getGeotag());
              existing.setOntology_type(sp.getOntology_type());
            }//TESTED
            if (null != sp.getLinkdata()) { // (here we can just combine the linkdata)
              if (null == existing.getLinkdata()) {
                existing.setLinkdata(sp.getLinkdata());
              }
              else {
                existing.getLinkdata().addAll(sp.getLinkdata());
              }
            }//TESTED
            continue; // (ie don't add this guy)
          }
          else { // add it
            aliasResults.put(index, sp);
          }
        }
        //TESTED
        // end more alias handing               

        dimlist.addSearchSuggestPojo(sp);
        // (only adds unique entries, ie handles multiple communities "ok" (only ok
        //  because it doesn't sum the doccounts across multiple communities, you'd probably
        //  want to use facets for that, but it doesn't seem worth it, especially since we're
        //  pretty short on field cache space)

        if (++nDocsAdded >= nDesiredSize) { // (can happen in the de-aliasing case)
          break;
        }//TESTED
      }     
    }
    rp.setData(dimlist);
    rp.setResponse(new ResponseObject("Suggestions",true,term));

    if (nSysTime > (lastSuggestLog + 5000)) {
      lastSuggestLog = nSysTime;
      logMsg.setLength(0);
      logMsg.append("knowledge/searchSuggest query=").append(escapedterm);
      logMsg.append(" groups=").append(communityIdStrList);
      logMsg.append(" found=").append(docs.length);
      logMsg.append(" time=").append(System.currentTimeMillis() - nSysTime).append(" ms");
      logger.info(logMsg.toString());
    }
    return rp;
 

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // Geo suggestions code
  // (Haven't yet converted geo feature to string literals)

  public ResponsePojo getSuggestionsGeo(String userIdStr, String term, String communityIdStrList)
  {     
    ResponsePojo rp = new ResponsePojo();
   
    //validate term object to be a lat,lng or location
    if ( term == null )
      rp.setResponse(new ResponseObject("Suggestions Geo", false, "search term is required, was not provided"));
   
    boolean isLatLng = false;
    Double[] latlng = new Double[2];
    String[] terms = term.split(",");
    if ( terms.length == 2 )
    {
      try
      {
        latlng[0] = Double.parseDouble(terms[0]);
        latlng[1] = Double.parseDouble(terms[1]);
        isLatLng = true;
      }
      catch (Exception e)
      {
        //could not parse as double, treat as location
        //just fall through
      }
    }
    List<SearchSuggestPojo> locations = null;
    if ( isLatLng )
    {
      //lookup location name via lat/lng
       locations = reverseGeoLookup(latlng[0], latlng[1]);
     
    }
    else
    {
      //lookup lat/lngs via location name
      rp.setResponse(new ResponseObject("Suggestions Geo", false, "Search term provided could not be parsed as lat, lng... geotag lookup by name not yet supported."));
      return rp;
    }
   
    rp.setData(locations, new SearchSuggestPojoApiMap());
    rp.setResponse(new ResponseObject("Suggestions Geo", true, term));
    return rp;
  }
 
  private static Double MAXIMUM_DISTANCE_IN_METERS = 50000.0;
  /**
   * Performs a reverse geolookup, takes a lat/lon and returns a list of nearby
   * locations
   *
   * @param latitude
   * @param longitude
   * @return
   */
  private  List<SearchSuggestPojo> reverseGeoLookup(Double latitude, Double longitude)
  {
    List<SearchSuggestPojo> locations = null;
   
    BasicDBList results = runGeoNear(latitude, longitude);
   
    if ( results != null)
    {
      locations = new ArrayList<SearchSuggestPojo>();
      if ( results.size() > 0 )
      {
        for ( int i = 0; i < 10 && i < results.size(); i++ )
        {
          BasicDBObject result = (BasicDBObject) results.get(i);
          Double distance = result.getDouble("dis");
          BasicDBObject obj = (BasicDBObject) result.get("obj");
          locations.add( buildLocation(obj, distance) );
        }
      }
    }
    return locations;
  }
 
  /**
   * Sends a geonear command to the feature.geo database.  Returns back
   * a list of the nearest 10 locations
   * 
   * @param lat
   * @param lon
   * @return
   */
  private BasicDBList runGeoNear(Double lat, Double lon)
  {
    String location = null;
    BasicDBObject command = new BasicDBObject("geoNear", "geo");
    Double[] coordinates = {lat,lon};
    command.put("near", coordinates);
    command.put("maxDistance", MAXIMUM_DISTANCE_IN_METERS);
    CommandResult commandResult = MongoDbManager.getDB("feature").command(command);
    if ( commandResult.ok() && commandResult.containsField("results") )
    {
      BasicDBList results = (BasicDBList)commandResult.get("results");
      return results;     
    }
   
    return null;
  }
 
  /**
   * Takes a geonear result object and returns a searchsuggestpojo
   *
   * @param location
   * @param distance
   * @return
   */
  private SearchSuggestPojo buildLocation(BasicDBObject location, Double distance)
  {
    GeoFeaturePojo feature = GeoFeaturePojo.fromDb(location, GeoFeaturePojo.class);
    SearchSuggestPojo suggest = new SearchSuggestPojo();
    suggest.setOntology_type(feature.getOntology_type());
    suggest.setScore(distance);     
    suggest.setValue(buildLocation(feature));
    suggest.setGeotag(new GeoPojo(feature.getGeoindex().lat, feature.getGeoindex().lon));
    return suggest;
  }
 
  /**
   * Takes a feature.geo object from the geonear results and tries
   * to build out an object matching city, region, country or
   * search_field if all those are null.
   *
   * @param location
   * @param distance
   * @return
   */
  private String buildLocation(GeoFeaturePojo feature)
  {
    StringBuilder result = new StringBuilder();
    boolean needComma = false;
    if ( feature.getCity() != null )
    {
      result.append(feature.getCity());
      needComma = true;
    }
    if ( feature.getRegion() != null )
    {
      if ( needComma )
        result.append(", ");
      result.append(feature.getRegion());
      needComma = true;
    }
    if ( feature.getCountry() != null )
    {
      if ( needComma )
        result.append(", ");
      result.append(feature.getCountry());
      needComma = true;
    }
   
    if ( result.length() == 0 )
    {
      result.append(feature.getSearch_field());
    }
    return result.toString();
  }

  // Event suggestions code

  private static final String assocIndex_ = AssociationFeaturePojoIndexMap.indexCollectionName_ + "/" + AssociationFeaturePojoIndexMap.indexName_;

  public ResponsePojo getAssociationSuggestions(String userIdStr, String ent1, String verb, String ent2, String field, String communityIdStrList, boolean bWantNoAlias)
  {
    ResponsePojo rp = new ResponsePojo();
    try
    {
      // Community ids, needed in a couple of places
      String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);

      // Initial alias handling:
      AliasLookupTable aliasTable = null;
      // Initial alias handling:     
      if (!bWantNoAlias) {
        AliasManager aliasManager = AliasManager.getAliasManager();
        if (null != aliasManager) {
          aliasTable = aliasManager.getAliasLookupTable(communityIdStrList, communityIdStrs, null, userIdStr);
        }
      }//TESTED                   

      ElasticSearchManager esm = ElasticSearchManager.getIndex(assocIndex_);
      SearchRequestBuilder searchOptions = esm.getSearchOptions();
      BoolQueryBuilder boolQuery = QueryBuilders.boolQuery();
      boolean bExtraQueryTerms = false;
      String term = "";
      if ( !ent1.equals("null") )
      {
        if ( field.equals(AssociationFeaturePojo.entity1_) )
          term = ent1;
        else {
          bExtraQueryTerms = true;
          EntityFeaturePojo alias = null;
          if (null != aliasTable) {
            alias = aliasTable.getAliasMaster(ent1);
          }
          if (null != alias) { // Found!
            boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity1_index_, alias.getAlias().toArray()));
          }
          else {
            boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity1_index_, ent1));
          }//TESTED
        }
      }
      if ( !verb.equals("null") )
      {
        if ( field.equals(AssociationFeaturePojo.verb_) )
          term = verb;
        else
        {
          bExtraQueryTerms = true;
          boolQuery.must(QueryBuilders.queryString(new StringBuffer("+").append(verb.replaceAll("\\s+", " +")).toString()).
              defaultField(AssociationFeaturePojo.verb_));
        }
      }
      if ( !ent2.equals("null") )
      {
        if ( field.equals(AssociationFeaturePojo.entity2_) )
          term = ent2;
        else {
          bExtraQueryTerms = true;
          EntityFeaturePojo alias = null;
          if (null != aliasTable) {
            alias = aliasTable.getAliasMaster(ent2);
          }
          if (null != alias) { // Found!
            boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.entity2_index_, alias.getAlias().toArray()));
          }
          else {
            boolQuery.must(QueryBuilders.termQuery(AssociationFeaturePojo.entity2_index_, ent2));
          }
        }//TESTED (cut and paste from entity1)
     

      String escapedterm = null;
      StandardTokenizer st = new StandardTokenizer(Version.LUCENE_30, new StringReader(ContentUtils.stripDiacritics(term)));
      CharTermAttribute termAtt = st.addAttribute(CharTermAttribute.class);
      StringBuffer sb = new StringBuffer();
      try {
        try {
          st.reset();
          while (st.incrementToken()) {
            if (sb.length() > 0) {
              sb.append(" +");
            }
            else {
              sb.append('+');           
            }
            sb.append(luceneEncodeTerm(termAtt.toString()));
          }
        }
        finally {
          st.close();
        }
      } catch (IOException e) {
        e.printStackTrace();
      }     
      if (!term.endsWith(" ") || (0 == sb.length())) { // Could be in the middle of typing, stick a * on the end
        sb.append('*');
      }//TESTED     

      escapedterm = sb.toString();
     
      // Also create an internal Lucene index for aliases, in case any of them do not have actual entities representing them
      List<EntityFeaturePojo> extraEntries = null;
      BoolQueryBuilder extraQueryTerms = null;
      if (field.startsWith("entity")) {
        String indexField = field.startsWith("entity1") ? "entity1_index" : "entity2_index";
        if (null != aliasTable) {
          extraEntries = checkAliasMasters(aliasTable, escapedterm);
        }
        if (null != extraEntries) {
          extraQueryTerms = QueryBuilders.boolQuery();
          int nExtraTerms = 0;
          Iterator<EntityFeaturePojo> aliasIt = extraEntries.iterator();
          while (aliasIt.hasNext()) {
            EntityFeaturePojo alias = aliasIt.next();           
            nExtraTerms += alias.getAlias().size();
           
            if (!bExtraQueryTerms && (nExtraTerms > 20)) { // If not filtering on event type we'll be more aggressive
              break;
            }//TESTED
            if (bExtraQueryTerms && (nExtraTerms > 60)) { // If the number of terms gets too large bail anyway
              break;
            }//TESTED
           
            extraQueryTerms.should(QueryBuilders.termsQuery(indexField, alias.getAlias().toArray()));
            aliasIt.remove();
           
          }//end loop over entities
        }//if found new aliases
       
      }//(if this is an entity lookup) TESTED - including breaking out because of # of terms
     
      // (end initial alias handling)
     
      if (null == extraQueryTerms) {
        boolQuery.must(QueryBuilders.queryString(escapedterm).defaultField(field));
      }
      else {//(in this case combine the escaped term with the aliases
        extraQueryTerms.should(QueryBuilders.queryString(escapedterm).defaultField(field));
        boolQuery.must(extraQueryTerms);
      }//TESTED
      boolQuery.must(QueryBuilders.termsQuery(AssociationFeaturePojo.communityId_, communityIdStrs));

      searchOptions.addSort(AssociationFeaturePojo.doccount_, SortOrder.DESC);

      // Work out which fields to return:
      //TODO (INF-1234) need to work out what to do with quotations and similar here (ie entityX without entityX_index)
      String returnfield;
      boolean bReturningEntities = true;
      if ( field.equals(AssociationFeaturePojo.entity1_) ) {
        returnfield = AssociationFeaturePojo.entity1_index_;
        searchOptions.addFields( AssociationFeaturePojo.entity1_index_, AssociationFeaturePojo.doccount_);
      }
      else if ( field.equals(AssociationFeaturePojo.entity2_)) {
        returnfield = AssociationFeaturePojo.entity2_index_;
        searchOptions.addFields( AssociationFeaturePojo.entity2_index_, AssociationFeaturePojo.doccount_);
      }
      else {
        bReturningEntities = false;
        returnfield = AssociationFeaturePojo.verb_;
        searchOptions.addFields( AssociationFeaturePojo.verb_, AssociationFeaturePojo.verb_category_,  AssociationFeaturePojo.doccount_);
      }

      int nNumSuggestionsToReturn = 20;
      if (bReturningEntities && (null != aliasTable)) {
        searchOptions.setSize(3*nNumSuggestionsToReturn); // we're going to remove some duplicates so get more than we need
      }
      else { // normal case
        searchOptions.setSize(nNumSuggestionsToReturn);
      }

      SearchResponse rsp = esm.doQuery(boolQuery, searchOptions);
      SearchHit[] docs = rsp.getHits().getHits();

      //Currently this code takes the results and puts
      //them into a set so there are no duplicates
      //duplicates occur for example when you search for
      //obama you get obama/quotation/quote1 and obama/travel/spain
      //may want to work this differnt, or atleast sum up
      //frequency
      Set<String> suggestions = new HashSet<String>();

      for (SearchHit hit: docs)
      {
        SearchHitField retField = hit.field(returnfield); // (this can be null in theory/by mistake)
        if (null != retField) {
          String suggestion = (String) retField.value();
          if (bReturningEntities && (null != aliasTable))
          {
            // More alias handling
            EntityFeaturePojo alias = aliasTable.getAliasMaster(suggestion);
            if (null != alias) { // Found!
              if (alias.getIndex().equalsIgnoreCase("discard")) { // Discard this entity
                continue;
              }
              else {
                // (these need to be present)
                suggestion = alias.getIndex();
              }
            }//TESTED
          }
          else { // (old code, still valid for verbs or no aliases)
            if ( returnfield.equals(AssociationFeaturePojo.verb_) && hit.field(AssociationFeaturePojo.verb_category_) != null )
              //for some reason verb_cat can be null!?!?! i think this is broken (ent1 facebook inc/company verb *)
            {
              String verbcat = (String)hit.field(AssociationFeaturePojo.verb_category_).value();
              suggestion += " (" + verbcat + ")";
              suggestions.add(verbcat);
            }
          }
          suggestions.add(suggestion);

          if (suggestions.size() >= nNumSuggestionsToReturn) {
            break;
          }

        } // (end return string valid)
      }//end loop over suggestions

      // Add any aliases that I couldn't explicity convert to query terms
      if ((null != extraEntries) && (suggestions.size() < nNumSuggestionsToReturn)) {
        for (EntityFeaturePojo alias: extraEntries) {
          suggestions.add(alias.getIndex());
          if (suggestions.size() >= nNumSuggestionsToReturn) {
            break;
          }         
        }
      }//(end add any remaining entries)
      //TESTED     
     
      String[] suggestionArray = new String[suggestions.size()];
      rp.setData(Arrays.asList(suggestions.toArray(suggestionArray)), (BasePojoApiMap<String>)null);

      String searchTerm = "";
      if ( field.equals(AssociationFeaturePojo.entity1_))
        searchTerm = ent1;
      else if ( field.equals(AssociationFeaturePojo.verb_))
        searchTerm = verb;
      else
        searchTerm = ent2;

      rp.setResponse(new ResponseObject("Association Suggestions", true, searchTerm));
    }
    catch (Exception ex)
    {
      ex.printStackTrace();
      rp.setResponse(new ResponseObject("Association Suggestions",false,"Response returned unsuccessfully: " + ex.getMessage()));
    }
    return rp;
 

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // Unused Alias code - returns aliases for a term
  // (The GUI code crashes or something, and anyway I'm not convinced we want to expose this to the user)

  public ResponsePojo getAliasSuggestions(String userIdStr, String term, String field, String communityIdStrList)
  {
    long nSysTime = System.currentTimeMillis();   
    ResponsePojo rp = new ResponsePojo();

    // (keep user facing data model consistent, ie index(ex gazateer_index), actual_name/alias, disambiguated_name (ex disambiguous_name))
    if (field.equalsIgnoreCase(EntityPojo.actual_name_) || field.equalsIgnoreCase(EntityFeaturePojo.alias_)) {
      field = EntityFeaturePojo.alias_;
    }
    else if (field.equalsIgnoreCase("disambiguous_name") || field.equals(EntityPojo.disambiguated_name_)
        || field.equals(EntityFeaturePojo.disambiguated_name_)) {
      //^^ (for bw compatibility from GUI)
      field = EntityFeaturePojo.disambiguated_name_;
    }
    else if (field.equalsIgnoreCase("gazateer_index") || field.equalsIgnoreCase(EntityPojo.index_)) { // (for bw compatibility from GUI)
      field = EntityFeaturePojo.index_;
    }
    else if (!field.equalsIgnoreCase(EntityFeaturePojo.index_)) {
      rp.setResponse(new ResponseObject("aliasSuggest",false, "Field " + field + " not recognized"));
      return rp;
    }

    try
    {        
      Collection<Set<String>> aliasSet = findAliases(null, field, Arrays.asList(term), userIdStr, communityIdStrList).values();
      Set<String> superSet = new HashSet<String>();
      for (Set<String> set : aliasSet )
      {
        superSet.addAll(set);
      }      
      rp.setData(superSet, (BasePojoApiMap<String>)null);
      rp.setResponse(new ResponseObject("aliasSuggest",true,"Successfully returned aliases"));

      if (nSysTime > (lastAliasLog + 5000)) {
        lastAliasLog = nSysTime;
        logMsg.setLength(0);
        logMsg.append("knowledge/aliasSuggest query=").append(term);
        logMsg.append(" found=").append(superSet.size());
        logMsg.append(" time=").append(System.currentTimeMillis() - nSysTime).append(" ms");
        logger.info(logMsg.toString());
      }         
    }
    catch (Exception e)
    {
      // If an exception occurs log the error
      logger.error("Exception Message: " + e.getMessage(), e);
      rp.setResponse(new ResponseObject("aliasSuggest",false,"Error returning aliases"));
    }
    return rp;
  }   

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // Alias utility code - used by (unused) alias suggestions code above and also for alias expansion

  public static Map<String, Set<String>> findAliases(DBCollection entityFeatureDb, String field, Collection<String> terms, String userIdStr, String communityIdStrList)
  {
    Map<String, Set<String>> aliases = new HashMap<String, Set<String>>();   
    String[] communityIdStrs = SocialUtils.getCommunityIds(userIdStr, communityIdStrList);
    try
    {
      if (null == entityFeatureDb) {
        entityFeatureDb = DbManager.getFeature().getEntity();
      }

      // Get all the aliases in one go, will sort them out later
      BasicDBObject query = new BasicDBObject();     
      query.put(field, new BasicDBObject(MongoDbManager.in_, terms));
      ObjectId[] communityIds = new ObjectId[communityIdStrs.length];
      int i = 0;
      for (String idStr: communityIdStrs) {
        communityIds[i] = new ObjectId(idStr);
        i++;
      }
      query.put(EntityFeaturePojo.communityId_, new BasicDBObject(MongoDbManager.in_, communityIds));

      List<EntityFeaturePojo> gpl = EntityFeaturePojo.listFromDb(entityFeatureDb.find(query), EntityFeaturePojo.listType());   

      for ( String s : terms )
      {       
        aliases.put(s, new HashSet<String>());
        for (EntityFeaturePojo gpit : gpl)
        {
          if ((field.equals(EntityFeaturePojo.index_) && gpit.getIndex().equals(s)) // gazname
              ||
              (field.equals(EntityFeaturePojo.disambiguated_name_) && gpit.getDisambiguatedName().equals(s)) // alias
              ||
              (field.equals(EntityFeaturePojo.alias_) && gpit.getAlias().contains(s))) // alias
          {
            aliases.get(s).addAll(gpit.getAlias());           
          }
        }
      }
    }
    catch(Exception e)
    {
      logger.error("Exception Message: " + e.getMessage(), e);
    }
    return aliases;
  }

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
  // INTERNAL SEARCHING OF ALIAS MASTERS (USES LUCENE)
 
  private static CrossVersionIndexSearcher _aliasSearcherCache = null;
  private static Date _searcherCacheLastCreated = null;
  private static EntityFeaturePojo[] indexToSearchCacheIndexes = null;

  private synchronized void createAliasSearchCache(AliasLookupTable aliasTable)
  {
    // Check if we need to update the Lucene store:
    if ((null != _searcherCacheLastCreated) && (null != aliasTable.getLastModified())) {
      if (_searcherCacheLastCreated.getTime() >= aliasTable.getLastModified().getTime()) {
        return;
      }
    }//TESTED

    RAMDirectory idx = new RAMDirectory();

    try {
      CrossVersionIndexWriter writer = new CrossVersionIndexWriter(idx, Version.LUCENE_30, new StandardAnalyzer(Version.LUCENE_30));
      int nAdded = 0;
      indexToSearchCacheIndexes = new EntityFeaturePojo[aliasTable.masters().size()];
      for (EntityFeaturePojo alias: aliasTable.masters()) {

        if ((null != alias.getIndex()) && (null != alias.getDisambiguatedName()) && (null != alias.getAlias())
            && !alias.getIndex().equalsIgnoreCase("discard") && !alias.getAlias().contains(alias.getIndex()))
        {
          // (that last check just means there's no point in including the alias if it has itself as a sub-alias)
          writer.addSingleAnalyzedUnstoredFieldDocument("name", alias.getDisambiguatedName());
          indexToSearchCacheIndexes[nAdded] = alias;
          nAdded++;
          //System.out.println("CACHE ADD: " + alias.getDisambiguatedName() + ": " + nAdded + " - " + alias.getIndex());
        }
      }
      writer.close();
     
      if (nAdded > 0) {
        if (null != _aliasSearcherCache) {
          try {
            _aliasSearcherCache.getIndexReader().close();
          }
          catch (Exception e) {}         
        }
        _aliasSearcherCache = new CrossVersionIndexSearcher(idx);
        if (null != _aliasSearcherCache) {
          _searcherCacheLastCreated = aliasTable.getLastModified();
        }
      }
      else {
        _aliasSearcherCache = null;
        _searcherCacheLastCreated = aliasTable.getLastModified();        
      }
    }//TESTED
    catch (Exception e) {
      //Probably should never happen once set up correctly
      e.printStackTrace();
    }
  }//TESTED

  private ArrayList<EntityFeaturePojo> checkAliasMasters(AliasLookupTable aliasTable, String term) {
    createAliasSearchCache(aliasTable); // (only does anything if needed)
    ArrayList<EntityFeaturePojo> retVal = null;

    if (null != _aliasSearcherCache) {
      try {
        if (term.startsWith("*")) { // match all
          retVal = new ArrayList<EntityFeaturePojo>(indexToSearchCacheIndexes.length);
          for (EntityFeaturePojo ent: indexToSearchCacheIndexes) {
            if (null != ent) {
              retVal.add(ent);
            }
            else {
              break;
            }
          }
        }//TESTED (end special case, "*" wildcard)
        else {
          Query query = new CrossVersionQueryParser(Version.LUCENE_30, "name", new StandardAnalyzer(Version.LUCENE_30)).parse(term);
          TopDocs results = _aliasSearcherCache.search(query, aliasTable.masters().size());
          ScoreDoc[] hits = results.scoreDocs;
          if (hits.length > 0) {
            retVal = new ArrayList<EntityFeaturePojo>(hits.length);
            for (ScoreDoc hit: hits) {
              retVal.add(indexToSearchCacheIndexes[hit.doc]);
            }
          }
        }//TESTED (normal case, Lucene lookup)
      }
      catch (Exception e) {
        //Probably should never happen once set up correctly
        e.printStackTrace();
      }     
    }
    return retVal;
  }//TESTED

  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

  // Some Lucene utlities:

  public static String luceneEncode(String rawQuery)
  {
    // + - && || ! ( ) { } [ ] ^ " ~ * ? : \ /
    /// add quotes to make it exact
    return '"' + rawQuery.replaceAll("([\"+~*?:/|&(){}\\[\\]\\^\\!\\-\\\\])", "\\\\$1") + '"';
  }
  public static String luceneEncodeTerm(String rawQueryTerm)
  {
    // + - && || ! ( ) { } [ ] ^ " ~ * ? : \
    /// (no quotes)
    return rawQueryTerm.replaceAll("([\"+~*?:/|&(){}\\[\\]\\^\\!\\-\\\\])", "\\\\$1");
  }

}
TOP

Related Classes of com.ikanow.infinit.e.api.knowledge.SearchHandler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.