Package edu.gslis.ttg.searchers

Source Code of edu.gslis.ttg.searchers.SimpleSearcher

package edu.gslis.ttg.searchers;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import cc.twittertools.search.api.TrecSearchThriftClient;
import cc.twittertools.thrift.gen.TResult;
import edu.gslis.queries.GQuery;
import edu.gslis.textrepresentation.FeatureVector;

public class SimpleSearcher {
 
  private TrecSearchThriftClient client;
  private int maxResults;
 
  public SimpleSearcher(TrecSearchThriftClient client, int maxResults) {
    this.client = client;
    this.maxResults = maxResults;
  }
 
 
  public Map<Long, TResult> search(GQuery query) {
    // clean up query
    String queryText = query.getText();
    queryText = queryText.replaceAll("[,'\\.\\?]", " ");
    queryText = queryText.replaceAll("  ", " ").trim();
   
    // need to lowercase the query vector
    FeatureVector temp = new FeatureVector(null);
    Iterator<String> qTerms = query.getFeatureVector().iterator();
    while(qTerms.hasNext()) {
      String term = qTerms.next();
      temp.addTerm(term.toLowerCase(), query.getFeatureVector().getFeatureWeight(term));
    }
    temp.normalize();;
    query.setFeatureVector(temp);
   
    System.err.println(query.getTitle()+": "+queryText);
   
    // perform search
    List<TResult> results = null;
    try {
      results = client.search(queryText, Long.parseLong(query.getMetadata("querytweettime")), maxResults);
    } catch (Exception e) {
      System.err.println("Error searching.");
      System.exit(-1);
    }
   
    // set cutoff score heuristically
    double topScore = results.get(0).getRsv();
    double cutOffScore = topScore / 2;
   
    // record hits, removing duplicates
    int i = 1;
    Map<Long, TResult> seenMap = new HashMap<Long, TResult>();
    Iterator<TResult> hitIterator = results.iterator();
    while(hitIterator.hasNext()) {
      TResult hit = hitIterator.next();
      if (hit.getRsv() < cutOffScore) {
        break;
      }
     
      long docId = hit.id;
      if (seenMap.containsKey(docId))
        continue;
      seenMap.put(docId, hit);

      if(i++ >= maxResults)
        break;
    }
   
    return seenMap;
  }

}
TOP

Related Classes of edu.gslis.ttg.searchers.SimpleSearcher

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.