Source Code of ivory.sqe.querygenerator.SCFGQueryGenerator

package ivory.sqe.querygenerator;


import ivory.core.ConfigurationException;
import ivory.core.RetrievalEnvironment;
import ivory.core.tokenize.BigramChineseTokenizer;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.sqe.retrieval.Constants;
import ivory.sqe.retrieval.StructuredQuery;


import java.io.IOException;
import java.util.Map;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;


import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;


import edu.umd.cloud9.io.map.HMapSFW;


/**
 * In addition to PSQ implemented in CLWordQueryGenerator, phrase translations are learned from a provided Synchronous context-free grammar (SCFG) (Hiero, 2006) and added to the representation.
 * 
 * @author ferhanture
 *
 */
public class SCFGQueryGenerator implements QueryGenerator {
  private static final Logger LOG = Logger.getLogger(SCFGQueryGenerator.class);
  private Tokenizer queryLangTokenizer, docLangTokenizer, bigramTokenizer;
  private Map<String, HMapSFW > probMap;
  private int length, numTransPerToken;
  private boolean bigramSegment;
  private RetrievalEnvironment env;
  private String queryLang, docLang;


  public SCFGQueryGenerator() throws IOException {
    super();
  }


  @Override
  public void init(FileSystem fs, Configuration conf) throws IOException {
    LOG.info(conf.get(Constants.DocLanguage));
    LOG.info(conf.get(Constants.DocTokenizerData));
    LOG.info(conf.get(Constants.MinWindow));
    LOG.info(conf.get(Constants.MaxWindow));
    LOG.info(conf.get(Constants.SCFGWeight));


    LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ));
    LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD));
    
    numTransPerToken = conf.getInt(Constants.NumTransPerToken, Integer.MAX_VALUE);


    queryLang = conf.get(Constants.QueryLanguage);
    docLang = conf.get(Constants.DocLanguage);


    float lexProbThreshold = conf.getFloat(Constants.LexicalProbThreshold, 0f);
    float cumProbThreshold = conf.getFloat(Constants.CumulativeProbThreshold, 1f);


    String bigram = conf.get(Constants.BigramSegment);
    bigramSegment = (bigram != null && bigram.equals("on")) ? true : false;
    if (bigramSegment) {
      bigramTokenizer = new BigramChineseTokenizer();
    }
    LOG.info("Bigram segmentation = " + bigramSegment);


    // initialize environment to access index
    try {
      env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
      env.initialize(true);
    } catch (ConfigurationException e) {
      e.printStackTrace();
    }


    queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, conf.get(Constants.QueryLanguage), conf.get(Constants.QueryTokenizerData), false, null, null, null);
    docLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, docLang, conf.get(Constants.DocTokenizerData), true, null, conf.get(Constants.StemmedStopwordListD), null);


    // construct prob. dist. from translation grammar (SCFG)
    probMap = Utils.generateTranslationTable(fs, conf, docLangTokenizer);


    //remove low-prob entries, cut off after certain cumulative prob., and normalize
    Utils.normalize(probMap, lexProbThreshold, cumProbThreshold, 30);      
  }


  @Override
  public StructuredQuery parseQuery(String query) {
    JsonObject queryJson = new JsonObject();


    String[] tokens = query.trim().split("\\s");


    length = tokens.length;
    JsonArray tokenTranslations = new JsonArray();
    for (String token : tokens) {


      // if we do bigram segmentation on translation alternatives, we have to a have a weight
      // structure, even if k=1
      // unless the token has less than 3 characters, in which bigram segmentation will only return
      // the token itself
      if (numTransPerToken == 1 && !bigramSegment) {
        String trans = getBestTranslation(token);
        if (trans != null) {
          tokenTranslations.add(new JsonPrimitive(trans));
        }
      } else {
        JsonObject tokenTrans = new JsonObject();
        JsonArray weights = Utils.createJsonArrayFromProbabilities(getTranslations(token, null));
        if (weights != null) {
          tokenTrans.add("#weight", weights);
          tokenTranslations.add(tokenTrans);
        }
      }
    }
    queryJson.add("#combine", tokenTranslations);


    return new StructuredQuery(queryJson, length);
  }


  private String getBestTranslation(String token) {
    HMapSFW probDist = probMap.get(token);


    if(probDist == null){
      return token;
    }


    float maxProb = 0f;
    String maxProbTrans = null;
    for (edu.umd.cloud9.util.map.MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }
    return maxProbTrans;
  }


  protected HMapSFW getTranslations(String token, Map<String, String> stemmed2Stemmed) {
    HMapSFW probDist = probMap.get(token);
    if(probDist == null){
      // heuristic: if no translation found, include itself as only translation
      probDist = new HMapSFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);      
      return probDist;
    }


    return probDist;
  }
}
Source Code of ivory.sqe.querygenerator.SCFGQueryGenerator

Related Classes of ivory.sqe.querygenerator.SCFGQueryGenerator