Package org.apache.lucene.search

Source Code of org.apache.lucene.search.PwaScorerFeatures

package org.apache.lucene.search;

import java.io.IOException;
import java.net.ConnectException;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import org.apache.commons.codec.binary.Base64;
import org.apache.lucene.search.caches.PwaDateCache;
import org.apache.lucene.search.caches.PwaIndexStats;
import org.apache.lucene.search.features.*;
import org.apache.lucene.search.features.querydependent.*;
import org.apache.lucene.search.features.queryindependent.*;
import org.apache.lucene.search.features.temporal.*;
import org.apache.lucene.search.memcached.*;
import org.apache.lucene.document.Document;


/**
* Scores ranking features
* @author Miguel Costa
*/
public class PwaScorerFeatures {
 
  private final static String BOOST_LABEL="boost"
 
  private final static String MEMCACHED_ADDRESSES="193.136.192.57:11111"; //memcached TODO parameterize
  //private final static String MEMCACHED_ADDRESSES="127.0.0.1:11211"; //membase
  private static Memcached memcache=null;
  private static int maxVersions;
  private static int maxSpan; 
  private static long minTimestamp;
  private static long maxTimestamp;
 
   
  /**
   * Ranking model that computes score  
   * @param doc document identifier
   * @param queryTimestamp timestamp when the query was submitted
   * @param collector ranking features collector
   * @param posmanagers query term position into the document
   * @param searcher searcher
   * @param functions ranking functions
   * @return ranking score
   */
  public static PwaScores score(int doc, long queryTimestamp, PwaRawFeatureCollector collector, Vector<PwaPositionsManager> posmanagers, Searcher searcher, PwaFunctionsWritable functions) throws IOException {   
    PwaScores scores=new PwaScores();   
    int nDocs=collector.getNumDocs();
    Vector<Integer> vecTfs;
    Vector<Integer> vecIdfs;
    int fieldLength;
    double fieldAvgLength;   
    Vector<Vector<Integer>> tfPerField=new Vector<Vector<Integer>>();
    Vector<Vector<Integer>> idfPerField=new Vector<Vector<Integer>>();
    Vector<Integer> nTermsPerField=new Vector<Integer>()
    //Vector<Double> avgNTermsPerField=new Vector<Double>();
       
    Vector<Integer> sumVecTfs=null;
    Vector<Integer> sumVecIdfs=null;
    Integer sumFieldLength=new Integer(0)
    Double sumfieldAvgLength=new Double(0);   
   
    int funct=0; // function index   
    String surl=null; // URL string
       
    // query dependent features
    if (!collector.isEmpty()) {
      // term weight features
      for (int i=0;i<PwaIndexStats.FIELDS.length;i++) {       
        vecTfs=collector.getFieldTfs(PwaIndexStats.FIELDS[i]); // vector of all query terms  for tf per term
        vecIdfs=collector.getFieldIdfs(PwaIndexStats.FIELDS[i]); // vector of all query terms for idf per term   
        fieldLength=collector.getFieldLength(PwaIndexStats.FIELDS[i]);
        fieldAvgLength=collector.getFieldAvgLength(PwaIndexStats.FIELDS[i]);     
     
        if (functions.hasFunction(funct)) {
          float score=0;
          for (int j=0;j<vecTfs.size();j++) {  // for all terms         
            score+= vecTfs.get(j);
          }
          scores.addScore(funct, score); // sum of the frequency of each term
        }
        funct++;               
        if (functions.hasFunction(funct)) {
          float score=0;
          for (int j=0;j<vecIdfs.size();j++) {  // for all terms         
            score+= vecIdfs.get(j);
          }
          scores.addScore(funct, score); // sum of the inverse document frequency of each term
        }
        funct++;             
        if (functions.hasFunction(funct)) {
          scores.addScore(funct, fieldLength); // field length     
        }
        funct++;       
        if (functions.hasFunction(funct)) {
          scores.addScore(funct, (float)fieldAvgLength); // field average length     
        }
        funct++;               
        if (functions.hasFunction(funct)) {
          scores.addScore(funct, (new PwaTFxIDF(vecTfs,vecIdfs,fieldLength,nDocs)).score()); // "TFxIDF-"+PwaIndexStats.FIELDS[i]          
        }
        funct++;
        if (functions.hasFunction(funct)) {
          scores.addScore(funct, (new PwaBM25(vecTfs,vecIdfs,fieldLength,fieldAvgLength,nDocs)).score()); // "BM25-"+PwaIndexStats.FIELDS[i]       
        }
        funct++;
                           
        // add values to vectors for term weighting functions using all fields
        tfPerField.add(vecTfs);
        idfPerField.add(vecIdfs);
        nTermsPerField.add(fieldLength);         
        //avgNTermsPerField.add(fieldAvgLength);
       
        if (sumVecTfs==null) { // i==0
          sumVecTfs=(Vector<Integer>)vecTfs.clone();
          sumVecIdfs=(Vector<Integer>)vecIdfs.clone();
        }
        else {
          for (int j=0;j<vecTfs.size();j++) {
            sumVecTfs.set(j,sumVecTfs.get(j)+vecTfs.get(j));
            sumVecIdfs.set(j,sumVecIdfs.get(j)+vecIdfs.get(j));
          }
        }
        sumFieldLength+=fieldLength;
        sumfieldAvgLength+=fieldAvgLength;             
     
      // term weight features using all fields at once
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, (new PwaTFxIDF(sumVecTfs,sumVecIdfs,sumFieldLength,nDocs)).score()); // "TFxIDF-" + all fields          
      }
      funct++;
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, (new PwaBM25(sumVecTfs,sumVecIdfs,sumFieldLength,sumfieldAvgLength,nDocs)).score()); // "BM25-" + all fields       
      }
      funct++;       
      if (functions.hasFunction(funct)) {       
        scores.addScore(funct, (new PwaLuceneSimilarity(tfPerField,idfPerField,nTermsPerField,nDocs)).score()); // Lucene + all fields
      }
      funct++;
      if (functions.hasFunction(funct)) {       
        scores.addScore(funct, (new PwaLuceneSimilarityNormalized(tfPerField,idfPerField,nTermsPerField,nDocs)).score()); // Lucene normalized + all fields
      }
      funct++;
      if (functions.hasFunction(funct)) {       
        scores.addScore(funct, (new PwaNutchSimilarity(tfPerField,idfPerField,nTermsPerField,nDocs)).score()); // Nutch + all fields
      }
      funct++;
      if (functions.hasFunction(funct)) {       
        scores.addScore(funct, (new PwaNutchSimilarityNormalized(tfPerField,idfPerField,nTermsPerField,nDocs)).score()); // Nutch normalized + all fields
      }
      funct++;

      // term distance features
      for (int i=0;i<PwaIndexStats.FIELDS.length;i++) { // or for (i=0;i<posmanagers.length;i++) {  // per field
        if (functions.hasFunction(funct) || functions.hasFunction(funct+1) || functions.hasFunction(funct+2)) {         
          if (posmanagers.size()>0) {          
            posmanagers.get(i).computeDistances(doc);
            if (functions.hasFunction(funct)) {           
              scores.addScore(funct, (new PwaMinSpan(posmanagers.get(i).getMinSpanCovOrdered())).score()); // "MinSpanCovOrd-"+PwaIndexStats.FIELDS[i]
            }
            funct++;
            if (functions.hasFunction(funct)) {           
              scores.addScore(funct, (new PwaMinSpan(posmanagers.get(i).getMinSpanCovUnordered())).score()); // "MinSpanCovUnord-"+PwaIndexStats.FIELDS[i]
            }
            funct++;
            if (functions.hasFunction(funct)) {           
              scores.addScore(funct, (new PwaMinSpan(posmanagers.get(i).getMinPairDist())).score()); // "MinPairDist-"+PwaIndexStats.FIELDS[i]
            }
            funct++;
          }
          else {
            scores.addScore(funct,0);           
            scores.addScore(funct+1,0);           
            scores.addScore(funct+2,0);
            funct+=3;
          }
        }
        else {
          funct+=3;
        }
      }     
    }
    else {
      funct+=PwaIndexStats.FIELDS.length*6 + 6 + PwaIndexStats.FIELDS.length*3;
    }
               
        // query independent features
    if (functions.hasFunction(funct) || functions.hasFunction(funct+1) || functions.hasFunction(funct+2) || functions.hasFunction(funct+3) || functions.hasFunction(funct+4)) {                   
      Document docMeta=searcher.doc(doc);       
      if (functions.hasFunction(funct)) {
        surl=docMeta.get("url");       
        scores.addScore(funct, (new PwaUrlDepth(surl)).score()); // "UrlDepth"       
      }
      funct++;       
      if (functions.hasFunction(funct)) {
        surl=docMeta.get("url");                 
        scores.addScore(funct, (new PwaUrlSlashes(surl)).score()); // "PwaUrlSlashes"       
      }
      funct++;     
      if (functions.hasFunction(funct)) {
        surl=docMeta.get("url");                 
        scores.addScore(funct, surl.length()); // "URLLength"       
      }
      funct++;               
      if (functions.hasFunction(funct)) {
        String sinlinks=docMeta.get("inlinks");
        scores.addScore(funct, Integer.parseInt(sinlinks)); // "Inlinks"       
      }
      funct++;
      if (functions.hasFunction(funct)) {
        String sinlinks=docMeta.get("inlinks");
        scores.addScore(funct, (new PwaLinInlinks(Integer.parseInt(sinlinks))).score()); // "LinInlinks"       
      }
      funct++;
      /* do not work properly
      if (functions.hasFunction(funct)) {
        String spagerank=docMeta.get("pagerank");
        boost=functions.getBoost(funct);
        score+= Float.parseFloat(spagerank) * boost; // "Pagerank"     
      }
      funct++;     
      if (functions.hasFunction(funct)) {
        String spagerank=docMeta.get("pagerank");
        boost=functions.getBoost(funct);
        score+= (new PwaLinPagerank(Float.parseFloat(spagerank))).score() * boost; // "LinPagerank"     
      }
      funct++;
      if (functions.hasFunction(funct)) {
        String sboost=docMeta.get("boost");
        boost=functions.getBoost(funct);
        score+= Float.parseFloat(sboost) * boost; // OPIC
      }
      funct++;
      */
    }
    else {
      funct+=5;
    }               
   
    // temporal features - local timestamps
    if (functions.hasFunction(funct) || functions.hasFunction(funct+1) || functions.hasFunction(funct+2)) {
      PwaDateCache cache=new PwaDateCache(null); // already initialized
      long timestamp=cache.getTimestamp(doc);
      //long minTimestamp=cache.getMinTimestamp();
      //long maxTimestamp=cache.getMaxTimestamp();                     
     
      if (functions.hasFunction(funct)) {   
        scores.addScore(funct, ((float)queryTimestamp) / PwaIRankingFunction.DAY_MILLISEC); // Query issue time in days
      }
      funct++;     
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, (new PwaAge(timestamp,queryTimestamp)).score()); // Age in days from query issue time
      }
      funct++;       
      if (functions.hasFunction(funct)) {   
        scores.addScore(funct, ((float)timestamp) / PwaIRankingFunction.DAY_MILLISEC); // Version's timestamp in days
      }
      funct++;     
    }
    else {
      funct+=3;
    }
   
    // temporal features - global timestamps
    boolean temporalGlobalUsed=false;
    for (int i=funct; !temporalGlobalUsed && i<funct+9; i++) {
      if (functions.hasFunction(i)) {
        temporalGlobalUsed=true;
      }
    }   
    if (temporalGlobalUsed) {                 
      PwaDateCache cache=new PwaDateCache(null); // already initialized
      long timestamp=cache.getTimestamp(doc);
     
      int nVersionsURL;       
      long minTimestampURL;
      long maxTimestampURL;           
      UrlRow row=null;
     
      try {
        if (memcache==null) {
          memcache=new Memcached(MEMCACHED_ADDRESSES); // [address1=127.0.0.1:8091] [address2] ... [addressn]
          maxVersions=(Integer)memcache.get(MemcachedTransactions.MAX_VERSIONS);
          maxSpan=(Integer)memcache.get(MemcachedTransactions.MAX_SPAN);
          int minDate=(Integer)memcache.get(MemcachedTransactions.MIN_DATE);
          minTimestamp=MemcachedTransactions.intToLongdate(minDate);
          int maxDate=(Integer)memcache.get(MemcachedTransactions.MAX_DATE);
          maxTimestamp=MemcachedTransactions.intToLongdate(maxDate);         
        }
       
        if (surl==null) {
          Document docMeta=searcher.doc(doc);
          surl=docMeta.get("url");                       
        }
                             
        String key=MemcachedTransactions.getUrlKey(surl);         
        row=memcache.getRow(key);
      }
      catch (ConnectException e) { // error communicating with memcached. It will try to reconnect.
        // ignore
      }
      catch (IOException e) {
        // ignore
        System.err.println("Memcache Exception: "+e.getMessage());
      }       
       
      if (row==null) { // for urls discarded such as dynamics (there are not space to store everything)
        nVersionsURL=1;       
        minTimestampURL=timestamp;
        maxTimestampURL=timestamp;         
      }
      else {
        nVersionsURL=row.getNVersions();       
        minTimestampURL=MemcachedTransactions.intToLongdate(row.getMin());
        maxTimestampURL=MemcachedTransactions.intToLongdate(row.getMax());
      }                               
     
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, ((float)minTimestampURL) / PwaIRankingFunction.DAY_MILLISEC); // Oldest version's timestamp in days
      }
      funct++;       
      if (functions.hasFunction(funct)) {           
        scores.addScore(funct, ((float)maxTimestampURL) / PwaIRankingFunction.DAY_MILLISEC ); // Newest version's timestamp in days
      }
      funct++;
      if (functions.hasFunction(funct)) {           
        scores.addScore(funct, ((float)maxTimestampURL-minTimestampURL) / PwaIRankingFunction.DAY_MILLISEC ); // Days between oldest and newest versions           
      }
      funct++;                 
      if (functions.hasFunction(funct)) {           
        scores.addScore(funct, (new PwaSpanVersions(maxTimestampURL,minTimestampURL,maxSpan)).score()); // Days between oldest and newest versions normalized
      }     
      funct++;             
      if (functions.hasFunction(funct)) {             
        scores.addScore(funct, nVersionsURL); // NumberVersions
      }
      funct++;         
      if (functions.hasFunction(funct)) {             
        scores.addScore(funct, (new PwaNumberVersions(nVersionsURL,maxVersions)).score()); // NumberVersions normalized
      }
      funct++;                  
      if (functions.hasFunction(funct)) {       
        scores.addScore(funct, (new PwaBoostNewer(timestamp,maxTimestamp,minTimestamp)).score()); // BoostNewer       
      }
      funct++;
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, (new PwaBoostOlder(timestamp,maxTimestamp,minTimestamp)).score()); // BoostOlder
      }
      funct++;
      if (functions.hasFunction(funct)) {
        scores.addScore(funct, (new PwaBoostNewerAndOlder(timestamp,maxTimestamp,minTimestamp)).score()); // BoostNewerAndOlder
      }
      funct++;
           
      //cache.close();
    }
    else {
      funct+=9;
    }     
   
    return scores;
  }
   
 
  /**
   * Display all features 
   * @param doc document identifier
   * @param queryTimestamp timestamp when the query was submitted
   * @param collector ranking features collector
   * @param posmanagers query term position into the document
   * @param searcher searcher
   * @param functions ranking functions
   * @return explanation
   */
  public static Explanation explain(int doc, long queryTimestamp, PwaRawFeatureCollector collector, Vector<PwaPositionsManager> posmanagers, Searcher searcher, PwaFunctionsWritable functions) throws IOException {         
    int key;   
    StringBuffer bufValue=new StringBuffer("Feature values of document "+doc+": <span class=\"features\">"); // feature values
    StringBuffer bufBoost=new StringBuffer("Feature boosts of document "+doc+": "); // feature boosts
    StringBuffer bufFinal=new StringBuffer("Feature values*boosts of document "+doc+": "); // feature final scores
    PwaScores scores=score(doc, queryTimestamp, collector, posmanagers, searcher, functions);
   
    Vector<Integer> vecKeys = new Vector<Integer>(functions.keySet());
    Collections.sort(vecKeys);
    for(int i=0;i<vecKeys.size();i++) {     
        key=vecKeys.get(i);            
        bufValue.append(" "+key+":"+scores.getScore(key));   
        bufBoost.append(" "+key+":"+functions.getBoost(key));
        bufFinal.append(" "+key+":"+scores.getScore(key)*functions.getBoost(key));
        }
    bufValue.append("</span>");
    Explanation allExpl = new Explanation(0,bufValue.toString());        
    Explanation expAux = new Explanation(0,bufBoost.toString());
    allExpl.addDetail(expAux);
    expAux = new Explanation(0,bufFinal.toString());
    allExpl.addDetail(expAux);
    return allExpl;     
  }   
 
  /**
   * Get part of explanation
   * @param expAux part of explanation
   * @param functions ranking functions
   * @param index index of functions array
   * @return
   */
  private static Explanation getExplainPart(Explanation expAux, PwaFunctionsWritable functions, int index) {
    float boost=functions.getBoost(index);
    expAux.addDetail(new Explanation(boost,BOOST_LABEL));
    return expAux;
  }
}
TOP

Related Classes of org.apache.lucene.search.PwaScorerFeatures

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.