Package org.archive.nutch.trec

Source Code of org.archive.nutch.trec.TRECBean

package org.archive.nutch.trec;

import java.util.*;
import java.io.*;

import org.apache.nutch.searcher.*;
import org.apache.lucene.search.ArquivoWebFunctionsWritable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.parse.ParseData;
import org.apache.hadoop.io.LongWritable;


public class TRECBean extends NutchBean {
 
  private final static String BOOST_LABEL="BOOST";
  //private static ArquivoWebWritable boostParams=null;
  private static ArquivoWebFunctionsWritable functions=null;
 
  /**
   * Constructor
   * @param conf
   * @param path search-servers.txt dir
   * @param boostsFile function/boost entries file
   * @throws IOException
   */
  public TRECBean(Configuration conf, Path path, File boostsFile) throws IOException {
    super(conf,path);     
     
      Properties props=new Properties();
      props.load(new FileInputStream(boostsFile));
          
      functions=new ArquivoWebFunctionsWritable();
      String key=null;
      int index;
      float boost;
      for (Enumeration e = props.keys(); e.hasMoreElements();) {
        key=(String)e.nextElement();
        index=Integer.parseInt(key.substring(BOOST_LABEL.length()));
        boost=Float.parseFloat(props.getProperty(key));
        functions.addFunction(index, boost);
        }
     
      /* TODO remove
      boostParams=new ArquivoWebWritable();
    boostParams.setUrlBoost(Float.parseFloat(props.getProperty("url")));
    boostParams.setAnchorBoost(Float.parseFloat(props.getProperty("anchor")));
    boostParams.setContentBoost(Float.parseFloat(props.getProperty("content")));
    boostParams.setTitleBoost(Float.parseFloat(props.getProperty("title")));
    boostParams.setHostBoost(Float.parseFloat(props.getProperty("host")));
    boostParams.setPhraseBoost(Float.parseFloat(props.getProperty("phrase")));
    boostParams.setSlop(Integer.parseInt(props.getProperty("slop")));
    */
     
  }
 
    public static void main(String[] args) throws Exception {
        String usage = "org.archive.nutch.trec.TRECBean <inputfile (trecTopics.txt)> <runid> <topDocsReturned (1000)> <maxMatches (-1=all|-2=default)> <maxDups> <search-servers.txt dir> <ranking file> <debug (true|false)>";
           
        if (args.length < 8) {
          System.err.println(usage);
          System.exit(-1);
        }
        int topDocsReturned = 20;
        int maxMatches = 1000;
        int maxDups = 2;
        try {
          topDocsReturned = Integer.parseInt(args[2]);
          maxMatches = Integer.parseInt(args[3]);
          maxDups = Integer.parseInt(args[4]);
        } catch (ArrayIndexOutOfBoundsException e) {
          throw e;
        }
       
        boolean debug=Boolean.parseBoolean(args[7]);
         
        Configuration conf = NutchConfiguration.create();
        if (debug) {
          System.out.println("Conf: "+conf.toString());
          System.out.println("args: "+args[0]+" "+args[1]+" "+args[2]+" "+args[3]+" "+args[4]+" "+args[5]+" "+args[6]+" "+args[7]);
        }
        //NutchBean bean = new NutchBean(conf);
       
        try {
          TRECBean bean = new TRECBean(conf,new Path(args[5]),new File(args[6]));
         
            BufferedReader in = new BufferedReader(new FileReader(args[0]));
            String str;
            String qid=null;
            String query_str=null;
            while ((str = in.readLine()) != null) {
             
              if (str.startsWith("<num> Number:")) {
                qid = str.substring("<num> Number: WT04-:".length()-1);
              }
              else if (str.startsWith("<title>")) {             
                query_str = str.substring("<title>".length()+1);               
               
              Query query = Query.parse(query_str, conf);   
              int maxHitsPerVersion=100;
             
              //Hits hits = bean.search(query, topDocsReturned);
              //Hits hits = bean.search(query, topDocsReturned, maxMatches, maxDups, "site", null, false, null);      
              Hits hits = bean.search(query, topDocsReturned, maxMatches, maxDups, "site", null, false, functions, maxHitsPerVersion);
              int length = (int)Math.min(hits.getLength(), topDocsReturned);                
              Hit[] show = hits.getHits(0, length);
              HitDetails[] details = bean.getDetails(show);
             
              if (debug) {
                  System.out.println();
                  System.out.println(qid+":"+query_str);
                  System.out.println("topDocsReturned:"+topDocsReturned+" hits:"+length);
                }
             
              for (int i = 0; i < length; i++) {
                //String docno = bean.getParseData(details[i]).getMeta("DOCNO");
                String docno = details[i].getValue("DOCNO");
               
                float sim = Float.parseFloat(show[i].getSortValue().toString());
                System.out.println(qid+"\tQ0\t"+ docno+ "\t" + i + "\t" +(sim*1000)+
                    "\t"+ args[1]);
              }
             
              qid=null;
              query_str=null;
              }
            }
            in.close();
        }
        catch (IOException e) {
          System.err.println("Problem reading query file: "+e.getMessage());
          e.printStackTrace();
          System.err.println(usage);
        }
        catch (Exception e) {
          System.err.println("Problem: "+e.getMessage());
          e.printStackTrace();
          System.err.println(usage);
        }
    }
}
TOP

Related Classes of org.archive.nutch.trec.TRECBean

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.