Package org.apache.nutch.searcher

Source Code of org.apache.nutch.searcher.NutchBean$DupHits

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements.  See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.searcher;

import java.io.*;
import java.util.*;
import javax.servlet.ServletContext;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.conf.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;

import org.apache.lucene.search.PwaFunctionsWritable;

import org.apache.nutch.global.Global;


/**
* One stop shopping for search-related functionality.
* @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
*/  
public class NutchBean
  implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
             DistributedSearch.Protocol, Closeable {

  public static final Log LOG = LogFactory.getLog(NutchBean.class);
 
  public static final int MATCHED_DOCS_CONST_IGNORE = -2;

//  static {
//    LogFormatter.setShowThreadIDs(true);
//  }

  private String[] segmentNames;

  private Searcher searcher;
  private HitDetailer detailer;
  private HitSummarizer summarizer;
  private HitContent content;
  private HitInlinks linkDb;


  /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
   * don't want to use too many of those. */
  private static final int MAX_PROHIBITED_TERMS = 20;
 
  private Configuration conf;
  private FileSystem fs;
 
  private int maxFulltextMatchesReturned;
  private int maxFulltextMatchesRanked; 
  private int maxQueryTerms;
  private int maxQueryExtraTerms;
 

  /** Cache in servlet context. */
  public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
    NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
    if (bean == null) {
      //if (LOG.isInfoEnabled()) {
        LOG.info("creating new bean");
      //}
      bean = new NutchBean(conf);
      app.setAttribute("nutchBean", bean);
    }
    return bean;
  }


  /**
   *
   * @param conf
   * @throws IOException
   */
  public NutchBean(Configuration conf) throws IOException {
    this(conf, null, null);
  }
 
  /**
   *  Construct in a named directory.
   * @param conf
   * @param dir
   * @throws IOException
   */
  public NutchBean(Configuration conf, Path dir, File blacklistFile) throws IOException {
      this.conf = conf;
        this.fs = FileSystem.get(this.conf);
        if (dir == null) {
            dir = new Path(this.conf.get("searcher.dir", "crawl"));
        }
        Path servers = new Path(dir, "search-servers.txt");
        if (fs.exists(servers)) {
            LOG.info("searching servers in " + servers);           
            init(new DistributedSearch.Client(servers, conf));
        }
        else {
            init(new Path(dir, "index"), new Path(dir, "indexes"), new Path(
                    dir, "segments"), new Path(dir, "linkdb"), blacklistFile);
        }
                      
      this.maxFulltextMatchesReturned = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RETURNED, -1);
      this.maxFulltextMatchesRanked = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RANKED, -1);     
      this.maxQueryTerms = conf.getInt(Global.MAX_QUERY_TERMS, -1);
      this.maxQueryExtraTerms = conf.getInt(Global.MAX_QUERY_EXTRA_TERMS, -1);
    }

  private void init(Path indexDir, Path indexesDir, Path segmentsDir, Path linkDb, File blacklistFile)
    throws IOException {
   
    IndexSearcher indexSearcher;
    if (this.fs.exists(indexDir)) {
        LOG.info("opening merged index in " + indexDir);
        indexSearcher = new IndexSearcher(indexDir, this.conf, blacklistFile);
    }
    else {
        LOG.info("opening indexes in " + indexesDir);
     
        Vector vDirs=new Vector();
        Path [] directories = fs.listPaths(indexesDir);
        for(int i = 0; i < fs.listPaths(indexesDir).length; i++) {
          Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
          if(fs.isFile(indexdone)) {
            vDirs.add(directories[i]);
          }
        }
           
        directories = new Path[ vDirs.size() ];
        for(int i = 0; vDirs.size()>0; i++) {
          directories[i]=(Path)vDirs.remove(0);
        }
     
        indexSearcher = new IndexSearcher(directories, this.conf, blacklistFile);
    }

    LOG.info("opening segments in " + segmentsDir);   
    FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
   
    this.segmentNames = segments.getSegmentNames();

    this.searcher = indexSearcher;
    this.detailer = indexSearcher;
    this.summarizer = segments;
    this.content = segments;

    LOG.info("opening linkdb in " + linkDb);    
    this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
  }

  private void init(DistributedSearch.Client client) {
    this.segmentNames = client.getSegmentNames();
    this.searcher = client;
    this.detailer = client;
    this.summarizer = client;
    this.content = client;
    this.linkDb = client;
  }


  public String[] getSegmentNames() {
    return segmentNames;
  }

  public Hits search(Query query, int numHits) throws IOException {
    return search(query, numHits, null, null, false);
  }
 
  public Hits search(Query query, int numHits,
                     String dedupField, String sortField, boolean reverse)
    throws IOException {

    return searcher.search(query, numHits, dedupField, sortField, reverse);
  }
 
  private class DupHits extends ArrayList {
    private boolean maxSizeExceeded;
  }

  /** Search for pages matching a query, eliminating excessive hits from the
   * same site.  Hits after the first <code>maxHitsPerDup</code> from the same
   * site are removed from results.  The remaining hits have {@link
   * Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero then all
   * hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException {
    return search(query, numHits, maxHitsPerDup, "site", null, false, false);
  }

  /** Search for pages matching a query, eliminating excessive hits with
   * matching values for a named field.  Hits after the first
   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
   * then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException {
    return search(query, numHits, maxHitsPerDup, dedupField, null, false, false);
  }
 
  /** Search for pages matching a query, eliminating excessive hits with
   * matching values for a named field.  Hits after the first
   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
   * then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore  
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @param sortField Field to sort on (or null if no sorting).
   * @param reverse True if we are to reverse sort by <code>sortField</code>.
   * @param functions Extra parameters  
   * @param maxHitsPerVersion maximum hits returned with the same url and different version
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
                     String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion) throws IOException {   
    return search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion, false);
  }
 
 
  /** Search for pages matching a query, eliminating excessive hits with
   * matching values for a named field.  Hits after the first
   * <code>maxHitsPerDup</code> are removed from results.  The remaining hits
   * have {@link Hit#moreFromDupExcluded()} set.  <p> If maxHitsPerDup is zero
   * then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore  
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @param sortField Field to sort on (or null if no sorting).
   * @param reverse True if we are to reverse sort by <code>sortField</code>.
   * @param functions Extra parameters   
   * @param maxHitsPerVersion maximum hits returned with the same url and different version
   * @param waybackQuery if true it is a query from wayback; otherwise it is from nutchwax
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
                     String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion, boolean waybackQuery) throws IOException {   
                
  Hits hits = null;
  if (waybackQuery) {
    hits=searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);   
    hits.setTotalIsExact(true);
    return hits;
  }
 
  // check maximum value of variables
  if (numHits>maxFulltextMatchesReturned) {
    numHits=maxFulltextMatchesReturned;
  }
  if (searcherMaxHits>maxFulltextMatchesRanked) {
    searcherMaxHits=maxFulltextMatchesRanked;
  }
 
  // limit query terms for full-text queries
  query=limitTerms(query);   
   
    int numHitsRaw;
    float rawHitsFactor;
    if (maxHitsPerDup<=0) {
      if (searcherMaxHits==MATCHED_DOCS_CONST_IGNORE && functions==null) {
        return searcher.search(query, numHits, dedupField, sortField, reverse);
      }
      else {       
        return searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);     
      }
    }
    else {
      rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
        numHitsRaw = (int)(numHits * rawHitsFactor);
       
        LOG.debug("searching for "+numHitsRaw+" raw hits");               
        hits=searcher.search(query, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion)// the same method for all values of searcherMaxHits
    }          
   
    boolean lastRequest=false;
    if (numHitsRaw>hits.getTotal()) { // BUG 200608 - do no request continuously until it have numHits if the match has a smaller number of hits
      lastRequest=true;
    }
   
    // remove duplicates block
    long total = hits.getTotal();
    Map dupToHits = new HashMap();
    List resultList = new ArrayList();
    Set seen = new HashSet();
    List excludedValues = new ArrayList();
    boolean totalIsExact = true;
    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
      // get the next raw hit
      if (rawHitNum >= hits.getLength()) {
       
      if (lastRequest) { // BUG 200608
        break;
      }
       
        // optimize query by prohibiting more matches on some excluded values
        Query optQuery = (Query)query.clone();
        for (int i = 0; i < excludedValues.size(); i++) {
          if (i == MAX_PROHIBITED_TERMS)
            break;
          optQuery.addProhibitedTerm(((String)excludedValues.get(i)),dedupField);
        }
        numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
        //if (LOG.isInfoEnabled()) {
          LOG.debug("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
        //}
        // hits = searchAux(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse);  // for TREC
        hits = searcher.search(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);       
        if (numHitsRaw>hits.getTotal()) { // BUG 200608
          lastRequest=true;
        }
       
        //if (LOG.isInfoEnabled()) {
          LOG.debug("found "+hits.getTotal()+" raw hits");
        //}
        rawHitNum = -1;
        continue;
      }

      Hit hit = hits.getHit(rawHitNum);
      if (seen.contains(hit)) // processed in the previous query
        continue;
      seen.add(hit);
     
      // get dup hits for its value
      String value = hit.getDedupValue();     
      DupHits dupHits = (DupHits)dupToHits.get(value);      
      if (dupHits == null) {      
        dupToHits.put(value, dupHits = new DupHits());
      }
                
      // does this hit exceed maxHitsPerDup?
      if (dupHits.size()==maxHitsPerDup ) {      // yes -- then ignore the hit
        if (!dupHits.maxSizeExceeded) {

          // mark prior hits with moreFromDupExcluded
          for (int i = 0; i < dupHits.size(); i++) {
            ((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
          }
          dupHits.maxSizeExceeded = true;

          excludedValues.add(value);              // exclude dup
        }
        totalIsExact = false;
      }   
      else {                                    // no -- then collect the hit
        resultList.add(hit);
        dupHits.add(hit);       

        // are we done?
        // we need to find one more than asked for, so that we can tell if
        // there are more hits to be shown
        if (resultList.size() > numHits)
          break;
      }
    }

    Hits results = new Hits(total, (Hit[])resultList.toArray(new Hit[resultList.size()]));
    results.setTotalIsExact(totalIsExact);
    return results;
  }
 
 
  /**
   * Limit number of query terms and extra query terms
   * @param input
   * @param output
   */
  public Query limitTerms(Query input) {
    Query output=new Query(input.getConf());
    Clause[] clauses = input.getClauses();
    int termsCounter=0;
    int termsExtraCounter=0;
       
    for (int i=0; i<clauses.length; i++) {
      Clause c = clauses[i];

        if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited() && termsCounter>=maxQueryTerms) { // is it is a term and reached the limiti
          continue;
        }     
        if ((!c.getField().equals(Clause.DEFAULT_FIELD) || c.isProhibited()) && termsExtraCounter>=maxQueryExtraTerms) // it is an exstra term or a not
          continue;                                

        if (c.isPhrase()) {                        
          Term[] terms = c.getPhrase().getTerms();
         
          int newLength=terms.length;
          if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
            if (terms.length+termsCounter>maxQueryTerms) {
                newLength=maxQueryTerms-termsCounter;
                termsCounter+=newLength;
            }
            else {
              termsCounter+=terms.length;
            }
          }
          else {
            if (terms.length+termsExtraCounter>maxQueryExtraTerms) {
                newLength=maxQueryExtraTerms-termsExtraCounter;
                termsExtraCounter+=newLength;
            }
            else {
              termsExtraCounter+=terms.length;
            }
          }
                   
            if (newLength!=terms.length) {             
              if (newLength==1) {
                output.addClause(new Clause(terms[0], c.isRequired(), c.isProhibited(), c.getConf()));
                }
              else {
                Term[] newTerms=new Term[newLength];
                System.arraycopy(terms, 0, newTerms, 0, newLength);
                    output.addClause(new Clause(new Phrase(newTerms), c.isRequired(), c.isProhibited(), c.getConf()));
                }
            }
            else {
              output.addClause(c);               
            }            
        }
        else {
          output.addClause(c);       
          if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
            termsCounter++; 
          }
          else {
            termsExtraCounter++;
          }                                  
        }
    }
   
    return output;
  }
     
  /**
   * @param searcherMaxHits 
   */
  public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse, boolean waybackQuery) throws IOException {
  return search(query, numHits, MATCHED_DOCS_CONST_IGNORE, maxHitsPerDup, dedupField, sortField, reverse, null, Integer.MAX_VALUE, waybackQuery);
  }
 
  public String getExplanation(Query query, Hit hit, PwaFunctionsWritable functions) throws IOException {
  return searcher.getExplanation(query, hit, functions);
  }
 
  public String getExplanation(Query query, Hit hit) throws IOException {
    return searcher.getExplanation(query, hit, null);
 

  public HitDetails getDetails(Hit hit) throws IOException {
    return detailer.getDetails(hit);
  }
 
  public HitDetails[] getDetails(Hit[] hits) throws IOException {
   return detailer.getDetails(hits);
  }
 
  /* BUG wayback 0000155 */
  public HitDetails[] getDetails(PwaRequestDetailsWritable details) throws IOException {
    return detailer.getDetails(details);
  }
 
  public Summary getSummary(HitDetails hit, Query query) throws IOException {
    return summarizer.getSummary(hit, query);
  }

  public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException {
    return summarizer.getSummary(hits, query);
  }
 
  /* BUG nutchwax 0000616 */
  public Summary[] getSummary(PwaRequestSummaryWritable summaries) throws IOException {
  return summarizer.getSummary(summaries);
 

  public byte[] getContent(HitDetails hit) throws IOException {
    return content.getContent(hit);
  }

  public ParseData getParseData(HitDetails hit) throws IOException {
    return content.getParseData(hit);
  }

  public ParseText getParseText(HitDetails hit) throws IOException {
    return content.getParseText(hit);
  }

  public String[] getAnchors(HitDetails hit) throws IOException {
    return linkDb.getAnchors(hit);
  }

  public Inlinks getInlinks(HitDetails hit) throws IOException {
    return linkDb.getInlinks(hit);
 

  public long getFetchDate(HitDetails hit) throws IOException {
    return content.getFetchDate(hit);
  }

  public void close() throws IOException {
    if (content != null) { content.close(); }
    if (searcher != null) { searcher.close(); }
    if (linkDb != null) { linkDb.close(); }
    if (fs != null) { fs.close(); }
  }
 
  /** For debugging. */
  public static void main(String[] args) throws Exception {
    String usage = "NutchBean query";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    Configuration conf = NutchConfiguration.create();
    NutchBean bean = new NutchBean(conf);
    Query query = Query.parse(args[0], conf);
    Hits hits = bean.search(query, 10);
    System.out.println("Total hits: " + hits.getTotal());
    int length = (int)Math.min(hits.getTotal(), 10);
    Hit[] show = hits.getHits(0, length);
    HitDetails[] details = bean.getDetails(show);
    Summary[] summaries = bean.getSummary(details, query);

    for (int i = 0; i < hits.getLength(); i++) {
      System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
    }
  }

  public long getProtocolVersion(String className, long arg1) throws IOException {
    if(DistributedSearch.Protocol.class.getName().equals(className)){
      return 1;
    } else {
      throw new IOException("Unknown Protocol classname:" + className);
    }
  }



}
TOP

Related Classes of org.apache.nutch.searcher.NutchBean$DupHits

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.