/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.searcher;
import java.io.*;
import java.util.*;
import javax.servlet.ServletContext;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.conf.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;
import org.apache.lucene.search.PwaFunctionsWritable;
import org.apache.nutch.global.Global;
/**
* One stop shopping for search-related functionality.
* @version $Id: NutchBean.java,v 1.19 2005/02/07 19:10:08 cutting Exp $
*/
public class NutchBean
implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
DistributedSearch.Protocol, Closeable {
public static final Log LOG = LogFactory.getLog(NutchBean.class);
public static final int MATCHED_DOCS_CONST_IGNORE = -2;
// static {
// LogFormatter.setShowThreadIDs(true);
// }
private String[] segmentNames;
private Searcher searcher;
private HitDetailer detailer;
private HitSummarizer summarizer;
private HitContent content;
private HitInlinks linkDb;
/** BooleanQuery won't permit more than 32 required/prohibited clauses. We
* don't want to use too many of those. */
private static final int MAX_PROHIBITED_TERMS = 20;
private Configuration conf;
private FileSystem fs;
private int maxFulltextMatchesReturned;
private int maxFulltextMatchesRanked;
private int maxQueryTerms;
private int maxQueryExtraTerms;
/** Cache in servlet context. */
public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
if (bean == null) {
//if (LOG.isInfoEnabled()) {
LOG.info("creating new bean");
//}
bean = new NutchBean(conf);
app.setAttribute("nutchBean", bean);
}
return bean;
}
/**
*
* @param conf
* @throws IOException
*/
public NutchBean(Configuration conf) throws IOException {
this(conf, null, null);
}
/**
* Construct in a named directory.
* @param conf
* @param dir
* @throws IOException
*/
public NutchBean(Configuration conf, Path dir, File blacklistFile) throws IOException {
this.conf = conf;
this.fs = FileSystem.get(this.conf);
if (dir == null) {
dir = new Path(this.conf.get("searcher.dir", "crawl"));
}
Path servers = new Path(dir, "search-servers.txt");
if (fs.exists(servers)) {
LOG.info("searching servers in " + servers);
init(new DistributedSearch.Client(servers, conf));
}
else {
init(new Path(dir, "index"), new Path(dir, "indexes"), new Path(
dir, "segments"), new Path(dir, "linkdb"), blacklistFile);
}
this.maxFulltextMatchesReturned = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RETURNED, -1);
this.maxFulltextMatchesRanked = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RANKED, -1);
this.maxQueryTerms = conf.getInt(Global.MAX_QUERY_TERMS, -1);
this.maxQueryExtraTerms = conf.getInt(Global.MAX_QUERY_EXTRA_TERMS, -1);
}
private void init(Path indexDir, Path indexesDir, Path segmentsDir, Path linkDb, File blacklistFile)
throws IOException {
IndexSearcher indexSearcher;
if (this.fs.exists(indexDir)) {
LOG.info("opening merged index in " + indexDir);
indexSearcher = new IndexSearcher(indexDir, this.conf, blacklistFile);
}
else {
LOG.info("opening indexes in " + indexesDir);
Vector vDirs=new Vector();
Path [] directories = fs.listPaths(indexesDir);
for(int i = 0; i < fs.listPaths(indexesDir).length; i++) {
Path indexdone = new Path(directories[i], Indexer.DONE_NAME);
if(fs.isFile(indexdone)) {
vDirs.add(directories[i]);
}
}
directories = new Path[ vDirs.size() ];
for(int i = 0; vDirs.size()>0; i++) {
directories[i]=(Path)vDirs.remove(0);
}
indexSearcher = new IndexSearcher(directories, this.conf, blacklistFile);
}
LOG.info("opening segments in " + segmentsDir);
FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
this.segmentNames = segments.getSegmentNames();
this.searcher = indexSearcher;
this.detailer = indexSearcher;
this.summarizer = segments;
this.content = segments;
LOG.info("opening linkdb in " + linkDb);
this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
}
private void init(DistributedSearch.Client client) {
this.segmentNames = client.getSegmentNames();
this.searcher = client;
this.detailer = client;
this.summarizer = client;
this.content = client;
this.linkDb = client;
}
public String[] getSegmentNames() {
return segmentNames;
}
public Hits search(Query query, int numHits) throws IOException {
return search(query, numHits, null, null, false);
}
public Hits search(Query query, int numHits,
String dedupField, String sortField, boolean reverse)
throws IOException {
return searcher.search(query, numHits, dedupField, sortField, reverse);
}
private class DupHits extends ArrayList {
private boolean maxSizeExceeded;
}
/** Search for pages matching a query, eliminating excessive hits from the
* same site. Hits after the first <code>maxHitsPerDup</code> from the same
* site are removed from results. The remaining hits have {@link
* Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero then all
* hits are returned.
*
* @param query query
* @param numHits number of requested hits
* @param maxHitsPerDup the maximum hits returned with matching values, or zero
* @return Hits the matching hits
* @throws IOException
*/
public Hits search(Query query, int numHits, int maxHitsPerDup) throws IOException {
return search(query, numHits, maxHitsPerDup, "site", null, false, false);
}
/** Search for pages matching a query, eliminating excessive hits with
* matching values for a named field. Hits after the first
* <code>maxHitsPerDup</code> are removed from results. The remaining hits
* have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
* then all hits are returned.
*
* @param query query
* @param numHits number of requested hits
* @param maxHitsPerDup the maximum hits returned with matching values, or zero
* @param dedupField field name to check for duplicates
* @return Hits the matching hits
* @throws IOException
*/
public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField) throws IOException {
return search(query, numHits, maxHitsPerDup, dedupField, null, false, false);
}
/** Search for pages matching a query, eliminating excessive hits with
* matching values for a named field. Hits after the first
* <code>maxHitsPerDup</code> are removed from results. The remaining hits
* have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
* then all hits are returned.
*
* @param query query
* @param numHits number of requested hits
* @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore
* @param maxHitsPerDup the maximum hits returned with matching values, or zero
* @param dedupField field name to check for duplicates
* @param sortField Field to sort on (or null if no sorting).
* @param reverse True if we are to reverse sort by <code>sortField</code>.
* @param functions Extra parameters
* @param maxHitsPerVersion maximum hits returned with the same url and different version
* @return Hits the matching hits
* @throws IOException
*/
public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion) throws IOException {
return search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion, false);
}
/** Search for pages matching a query, eliminating excessive hits with
* matching values for a named field. Hits after the first
* <code>maxHitsPerDup</code> are removed from results. The remaining hits
* have {@link Hit#moreFromDupExcluded()} set. <p> If maxHitsPerDup is zero
* then all hits are returned.
*
* @param query query
* @param numHits number of requested hits
* @param searcherMaxHits number of matched documents for ranking, or MATCHED_DOCS_CONST_IGNORE to ignore
* @param maxHitsPerDup the maximum hits returned with matching values, or zero
* @param dedupField field name to check for duplicates
* @param sortField Field to sort on (or null if no sorting).
* @param reverse True if we are to reverse sort by <code>sortField</code>.
* @param functions Extra parameters
* @param maxHitsPerVersion maximum hits returned with the same url and different version
* @param waybackQuery if true it is a query from wayback; otherwise it is from nutchwax
* @return Hits the matching hits
* @throws IOException
*/
public Hits search(Query query, int numHits, int searcherMaxHits, int maxHitsPerDup, String dedupField,
String sortField, boolean reverse, PwaFunctionsWritable functions, int maxHitsPerVersion, boolean waybackQuery) throws IOException {
Hits hits = null;
if (waybackQuery) {
hits=searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);
hits.setTotalIsExact(true);
return hits;
}
// check maximum value of variables
if (numHits>maxFulltextMatchesReturned) {
numHits=maxFulltextMatchesReturned;
}
if (searcherMaxHits>maxFulltextMatchesRanked) {
searcherMaxHits=maxFulltextMatchesRanked;
}
// limit query terms for full-text queries
query=limitTerms(query);
int numHitsRaw;
float rawHitsFactor;
if (maxHitsPerDup<=0) {
if (searcherMaxHits==MATCHED_DOCS_CONST_IGNORE && functions==null) {
return searcher.search(query, numHits, dedupField, sortField, reverse);
}
else {
return searcher.search(query, numHits, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);
}
}
else {
rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
numHitsRaw = (int)(numHits * rawHitsFactor);
LOG.debug("searching for "+numHitsRaw+" raw hits");
hits=searcher.search(query, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion); // the same method for all values of searcherMaxHits
}
boolean lastRequest=false;
if (numHitsRaw>hits.getTotal()) { // BUG 200608 - do no request continuously until it have numHits if the match has a smaller number of hits
lastRequest=true;
}
// remove duplicates block
long total = hits.getTotal();
Map dupToHits = new HashMap();
List resultList = new ArrayList();
Set seen = new HashSet();
List excludedValues = new ArrayList();
boolean totalIsExact = true;
for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
// get the next raw hit
if (rawHitNum >= hits.getLength()) {
if (lastRequest) { // BUG 200608
break;
}
// optimize query by prohibiting more matches on some excluded values
Query optQuery = (Query)query.clone();
for (int i = 0; i < excludedValues.size(); i++) {
if (i == MAX_PROHIBITED_TERMS)
break;
optQuery.addProhibitedTerm(((String)excludedValues.get(i)),dedupField);
}
numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
//if (LOG.isInfoEnabled()) {
LOG.debug("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
//}
// hits = searchAux(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse); // for TREC
hits = searcher.search(optQuery, numHitsRaw, searcherMaxHits, maxHitsPerDup, dedupField, sortField, reverse, functions, maxHitsPerVersion);
if (numHitsRaw>hits.getTotal()) { // BUG 200608
lastRequest=true;
}
//if (LOG.isInfoEnabled()) {
LOG.debug("found "+hits.getTotal()+" raw hits");
//}
rawHitNum = -1;
continue;
}
Hit hit = hits.getHit(rawHitNum);
if (seen.contains(hit)) // processed in the previous query
continue;
seen.add(hit);
// get dup hits for its value
String value = hit.getDedupValue();
DupHits dupHits = (DupHits)dupToHits.get(value);
if (dupHits == null) {
dupToHits.put(value, dupHits = new DupHits());
}
// does this hit exceed maxHitsPerDup?
if (dupHits.size()==maxHitsPerDup ) { // yes -- then ignore the hit
if (!dupHits.maxSizeExceeded) {
// mark prior hits with moreFromDupExcluded
for (int i = 0; i < dupHits.size(); i++) {
((Hit)dupHits.get(i)).setMoreFromDupExcluded(true);
}
dupHits.maxSizeExceeded = true;
excludedValues.add(value); // exclude dup
}
totalIsExact = false;
}
else { // no -- then collect the hit
resultList.add(hit);
dupHits.add(hit);
// are we done?
// we need to find one more than asked for, so that we can tell if
// there are more hits to be shown
if (resultList.size() > numHits)
break;
}
}
Hits results = new Hits(total, (Hit[])resultList.toArray(new Hit[resultList.size()]));
results.setTotalIsExact(totalIsExact);
return results;
}
/**
* Limit number of query terms and extra query terms
* @param input
* @param output
*/
public Query limitTerms(Query input) {
Query output=new Query(input.getConf());
Clause[] clauses = input.getClauses();
int termsCounter=0;
int termsExtraCounter=0;
for (int i=0; i<clauses.length; i++) {
Clause c = clauses[i];
if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited() && termsCounter>=maxQueryTerms) { // is it is a term and reached the limiti
continue;
}
if ((!c.getField().equals(Clause.DEFAULT_FIELD) || c.isProhibited()) && termsExtraCounter>=maxQueryExtraTerms) // it is an exstra term or a not
continue;
if (c.isPhrase()) {
Term[] terms = c.getPhrase().getTerms();
int newLength=terms.length;
if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
if (terms.length+termsCounter>maxQueryTerms) {
newLength=maxQueryTerms-termsCounter;
termsCounter+=newLength;
}
else {
termsCounter+=terms.length;
}
}
else {
if (terms.length+termsExtraCounter>maxQueryExtraTerms) {
newLength=maxQueryExtraTerms-termsExtraCounter;
termsExtraCounter+=newLength;
}
else {
termsExtraCounter+=terms.length;
}
}
if (newLength!=terms.length) {
if (newLength==1) {
output.addClause(new Clause(terms[0], c.isRequired(), c.isProhibited(), c.getConf()));
}
else {
Term[] newTerms=new Term[newLength];
System.arraycopy(terms, 0, newTerms, 0, newLength);
output.addClause(new Clause(new Phrase(newTerms), c.isRequired(), c.isProhibited(), c.getConf()));
}
}
else {
output.addClause(c);
}
}
else {
output.addClause(c);
if (c.getField().equals(Clause.DEFAULT_FIELD) && !c.isProhibited()) {
termsCounter++;
}
else {
termsExtraCounter++;
}
}
}
return output;
}
/**
* @param searcherMaxHits
*/
public Hits search(Query query, int numHits, int maxHitsPerDup, String dedupField, String sortField, boolean reverse, boolean waybackQuery) throws IOException {
return search(query, numHits, MATCHED_DOCS_CONST_IGNORE, maxHitsPerDup, dedupField, sortField, reverse, null, Integer.MAX_VALUE, waybackQuery);
}
public String getExplanation(Query query, Hit hit, PwaFunctionsWritable functions) throws IOException {
return searcher.getExplanation(query, hit, functions);
}
public String getExplanation(Query query, Hit hit) throws IOException {
return searcher.getExplanation(query, hit, null);
}
public HitDetails getDetails(Hit hit) throws IOException {
return detailer.getDetails(hit);
}
public HitDetails[] getDetails(Hit[] hits) throws IOException {
return detailer.getDetails(hits);
}
/* BUG wayback 0000155 */
public HitDetails[] getDetails(PwaRequestDetailsWritable details) throws IOException {
return detailer.getDetails(details);
}
public Summary getSummary(HitDetails hit, Query query) throws IOException {
return summarizer.getSummary(hit, query);
}
public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException {
return summarizer.getSummary(hits, query);
}
/* BUG nutchwax 0000616 */
public Summary[] getSummary(PwaRequestSummaryWritable summaries) throws IOException {
return summarizer.getSummary(summaries);
}
public byte[] getContent(HitDetails hit) throws IOException {
return content.getContent(hit);
}
public ParseData getParseData(HitDetails hit) throws IOException {
return content.getParseData(hit);
}
public ParseText getParseText(HitDetails hit) throws IOException {
return content.getParseText(hit);
}
public String[] getAnchors(HitDetails hit) throws IOException {
return linkDb.getAnchors(hit);
}
public Inlinks getInlinks(HitDetails hit) throws IOException {
return linkDb.getInlinks(hit);
}
public long getFetchDate(HitDetails hit) throws IOException {
return content.getFetchDate(hit);
}
public void close() throws IOException {
if (content != null) { content.close(); }
if (searcher != null) { searcher.close(); }
if (linkDb != null) { linkDb.close(); }
if (fs != null) { fs.close(); }
}
/** For debugging. */
public static void main(String[] args) throws Exception {
String usage = "NutchBean query";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
Configuration conf = NutchConfiguration.create();
NutchBean bean = new NutchBean(conf);
Query query = Query.parse(args[0], conf);
Hits hits = bean.search(query, 10);
System.out.println("Total hits: " + hits.getTotal());
int length = (int)Math.min(hits.getTotal(), 10);
Hit[] show = hits.getHits(0, length);
HitDetails[] details = bean.getDetails(show);
Summary[] summaries = bean.getSummary(details, query);
for (int i = 0; i < hits.getLength(); i++) {
System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
}
}
public long getProtocolVersion(String className, long arg1) throws IOException {
if(DistributedSearch.Protocol.class.getName().equals(className)){
return 1;
} else {
throw new IOException("Unknown Protocol classname:" + className);
}
}
}