/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.keymatch;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.util.DomUtil;
import org.apache.xerces.dom.DocumentImpl;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* <p>SimpleKeyMatcher is responsible for targetting predefined links for defined
* keywords for example to promote some urls that are not yet part of
* production index.</p>
* <p>SimpleKeyMatcher is not a textadd targetting system</p>
* <p>KeyMatcher is configured with xml configuration file:
* <br><pre>
* <?xml version="1.0"?>
* <keymatches>
* <keymatch type="keyword|phrase|exact">
* <term>search engine</term>
* <url>http://lucene.apache.org/nutch</url>
* <title>Your favourite search engine!</title>
* </keymatch>
* </keymatches></pre>
* By default Keymatcher expects the file be named keymatches.xml
* </p>
* <p>Match type can be one of the following: keyword, phrase, exact match.
* Terms of a query are produced by the Query object and none of the
* matches is case sensitive</p>
* <b>keyword</b><br>
* Matches on keyword level, for example query "search engine" would match both
* keywords search and engine<br>
* <br>
* <b>phrase</b><br>
* Matches phrase, for example: query "open source search engine" "search engine watch"
* would match "search engine", but query "search from engine" would not.<br>
* <br>
* <b>exact</b><br>
* Query "open source search engine" would match "open source search engine", but not
* "search engine" nor "best open source engine"<br>
*
*/
public class SimpleKeyMatcher extends Configured {
static final char PREFIX_KEYWORD='k';
static final char PREFIX_PHRASE='p';
static final char PREFIX_EXACT='e';
class KeyMatcherStats {
int terms[];
void addStats(int numTerms) {
if (numTerms <= terms.length) {
terms[numTerms]++;
}
}
public KeyMatcherStats(int size) {
terms = new int[size];
for (int i = 0; i < size; i++) {
terms[i] = 0;
}
}
}
public static final Log LOG = LogFactory.getLog(SimpleKeyMatcher.class);
public static final String TAG_KEYMATCH = "keymatch";
public static final String TAG_KEYMATCHES = "keymatches";
static final String DEFAULT_CONFIG_FILE = "keymatches.xml";
static final int MAX_TERMS = 5;
KeyMatcherStats stats;
KeyMatchFilter currentFilter;
HashMap matches = new HashMap();
private String configName;
public SimpleKeyMatcher(Configuration conf) {
this(DEFAULT_CONFIG_FILE,conf);
}
/**
* Sets currentFilter
* @param filter the filter to set
*/
public void setFilter(KeyMatchFilter filter) {
this.currentFilter=filter;
}
/**
* Construct new SimpleKeyMatcher with provided filename and configuration
* @param resourceName
* @param conf
*/
public SimpleKeyMatcher(String resourceName, Configuration conf) {
super(conf);
configName=resourceName;
stats = new KeyMatcherStats(MAX_TERMS);
currentFilter=new ViewCountSorter();
init();
}
/**
* Initialize keyword matcher
*
*/
protected void init() {
final HashMap tempMap = new HashMap();
final InputStream input = getConf().getConfResourceAsInputStream(
configName);
if (input != null) {
final Element root = DomUtil.getDom(input);
try {
input.close();
} catch (IOException e1) {
e1.printStackTrace();
}
final NodeList nodeList = root.getElementsByTagName(TAG_KEYMATCH);
LOG.debug("Configuration file has " + nodeList.getLength()
+ " KeyMatch entries.");
for (int i = 0; i < nodeList.getLength(); i++) {
final Element element = (Element) nodeList.item(i);
final KeyMatch keyMatch = new KeyMatch();
keyMatch.initialize(element);
addKeyMatch(tempMap, keyMatch);
}
matches=tempMap;
}
}
/**
* Get keymatches for query
* @param query parsed query
* @param context evaluation context
* @return array of keymatches
*/
public KeyMatch[] getMatches(final Query query, Map context) {
final ArrayList currentMatches=new ArrayList();
final String terms[]=query.getTerms();
//"keyword"
for(int i=0;i<terms.length;i++){
if(LOG.isDebugEnabled()){
LOG.debug("keyword: '" + terms[i] + "'");
}
addMatches(currentMatches, matches.get(PREFIX_KEYWORD + terms[i]));
}
//"phrase"
for(int l=2;l<=terms.length;l++){
if(stats.terms[l]>0) {
for(int p=0;p<=terms.length-l;p++){
String key="";
for(int i=p;i<p+l;i++){
key+=terms[i];
if(i!=p+l-1) key+=" ";
}
if(LOG.isDebugEnabled()){
LOG.debug("phrase key: '" + key + "'");
}
addMatches(currentMatches, matches.get(PREFIX_PHRASE + key));
}
}
}
//"exact"
String key=query.toString();
if(LOG.isDebugEnabled()){
LOG.debug("exact key: '" + key + "'");
}
addMatches(currentMatches, matches.get(PREFIX_EXACT + key));
return currentFilter.filter(currentMatches, context);
}
void addMatches(ArrayList currentMatches, Object match){
if(match!=null) {
if(match instanceof ArrayList) {
currentMatches.addAll(((ArrayList)match));
} else {
currentMatches.add(match);
}
}
}
/** Get tokens of a string with nutch Query parser
*
* @param string
* @return
*/
private String[] getTokens(final String string){
org.apache.nutch.searcher.Query q;
try {
q = org.apache.nutch.searcher.Query.parse(string, getConf());
return q.getTerms();
} catch (IOException e) {
LOG.info("Error getting terms from query:" + e);
}
return new String[0];
}
/**
* add new keymatch
*
* @param keymatch
*/
protected void addKeyMatch(Map map, final KeyMatch keymatch) {
String key="";
LOG.info("Adding keymatch: MATCHTYPE=" + KeyMatch.TYPES[keymatch.type] + ", TERM='" + keymatch.term + "', TITLE='"
+ keymatch.title + "' ,URL='" + keymatch.url + "'");
keymatch.term=keymatch.term.toLowerCase();
switch (keymatch.type) {
case KeyMatch.TYPE_EXACT: key+=PREFIX_EXACT;break;
case KeyMatch.TYPE_PHRASE: key+=PREFIX_PHRASE;break;
default: key+=PREFIX_KEYWORD;break;
}
//add info obout kw count for optimization
if(keymatch.type==KeyMatch.TYPE_PHRASE) {
stats.addStats(getTokens(keymatch.term).length);
}
key+=keymatch.term;
if(map.containsKey(key)) {
ArrayList l;
Object o = matches.get(key);
if(o instanceof ArrayList) {
l=(ArrayList) o;
} else {
KeyMatch temp=(KeyMatch)o;
l=new ArrayList();
l.add(temp);
}
l.add(keymatch);
map.put(key,l);
} else {
map.put(key, keymatch);
}
}
/**
* Add Keymatch
*
*/
public void addKeyMatch(KeyMatch match){
addKeyMatch(matches, match);
}
/**
* Saves keymatch configuration into file.
*
* @throws IOException
*/
public void save() throws IOException {
try {
final URL url = getConf().getResource(configName);
if (url == null) {
throw new IOException("Resource not found: " + configName);
}
final FileOutputStream fos = new FileOutputStream(new File(url.getFile()));
final DocumentImpl doc = new DocumentImpl();
final Element keymatches = doc.createElement(TAG_KEYMATCHES);
final Iterator iterator = matches.values().iterator();
while (iterator.hasNext()) {
final Element keymatch = doc.createElement(TAG_KEYMATCH);
final KeyMatch keyMatch = (KeyMatch) iterator.next();
keyMatch.populateElement(keymatch);
keymatches.appendChild(keymatch);
}
DomUtil.saveDom(fos, keymatches);
fos.flush();
fos.close();
} catch (FileNotFoundException e) {
throw new IOException(e.toString());
}
}
/**
* Clear keymatches from this SimpleKeyMatcher instance
*
*/
public void clear(){
matches=new HashMap();
}
public void setKeyMatches(List keymatches){
HashMap hm=new HashMap();
Iterator i=keymatches.iterator();
while(i.hasNext()) {
KeyMatch km=(KeyMatch)i.next();
addKeyMatch(hm,km);
}
matches=hm;
}
}