/* eXist Open Source Native XML Database
* Copyright (C) 2001/02, Wolfgang M. Meier (meier@ifs.tu-darmstadt.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* $Id$
*/
package org.exist.storage;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.Observable;
import java.util.TreeSet;
import org.apache.log4j.Logger;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.fulltext.ElementContent;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.analysis.SimpleTokenizer;
import org.exist.storage.analysis.Tokenizer;
import org.exist.storage.btree.DBException;
import org.exist.storage.serializers.Serializer;
import org.exist.util.Configuration;
import org.exist.util.Occurrences;
import org.exist.util.PorterStemmer;
import org.exist.xquery.TerminatedException;
import org.exist.xquery.XQueryContext;
/**
* This is the base class for all classes providing access to the fulltext index.
*
* The class has methods to add text and attribute nodes to the fulltext index,
* or to search for nodes matching selected search terms.
*
* @author wolf
*/
public abstract class TextSearchEngine extends Observable {
protected final static Logger LOG =
Logger.getLogger(TextSearchEngine.class);
protected TreeSet<String> stoplist = new TreeSet<String>();
protected DBBroker broker = null;
protected Tokenizer tokenizer;
protected Configuration config;
protected boolean indexNumbers = false ;
protected boolean stem = false ;
protected boolean termFreq = true;
protected PorterStemmer stemmer = null;
protected int trackMatches = Serializer.TAG_ELEMENT_MATCHES;
public final static String INDEX_NUMBERS_ATTRIBUTE = "parseNumbers";
public final static String STEM_ATTRIBUTE = "stemming";
public final static String STORE_TERM_FREQUENCY_ATTRIBUTE = "track-term-freq";
public final static String TOKENIZER_ATTRIBUTE = "tokenizer";
public static final String CONFIGURATION_STOPWORDS_ELEMENT_NAME = "stopwords";
public final static String STOPWORD_FILE_ATTRIBUTE = "file";
public final static String PROPERTY_INDEX_NUMBERS = "indexer.indexNumbers";
public final static String PROPERTY_STEM = "indexer.stem";
public final static String PROPERTY_STORE_TERM_FREQUENCY = "indexer.store-term-freq";
public final static String PROPERTY_TOKENIZER = "indexer.tokenizer";
public final static String PROPERTY_STOPWORD_FILE = "stopwords";
/**
* Construct a new instance and configure it.
*
* @param broker
* @param conf
*/
public TextSearchEngine(DBBroker broker, Configuration conf) {
this.broker = broker;
this.config = conf;
String stopword, tokenizerClass;
Boolean num, stemming, termFrequencies;
if ((num = (Boolean) config.getProperty(PROPERTY_INDEX_NUMBERS)) != null)
{indexNumbers = num.booleanValue();}
if ((stemming = (Boolean) config.getProperty(PROPERTY_STEM)) != null)
{stem = stemming.booleanValue();}
if((termFrequencies = (Boolean) config.getProperty(PROPERTY_STORE_TERM_FREQUENCY)) != null)
{termFreq = termFrequencies.booleanValue();}
String track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ELEMENTS);
if (track != null)
{trackMatches = track.equalsIgnoreCase("yes")
? Serializer.TAG_ELEMENT_MATCHES
: Serializer.TAG_NONE;}
track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ATTRIBUTES);
if (track != null && track.equalsIgnoreCase("yes"))
{trackMatches = trackMatches | Serializer.TAG_ATTRIBUTE_MATCHES;}
if ((tokenizerClass = (String) config.getProperty(PROPERTY_TOKENIZER)) != null) {
try {
final Class<?> tokClass = Class.forName(tokenizerClass);
tokenizer = (Tokenizer) tokClass.newInstance();
LOG.debug("using tokenizer: " + tokenizerClass);
} catch (final ClassNotFoundException e) {
LOG.debug(e);
} catch (final InstantiationException e) {
LOG.debug(e);
} catch (final IllegalAccessException e) {
LOG.debug(e);
}
}
if (tokenizer == null) {
LOG.debug("using simple tokenizer");
tokenizer = new SimpleTokenizer();
}
if (stem)
{stemmer = new PorterStemmer();}
tokenizer.setStemming(stem);
if ((stopword = (String) config.getProperty(PROPERTY_STOPWORD_FILE)) != null) {
try {
final FileReader in = new FileReader(stopword);
final StreamTokenizer tok = new StreamTokenizer(in);
int next = tok.nextToken();
while (next != StreamTokenizer.TT_EOF) {
if (next != StreamTokenizer.TT_WORD)
{continue;}
stoplist.add(tok.sval);
next = tok.nextToken();
}
} catch (final FileNotFoundException e) {
LOG.debug(e);
} catch (final IOException e) {
LOG.debug(e);
}
}
}
/**
* Returns the Tokenizer used for tokenizing strings into
* words.
*
* @return tokenizer
*/
public Tokenizer getTokenizer() {
return tokenizer;
}
/**
* Tokenize and index the given text node.
*
* @param indexSpec
* @param node
*/
public abstract void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);
public abstract void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);
public abstract void flush();
public abstract boolean close() throws DBException;
public int getTrackMatches() {
return trackMatches;
}
public void setTrackMatches(int flags) {
trackMatches = flags;
}
public NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
NodeSet contextSet, int axis, QName qname, String expr, int type) throws TerminatedException {
return getNodesContaining(context, docs, contextSet, axis, qname, expr, type, true);
}
/**
* For each of the given search terms and each of the documents in the
* document set, return a node-set of matching nodes.
*
* The type-argument indicates if search terms should be compared using
* a regular expression. Valid values are DBBroker.MATCH_EXACT or
* DBBroker.MATCH_REGEXP.
*/
public abstract NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
NodeSet contextSet, int axis, QName qname, String expr, int type, boolean matchAll) throws TerminatedException;
public abstract NodeSet getNodes(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname,
TermMatcher matcher, CharSequence startTerm) throws TerminatedException;
/**
* Queries the fulltext index to retrieve information on indexed words contained
* in the index for the current collection. Returns a list of {@link Occurrences} for all
* words contained in the index. If param end is null, all words starting with
* the string sequence param start are returned. Otherwise, the method
* returns all words that come after start and before end in lexical order.
*/
public abstract Occurrences[] scanIndexTerms(
DocumentSet docs,
NodeSet contextSet,
String start,
String end) throws PermissionDeniedException;
public abstract Occurrences[] scanIndexTerms(
DocumentSet docs,
NodeSet contextSet,
QName[] qnames,
String start,
String end) throws PermissionDeniedException;
public abstract String[] getIndexTerms(DocumentSet docs, TermMatcher matcher);
/**
* Remove index entries for an entire collection.
*
* @param collection
*/
public abstract void dropIndex(Collection collection);
/**
* Remove all index entries for the given document.
*
* @param doc
*/
public abstract void dropIndex(DocumentImpl doc);
}