Source Code of org.exist.storage.TextSearchEngine


/* eXist Open Source Native XML Database
 * Copyright (C) 2001/02,  Wolfgang M. Meier (meier@ifs.tu-darmstadt.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * 
 * $Id$
 */
package org.exist.storage;


import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.Observable;
import java.util.TreeSet;


import org.apache.log4j.Logger;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.fulltext.ElementContent;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.analysis.SimpleTokenizer;
import org.exist.storage.analysis.Tokenizer;
import org.exist.storage.btree.DBException;
import org.exist.storage.serializers.Serializer;
import org.exist.util.Configuration;
import org.exist.util.Occurrences;
import org.exist.util.PorterStemmer;
import org.exist.xquery.TerminatedException;
import org.exist.xquery.XQueryContext;


/**
 * This is the base class for all classes providing access to the fulltext index.
 * 
 * The class has methods to add text and attribute nodes to the fulltext index,
 * or to search for nodes matching selected search terms.
 *  
 * @author wolf
 */
public abstract class TextSearchEngine extends Observable {


  protected final static Logger LOG =
    Logger.getLogger(TextSearchEngine.class);
    
    protected TreeSet<String> stoplist = new TreeSet<String>();
  protected DBBroker broker = null;
  protected Tokenizer tokenizer;
  protected Configuration config;
  protected boolean indexNumbers = false ;
  protected boolean stem = false ;
  protected boolean termFreq = true;
  protected PorterStemmer stemmer = null;
  protected int trackMatches = Serializer.TAG_ELEMENT_MATCHES;
  
  public final static String INDEX_NUMBERS_ATTRIBUTE = "parseNumbers";
  public final static String STEM_ATTRIBUTE = "stemming";
  public final static String STORE_TERM_FREQUENCY_ATTRIBUTE = "track-term-freq";
  public final static String TOKENIZER_ATTRIBUTE = "tokenizer";
  public static final String CONFIGURATION_STOPWORDS_ELEMENT_NAME = "stopwords";
  public final static String STOPWORD_FILE_ATTRIBUTE = "file";
  
  public final static String PROPERTY_INDEX_NUMBERS = "indexer.indexNumbers";
  public final static String PROPERTY_STEM = "indexer.stem";
  public final static String PROPERTY_STORE_TERM_FREQUENCY = "indexer.store-term-freq";
  public final static String PROPERTY_TOKENIZER = "indexer.tokenizer";
  public final static String PROPERTY_STOPWORD_FILE = "stopwords";
  
  /**
   * Construct a new instance and configure it.
   * 
   * @param broker
   * @param conf
   */
  public TextSearchEngine(DBBroker broker, Configuration conf) {
    this.broker = broker;
    this.config = conf;
    String stopword, tokenizerClass;
    Boolean num, stemming, termFrequencies;
    if ((num = (Boolean) config.getProperty(PROPERTY_INDEX_NUMBERS)) != null)
      {indexNumbers = num.booleanValue();}
    if ((stemming = (Boolean) config.getProperty(PROPERTY_STEM)) != null)
      {stem = stemming.booleanValue();}
    if((termFrequencies = (Boolean) config.getProperty(PROPERTY_STORE_TERM_FREQUENCY)) != null)
      {termFreq = termFrequencies.booleanValue();}
    String track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ELEMENTS);
    if (track != null)
      {trackMatches = track.equalsIgnoreCase("yes")
      ? Serializer.TAG_ELEMENT_MATCHES
          : Serializer.TAG_NONE;}
    track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ATTRIBUTES);
    if (track != null && track.equalsIgnoreCase("yes"))
      {trackMatches = trackMatches | Serializer.TAG_ATTRIBUTE_MATCHES;}
    
    if ((tokenizerClass = (String) config.getProperty(PROPERTY_TOKENIZER)) != null) {
      try {
        final Class<?> tokClass = Class.forName(tokenizerClass);
        tokenizer = (Tokenizer) tokClass.newInstance();
        LOG.debug("using tokenizer: " + tokenizerClass);
      } catch (final ClassNotFoundException e) {
        LOG.debug(e);
      } catch (final InstantiationException e) {
        LOG.debug(e);
      } catch (final IllegalAccessException e) {
        LOG.debug(e);
      }
    }
    if (tokenizer == null) {
      LOG.debug("using simple tokenizer");
      tokenizer = new SimpleTokenizer();
    }


    if (stem)
      {stemmer = new PorterStemmer();}
    tokenizer.setStemming(stem);
    if ((stopword = (String) config.getProperty(PROPERTY_STOPWORD_FILE)) != null) {
      try {
        final FileReader in = new FileReader(stopword);
        final StreamTokenizer tok = new StreamTokenizer(in);
        int next = tok.nextToken();
        while (next != StreamTokenizer.TT_EOF) {
          if (next != StreamTokenizer.TT_WORD)
            {continue;}
          stoplist.add(tok.sval);
          next = tok.nextToken();
        }
      } catch (final FileNotFoundException e) {
        LOG.debug(e);
      } catch (final IOException e) {
        LOG.debug(e);
      }
    }
  }


    /**
   * Returns the Tokenizer used for tokenizing strings into
   * words.
   * 
   * @return tokenizer
   */
  public Tokenizer getTokenizer() {
    return tokenizer;
  }
    
    /**
   * Tokenize and index the given text node.
   * 
   * @param indexSpec
   * @param node
   */
  public abstract void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);
    public abstract void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);


  public abstract void flush();
  public abstract boolean close() throws DBException;


  public int getTrackMatches() {
    return trackMatches;
  }
  
  public void setTrackMatches(int flags) {
    trackMatches = flags;
  }


    public NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
          NodeSet contextSet, int axis, QName qname, String expr, int type) throws TerminatedException {
        return getNodesContaining(context, docs, contextSet, axis, qname, expr, type, true);
    }


    /**
   * For each of the given search terms and each of the documents in the
   * document set, return a node-set of matching nodes.
   *
   * The type-argument indicates if search terms should be compared using
   * a regular expression. Valid values are DBBroker.MATCH_EXACT or
   * DBBroker.MATCH_REGEXP.
   */
  public abstract NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
          NodeSet contextSet, int axis, QName qname, String expr, int type, boolean matchAll) throws TerminatedException;
  
  public abstract NodeSet getNodes(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname,
          TermMatcher matcher, CharSequence startTerm) throws TerminatedException;
  
  /**
   * Queries the fulltext index to retrieve information on indexed words contained
   * in the index for the current collection. Returns a list of {@link Occurrences} for all 
   * words contained in the index. If param end is null, all words starting with 
   * the string sequence param start are returned. Otherwise, the method 
   * returns all words that come after start and before end in lexical order.
   */
  public abstract Occurrences[] scanIndexTerms(
    DocumentSet docs,
    NodeSet contextSet,
    String start,
    String end) throws PermissionDeniedException;


    public abstract Occurrences[] scanIndexTerms(
    DocumentSet docs,
    NodeSet contextSet,
        QName[] qnames,
        String start,
    String end) throws PermissionDeniedException;
    
    public abstract String[] getIndexTerms(DocumentSet docs, TermMatcher matcher);
  
  /**
   * Remove index entries for an entire collection.
   * 
   * @param collection
   */
  public abstract void dropIndex(Collection collection);
  
  /**
   * Remove all index entries for the given document.
   * 
   * @param doc
   */
  public abstract void dropIndex(DocumentImpl doc);
  
}
Source Code of org.exist.storage.TextSearchEngine

Related Classes of org.exist.storage.TextSearchEngine