package com.dbxml.db.common.fulltext;
/*
* dbXML - Native XML Database
* Copyright (c) 1999-2006 The dbXML Group, L.L.C.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* $Id: FullTextIndexer.java,v 1.7 2006/02/02 18:53:52 bradford Exp $
*/
import java.util.*;
import com.dbxml.db.common.btree.BTree;
import com.dbxml.db.common.btree.BTreeCallback;
import com.dbxml.db.common.btree.BTreeCorruptException;
import com.dbxml.db.core.ClassResolver;
import com.dbxml.db.core.Collection;
import com.dbxml.db.core.DBException;
import com.dbxml.db.core.data.Key;
import com.dbxml.db.core.data.Value;
import com.dbxml.db.core.indexer.IndexMatch;
import com.dbxml.db.core.indexer.IndexPattern;
import com.dbxml.db.core.indexer.IndexQuery;
import com.dbxml.db.core.indexer.Indexer;
import com.dbxml.db.core.transaction.Transaction;
import com.dbxml.util.ByteArray;
import com.dbxml.util.Configuration;
import com.dbxml.xml.SymbolTable;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
/**
* FullTextIndexer is a full text search implementation of the
* Indexer interface.
*/
public final class FullTextIndexer extends BTree implements Indexer {
private static final IndexMatch[] EmptyMatches = new IndexMatch[0];
private static final String[] EmptyStrings = new String[0];
private static final Class DEFAULT_STEMMER_CLASS = PorterStemmer.class;
private static final String DEFAULT_STEMMER = DEFAULT_STEMMER_CLASS.getName();
static {
ClassResolver.register(DEFAULT_STEMMER, DEFAULT_STEMMER_CLASS);
}
private static final String DEFAULT_STOPWORDS = "config/stopwords.txt";
private static final long MATCH_INFO = -1000;
private static final String NAME = "name";
private static final String PATTERN = "pattern";
private static final String PAGESIZE = "pagesize";
private static final String STEMMER = "stemmer";
private static final String STOPWORDS = "stopwords";
private static final String ROLLCASE = "rollcase";
private Configuration config;
private Collection collection;
private SymbolTable symbols;
private String name;
private String pattern;
private WordStemmer stemmer;
private Set stopWords;
private boolean rollCase = true;
private boolean wildcard;
private FileHeader fileHeader;
public FullTextIndexer() {
super(true);
setTransactionSupported(true);
fileHeader = getFileHeader();
}
public void setConfig(Configuration config) {
this.config = config;
try {
name = config.getAttribute(NAME);
pattern = config.getAttribute(PATTERN);
wildcard = pattern.indexOf('*') != -1;
rollCase = config.getBooleanAttribute(ROLLCASE, rollCase);
String stemClass = config.getAttribute(STEMMER, DEFAULT_STEMMER);
if ( stemClass != null && stemClass.length() > 0 ) {
try {
stemmer = (WordStemmer)ClassResolver.get(stemClass).newInstance();
}
catch ( Exception e ) {
System.err.println("Stemmer '" + stemClass + "' Not Found");
}
}
String stopName = config.getAttribute(STOPWORDS, DEFAULT_STOPWORDS);
if ( stopName != null && stopName.length() > 0 ) {
try {
stopWords = new HashSet();
File stopFile = new File(stopName);
FileInputStream fis = new FileInputStream(stopFile);
BufferedInputStream bis = new BufferedInputStream(fis, 4096);
InputStreamReader isr = new InputStreamReader(bis, "UTF8");
BufferedReader br = new BufferedReader(isr);
String word = null;
do {
word = br.readLine();
if ( word != null && word.trim().length() > 0 )
stopWords.add(word);
}
while ( word != null );
br.close();
}
catch ( Exception e ) {
System.err.println("Stop Word list '" + stopName + "' Not Found");
}
}
fileHeader.setPageSize(config.getIntAttribute(PAGESIZE, fileHeader.getPageSize()));
setLocation(name);
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
}
public Configuration getConfig() {
return config;
}
public String getName() {
return name;
}
public void setLocation(String location) {
setFile(new File(collection.getCollectionRoot(), location + ".idx"));
}
public void setCollection(Collection collection) {
try {
this.collection = collection;
symbols = collection.getSymbols();
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
}
public String getIndexStyle() {
return STYLE_FULLTEXT;
}
public WordStemmer getWordStemmer() {
return stemmer;
}
public Set getStopWords() {
return stopWords;
}
public String getPattern() {
return pattern;
}
private Value getCombinedValue(Value value, Key key, int pos, int elemID, int attrID) {
Value result;
try {
int valSize = value.getLength();
int keySize = key.getLength();
int totalSize = valSize+keySize+16;
byte[] b = new byte[totalSize];
System.arraycopy(value.getRawData(), value.getOffset(), b, 0, valSize);
System.arraycopy(key.getRawData(), key.getOffset(), b, valSize+1, keySize);
int l = valSize+keySize+2;
ByteArray.writeInt(b, l, pos); // Write the pos
ByteArray.writeInt(b, l+4, elemID); // Write the elemID
ByteArray.writeInt(b, l+8, attrID); // Write the attrID
ByteArray.writeShort(b, l+12, (short)valSize);
result = new Value(b);
}
catch ( Exception e ) {
result = null; // This will never happen
}
return result;
}
private String[] getReducedSet(String value) {
Set set = new TreeSet();
StringTokenizer st = new StringTokenizer(value);
while ( st.hasMoreTokens() ) {
String s = st.nextToken();
if ( stemmer != null )
s = stemmer.normalizeCase(s);
if ( stopWords != null && stopWords.contains(s) )
continue;
if ( stemmer != null )
s = stemmer.stemWord(s);
if ( !set.contains(s) )
set.add(s);
}
return (String[])set.toArray(EmptyStrings);
}
private IndexMatch getIndexMatch(Value v) {
byte[] b = v.getData();
int l = b.length - 13;
Key key = new Key(b, 0, b.length - 13);
int pos = ByteArray.readInt(b, l+1);
int elemID = ByteArray.readInt(b, l+5);
int attrID = ByteArray.readInt(b, l+9);
return new IndexMatch(key, pos, elemID, attrID);
}
public void remove(Transaction tx, String value, Key key, int pos, int elemID, int attrID) throws DBException {
String[] set = getReducedSet(value);
for ( int i = 0; i < set.length; i++ ) {
try {
Value cv = getCombinedValue(new Value(set[i]), key, pos, elemID, attrID);
removeValue(tx, cv);
}
catch ( DBException d ) {
throw d;
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
}
}
public void add(Transaction tx, String value, Key key, int pos, int elemID, int attrID) throws DBException {
String[] set = getReducedSet(value);
for ( int i = 0; i < set.length; i++ ) {
try {
Value cv = getCombinedValue(new Value(set[i]), key, pos, elemID, attrID);
addValue(tx, cv, MATCH_INFO);
}
catch ( DBException d ) {
throw d;
}
catch ( IOException e ) {
throw new BTreeCorruptException("Corruption detected on add", e);
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
}
}
public IndexMatch[] queryMatches(Transaction tx, final IndexQuery query) throws DBException {
// Pre-process the value-set for stop words and stemming
Value[] vals = query.getValues();
for ( int i = 0; i < vals.length; i++ ) {
String s = vals[i].toString();
if ( stemmer != null )
s = stemmer.normalizeCase(s);
if ( stopWords != null && stopWords.contains(s) )
continue;
if ( stemmer != null )
s = stemmer.stemWord(s);
vals[i] = new Value(s);
}
// Now issue the query
final List results = new ArrayList(128);
try {
query(tx, query, new BTreeCallback() {
public void indexInfo(Value value, Value extra) {
try {
IndexMatch match = getIndexMatch(extra);
if ( !wildcard )
results.add(match);
else {
IndexPattern pt = new IndexPattern(symbols, match.getElement(), match.getAttribute());
if ( pt.getMatchLevel(query.getPattern()) > 0 )
results.add(match);
}
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
}
});
}
catch ( IOException e ) {
throw new BTreeCorruptException("Corruption detected on query", e);
}
catch ( Exception e ) {
e.printStackTrace(System.err);
}
return (IndexMatch[])results.toArray(EmptyMatches);
}
}