Package com.dbxml.db.common.fulltext

Source Code of com.dbxml.db.common.fulltext.FullTextIndexer

package com.dbxml.db.common.fulltext;

/*
* dbXML - Native XML Database
* Copyright (c) 1999-2006 The dbXML Group, L.L.C.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* $Id: FullTextIndexer.java,v 1.7 2006/02/02 18:53:52 bradford Exp $
*/

import java.util.*;

import com.dbxml.db.common.btree.BTree;
import com.dbxml.db.common.btree.BTreeCallback;
import com.dbxml.db.common.btree.BTreeCorruptException;
import com.dbxml.db.core.ClassResolver;
import com.dbxml.db.core.Collection;
import com.dbxml.db.core.DBException;
import com.dbxml.db.core.data.Key;
import com.dbxml.db.core.data.Value;
import com.dbxml.db.core.indexer.IndexMatch;
import com.dbxml.db.core.indexer.IndexPattern;
import com.dbxml.db.core.indexer.IndexQuery;
import com.dbxml.db.core.indexer.Indexer;
import com.dbxml.db.core.transaction.Transaction;
import com.dbxml.util.ByteArray;
import com.dbxml.util.Configuration;
import com.dbxml.xml.SymbolTable;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;

/**
* FullTextIndexer is a full text search implementation of the
* Indexer interface.
*/

public final class FullTextIndexer extends BTree implements Indexer {
   private static final IndexMatch[] EmptyMatches = new IndexMatch[0];
   private static final String[] EmptyStrings = new String[0];

   private static final Class DEFAULT_STEMMER_CLASS = PorterStemmer.class;
   private static final String DEFAULT_STEMMER = DEFAULT_STEMMER_CLASS.getName();

   static {
      ClassResolver.register(DEFAULT_STEMMER, DEFAULT_STEMMER_CLASS);
   }

   private static final String DEFAULT_STOPWORDS = "config/stopwords.txt";

   private static final long MATCH_INFO = -1000;

   private static final String NAME = "name";
   private static final String PATTERN = "pattern";
   private static final String PAGESIZE = "pagesize";
   private static final String STEMMER = "stemmer";
   private static final String STOPWORDS = "stopwords";
   private static final String ROLLCASE = "rollcase";

   private Configuration config;
   private Collection collection;
   private SymbolTable symbols;

   private String name;
   private String pattern;
   private WordStemmer stemmer;
   private Set stopWords;
   private boolean rollCase = true;
   private boolean wildcard;

   private FileHeader fileHeader;

   public FullTextIndexer() {
      super(true);
      setTransactionSupported(true);
      fileHeader = getFileHeader();
   }

   public void setConfig(Configuration config) {
      this.config = config;
      try {
         name = config.getAttribute(NAME);

         pattern = config.getAttribute(PATTERN);
         wildcard = pattern.indexOf('*') != -1;

         rollCase = config.getBooleanAttribute(ROLLCASE, rollCase);

         String stemClass = config.getAttribute(STEMMER, DEFAULT_STEMMER);
         if ( stemClass != null && stemClass.length() > 0 ) {
            try {
               stemmer = (WordStemmer)ClassResolver.get(stemClass).newInstance();
            }
            catch ( Exception e ) {
               System.err.println("Stemmer '" + stemClass + "' Not Found");
            }
         }

         String stopName = config.getAttribute(STOPWORDS, DEFAULT_STOPWORDS);
         if ( stopName != null && stopName.length() > 0 ) {
            try {
               stopWords = new HashSet();
               File stopFile = new File(stopName);
               FileInputStream fis = new FileInputStream(stopFile);
               BufferedInputStream bis = new BufferedInputStream(fis, 4096);
               InputStreamReader isr = new InputStreamReader(bis, "UTF8");
               BufferedReader br = new BufferedReader(isr);
               String word = null;
               do {
                  word = br.readLine();
                  if ( word != null && word.trim().length() > 0 )
                     stopWords.add(word);
               }
               while ( word != null );
               br.close();
            }
            catch ( Exception e ) {
               System.err.println("Stop Word list '" + stopName + "' Not Found");
            }
         }

         fileHeader.setPageSize(config.getIntAttribute(PAGESIZE, fileHeader.getPageSize()));

         setLocation(name);
      }
      catch ( Exception e ) {
         e.printStackTrace(System.err);
      }
   }

   public Configuration getConfig() {
      return config;
   }

   public String getName() {
      return name;
   }

   public void setLocation(String location) {
      setFile(new File(collection.getCollectionRoot(), location + ".idx"));
   }

   public void setCollection(Collection collection) {
      try {
         this.collection = collection;
         symbols = collection.getSymbols();
      }
      catch ( Exception e ) {
         e.printStackTrace(System.err);
      }
   }

   public String getIndexStyle() {
      return STYLE_FULLTEXT;
   }

   public WordStemmer getWordStemmer() {
      return stemmer;
   }

   public Set getStopWords() {
      return stopWords;
   }

   public String getPattern() {
      return pattern;
   }

   private Value getCombinedValue(Value value, Key key, int pos, int elemID, int attrID) {
      Value result;
      try {
         int valSize = value.getLength();
         int keySize = key.getLength();
         int totalSize = valSize+keySize+16;

         byte[] b = new byte[totalSize];

         System.arraycopy(value.getRawData(), value.getOffset(), b, 0, valSize);
         System.arraycopy(key.getRawData(), key.getOffset(), b, valSize+1, keySize);

         int l = valSize+keySize+2;

         ByteArray.writeInt(b, l, pos);    // Write the pos
         ByteArray.writeInt(b, l+4, elemID); // Write the elemID
         ByteArray.writeInt(b, l+8, attrID); // Write the attrID
         ByteArray.writeShort(b, l+12, (short)valSize);

         result = new Value(b);
      }
      catch ( Exception e ) {
         result = null; // This will never happen
      }
      return result;
   }

   private String[] getReducedSet(String value) {
      Set set = new TreeSet();
      StringTokenizer st = new StringTokenizer(value);
      while ( st.hasMoreTokens() ) {
         String s = st.nextToken();

         if ( stemmer != null )
            s = stemmer.normalizeCase(s);

         if ( stopWords != null && stopWords.contains(s) )
            continue;

         if ( stemmer != null )
            s = stemmer.stemWord(s);

         if ( !set.contains(s) )
            set.add(s);
      }
      return (String[])set.toArray(EmptyStrings);
   }

   private IndexMatch getIndexMatch(Value v) {
      byte[] b = v.getData();
      int l = b.length - 13;
      Key key = new Key(b, 0, b.length - 13);

      int pos = ByteArray.readInt(b, l+1);
      int elemID = ByteArray.readInt(b, l+5);
      int attrID = ByteArray.readInt(b, l+9);

      return new IndexMatch(key, pos, elemID, attrID);
   }

   public void remove(Transaction tx, String value, Key key, int pos, int elemID, int attrID) throws DBException {
      String[] set = getReducedSet(value);
      for ( int i = 0; i < set.length; i++ ) {
         try {
            Value cv = getCombinedValue(new Value(set[i]), key, pos, elemID, attrID);
            removeValue(tx, cv);
         }
         catch ( DBException d ) {
            throw d;
         }
         catch ( Exception e ) {
            e.printStackTrace(System.err);
         }
      }
   }

   public void add(Transaction tx, String value, Key key, int pos, int elemID, int attrID) throws DBException {
      String[] set = getReducedSet(value);
      for ( int i = 0; i < set.length; i++ ) {
         try {
            Value cv = getCombinedValue(new Value(set[i]), key, pos, elemID, attrID);
            addValue(tx, cv, MATCH_INFO);
         }
         catch ( DBException d ) {
            throw d;
         }
         catch ( IOException e ) {
            throw new BTreeCorruptException("Corruption detected on add", e);
         }
         catch ( Exception e ) {
            e.printStackTrace(System.err);
         }
      }
   }

   public IndexMatch[] queryMatches(Transaction tx, final IndexQuery query) throws DBException {
      // Pre-process the value-set for stop words and stemming
      Value[] vals = query.getValues();
      for ( int i = 0; i < vals.length; i++ ) {
         String s = vals[i].toString();

         if ( stemmer != null )
            s = stemmer.normalizeCase(s);

         if ( stopWords != null && stopWords.contains(s) )
            continue;

         if ( stemmer != null )
            s = stemmer.stemWord(s);

         vals[i] = new Value(s);
      }

      // Now issue the query
      final List results = new ArrayList(128);

      try {
         query(tx, query, new BTreeCallback() {
            public void indexInfo(Value value, Value extra) {
               try {
                  IndexMatch match = getIndexMatch(extra);
                  if ( !wildcard )
                     results.add(match);
                  else {
                     IndexPattern pt = new IndexPattern(symbols, match.getElement(), match.getAttribute());
                     if ( pt.getMatchLevel(query.getPattern()) > 0 )
                        results.add(match);
                  }
               }
               catch ( Exception e ) {
                  e.printStackTrace(System.err);
               }
            }
         });
      }
      catch ( IOException e ) {
         throw new BTreeCorruptException("Corruption detected on query", e);
      }
      catch ( Exception e ) {
         e.printStackTrace(System.err);
      }

      return (IndexMatch[])results.toArray(EmptyMatches);
   }
}

TOP

Related Classes of com.dbxml.db.common.fulltext.FullTextIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.