Package org.sf.mustru.utils

Source Code of org.sf.mustru.utils.BgramFilter

package org.sf.mustru.utils;

import java.util.HashMap;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

* A bigram analyzer that uses an inner bgram filter class to create bigrams that start or end
* with stop words. This is somewhat similar to the Nutch analyzer. The bigram includes a "_' separator
* character. So, the string - <br> <br>
* Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do
* <br><br>
* will generate the following token list
* <br>
* 1:    [alice: ALPHANUM] 0:5       [alice_was: BIGRAM] 0:9 <br>
* 2:    [was_beginning: BIGRAM] 6:19    [beginning: ALPHANUM] 10:19  [beginning_to: BIGRAM] 10:22 <br>
* 3:    [to_get: BIGRAM] 20:26       [get: ALPHANUM] 23:26 <br>
* 4:    [very: ALPHANUM] 27:31 <br>
* 5:    [tired: ALPHANUM] 32:37     [tired_of: BIGRAM] 32:40 <br>
* 6:    [of_sitting: BIGRAM] 38:48     [sitting: ALPHANUM] 41:48  [sitting_by: BIGRAM] 41:51 <br>
* 7:    [by_her: BIGRAM] 49:55       [her: ALPHANUM] 52:55 <br>
* 8:    [sister: ALPHANUM] 56:62     [sister_on: BIGRAM] 56:65 <br>
* 9:    [on_the: BIGRAM] 63:69 <br>
* 10:    [the_bank: BIGRAM] 66:74     [bank: ALPHANUM] 70:74     [bank_and: BIGRAM] 70:79 <br>
* 11:    [and_of: BIGRAM] 76:82 <br>
* 12:    [of_having: BIGRAM] 80:89     [having: ALPHANUM] 83:89 <br>
* 13:    [nothing: ALPHANUM] 90:97     [nothing_to: BIGRAM] 90:100 <br>
* 14:    [to_do: BIGRAM] 98:103      [do: ALPHANUM] 101:103 <br>
public class StandardBgramAnalyzer extends Analyzer
private boolean extractEntities = true;
public StandardBgramAnalyzer() { }
public TokenStream tokenStream (String fieldName, Reader reader)
   TokenStream ts = (extractEntities) ?
     new EntFilter ( new BgramFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader) ) ) ) ):
                     new BgramFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader) ) ) );
   return (ts);
public boolean isExtractEntities()
{ return extractEntities; }
public void setExtractEntities(boolean extractEntities)
{ this.extractEntities = extractEntities; }

  * Optional inner class to find possible entities
final class EntFilter extends TokenFilter
  private Token stoken = null;       //*-- saved token
  private Token ptoken = null;      //*-- previous token
  private Pattern number = Pattern.compile("^[0-9,.]+$");
  private HashMap<String, String> ehash = Constants.getTokenEntXref();
  public EntFilter(TokenStream in)
  { super(in); }
  public final Token next() throws IOException
   //*-- check for a saved token
   if (stoken != null)
    { Token tempToken = stoken; stoken = null;
      tempToken.setPositionIncrement(0); return(tempToken); }
   Token ctoken = (ctoken == null) return (null);
   String ctext = ctoken.termText();
   //*-- check for an unigram entity
   String etype = ehash.get(ctext);
   if (etype != null) stoken = new Token(etype, ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   //*-- check for a bigram entity
   if (ptoken != null)   
   { String bigram = ptoken.termText() + " " + ctext;
     etype = ehash.get(bigram);
     if (etype != null)
      stoken = new Token(etype, ptoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   if (number.matcher(ctext).matches())
     stoken = new Token("enumber", ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
   ptoken = ctoken;
   return (ctoken);

  * Inner class to generate bigrams that start/end with stopwords using a stop word list
final class BgramFilter extends TokenFilter
   private Set stopWords;      //*-- set of stopwords copied from Constants
   private Token ptoken = null;    //*-- previous token
   private Token ctoken = null;      //*-- current token
   private Token stoken = null;      //*-- saved token
   private boolean pstop = false;    //*-- stop word flag for previous token
   private boolean cstop = false;    //*-- stop word flag for current token
   private static Pattern wordPattern = Pattern.compile("^.*?[a-zA-Z0-9].*?$");    //*-- pattern to check for alphanumeric chars
    * Load the stop word list
   public BgramFilter(TokenStream in)
   { super(in);
     this.stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());

     * Get the next token. Use bigrams for non-stop words adjacent to stop words
     * No.  Previous  Current    Output
     * 1.  stop    !stop    Word + bigram
     * 2.   !stop    !stop    Word
     * 3.   stop    stop    Bigram
     * 4.   !stop    stop    Bigram
     * 5.   null    stop    skip
     * 6.   null     !stop    Word
     * 7.   stop    null    null
     * 8.  !stop    null    null
    public final Token next() throws IOException
     //*-- if a token was saved, return it
     if (stoken != null) //*-- no. 1
     { Token tempToken = stoken; stoken = null;
       return(tempToken); }
     //*-- get the next token and check if it is a stop word
     ctoken =
     if (ctoken == null) return (null);    //*-- nos. 7 and 8
     String ctokenText = ctoken.termText();
     //*-- set stopword status: True if in stopwords list, or does NOT contain an alphabetic character
     cstop = ( stopWords.contains(ctokenText) ) ? true: (wordPattern.matcher(ctokenText).matches()) ? false: true;
     //*-- if there was a previous token
     if (ptoken == null)
     { //*-- continue if the current token is a stop word
       if (cstop)   //*-- no. 5
       { ptoken = ctoken; pstop = cstop;
       ctoken =; cstop = (ctoken != null) ? stopWords.contains(ctoken.termText()): false;
       } //*-- otherwise, return the current token
       else     //*-- no. 6
       { ptoken = ctoken; pstop = cstop; return(ctoken); }
     //*-- if the current token is a stop word, return a
     //*-- bigram from the previous and current tokens
     if (cstop)
      { Token tempToken = createBigram(ptoken, ctoken);
        if (!stopWords.contains(ptoken.termText()) ) tempToken.setPositionIncrement(0); //*-- no. 3
        ptoken = ctoken; pstop = cstop; return(tempToken); }    //*-- nos. 3 and 4
     //*-- if the previous token was a stop word, first save
     //*-- the current token and return a bigram from previous
     //*-- and current tokens. In next call, return the saved token
     if (pstop)    //*-- no. 1
     { if (ctoken != null)
      { stoken = ctoken; stoken.setPositionIncrement(0);
        Token tempToken = createBigram(ptoken, ctoken);
        ptoken = ctoken; pstop = cstop; return(tempToken);
     ptoken = ctoken; pstop = cstop;
     return(ctoken);    //*-- no. 2
    //*-- return a bigram token from the passed tokens
   protected Token createBigram(Token aToken, Token bToken)
   { String bigram = aToken.termText() + "_" + bToken.termText();
     return ( new Token(bigram, aToken.startOffset(), bToken.endOffset(), "<BIGRAM>" ) );

