package org.sf.mustru.utils;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
/**
* A bigram analyzer that uses an inner bgram filter class to create bigrams that start or end
* with stop words. This is somewhat similar to the Nutch analyzer. The bigram includes a "_' separator
* character. So, the string - <br> <br>
*
* Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do
* <br><br>
* will generate the following token list
* <br>
* 1: [alice: ALPHANUM] 0:5 [alice_was: BIGRAM] 0:9 <br>
* 2: [was_beginning: BIGRAM] 6:19 [beginning: ALPHANUM] 10:19 [beginning_to: BIGRAM] 10:22 <br>
* 3: [to_get: BIGRAM] 20:26 [get: ALPHANUM] 23:26 <br>
* 4: [very: ALPHANUM] 27:31 <br>
* 5: [tired: ALPHANUM] 32:37 [tired_of: BIGRAM] 32:40 <br>
* 6: [of_sitting: BIGRAM] 38:48 [sitting: ALPHANUM] 41:48 [sitting_by: BIGRAM] 41:51 <br>
* 7: [by_her: BIGRAM] 49:55 [her: ALPHANUM] 52:55 <br>
* 8: [sister: ALPHANUM] 56:62 [sister_on: BIGRAM] 56:65 <br>
* 9: [on_the: BIGRAM] 63:69 <br>
* 10: [the_bank: BIGRAM] 66:74 [bank: ALPHANUM] 70:74 [bank_and: BIGRAM] 70:79 <br>
* 11: [and_of: BIGRAM] 76:82 <br>
* 12: [of_having: BIGRAM] 80:89 [having: ALPHANUM] 83:89 <br>
* 13: [nothing: ALPHANUM] 90:97 [nothing_to: BIGRAM] 90:100 <br>
* 14: [to_do: BIGRAM] 98:103 [do: ALPHANUM] 101:103 <br>
*
*/
public class StandardBgramAnalyzer extends Analyzer
{
private boolean extractEntities = true;
public StandardBgramAnalyzer() { }
public TokenStream tokenStream (String fieldName, Reader reader)
{
TokenStream ts = (extractEntities) ?
new EntFilter ( new BgramFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader) ) ) ) ):
new BgramFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader) ) ) );
return (ts);
}
public boolean isExtractEntities()
{ return extractEntities; }
public void setExtractEntities(boolean extractEntities)
{ this.extractEntities = extractEntities; }
}
/**
* Optional inner class to find possible entities
*/
final class EntFilter extends TokenFilter
{
private Token stoken = null; //*-- saved token
private Token ptoken = null; //*-- previous token
private Pattern number = Pattern.compile("^[0-9,.]+$");
private HashMap<String, String> ehash = Constants.getTokenEntXref();
public EntFilter(TokenStream in)
{ super(in); }
public final Token next() throws IOException
{
//*-- check for a saved token
if (stoken != null)
{ Token tempToken = stoken; stoken = null;
tempToken.setPositionIncrement(0); return(tempToken); }
Token ctoken = input.next(); if (ctoken == null) return (null);
String ctext = ctoken.termText();
//*-- check for an unigram entity
String etype = ehash.get(ctext);
if (etype != null) stoken = new Token(etype, ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
//*-- check for a bigram entity
if (ptoken != null)
{ String bigram = ptoken.termText() + " " + ctext;
etype = ehash.get(bigram);
if (etype != null)
stoken = new Token(etype, ptoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
}
if (number.matcher(ctext).matches())
stoken = new Token("enumber", ctoken.startOffset(), ctoken.endOffset(), "<ENTITY>");
ptoken = ctoken;
return (ctoken);
}
}
/**
* Inner class to generate bigrams that start/end with stopwords using a stop word list
*/
final class BgramFilter extends TokenFilter
{
private Set stopWords; //*-- set of stopwords copied from Constants
private Token ptoken = null; //*-- previous token
private Token ctoken = null; //*-- current token
private Token stoken = null; //*-- saved token
private boolean pstop = false; //*-- stop word flag for previous token
private boolean cstop = false; //*-- stop word flag for current token
private static Pattern wordPattern = Pattern.compile("^.*?[a-zA-Z0-9].*?$"); //*-- pattern to check for alphanumeric chars
/**
* Load the stop word list
*/
public BgramFilter(TokenStream in)
{ super(in);
this.stopWords = StopFilter.makeStopSet(Constants.getSTOP_WORDS());
}
/**
* Get the next token. Use bigrams for non-stop words adjacent to stop words
*
* No. Previous Current Output
* 1. stop !stop Word + bigram
* 2. !stop !stop Word
* 3. stop stop Bigram
* 4. !stop stop Bigram
* 5. null stop skip
* 6. null !stop Word
* 7. stop null null
* 8. !stop null null
*/
public final Token next() throws IOException
{
//*-- if a token was saved, return it
if (stoken != null) //*-- no. 1
{ Token tempToken = stoken; stoken = null;
return(tempToken); }
//*-- get the next token and check if it is a stop word
ctoken = input.next();
if (ctoken == null) return (null); //*-- nos. 7 and 8
String ctokenText = ctoken.termText();
//*-- set stopword status: True if in stopwords list, or does NOT contain an alphabetic character
cstop = ( stopWords.contains(ctokenText) ) ? true: (wordPattern.matcher(ctokenText).matches()) ? false: true;
//*-- if there was a previous token
if (ptoken == null)
{ //*-- continue if the current token is a stop word
if (cstop) //*-- no. 5
{ ptoken = ctoken; pstop = cstop;
ctoken = input.next(); cstop = (ctoken != null) ? stopWords.contains(ctoken.termText()): false;
} //*-- otherwise, return the current token
else //*-- no. 6
{ ptoken = ctoken; pstop = cstop; return(ctoken); }
}
//*-- if the current token is a stop word, return a
//*-- bigram from the previous and current tokens
if (cstop)
{ Token tempToken = createBigram(ptoken, ctoken);
if (!stopWords.contains(ptoken.termText()) ) tempToken.setPositionIncrement(0); //*-- no. 3
ptoken = ctoken; pstop = cstop; return(tempToken); } //*-- nos. 3 and 4
//*-- if the previous token was a stop word, first save
//*-- the current token and return a bigram from previous
//*-- and current tokens. In next call, return the saved token
if (pstop) //*-- no. 1
{ if (ctoken != null)
{ stoken = ctoken; stoken.setPositionIncrement(0);
Token tempToken = createBigram(ptoken, ctoken);
ptoken = ctoken; pstop = cstop; return(tempToken);
}
}
ptoken = ctoken; pstop = cstop;
return(ctoken); //*-- no. 2
}
//*-- return a bigram token from the passed tokens
protected Token createBigram(Token aToken, Token bToken)
{ String bigram = aToken.termText() + "_" + bToken.termText();
return ( new Token(bigram, aToken.startOffset(), bToken.endOffset(), "<BIGRAM>" ) );
}
}