package org.sf.mustru.utils;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.regex.Matcher;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Token;
import org.sf.mustru.search.SearchTools;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.sentences.IndoEuropeanSentenceModel;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
/**
* Tools from Lingpipe <br>
*
* 1. Entity extraction: Return a list of named entities from passed text <br>
* 2. POS Tagging: Return a list of parts of speech for a text string <br>
* 3. Sentence extraction: Return a list of sentences from a text chunk <br>
*/
public class LingpipeTools extends EntityTools
{
private static final TokenizerFactory TOKENIZER_FACTORY = new IndoEuropeanTokenizerFactory(); //*-- tokenizer to extract tokens and whitespace
private static final SentenceModel SENTENCE_MODEL = new IndoEuropeanSentenceModel(); //*-- sentence model for text
private HmmDecoder decoder = null;
private boolean qEntities = true; //*-- pattern to collapse consecutive entity types
private static HashMap<String, String> phash = null;
static Logger logger = Logger.getLogger(LingpipeTools.class.getName() );
/**
* Class to run lingpipe tagger and NE extractor
*/
public LingpipeTools() { }
/**
* Setup lingpipe for POS extraction, read the tagger file specified Constants
*/
public boolean setforPOS()
{ return (setforPOS(null)); }
public boolean setforPOS(String[] additionalEntities)
{
logger.info("Reading POS tagger model from " + Constants.POS_TAGGER_MODEL);
ObjectInputStream oi = null;
try
{ oi = new ObjectInputStream( new FileInputStream(Constants.POS_TAGGER_MODEL) );
HiddenMarkovModel hmm = (HiddenMarkovModel) oi.readObject();
decoder = new HmmDecoder(hmm);
setTagPosXref();
}
catch (IOException ie ) { logger.error("setforPOS IO Error : could not read " + ie.getMessage() ); }
catch (ClassNotFoundException ce) { logger.error("setforPOS Class Error : " + ce.getMessage() ); }
finally { if (oi != null) { try { oi.close(); } catch (IOException ie) { } } }
if ( (additionalEntities != null) && (additionalEntities.length > 0) ) qEntities = true;
return true;
}
/**
* Generate an annotated sentence with a select group of parts of speech and return
* @param sentence to be annotated with parts of speech
* @return annotated sentence
*/
public String getPOS(String sentence)
{ return getPOS(sentence, false); }
public String getPOS(String sentence, boolean allTags)
{
StringBuffer xmlOutput = new StringBuffer();
char[] cs = sentence.toCharArray();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs, 0, cs.length);
String[] tokens = tokenizer.tokenize();
String[] tags = decoder.firstBest(tokens); int len = tokens.length;
for (int i = 0; i < len; i++)
{
//*-- set the adjective tags
if (tags[i].startsWith("j") || tags[i].equals("cd") || tags[i].endsWith("od") )
{ xmlOutput.append(" <Adjective> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Adjective>"); }
//*-- next, the noun tags
else if ( tags[i].startsWith("n") )
{ xmlOutput.append(" <Noun> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Noun>"); }
//*-- finally, the verb tags, skipping auxiliary verbs
else if ( tags[i].startsWith("v") )
{ xmlOutput.append(" <Verb> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Verb>"); }
//*-- skip, all other tags
else if (allTags)
{ String tag = phash.get(tags[i]); if (tag == null) tag = tags[i];
xmlOutput.append("<" + tag + "> "); xmlOutput.append(tokens[i]); xmlOutput.append("</" + tag + "> "); }
else
{ xmlOutput.append(" "); xmlOutput.append(tokens[i]); }
}
String out = xmlOutput.toString();
if (!qEntities) return out;
Matcher matcher = SearchTools.qwordPattern.matcher(out);
if (matcher.matches())
{ out = matcher.replaceFirst(matcher.group(1) + "<Qword>" + matcher.group(2) + "</Qword>" + matcher.group(3) ); }
return(out);
}
/**
* Build the list of tokens, white spaces, and sentence boundaries for the paragraph passed
* @param in paragraph
*/
public void buildSentences(String in)
{
//*-- extract the sentence boundaries
if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(in.toCharArray(), 0, in.length() );
tokenizer.tokenize(tokenList, whiteList);
tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
whites = new String[whiteList.size()]; whiteList.toArray(whites);
sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens, whites);
int numPossibleSentences = sentenceBoundaries.length;
//*-- set a default sentence boundary if no sentence boundaries were found
if (numPossibleSentences < 1) { sentenceBoundaries = new int[1]; sentenceBoundaries[0] = tokens.length - 1; }
currentSentenceBoundary = 0; firstTime = true;
}
/**
* Standard Bgram Tokenizer
*/
public String[] tokenizer(String in)
{
if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
Tokenizer tokenizer = new StandardBgramTokenizerFactory().tokenizer(in.toCharArray(), 0, in.length() );
tokenizer.tokenize(tokenList, whiteList);
String[] tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
return(tokens);
}
/**
* Return a hash map cross reference of Brown tags to part of speech
*/
public static void setTagPosXref()
{
phash = new HashMap<String, String>();
phash.put ( "abl", "determiner/pronoun" );
phash.put ( "abn", "determiner/pronoun" );
phash.put ( "abx", "determiner/pronoun" );
phash.put ( "ap", "determiner/pronoun" );
phash.put ( "ap$", "determiner/pronoun" );
phash.put ( "be", "verb" );
phash.put ( "bed", "verb" );
phash.put ( "bedz", "verb" );
phash.put ( "beg", "verb" );
phash.put ( "bem", "verb" );
phash.put ( "ben", "verb" );
phash.put ( "ber", "verb" );
phash.put ( "bez", "verb" );
phash.put ( "cc", "conjunction" );
phash.put ( "cd", "numeral" );
phash.put ( "cd$", "numeral" );
phash.put ( "cs", "conjunction" );
phash.put ( "do", "verb" );
phash.put ( "dod", "verb" );
phash.put ( "doz", "verb" );
phash.put ( "dt", "determiner/pronoun" );
phash.put ( "dt$", "determiner/pronoun" );
phash.put ( "dt+bez", "determiner/pronoun" );
phash.put ( "dt+md", "determiner/pronoun" );
phash.put ( "dti", "determiner/pronoun" );
phash.put ( "dts", "determiner/pronoun" );
phash.put ( "dtx", "determiner" );
phash.put ( "ex", "existential there" );
phash.put ( "hv", "verb" );
phash.put ( "hv+to", "verb" );
phash.put ( "hvd", "verb" );
phash.put ( "hvg", "verb" );
phash.put ( "hvn", "verb" );
phash.put ( "hvz", "verb" );
phash.put ( "in", "preposition" );
phash.put ( "jj", "adjective" );
phash.put ( "jj$", "adjective" );
phash.put ( "jjr", "adjective" );
phash.put ( "jjs", "adjective" );
phash.put ( "jjt", "adjective" );
phash.put ( "md", "modal auxiliary" );
phash.put ( "nn", "noun" );
phash.put ( "nn$", "noun" );
phash.put ( "nn+bez", "noun" );
phash.put ( "nn+hvz", "noun" );
phash.put ( "nns", "noun" );
phash.put ( "nns$", "noun" );
phash.put ( "np", "noun" );
phash.put ( "np$", "noun" );
phash.put ( "np+bez", "noun" );
phash.put ( "nps", "noun" );
phash.put ( "nps$", "noun" );
phash.put ( "nr", "noun" );
phash.put ( "nr$", "noun" );
phash.put ( "nrs", "noun" );
phash.put ( "od", "numeral" );
phash.put ( "pn", "pronoun" );
phash.put ( "pn$", "pronoun" );
phash.put ( "pp$", "determiner" );
phash.put ( "pp$$", "pronoun" );
phash.put ( "ppl", "pronoun" );
phash.put ( "ppls", "pronoun" );
phash.put ( "ppo", "pronoun" );
phash.put ( "pps", "pronoun" );
phash.put ( "pps+bez", "pronoun" );
phash.put ( "pps+hvd", "pronoun" );
phash.put ( "pps+hvz", "pronoun" );
phash.put ( "pps+md", "pronoun" );
phash.put ( "ppss", "pronoun" );
phash.put ( "ppss+bem", "pronoun" );
phash.put ( "ppss+ber", "pronoun" );
phash.put ( "ppss+hv", "pronoun" );
phash.put ( "ppss+hvd", "pronoun" );
phash.put ( "ppss+md", "pronoun" );
phash.put ( "ql", "qualifier" );
phash.put ( "qlp", "qualifier" );
phash.put ( "rb", "adverb" );
phash.put ( "rb+bez", "adverb+verb" );
phash.put ( "rbr", "adverb" );
phash.put ( "rbt", "adverb" );
phash.put ( "rn", "adverb" );
phash.put ( "rp", "adverb" );
phash.put ( "to", "infinitival to" );
phash.put ( "uh", "interjection" );
phash.put ( "vb", "verb" );
phash.put ( "vbd", "verb" );
phash.put ( "vbg", "verb" );
phash.put ( "vbn", "verb" );
phash.put ( "vbz", "verb" );
phash.put ( "wdt", "determiner" );
phash.put ( "wdt+bez", "determiner+verb" );
phash.put ( "wp$", "pronoun" );
phash.put ( "wpo", "pronoun" );
phash.put ( "wps", "pronoun" );
phash.put ( "wps+bez", "pronoun" );
phash.put ( "wps+hvd", "pronoun" );
phash.put ( "wps+hvz", "pronoun" );
phash.put ( "wps+md", "pronoun" );
phash.put ( "wql", "qualifier" );
phash.put ( "wrb", "adverb" );
phash.put ( "wrb+ber", "adverb + verb" );
}
public void releaseResources()
{ decoder = null; }
}
/**
* Inner class to compare two strings by their lengths
*/
class StringLenComparator implements Comparator<String>
{
public int compare (String o1, String o2)
{ if (o1 == null || o2 == null) return(0);
Integer i1 = new Integer(o1.length());
Integer i2 = new Integer(o2.length());
return ( -i1.compareTo(i2));
}
}