// the sentence detector and tokenizer constructors
// take paths to their respective models
SentenceDetectorME sdetector = new SentenceDetectorME(
new SentenceModel(new FileInputStream(
"models/en-sent.bin")));
Tokenizer tokenizer = new TokenizerME(new TokenizerModel(
new FileInputStream("models/en-token.bin")));
// the parser takes the path to the parser models
// directory and a few other options
/*
* boolean useTagDict = true; boolean useCaseInsensitiveTagDict = false;
* int beamSize = opennlp.tools.parser.chunking.Parser.defaultBeamSize;
* double advancePercentage =
* opennlp.tools.parser.chunking.Parser.defaultAdvancePercentage;
* opennlp.tools.parser.Parser parser = TreebankParser.getParser(
* "models/parser", useTagDict, useCaseInsensitiveTagDict, beamSize,
* advancePercentage);
*/Parser parser = ParserFactory.create(new ParserModel(
new FileInputStream("models/en-parser-chunking.bin")),
AbstractBottomUpParser.defaultBeamSize,
AbstractBottomUpParser.defaultAdvancePercentage);
// break a paragraph into sentences
String[] sents = sdetector.sentDetect(paragraph.toString());
// TODO handle paragraph (multiple sentences)
String sent = sents[0];
// tokenize brackets and parentheses by putting a space on either side.
// this makes sure it doesn't get confused with output from the parser
sent = untokenizedParenPattern1.matcher(sent).replaceAll("$1 $2");
sent = untokenizedParenPattern2.matcher(sent).replaceAll("$1 $2");
// get the tokenizer to break apart the sentence
String[] tokens = tokenizer.tokenize(sent);
// build a string to parse as well as a list of tokens
StringBuffer sb = new StringBuffer();
List<String> tokenList = new ArrayList<String>();
for (int j = 0; j < tokens.length; j++) {