package cc.util;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.util.Function;
/**
* based heavily on edu.stanford.nlp.process.DocumentPreprocessor
*/
public class SentenceTokeniser implements Iterable<List<HasWord>> {
private Reader inputReader = null;
//Configurable options
private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.factory();
private String[] sentenceFinalPuncWords = {".", "?", "!"};
private Function<List<HasWord>,List<HasWord>> escaper = null;
private String sentenceDelimiter = null;
//From PTB conventions
private final String[] sentenceFinalFollowers = {")", "]", "\"", "\'", "''", "-RRB-", "-RSB-", "-RCB-"};
public static void main(String[] args) throws IOException{
SentenceTokeniser sentenceTokeniser = new SentenceTokeniser();
for (String sentence : sentenceTokeniser.extractSentences( "This is a sentence. And (do you know) So is this!")) {
System.out.println("next sentence ["+sentence+"]");
}
for (String sentence : sentenceTokeniser.extractSentences( "Oh, and by the way.... guess what! so is this!")) {
System.out.println("next sentence ["+sentence+"]");
}
}
public List<String> extractSentences(String text) {
this.inputReader = new StringReader(text);
List<String> sentences = new ArrayList<String>();
for (List<HasWord> sentence : this) {
StringBuilder sentenceBuffer = new StringBuilder();
for (HasWord word : sentence) {
sentenceBuffer.append(word.word()+" ");
}
String sentenceWithoutTrailingSpace = sentenceBuffer.toString().substring(0, sentenceBuffer.length()-1);
sentences.add(sentenceWithoutTrailingSpace);
}
return sentences;
}
/**
* Returns sentences until the document is exhausted. Calls close() if the end of the document
* is reached. Otherwise, the user is required to close the stream.
*/
public Iterator<List<HasWord>> iterator() {
return new PlainTextIterator();
}
private class PlainTextIterator implements Iterator<List<HasWord>> {
private Tokenizer<? extends HasWord> tokenizer;
private Set<String> sentDelims;
private Set<String> delimFollowers = new HashSet<String>(Arrays.asList(sentenceFinalFollowers));
private Function<String, String[]> splitTag;
private List<HasWord> nextSent = null;
private List<HasWord> nextSentCarryover = new ArrayList<HasWord>();
public PlainTextIterator() {
// Establish how to find sentence boundaries
sentDelims = new HashSet<String>();
boolean eolIsSignificant = false;
if (sentenceDelimiter == null) {
if (sentenceFinalPuncWords != null) {
sentDelims = new HashSet<String>(Arrays.asList(sentenceFinalPuncWords));
}
} else {
sentDelims.add(sentenceDelimiter);
delimFollowers = new HashSet<String>();
eolIsSignificant = sentenceDelimiter.matches("\\s+");
}
// Setup the tokenizer
if(tokenizerFactory == null) {
tokenizer = WhitespaceTokenizer.
newWordWhitespaceTokenizer(inputReader, eolIsSignificant);
} else {
if(eolIsSignificant)
tokenizerFactory.setOptions("tokenizeNLs");//wsg2010: This key currently used across all tokenizers
tokenizer = tokenizerFactory.getTokenizer(inputReader);
}
}
private void primeNext() {
nextSent = new ArrayList<HasWord>(nextSentCarryover);
nextSentCarryover.clear();
boolean seenBoundary = false;
while (tokenizer.hasNext()) {
HasWord token = tokenizer.next();
if (splitTag != null) {
String[] toks = splitTag.apply(token.word());
token.setWord(toks[0]);
if(toks.length == 2 && token instanceof HasTag) {
//wsg2011: Some of the underlying tokenizers return old
//JavaNLP labels. We could convert to CoreLabel here, but
//we choose a conservative implementation....
((HasTag) token).setTag(toks[1]);
}
}
if (sentDelims.contains(token.word())) {
seenBoundary = true;
} else if (seenBoundary && !delimFollowers.contains(token.word())) {
nextSentCarryover.add(token);
break;
}
if ( ! (token.word().matches("\\s+") //||
/*token.word().equals(PTBLexer.NEWLINE_TOKEN)*/)) {
nextSent.add(token);
}
// If there are no words that can follow a sentence delimiter,
// then there are two cases. In one case is we already have a
// sentence, in which case there is no reason to look at the
// next token, since that just causes buffering without any
// chance of the current sentence being extended, since
// delimFollowers = {}. In the other case, we have an empty
// sentence, which at this point means the sentence delimiter
// was a whitespace token such as \n. We might as well keep
// going as if we had never seen anything.
if (seenBoundary && delimFollowers.size() == 0) {
if (nextSent.size() > 0) {
break;
} else {
seenBoundary = false;
}
}
}
if (nextSent.size() == 0 && nextSentCarryover.size() == 0) {
IOUtils.closeIgnoringExceptions(inputReader);
inputReader = null;
nextSent = null;
} else if (escaper != null) {
nextSent = escaper.apply(nextSent);
}
}
public boolean hasNext() {
if (nextSent == null) {
primeNext();
}
return nextSent != null;
}
public List<HasWord> next() {
if (nextSent == null) {
primeNext();
}
if (nextSent == null) {
throw new NoSuchElementException();
}
List<HasWord> thisIteration = nextSent;
nextSent = null;
return thisIteration;
}
public void remove() { throw new UnsupportedOperationException(); }
}
}