package edu.stanford.nlp.process;
import java.io.Reader;
import java.io.Serializable;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
/** A tokenizer that works by calling a WordSegmenter.
* This is used for Chinese and Arabic.
*
* @author Galen Andrew
* @author Spence Green
*/
public class WordSegmentingTokenizer extends AbstractTokenizer<HasWord> {
private Iterator<HasWord> wordIter;
private Tokenizer<CoreLabel> tok;
private WordSegmenter wordSegmenter;
public WordSegmentingTokenizer(WordSegmenter segmenter, Reader r) {
this(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r));
}
public WordSegmentingTokenizer(WordSegmenter segmenter, Tokenizer<CoreLabel> tokenizer) {
wordSegmenter = segmenter;
tok = tokenizer;
}
@Override
protected HasWord getNext() {
while (wordIter == null || ! wordIter.hasNext()) {
if ( ! tok.hasNext()) {
return null;
}
CoreLabel token = tok.next();
String s = token.word();
if (s == null) {
return null;
}
if (s.equals(WhitespaceLexer.NEWLINE)) {
// if newlines were significant, we should make sure to return
// them when we see them
List<HasWord> se = Collections.<HasWord>singletonList(token);
wordIter = se.iterator();
} else {
List<HasWord> se = wordSegmenter.segment(s);
wordIter = se.iterator();
}
}
return wordIter.next();
}
public static TokenizerFactory<HasWord> factory(WordSegmenter wordSegmenter) {
return new WordSegmentingTokenizerFactory(wordSegmenter);
}
private static class WordSegmentingTokenizerFactory implements TokenizerFactory<HasWord>, Serializable {
private static final long serialVersionUID = -4697961121607489828L;
boolean tokenizeNLs = false;
private WordSegmenter segmenter;
public WordSegmentingTokenizerFactory(WordSegmenter wordSegmenter) {
segmenter = wordSegmenter;
}
public Iterator<HasWord> getIterator(Reader r) {
return getTokenizer(r);
}
public Tokenizer<HasWord> getTokenizer(Reader r) {
return getTokenizer(r, null);
}
public Tokenizer<HasWord> getTokenizer(Reader r, String extraOptions) {
boolean tokenizeNewlines = this.tokenizeNLs;
if (extraOptions != null) {
Properties prop = StringUtils.stringToProperties(extraOptions);
tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);
}
return new WordSegmentingTokenizer(segmenter, WhitespaceTokenizer.newCoreLabelWhitespaceTokenizer(r, tokenizeNewlines));
}
public void setOptions(String options) {
Properties prop = StringUtils.stringToProperties(options);
tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
}
}
}