}
BufferedWriter writer = null;
try {
MaxentTagger tagger = new MaxentTagger(config.getModel(), config);
Timing t = new Timing();
String sentenceDelimiter = null;
final TokenizerFactory<? extends HasWord> tokenizerFactory; // initialized immediately below
if (config.getTokenize() && config.getTokenizerFactory().trim().length() != 0) {
Class<TokenizerFactory<? extends HasWord>> clazz = (Class<TokenizerFactory<? extends HasWord>>) Class.forName(config.getTokenizerFactory().trim());
Method factoryMethod = clazz.getMethod("newTokenizerFactory");
tokenizerFactory = (TokenizerFactory<? extends HasWord>) factoryMethod.invoke(null);
} else if (config.getTokenize()){
tokenizerFactory = PTBTokenizerFactory.newWordTokenizerFactory(config.getTokenizerOptions());
} else {
tokenizerFactory = WhitespaceTokenizer.factory();
sentenceDelimiter = "\n";
}
final DocumentPreprocessor docProcessor = new DocumentPreprocessor(tokenizerFactory);
docProcessor.setEncoding(config.getEncoding());
//Counts
int numWords = 0;
int numSentences = 0;
String outFile = config.getOutputFile();
if (outFile.length() > 0) {
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile), config.getEncoding()));
} else {
writer = new BufferedWriter(new OutputStreamWriter(System.out, config.getEncoding()));
}
String[] xmlInput = config.getXMLInput();
if (xmlInput.length > 0) {
if(xmlInput.length > 1 || !xmlInput[0].equals("null")) {
tagFromXML(config);
return;
}
}
boolean stdin = config.getFile().trim().equalsIgnoreCase("stdin");
while (true) {
//Now determine if we're tagging from stdin or from a file
BufferedReader br;
if (!stdin) {
br = IOUtils.readReaderFromString(config.getFile(), config.getEncoding());
} else {
System.err.println("Type some text to tag, then EOF.");
System.err.println(" (For EOF, use Return, Ctrl-D on Unix; Enter, Ctrl-Z, Enter on Windows.)");
br = new BufferedReader(new InputStreamReader(System.in));
}
int outputStyle = PlainTextDocumentReaderAndWriter.asIntOutputFormat(config.getOutputFormat());
if (config.getSGML()) {
// this uses NER codebase technology to read/write SGML-ish files
PlainTextDocumentReaderAndWriter readerAndWriter = new PlainTextDocumentReaderAndWriter();
ObjectBank<List<CoreLabel>> ob = new ObjectBank<List<CoreLabel>>(new ReaderIteratorFactory(br), readerAndWriter);
PrintWriter pw = new PrintWriter(writer);
for (List<CoreLabel> sentence : ob) {
Sentence<CoreLabel> s = new Sentence<CoreLabel>(sentence);
numWords += s.length();
Sentence<TaggedWord> taggedSentence = MaxentTagger.tagSentence(s);
Iterator<CoreLabel> origIter = sentence.iterator();
for (TaggedWord tw : taggedSentence) {
CoreLabel cl = origIter.next();
cl.set(CoreAnnotations.AnswerAnnotation.class, tw.tag());
}
readerAndWriter.printAnswers(sentence, pw, outputStyle, true);
}
} else {
//Now we do everything through the doc preprocessor
List<List<? extends HasWord>> document;
if ((config.getTagInside() != null && !config.getTagInside().equals(""))) {
document = docProcessor.getSentencesFromXML(br, config.getTagInside(), null, false);
} else if (stdin) {
document = docProcessor.getSentencesFromText(new StringReader(br.readLine()));
} else {
document = docProcessor.getSentencesFromText(br, sentenceDelimiter);
}
for (List<? extends HasWord> sentence : document) {
numWords += sentence.size();
Sentence<TaggedWord> taggedSentence = MaxentTagger.tagSentence(sentence);
if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_TSV) {
writer.write(getTsvWords(taggedSentence));
} else if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_XML) {
writeXMLSentence(writer, taggedSentence, numSentences);
} else { // if (outputStyle == PlainTextDocumentReaderAndWriter.OUTPUT_STYLE_SLASH_TAGS) {
writer.write(taggedSentence.toString(false));
writer.newLine();
}
if (stdin) {
writer.newLine();
writer.flush();
}
numSentences++;
}
}
if (!stdin) break;
}
long millis = t.stop();
printErrWordsPerSec(millis, numWords);
} catch (Exception e) {
System.err.println("An error occurred while tagging.");
e.printStackTrace();
} finally {