/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.core.ae;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.util.JCasUtil;
/**
* UIMA annotator that tokenizes based on Penn Treebank rules.
*
* @author Mayo Clinic
*/
public class TokenizerAnnotatorPTB extends JCasAnnotator_ImplBase
{
// LOG4J logger based on class name
private Logger logger = Logger.getLogger(getClass().getName());
/**
* Value is "SegmentsToSkip". This parameter specifies which segments to skip. The parameter should be
* of type String, should be multi-valued and optional.
*/
public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";
@ConfigurationParameter(
name = PARAM_SEGMENTS_TO_SKIP,
mandatory = false,
description = "Set of segments that can be skipped"
)
private String[] skipSegmentsArray;
private Set<String> skipSegmentsSet;
private TokenizerPTB tokenizer;
private int tokenCount = 0;
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
logger.info("Initializing " + this.getClass().getName());
tokenizer = new TokenizerPTB();
skipSegmentsSet = new HashSet<>();
if(skipSegmentsArray != null){
Collections.addAll(skipSegmentsSet, skipSegmentsArray);
}
}
/**
* Entry point for processing.
*/
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
logger.info("process(JCas) in " + this.getClass().getName());
tokenCount = 0;
Collection<Segment> segments = JCasUtil.select(jcas, Segment.class);
for(Segment sa : segments){
String segmentID = sa.getId();
if (!skipSegmentsSet.contains(segmentID)) {
annotateRange(jcas, sa.getBegin(), sa.getEnd());
}
}
}
static char CR = '\r';
static char LF = '\n';
/**
* Tokenizes a range of text, adding the tokens to the CAS
* Tokenizes one sentence at a time. Only tokenizes what is within Sentence annotation.
* There must have been Sentence annotations created beforehand in order for this method
* to tokenize anything.
* @throws AnalysisEngineProcessException
*/
protected void annotateRange(JCas jcas, int rangeBegin, int rangeEnd) throws AnalysisEngineProcessException {
// int tokenCount = 0; // can't start with tokenCount=0 here because this method can be called multiple times
// First look for all newlines and carriage returns (which are not contained within sentences)
String docText = jcas.getDocumentText();
for (int i = rangeBegin; i<rangeEnd; i++) {
if (docText.charAt(i)==CR) {
NewlineToken nta;
if (i+1<rangeEnd && docText.charAt(i+1)==LF) {
// single NewlineToken for the 2 characters
nta = new NewlineToken(jcas, i, i+2);
i++; // skip past the LF
} else {
nta = new NewlineToken(jcas, i, i+1);
}
nta.addToIndexes();
} else if (docText.charAt(i)==LF) {
NewlineToken nta = new NewlineToken(jcas, i, i+1);
nta.addToIndexes();
}
}
// Now process each sentence
Collection<Sentence> sentences = JCasUtil.select(jcas, Sentence.class);
// Tokenize each sentence, adding the tokens to the cas index
for(Sentence sentence : sentences){
if (sentence.getBegin() < rangeBegin || sentence.getEnd() > rangeEnd) {
continue;
}
List<?> tokens = tokenizer.tokenizeTextSegment(jcas, sentence.getCoveredText(), sentence.getBegin(), true);
for (Object bta: tokens) {
if (bta==null) {
Exception e = new RuntimeException("bta==null tokenCount=" + tokenCount + " tokens.size()==" + tokens.size());
e.printStackTrace();
} else{
//logger.info("Token #" + tokenCount + " len = " + bta.getCoveredText().length() + " " + bta.getCoveredText());
// add the BaseToken to CAS index
if(BaseToken.class.isAssignableFrom(bta.getClass())){
BaseToken.class.cast(bta).addToIndexes();
}else{
throw new AnalysisEngineProcessException("Token returned cannot be cast as BaseToken", new Object[]{bta});
}
//tokenCount++;
}
}
}
// Now add the tokenNumber in the order of offsets
Collection<BaseToken> tokens = JCasUtil.select(jcas, BaseToken.class);
for(BaseToken bta : tokens){
if (bta.getBegin()>=rangeBegin && bta.getBegin()<rangeEnd) {
bta.setTokenNumber(tokenCount);
tokenCount++;
}
}
}
public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException{
return AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class);
}
}
// pseudo code:
// ** find first non white space or first newline. this starts token 1
// if eof before next newline or next nonwhite space, done.
// if nothing but whitespace before the newline, create NewlineToken tokenNumber=0 for first
// repeat until find non white space. this starts the first BaseToken that is
// not a NewlineToken and it will have tokenNumber = (#NewlineTokens + 0)
// once found start of a token (other than NewlineToken) process as follows to find end of token:
// if char 2 is a whitespace or no char 2 (eof), then token len = 1, and go back to looking for next token for nonwhitespace (**)
// if 1st char of token is "." (period) could be a number or an ellipsis CALL startsWithPeriod
// if 1st char of token is "'" (apostrophe) could be the start of a name CALL startsWithApostrophe
// if 1st char of token is dash/minus sign, could be a number, CALL startsWithMinusSign
// if 1st char of token is any other punctuation, it is a one-char token. go back to looking for nonwhitespace (**)
// if 1st char of token is alphanum, then follow these rules
// stop when hit a whitespace or EOF
// include any alphanum if just alphanums
// if a hyphen/dash/minus
// -- if part of hyphen list,
// then don't stop ****
// else stop, found end of token (and start of next token)
// Can't be a minus sign since wasn't first character.
// and if it's a dash, should be a separate token
// so if wasn't part of hyphen list, stop, found end of token
// include comma only part of a number (if all others are digits or commas or a single period)
// include period if
// --- part of abbreviation
// --- part of a number (if all others are digits or commas or a single period)
// From http://www.seas.upenn.edu/~jmott/2009_addendum.pdf
// modified by Task1.4.4_adoptedConventions[AL]_Feb28_2011.doc
//All strings separated by white space are treated as separate tokens.
//Also, no token can contain white space.
//Most hyphenated words are split into multiple tokens.
//Hyphenated interjections and affixes in the following list are not split into multiple tokens.
//All other punctuation not described above triggers a break in tokenization, with the ex-
//ceptions outlined below.
//Note that for present purposes, all non-alphanumeric characters
//are considered `punctuation'.
//The tokenization of punctuation in webtext is deter-
//mined by whitespace boundaries.
// The following is the list of punctuation exceptions that do not cause end of token
//Periods marking abbreviations.
//Punctuation in web addresses.
// - URLs or email address
//Ellipses, when encoded as a string of periods.
//Complex numerals.
//Telephone numbers and postal codes.
//Single quotation marks as parts of names.
// Hyphenated interjections and affixes in the following list are not split into multiple tokens.
// For example, uh-oh and e-mail are both single tokens: uh-oh, e-mail.
// e-
// a-
// u-
// x-
// agro-
// ante-
// anti-
// arch-
// be-
// bi-
// bio-
// co-
// counter-
// cross-
// cyber-
// de-
// eco-
// -esque
// -ette
// ex-
// extra-
// -fest
// -fold
// -gate
// inter-
// intra-
// -itis
// -less
// macro-
// mega-
// micro-
// 4
// mid-
// mini-
// mm-hm
// mm-mm
// -most
// multi-
// neo-
// non-
// o-kay
// -o-torium
// over-
// pan-
// para-
// peri-
// post-
// pre-
// pro-
// pseudo-
// quasi-
// -rama
// re-
// semi-
// sub-
// super-
// tri-
// uh-huh
// uh-oh
// ultra-
// un-
// uni-
// vice-
// -wise