/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.chunker.ae.adjuster;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.uimafit.component.JCasAnnotator_ImplBase;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.util.JCasUtil;
/**
* UIMA annotator that uses a pattern and a rule about that pattern to adjust
* certain annotations.
*
* The original reason for this annotator is to extend NP annotations to include
* prepositional phrases so that for the pattern NP PP NP, named entities that
* includes a word(s) from each of those NPs is found.
*
* Searches for the pattern within each Sentence. The end offset of the first
* chunk in the pattern is extended to match the end offset of the last chunk in
* the pattern.
*
* Note the pattern is applied repeatedly so that a sentence of NP PP NP PP NP
* results in only the first NP being extended all the way to the last NP in
* that sentence. This prevents NP annotations from only partially overlapping
* other NP annotations.
*
* This annotator is written to be able to handle more general cases than NP PP
* NP.
*
*
*/
public class ChunkAdjuster extends JCasAnnotator_ImplBase {
/**
* The pattern of chunks that trigger an adjustment.
*
*/
public static final String PARAM_CHUNK_PATTERN = "ChunkPattern";
@ConfigurationParameter(
name = PARAM_CHUNK_PATTERN,
mandatory = true,
description = "The pattern of chunks that trigger an adjustment"
)
private String[] chunksTypesInPattern;
/**
* The index of the token (within the pattern) to extend the end offset to
* include. E.g. is 2 to extend the first NP to include the last NP in NP PP
* NP.
*/
public static final String PARAM_EXTEND_TO_INCLUDE_TOKEN = "IndexOfTokenToInclude";
@ConfigurationParameter(
name = PARAM_EXTEND_TO_INCLUDE_TOKEN,
mandatory = true,
description = "The index of the token in the pattern to extend to the end offset"
)
private int indexOfTokenToInclude;
// TODO Consider adding a parameter for the type of annotation to look for
// pattern within, instead of always Sentence
// LOG4J logger based on class name
private Logger logger = Logger.getLogger(getClass().getName());
/**
* Performs initialization logic. This implementation just reads values for
* the configuration parameters. This method is not invoked for every
* document processed.
*
* @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(AnnotatorContext)
*/
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException {
super.initialize(aContext);
configInit();
}
/**
* Sets configuration parameters with values from the descriptor.
*/
private void configInit() throws ResourceInitializationException {
// TODO Consider validating values in pattern to type system
if (indexOfTokenToInclude < 0
|| indexOfTokenToInclude >= chunksTypesInPattern.length) {
// "The value "{0}" is not valid for the {1} parameter."
String msgArgs[] = { Integer.toString(indexOfTokenToInclude),
PARAM_EXTEND_TO_INCLUDE_TOKEN };
throw new ResourceInitializationException(
AnnotatorConfigurationException.PARAMETER_NOT_VALID,
msgArgs);
}
}
/**
* Invokes this annotator's analysis logic. Invoked for each document
* processed. For each Sentence, look for the pattern, and adjust a chunk if
* the pattern is found.
*/
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException {
logger.info(" process(JCas)");
try {
Collection<Sentence> sentences = JCasUtil.select(jcas, Sentence.class);
for(Sentence sentence : sentences){
annotateSentence(jcas, sentence);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
protected void annotateSentence(JCas jcas, Sentence sent) throws AnalysisEngineProcessException{
List<Chunk> chunkList = new ArrayList<>(JCasUtil.selectCovered(jcas, Chunk.class, sent));
// For each chunk in the Sentence, see if the chunk is the start of a
// matching pattern
// If so, extend the end offset of the <code>i</code> +
// <code>indexOfTokenToInclude</code>
for (int i = 0; i < chunkList.size(); i++) {
boolean matches = true;
Chunk chunk = chunkList.get(i);
while (matches == true) {
matches = compareToPattern(chunkList, i);
if (matches) {
extendChunk(chunk, chunkList.get(i + indexOfTokenToInclude)
.getEnd());
removeEnvelopedChunks(chunkList, i); // to check again on next
// iteration of while loop
}
}
}
}
/**
* A utility method that annotates a given range.
*/
protected void annotateRange(JCas jcas, int rangeBegin,
int rangeEnd)
throws AnalysisEngineProcessException {
// logger.info("Adjuster: from " + rangeBegin + " to " + rangeEnd);
// Find the Chunks in this Sentence
// For each Chunk, there is a corresponding more specific such as NP,
// PP, etc
List<Chunk> chunkList = new ArrayList<>(JCasUtil.selectCovered(jcas, Chunk.class, rangeBegin, rangeEnd));
// For each chunk in the Sentence, see if the chunk is the start of a
// matching pattern
// If so, extend the end offset of the <code>i</code> +
// <code>indexOfTokenToInclude</code>
for (int i = 0; i < chunkList.size(); i++) {
boolean matches = true;
Chunk chunk = chunkList.get(i);
while (matches == true) {
matches = compareToPattern(chunkList, i);
if (matches) {
extendChunk(chunk, chunkList.get(i + indexOfTokenToInclude)
.getEnd());
removeEnvelopedChunks(chunkList, i); // to check again on next
// iteration of while loop
}
}
}
}
/**
* Remove from our local list of chunks the chunks that have been enveloped.
* This allows the rule to be applied again.
*
*/
private void removeEnvelopedChunks(List<Chunk> list, int i) {
for (int j = 0; j < indexOfTokenToInclude; j++) {
list.remove(i + 1);
// if (false)
// logger.info("removed '" + chunk.getCoveredText() + "'");
}
}
/**
* Compares the chunks at index i to the 1st element on the pattern, i+1 to
* the 2nd element, etc and returns true if the chunks starting at i fit the
* pattern
*
* @param list
* the list of chunks
* @param i
* the position within the list to compare to the pattern
* @return true if the pattern is matched by the chunks starting with
* element <code>i</code> in the list. Note if there aren't enough
* chunks in the list starting at i to match the pattern, returns
* false.
* @throws AnnotatorProcessException
*/
private boolean compareToPattern(List<Chunk> list, int i)
{
boolean match = true;
int len = list.size();
for (int j = 0; j < chunksTypesInPattern.length; j++) {
if (i + j >= len
|| !list.get(i + j).getChunkType()
.equals(chunksTypesInPattern[j])) {
match = false; // some part of pattern doesn't match chunks
// starting at i
break;
}
}
return match;
}
/**
* Update the end value for the chunk to have the new value
*
* @param chunk
* The chunk to update
* @param newEnd
* The new end value for the chunk.
* @return The updated Chunk
* @throws AnnotatorProcessException
*/
private static Chunk extendChunk(Chunk chunk, int newEnd)
throws AnalysisEngineProcessException {
if (newEnd < chunk.getBegin()) {
Exception e;
e = new Exception("New end offset (" + newEnd
+ ") < begin offset (" + chunk.getBegin() + ").");
throw new AnalysisEngineProcessException(e);
}
// logger.info("Extending chunk end from " +chunk.getEnd()+ " to " +
// newEnd + ".");
// logger.info("For chunk " + chunk.getChunkType());
// logger.info(" text = '" + chunk.getCoveredText() + "'.");
chunk.setEnd(newEnd);
// logger.info(" new text = '" + chunk.getCoveredText() + "'.");
return chunk;
}
public static AnalysisEngineDescription createAnnotatorDescription(String[] chunkPattern, int patternIndex) throws ResourceInitializationException{
return AnalysisEngineFactory.createPrimitiveDescription(ChunkAdjuster.class,
ChunkAdjuster.PARAM_CHUNK_PATTERN,
chunkPattern,
ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
patternIndex);
}
}