/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.coreference.ae;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.ctakes.coreference.eval.helpers.Span;
import org.apache.ctakes.coreference.util.FSIteratorToList;
import org.apache.ctakes.coreference.util.MarkableTreeUtils;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.ctakes.coreference.type.DemMarkable;
import org.apache.ctakes.coreference.type.Markable;
import org.apache.ctakes.coreference.type.NEMarkable;
public class MipacqMarkableExpander extends JCasAnnotator_ImplBase {
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
// removeDoctors(aJCas);
removeHistoryOf(aJCas);
FSIterator<Annotation> iter = aJCas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator();
expandToNP(aJCas, FSIteratorToList.convert(iter));
mergeNP(aJCas);
elevateAdjectives(aJCas);
iter = aJCas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator();
rmDup(aJCas, FSIteratorToList.convert(iter));
}
private void removeDoctors(JCas jCas) {
FSIterator<Annotation> iter = jCas.getAnnotationIndex(NEMarkable.type).iterator();
ArrayList<Annotation> rm = new ArrayList<Annotation>();
while(iter.hasNext()){
NEMarkable m = (NEMarkable) iter.next();
if(m.getCoveredText().equalsIgnoreCase("dr")){
rm.add(m);
}
}
for(Annotation a: rm){
a.removeFromIndexes();
}
}
private void removeHistoryOf(JCas jCas) {
FSIterator<Annotation> iter = jCas.getAnnotationIndex(NEMarkable.type).iterator();
ArrayList<Annotation> rm = new ArrayList<Annotation>();
while(iter.hasNext()){
NEMarkable m = (NEMarkable) iter.next();
if(m.getCoveredText().equalsIgnoreCase("history of")){
rm.add(m);
}
}
for(Annotation a: rm){
a.removeFromIndexes();
}
}
private void expandToNP (JCas aJCas, LinkedList<Annotation> markables) {
// FSIterator<Annotation> iter = aJCas.getJFSIndexRepository().getAnnotationIndex(LookupWindowAnnotation.type).iterator();
FSIterator<Annotation> iter = aJCas.getAnnotationIndex(TreebankNode.type).iterator();
LinkedList<Annotation> l = FSIteratorToList.convert(iter);
for (Annotation m : markables){
TreebankNode node = MarkableTreeUtils.markableNode(aJCas, m.getBegin(), m.getEnd());
if(node == null) continue;
while(!(node.getNodeType().equals("NP") || node.getNodeType().equals("NML"))){
node = node.getParent();
if(node == null) break;
}
if(node == null) continue;
if(node.getChildren().size() == 3 && node.getChildren(1).getNodeType().equals("CC")){
continue;
}
// only expand if markable is at end of node.
// the following expression is more complicated however due to the way that
// NP sentences are annotated (and thus parsed). The NP will technically include the period so we will allow the node an
// extra character. This should allow a period but nothing else.
if(node.getEnd()-1 > m.getEnd()) continue;
String s = node.getCoveredText().toLowerCase();
if (s.startsWith("his ") ||
s.startsWith("her ") ||
s.startsWith("its "))
m.setBegin(node.getBegin()+4);
else if (s.startsWith("their "))
m.setBegin(node.getBegin()+6);
else if (!s.matches("^\\s*$")) // some lookupwindows start w/ multiple linefeeds in the pitt data
m.setBegin(node.getBegin());
}
}
// are any of the named entities contained within this chunk?
// if so return the first that is.
private Annotation containsAny (Chunk c, LinkedList<Annotation> l) {
int a = c.getBegin();
int b = c.getEnd();
for (Annotation ne : l)
if (a<=ne.getBegin() && b>=ne.getEnd())
return ne;
else if (ne.getBegin()>=b)
return null;
return null;
}
// merge NP# -> NP' PP, where NP' is marked as a Markable, by making NP# a markable
private void mergeNP (JCas jcas) {
Map<Integer,TreebankNode> innerMap = null;
// mark the boundaries of every NP:
FSIterator<Annotation> nodeIter = jcas.getAnnotationIndex(TreebankNode.type).iterator();
HashMap<Integer,Map<Integer,TreebankNode>> npMap = new HashMap<Integer,Map<Integer,TreebankNode>>();
while(nodeIter.hasNext()){
TreebankNode node = (TreebankNode) nodeIter.next();
if(node.getNodeType().equals("NP")){
innerMap = npMap.get(node.getBegin());
if(innerMap == null){
innerMap = new HashMap<Integer,TreebankNode>();
}
innerMap.put(node.getEnd(), node);
npMap.put(node.getBegin(), innerMap);
}
}
// now check if any of the NE markables share the exact same boundaries:
FSIterator<Annotation> neIter = jcas.getAnnotationIndex(Markable.type).iterator();
while(neIter.hasNext()){
Markable nem = (Markable) neIter.next();
innerMap = npMap.get(nem.getBegin());
if(innerMap != null && innerMap.containsKey(nem.getEnd())){
// found one! Check if it has an NP parent and a PP sibling:
TreebankNode node = innerMap.get(nem.getEnd());
TreebankNode parent = node.getParent();
if(parent.getChildren().size() == 2 && parent.getChildren(0) == node && parent.getNodeType().equals("NP") && parent.getChildren(1).getNodeType().equals("PP")){
// Jackpot! Expand the original markable to be the whole NP -> NP PP construction.
nem.setEnd(parent.getEnd());
}
}
}
}
/*
* JJ modifiers are often tagged as NEs in addition to the NP phrase which they are a part of when
* it's difficult to conceive of the modifier being referred to independently of the whole phrase:
* surgical procedures
*/
private void elevateAdjectives(JCas jcas){
FSIterator<Annotation> markables = jcas.getAnnotationIndex(NEMarkable.type).iterator();
while(markables.hasNext()){
NEMarkable mark = (NEMarkable) markables.next();
TreebankNode node = MarkableTreeUtils.markableNode(jcas, mark.getBegin(), mark.getEnd());
if(node.getNodeType().equals("JJ")){
while(node.getNodeType().equals("JJ")){
node = node.getParent();
if(node == null) break;
}
if(node != null){
mark.setBegin(node.getBegin());
mark.setEnd(node.getEnd());
}
}
}
}
private void rmDup(JCas aJCas, LinkedList<Annotation> markables) {
HashSet<Annotation> rm = new HashSet<Annotation>();
HashMap<String,Annotation> keep = new HashMap<String,Annotation>();
for (int i = 0; i < markables.size(); i++) {
Annotation m1 = markables.get(i);
String key = m1.getBegin() + "-" + m1.getEnd();
if(!keep.containsKey(key)){
keep.put(key, m1);
}else{
Annotation m2 = keep.get(key);
if(m2 instanceof DemMarkable && m1 instanceof NEMarkable){
rm.add(m2);
keep.put(key,m1);
}else if(m1 instanceof DemMarkable && m2 instanceof NEMarkable){
rm.add(m1);
}else{
// doesn't matter, they're probably both NE's
rm.add(m1);
}
}
}
for (Annotation a : rm)
a.removeFromIndexes();
}
}