Source Code of org.apache.ctakes.coreference.util.AnnotationSelector

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.coreference.util;


import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
//import java.util.Iterator;


import org.apache.log4j.Logger;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;


import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.ctakes.typesystem.type.syntax.WordToken;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Sentence;


// TODO: This class hardcoded all the criteria,
// which should be replaced by a parser of
// the inclusionCondition resource (use parameter?) MarkableCreator
public class AnnotationSelector {


  public static ArrayList<Annotation> selectNE (JCas jcas) {
    ArrayList<Annotation> ret = new ArrayList<Annotation>();
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(IdentifiedAnnotation.type).iterator();
    while (iter.hasNext()) {
      IdentifiedAnnotation a = (IdentifiedAnnotation) iter.next();
      if (a instanceof EntityMention || a instanceof EventMention) {
      if(a.getOntologyConceptArr() != null) {
//      int tid = a.getTypeID();
//      if (tid == TypeSystemConst.NE_TYPE_ID_ANATOMICAL_SITE ||
//        tid == TypeSystemConst.NE_TYPE_ID_DISORDER ||
//        tid == TypeSystemConst.NE_TYPE_ID_PROCEDURE ||
//        tid == TypeSystemConst.NE_TYPE_ID_FINDING)
        ret.add(a);
      }
      }
    }
    java.util.Collections.sort(ret, new AnnotOffsetComparator());
    return ret;
  }


  public static ArrayList<BaseToken> selectBaseToken (JCas jcas) {
    ArrayList<BaseToken> ret = new ArrayList<BaseToken>();
    FSIterator<?> iter = jcas.getJFSIndexRepository().getAnnotationIndex(BaseToken.type).iterator();
    while (iter.hasNext())
      ret.add((BaseToken)iter.next());
    java.util.Collections.sort(ret, new AnnotOffsetComparator());
    return ret;
  }


  public static ArrayList<Sentence> selectSentence (JCas jcas) {
    ArrayList<Sentence> ret = new ArrayList<Sentence>();
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(Sentence.type).iterator();
    while (iter.hasNext())
      ret.add((Sentence)iter.next());
    java.util.Collections.sort(ret, new AnnotOffsetComparator());
    return ret;
  }


  public static ArrayList<WordToken> selectPronoun (JCas jcas,
      HashSet<String> modalAdj, HashSet<String> cogved, HashSet<String> othervb,
      Logger logger) {
    Hashtable<String, WordToken> offset2token = new Hashtable<String, WordToken>();
    ArrayList<WordToken> ret = new ArrayList<WordToken>();
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(WordToken.type).iterator();
    while (iter.hasNext()) {
      WordToken t = (WordToken)iter.next();
      String s = t.getCoveredText();
      if (//s.equalsIgnoreCase("it") ||
        s.equalsIgnoreCase("its") ||
        s.equalsIgnoreCase("they") ||
        s.equalsIgnoreCase("their") ||
        s.equalsIgnoreCase("them") ||
        s.equalsIgnoreCase("theirs"))
        ret.add(t);
      if (s.equalsIgnoreCase("it"))
        offset2token.put(t.getBegin()+"-"+t.getEnd(), t);
    }


    iter = jcas.getJFSIndexRepository().getAnnotationIndex(TerminalTreebankNode.type).iterator();
    while (iter.hasNext()) {
      TerminalTreebankNode ttn = (TerminalTreebankNode) iter.next();
      if (ttn.getCoveredText().equalsIgnoreCase("it"))
        if (isPleonastic(ttn, modalAdj, cogved, othervb))
          logger.info("Pleonastic \"it\" at position "+
              ttn.getIndex()+" of \""+
              MarkableTreeUtils.getRoot(ttn).getCoveredText()+"\"");
        else
          ret.add(offset2token.get(ttn.getBegin()+"-"+ttn.getEnd()));
    }
    java.util.Collections.sort(ret, new AnnotOffsetComparator());
    return ret;
  }


  private static boolean isPleonastic (TerminalTreebankNode ttn,
      HashSet<String> modalAdj, HashSet<String> cogved, HashSet<String> othervb) {
    if (!ttn.getCoveredText().equalsIgnoreCase("it")) return false;


    if (ttn.getNodeType().equals("PRP")) {
      TreebankNode tn = ttn.getParent();
      while (tn.getNodeType().startsWith("NP"))
        tn = tn.getParent();
      if (tn.getNodeType().equals("S")) {
        TreebankNode par = tn;
        TreebankNode vp = findP(tn, "VP", 0);
        while (vp!=null) vp = findP(par = vp, "VP", 0);
        vp = par;
        par = vp.getParent();


        FSArray c = vp.getChildren();
        TreebankNode firstChild = (TreebankNode) c.get(0);
        if (isBe(firstChild)) {
          TreebankNode adjP = findP(vp, "ADJP", 1);
          if (adjP!=null && modalAdj.contains(adjP.getCoveredText()) &&
              (findP(vp, "SBAR", 1)!=null ||
              findP(vp, "S", 1)!=null ||
              findP(adjP, "SBAR", 1)!=null ||
              findP(adjP, "S", 1)!=null))
            return true;
        } else if (isBe(par.getChildren(0))) {
          firstChild = vp.getChildren(0);
          if (firstChild!=null && cogved.contains(firstChild.getCoveredText()) &&
              (findP(vp, "SBAR", 1)!=null ||
              findP(vp, "S", 1)!=null))
            return true;
        } else {
          // hacky way to get base form of the verbs in otherVerb list,
          // should use lvg to get the base form of the word
          String word = firstChild.getCoveredText().replaceAll("s$", "").replaceAll("ed$", "").replaceAll("t$", "");
          if (othervb.contains(word) &&
              (findP(vp, "SBAR", 1)!=null ||
              findP(vp, "S", 1)!=null))
            return true;
        }
      }
    }
    return false;
  }


  private static TreebankNode findP (TreebankNode n, String phraseTag, int startingChild) {
    FSArray c = n.getChildren();
    int i = startingChild;
    while (i < c.size()) {
      TreebankNode tn = (TreebankNode) c.get(i++);
      if (tn.getNodeType().equals(phraseTag) ||
          tn.getNodeType().startsWith(phraseTag+"-"))
        return tn;
    }
    return null;
  }


  private static boolean isBe (TreebankNode n) {
    String phCat = n.getNodeType();
    String txt = n.getCoveredText();
    if ((phCat.equals("VB") ||
        phCat.equals("VBZ") ||
        phCat.equals("VBD") ||
        phCat.equals("VBN")) &&
        (txt.equalsIgnoreCase("is") ||
            txt.equalsIgnoreCase("was") ||
            txt.equalsIgnoreCase("been") ||
            txt.equalsIgnoreCase("be")))
      return true;
    
    return false;
  }


  public static ArrayList<Chunk> selectDemonAndRelative (JCas jcas) {
    ArrayList<Chunk> ret = new ArrayList<Chunk>();
    FSIterator<Annotation> iter = jcas.getJFSIndexRepository().getAnnotationIndex(Chunk.type).iterator();
    while (iter.hasNext()) {
      Chunk c = (Chunk)iter.next();
      if (c.getChunkType().equals("NP")) {
        String s = c.getCoveredText().toLowerCase();
        if (s.startsWith("these") ||
          s.startsWith("those") ||
          s.startsWith("this") ||
          s.startsWith("that") ||
          s.startsWith("which"))
        ret.add(c);
      }
    }
    java.util.Collections.sort(ret, new AnnotOffsetComparator());
    return ret;
  }
}
Source Code of org.apache.ctakes.coreference.util.AnnotationSelector

Related Classes of org.apache.ctakes.coreference.util.AnnotationSelector