Source Code of org.apache.uima.ruta.textruler.core.TextRulerExampleDocument

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.uima.ruta.textruler.core;


import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;


import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.util.CasCopier;


/**
 * 
 * TextRulerExampleDocument stands for one document usually loaded from an XMI file. It uses the
 * given CasCache for storing its CAS with the XMI filename as the key.
 * 
 * It holds ArrayLists for positive and negative MLExamples which can be filled on demand for a
 * given learning target. E.g. single slot algorithms learn rules for each slot separately, so the
 * work-flow is to clear the current examples and create new for the next slot target. The same is
 * with single slot boundary algorithms like LP2: It first creates all left boundary examples,
 * learns from them, clears the examples and creates the right boundary examples and so on.
 * 
 * This class also provides the functionality extract and created MLExmaples of a given document or
 * test CAS for a given TextRulerTarget.
 * 
 * Especially for boundary algorithms you can call createBoundaryAnnotationsForCas to get boundary
 * annotations at the beginnings and endings of an example slot.
 * 
 * Caution (this is quite a bit inconvenient at the moment!): If a CAS gets loaded from the
 * casCache, you have to call createBoundaryAnnotationsForCas again, so your casLoader must be aware
 * of that (see BasicLP2 for an example) !
 * 
 * hint: this could be renamed to MLDocument instead of TextRulerExampleDocument ?
 */
public class TextRulerExampleDocument {


  protected String casFileName;


  protected CasCache casCache;


  protected List<TextRulerExample> positiveExamples = new ArrayList<TextRulerExample>();


  protected List<TextRulerExample> negativeExamples = new ArrayList<TextRulerExample>();


  public TextRulerExampleDocument(String casFileName, CasCache casCache) {
    this.casCache = casCache;
    this.casFileName = casFileName;
  }


  public CAS getCAS() {
    // ask CACHE
    return casCache.getCAS(casFileName);
  }


  public List<TextRulerExample> getPositiveExamples() {
    return positiveExamples;
  }


  public List<TextRulerExample> getNegativeExamples() {
    return negativeExamples;
  }


  protected void createPositiveExamplesForTarget(TextRulerTarget target) {
    positiveExamples = createSlotInstancesForCAS(getCAS(), target, true);
  }


  public List<TextRulerExample> createSlotInstancesForCAS(CAS aCas, TextRulerTarget target,
          boolean createFromRawTypeName) {
    List<TextRulerExample> result = new ArrayList<TextRulerExample>();


    if (target.isMultiSlot()) {
      TypeSystem ts = aCas.getTypeSystem();
      int currentSlotIndex = 0;
      TextRulerAnnotation[] currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
      List<Type> slotTypes = new ArrayList<Type>();
      for (String s : target.slotNames)
        slotTypes.add(ts.getType(s));


      for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator(true); it.isValid(); it
              .moveToNext()) {
        AnnotationFS fs = (AnnotationFS) it.get();
        Type theType = fs.getType();
        if (slotTypes.contains(theType)) {
          int idx = slotTypes.indexOf(theType);
          if (idx < currentSlotIndex) // the previous example was not
          // complete, so we have to write
          // it down:
          {
            result.add(new TextRulerExample(this, currentAnnotations, true, target));
            currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
          }
          currentAnnotations[idx] = new TextRulerAnnotation(fs, this);
          if (idx >= target.slotNames.length - 1) {
            result.add(new TextRulerExample(this, currentAnnotations, true, target));
            currentAnnotations = new TextRulerAnnotation[target.slotNames.length];
            currentSlotIndex = 0;
          } else
            currentSlotIndex = idx + 1;
        }
      }
      if (currentSlotIndex > 0) {
        result.add(new TextRulerExample(this, currentAnnotations, true, target));
      }


    } else if (target.isLeftCorrection() || target.isRightCorrection()) {
      // TODO
      TextRulerBasicLearner learner = target.getLearner();
      Set<String> filterSet = learner.getFilterSet();
      CAS testCAS = learner.getTestCAS();
      TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
      resetAndFillTestCAS(testCAS, target);
      CAS docCAS = getCAS();
      TypeSystem ts = docCAS.getTypeSystem();
      Type tokensRootType = ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME);
      AnalysisEngine analysisEngine = learner.getAnalysisEngine();
      try {
        analysisEngine.process(testCAS);
      } catch (AnalysisEngineProcessException e) {
        // TODO add log here
      }
      TextRulerTarget newTarget = new TextRulerTarget(target.slotNames, target.getLearner());
      if (target.isLeftCorrection()) {
        newTarget.type = TextRulerTarget.MLTargetType.SINGLE_LEFT_BOUNDARY;
      } else {
        newTarget.type = TextRulerTarget.MLTargetType.SINGLE_RIGHT_BOUNDARY;
      }
      createExamplesForTarget(newTarget);
      learner.compareOriginalDocumentWithTestCAS(this, testCAS, newTarget, c, true);
      List<TextRulerExample> correctTags = getPositiveExamples();
      List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(
              c.getCoveredNegativeExamples());
      for (TextRulerExample wrongTag : wrongTags) {
        // test, if there's a corresponding positive example
        // somewhere around (within maxDistance)
        List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag
                .getAnnotation().getBegin(), target.getMaxShiftDistance(), TextRulerToolkit
                .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);
        List<AnnotationFS> right = TextRulerToolkit.getAnnotationsAfterPosition(docCAS, wrongTag
                .getAnnotation().getEnd(), target.getMaxShiftDistance() + 1, TextRulerToolkit
                .getFilterSetWithSlotNames(target.slotNames, filterSet), tokensRootType);


        right.remove(0);


        // TODO stop after the first found match or create one bad
        // example for each found occurence ??!!
        // for now: stop after one ! so create only ONE bad
        // example...
        int leftDistance = 0;
        TextRulerExample leftCorrectTag = null;
        for (int i = left.size() - 1; i >= 0; i--) {
          leftDistance++;
          TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(left.get(i),
                  this, target, docCAS.getTypeSystem());
          // Only checks the beginning of needle
          leftCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
                  needle);
          if (leftCorrectTag != null)
            break;
        }


        int rightDistance = 0;
        TextRulerExample rightCorrectTag = null;
        for (AnnotationFS fs : right) {
          rightDistance++;
          TextRulerAnnotation needle = TextRulerToolkit.convertToTargetAnnotation(fs, this, target,
                  docCAS.getTypeSystem());
          // Only checks the beginning of needle
          rightCorrectTag = TextRulerExampleDocument.exampleListContainsAnnotation(correctTags,
                  needle);
          if (rightCorrectTag != null)
            break;
        }


        TextRulerExample theCorrectTag = null;
        if (rightDistance < leftDistance && rightCorrectTag != null)
          theCorrectTag = rightCorrectTag;
        else if (rightDistance > leftDistance && leftCorrectTag != null)
          theCorrectTag = leftCorrectTag;
        else // use the one that would lie in the slot filler:
        {
          if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY && rightCorrectTag != null)
            theCorrectTag = rightCorrectTag;
          else
            theCorrectTag = leftCorrectTag;
        }


        if (theCorrectTag != null) {
          TextRulerToolkit.log("FOUND BAD EXAMPLE FOR SHIFTING !!");
          TextRulerShiftExample shiftExample = new TextRulerShiftExample(this,
                  wrongTag.getAnnotation(), theCorrectTag.getAnnotation(), true, target);
          result.add(shiftExample);
        }
      }
      // GlobalCASSource.releaseCAS(testCAS);
    } else {
      List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(
              aCas,
              createFromRawTypeName ? target.getSingleSlotRawTypeName() : target
                      .getSingleSlotTypeName()); // do not use
      // boundary type
      // here since we
      // seek for the
      // orignial slot
      // !
      for (AnnotationFS a : slots) {
        result.add(new TextRulerExample(this, TextRulerToolkit.convertToTargetAnnotation(a, this,
                target, aCas.getTypeSystem()), true, target));
      }
    }
    return result;
  }


  protected void createNegativeExamplesForTarget(TextRulerTarget target) {
    // the default implementation does not support negative examples,
    // subclasses can overwrite
    // this if needed... or we could pass this as an argument to the
    // constructor....
  }


  public void createExamplesForTarget(TextRulerTarget target) {
    createPositiveExamplesForTarget(target);
    createNegativeExamplesForTarget(target);
  }


  public void clearCurrentExamples() {
    positiveExamples.clear();
    negativeExamples.clear();
  }


  // pass your test CAS object and the corresponding learning target to get a
  // filled
  // test-CAS for testing e.g. rule or rule set..
  // caution: testCas gets reset fist!
  public void resetAndFillTestCAS(CAS testCas, TextRulerTarget target) {
    testCas.reset();
    CAS docCas = getCAS();


    CasCopier cc = new CasCopier(docCas, testCas);
    testCas.setDocumentText(docCas.getDocumentText());


    // copy all annotations except the target-annotations:
    TypeSystem ts = docCas.getTypeSystem();


    List<Type> slotTypes = new ArrayList<Type>();


    for (String s : target.getSlotTypeNames())
      slotTypes.add(ts.getType(s));


    if (target.isBoundary()) {
      // add the base types (without START and END markers) also !
      for (String s : target.slotNames)
        slotTypes.add(ts.getType(s));
    }


    for (AnnotationFS fs : docCas.getAnnotationIndex()) {
      if (!slotTypes.contains(fs.getType())
              && !fs.getType().equals(docCas.getDocumentAnnotation().getType())) {
        FeatureStructure copyFs = cc.copyFs(fs);
        testCas.addFsToIndexes(copyFs);
      }
    }
  }


  public String getCasFileName() {
    return casFileName;
  }


  public static void createBoundaryAnnotationsForCas(CAS aCas, String slotName,
          Set<String> tokenFilterSet) {
    List<AnnotationFS> slots = TextRulerToolkit.extractAnnotationsForSlotName(aCas, slotName);
    TypeSystem ts = aCas.getTypeSystem();
    for (AnnotationFS a : slots) {


      List<AnnotationFS> slotTokens = TextRulerToolkit.getAnnotationsWithinBounds(aCas,
              a.getBegin(), a.getEnd(),
              TextRulerToolkit.getFilterSetWithSlotName(slotName, tokenFilterSet),
              ts.getType(TextRulerToolkit.RUTA_ANY_TYPE_NAME));
      if (!slotTokens.isEmpty()) {
        AnnotationFS first = slotTokens.get(0);
        AnnotationFS last = slotTokens.get(slotTokens.size() - 1);
        Type typeLB = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
        aCas.addFsToIndexes(aCas.createAnnotation(typeLB, first.getBegin(), first.getEnd()));
        Type typeRB = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
        aCas.addFsToIndexes(aCas.createAnnotation(typeRB, last.getBegin(), last.getEnd()));
      }
    }
  }


  public static void removeBoundaryAnnotationsFromCas(CAS aCas, String slotName) {
    // this method is not tested yet!
    TypeSystem ts = aCas.getTypeSystem();
    Type startType = ts.getType(slotName + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
    Type endType = ts.getType(slotName + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
    List<AnnotationFS> removeList = new ArrayList<AnnotationFS>();
    for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(startType).iterator(true); it
            .isValid(); it.moveToNext()) {
      AnnotationFS fs = it.get();
      removeList.add(fs);
    }
    for (FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(endType).iterator(true); it
            .isValid(); it.moveToNext()) {
      AnnotationFS fs = it.get();
      removeList.add(fs);
    }
    for (AnnotationFS fs : removeList)
      aCas.removeFsFromIndexes(fs);
  }


  public static synchronized TextRulerExample exampleListContainsAnnotation(
          List<TextRulerExample> list, TextRulerAnnotation ann) {
    TextRulerExample needle = new TextRulerExample(null, ann, true, null);


    int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() {
      public int compare(TextRulerExample o1, TextRulerExample o2) {
        TextRulerAnnotation afs1 = o1.getAnnotation();
        TextRulerAnnotation afs2 = o2.getAnnotation();
        if (afs1.getBegin() < afs2.getBegin())
          return -1;
        else if (afs1.getBegin() > afs2.getBegin())
          return 1;
        else
          return 0;
      }
    });
    if (index >= 0)
      return list.get(index);
    else
      return null;
  }


}
Source Code of org.apache.uima.ruta.textruler.core.TextRulerExampleDocument

Related Classes of org.apache.uima.ruta.textruler.core.TextRulerExampleDocument