Package org.apache.uima.ruta.textruler.learner.wien

Source Code of org.apache.uima.ruta.textruler.learner.wien.Wien

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.uima.ruta.textruler.learner.wien;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerAnnotation;
import org.apache.uima.ruta.textruler.core.TextRulerBasicLearner;
import org.apache.uima.ruta.textruler.core.TextRulerExample;
import org.apache.uima.ruta.textruler.core.TextRulerExampleDocument;
import org.apache.uima.ruta.textruler.core.TextRulerRuleItem;
import org.apache.uima.ruta.textruler.core.TextRulerRulePattern;
import org.apache.uima.ruta.textruler.core.TextRulerSlotPattern;
import org.apache.uima.ruta.textruler.core.TextRulerTarget;
import org.apache.uima.ruta.textruler.core.TextRulerToolkit;
import org.apache.uima.ruta.textruler.extension.TextRulerLearnerDelegate;

public class Wien extends TextRulerBasicLearner {

  TextRulerRulePattern hPattern;

  TextRulerRulePattern tPattern;

  Map<String, PatternPair> headTailCache = new HashMap<String, PatternPair>();

  Map<String, List<TextRulerRulePattern>> interTupelSeparatorsCache = new HashMap<String, List<TextRulerRulePattern>>();

  public static class PatternPair {
    public TextRulerRulePattern l = new TextRulerRulePattern();

    public TextRulerRulePattern r = new TextRulerRulePattern();
  }

  ArrayList<PatternPair> patternPairs = new ArrayList<PatternPair>();

  WienRule theRule;

  public enum constraint3ReturnType {
    C3_SUCCESS, C3_L1CandidateSuffixError, C3_TailCandidateH_L1Error, C3_TailCandidateRK_PrefixError, C3_TailCandidateNotFoundError, C3_TailCandidateSucceedsL1InTailError, C3_L1CandidateInterTupleSeparatorSuffixError, C3_TailCandidatePrecedesL1InterTupleSeparatorError
  };

  public Wien(String inputDir, String prePropTmFile, String tmpDir, String[] slotNames,
          Set<String> filterSet, boolean skip, TextRulerLearnerDelegate delegate) {
    super(inputDir, prePropTmFile, tmpDir, slotNames, filterSet, skip, delegate);
  }

  @Override
  public boolean collectNegativeCoveredInstancesWhenTesting() {
    return false;
  }

  @Override
  protected void doRun() {
    TextRulerToolkit.log("-- WIEN START");

    headTailCache.clear();
    interTupelSeparatorsCache.clear();

    for (int i = 0; i < slotNames.length; i++)
      patternPairs.add(new PatternPair());

    TextRulerTarget target = new TextRulerTarget(slotNames, this);

    exampleDocuments.createExamplesForTarget(target); // new multislot
    // target examples

    for (TextRulerExample e : exampleDocuments.getAllPositiveExamples()) {
      TextRulerToolkit.log("Example found: " + e);
    }

    try {
      boolean allOk = true;
      sendStatusUpdateToDelegate("Searching for right patterns...",
              TextRulerLearnerState.ML_RUNNING, false);
      if (!findRightPatterns())
        allOk = false;
      sendStatusUpdateToDelegate("Searching for left patterns...",
              TextRulerLearnerState.ML_RUNNING, false);
      if (!findLeftPatterns())
        allOk = false;
      sendStatusUpdateToDelegate("Searching for head, tail and left1 patterns...",
              TextRulerLearnerState.ML_RUNNING, false);
      if (!findHeadTailAndL1Patterns())
        allOk = false;
      // {
      // String s = "";
      // for (TextRulerRuleItem i : hPattern)
      // s += " "+i;
      // s += " ||||";
      // for (TextRulerRuleItem i : patternPairs.get(0).l)
      // s += " "+i;
      // s += " ||||";
      // for (TextRulerRuleItem i : tPattern)
      // s += " "+i;
      // TextRulerToolkit.log(s);
      // }

      if (allOk) {
        sendStatusUpdateToDelegate("Building multi-slot rule.", TextRulerLearnerState.ML_RUNNING,
                false);
        theRule = new WienRule(this, target);
        List<TextRulerSlotPattern> rPatterns = theRule.getPatterns();
        int totalItemCount = 0;
        for (int k = 0; k < slotNames.length; k++) {
          WienRuleItem slotItem = new WienRuleItem((TextRulerAnnotation) null);
          TextRulerSlotPattern rP = new TextRulerSlotPattern();
          rPatterns.add(rP);
          PatternPair p = patternPairs.get(k);
          for (int i = 0; i < p.l.size(); i++) {
            WienRuleItem item = (WienRuleItem) p.l.get(i);
            if (k == 0 && i == 0) // the very first rule item:
            {
              item = item.copy();
              // old version:
              // item.addCondition("-NEAR,wien_tail,10000000,false");
              item.addCondition("-AFTER(wien_tail)");
              item.addCondition("-PARTOF(wien_rulemark)");
            }
            rP.preFillerPattern.add(item);
            totalItemCount++;
          }
          rP.fillerPattern.add(slotItem.copy());
          totalItemCount++;
          for (int i = 0; i < p.r.size(); i++) {
            WienRuleItem item = (WienRuleItem) p.r.get(i);
            totalItemCount++;
            if (k == slotNames.length - 1 && i == p.r.size() - 1) // the
            // very
            // last
            // item
            {
              item = item.copy();
              item.addAction("MARK(wien_rulemark, 1, " + totalItemCount + ")");
            }
            rP.postFillerPattern.add(item);
          }
          totalItemCount++; // the inter-slot ALL*? item has to be
          // counted as well!
        }
        sendStatusUpdateToDelegate("Done", TextRulerLearnerState.ML_DONE, true);
      } else
        sendStatusUpdateToDelegate("Done - Not all patterns could be learned!",
                TextRulerLearnerState.ML_DONE, true);
    } catch (Exception e) {
      TextRulerPlugin.error(e);
      sendStatusUpdateToDelegate("Aborted due to Exception!", TextRulerLearnerState.ML_ERROR, true);
    }
    headTailCache.clear();
    interTupelSeparatorsCache.clear();
    TextRulerToolkit.log("-- WIEN END");
  }

  protected boolean findRightPatterns() {
    TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
    boolean allFound = true;
    for (int k = 0; k < slotNames.length; k++) {
      List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
      System.out.println(rightContexts.get(0));
      int shortest = Integer.MAX_VALUE;
      for (TextRulerRulePattern p : rightContexts)
        shortest = p.size() < shortest ? p.size() : shortest;
      boolean found = false;
      for (int len = 1; len <= shortest; len++) {
        TextRulerRulePattern subPattern = rightContexts.get(0).subPattern(0, len);
        if (testConstraint1(subPattern, k)) {
          // for (TextRulerRuleItem i : subPattern)
          // ((WienRuleItem)i).getWordConstraint().setGeneralizeLinkMarkUp(true);
          patternPairs.get(k).r = subPattern;
          TextRulerToolkit.log("right " + k + ": " + subPattern);
          found = true;
          break;
        }
      }
      if (!found)
        allFound = false;
    }
    return allFound;
  }

  protected boolean findLeftPatterns() {
    TextRulerExampleDocument doc = exampleDocuments.getDocuments().get(0);
    // skip l 0 !
    if (slotNames.length < 2)
      return true;
    boolean allFound = true;
    for (int k = 1; k < slotNames.length; k++) {
      List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
      int shortest = Integer.MAX_VALUE;
      for (TextRulerRulePattern p : leftContexts)
        shortest = p.size() < shortest ? p.size() : shortest;
      TextRulerRulePattern sourcePattern = leftContexts.get(0);
      boolean found = false;
      for (int len = 1; len <= shortest; len++) {
        // get suffix:
        TextRulerRulePattern subPattern = sourcePattern.subPattern(sourcePattern.size() - len, len);
        if (testConstraint2(subPattern, k)) {
          patternPairs.get(k).l = subPattern;
          for (TextRulerRuleItem i : subPattern)
            ((WienRuleItem) i).getWordConstraint().setGeneralizeLinkMarkUp(true);
          TextRulerToolkit.log("left " + k + ": " + subPattern);
          found = true;
          break;
        }
      }
      if (!found)
        allFound = false;
    }
    return allFound;
  }

  protected boolean findHeadTailAndL1Patterns() {
    List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
    TextRulerExampleDocument doc0 = docs.get(0);
    TextRulerRulePattern head = new TextRulerRulePattern();
    TextRulerRulePattern tail = new TextRulerRulePattern();
    getPageHeadAndTailPortion(doc0, head, tail);

    final class HLCandidate {
      public TextRulerRulePattern head = new TextRulerRulePattern();

      public TextRulerRulePattern l1 = new TextRulerRulePattern();
    }

    // a small optimization:
    // find out the maximum possible length for l1 in doc0 since l1 is much
    // smaller than the possible head length!
    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
    int shortestL1 = head.size() - 1;
    for (TextRulerRulePattern its : interTupleSeparators)
      shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;

    List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
    // create candidates for each separation of the head and tail patterns:
    for (int separator = head.size() - 1; separator > 0; separator--) {
      HLCandidate c = new HLCandidate();
      for (int i = 0; i < head.size(); i++) {
        if (i < separator)
          c.head.add(head.get(i));
        else {
          WienRuleItem it = (WienRuleItem) head.get(i).copy();
          it.getWordConstraint().setGeneralizeLinkMarkUp(true);
          c.l1.add(it);
        }
      }
      hlCandidates.add(c);
      TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
      if (c.l1.size() >= shortestL1)
        break;
    }

    long total = 0;

    // get total h l1 t combination count:
    long tCand = (tail.size() * (tail.size() + 1)) / 2;
    for (HLCandidate c : hlCandidates) {
      total += ((c.head.size() - 1) * (c.head.size())) / 2;
    }
    total *= tCand;

    long current = 0;
    int oldPercent = -1;

    for (HLCandidate c : hlCandidates) {
      // for each "candidate" which represents a l1 suffix pattern of the
      // head tokens and a rest pattern for the h pattern,
      // we have to create every sub pattern of the remaining h pattern as
      // a h candidate:
      TextRulerRulePattern l1 = c.l1;
      TextRulerRulePattern h = null;

      boolean l1Sucks = false;

      for (int endI = c.head.size() - 1; endI > 0; endI--) {
        for (int startI = endI; startI > 0; startI--) {
          h = new TextRulerRulePattern();
          for (int i = startI; i <= endI; i++)
            h.add(c.head.get(i));

          // now for each h candidate we have to create each t
          // candidate:
          TextRulerRulePattern t = null;
          for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
            for (int tendI = tstartI; tendI < tail.size(); tendI++) {
              int percent = Math.round(((float) current * 100 / total));
              if (percent != oldPercent) {
                oldPercent = percent;
                if (percent > 100)
                  percent = 100;
                // TextRulerToolkit.log(current+" / "+total);
                sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
                        TextRulerLearnerState.ML_RUNNING, false);
              }
              if (shouldAbort())
                return false;
              current++;

              t = new TextRulerRulePattern();
              for (int i = tstartI; i <= tendI; i++)
                t.add(tail.get(i));

              // no we have a possible candidate triple: h, t and
              // l1:

              constraint3ReturnType c3Result = testConstraint3(h, t, l1);

              if (c3Result == constraint3ReturnType.C3_SUCCESS) {
                hPattern = h;
                tPattern = t;
                patternPairs.get(0).l = l1;
                return true;
              } else if (c3Result == constraint3ReturnType.C3_L1CandidateSuffixError
                      || c3Result == constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError) {
                l1Sucks = true;
                current += tail.size() - tendI - 1;
                break;
              } else if (c3Result == constraint3ReturnType.C3_TailCandidateH_L1Error
                      || c3Result == constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError) {
                // no special pruning options here... we simply
                // have to test the next t-candidate
              } else if (c3Result == constraint3ReturnType.C3_TailCandidateRK_PrefixError
                      || c3Result == constraint3ReturnType.C3_TailCandidateNotFoundError) {
                // all candidates with the same start item are
                // bad, so leave this inner loop:
                current += tail.size() - tendI - 1;
                break;
              } else if (c3Result == constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError) {
                // this is a problematic case... the cause could
                // be L1 or the current Tail pattern,
                // so we can't do nothing about it! just try the
                // next t-candidate
              }
            }
            if (l1Sucks) {
              current += (tail.size() - tstartI - 1) * (tail.size() - tstartI) / 2;
              break;
            }
          }
          if (l1Sucks) {
            if (startI > 0)
              current += (startI - 1) * tCand;
            break;
          }
        }
        if (l1Sucks) {
          current += (endI * (endI + 1) / 2) * tCand;
          break;
        }
      }
    }
    return false;
  }

  protected void getPageHeadAndTailPortion(TextRulerExampleDocument doc, TextRulerRulePattern head,
          TextRulerRulePattern tail) {
    String key = doc.getCasFileName();
    if (headTailCache.containsKey(key)) {
      PatternPair p = headTailCache.get(key);
      head.addAll(p.l);
      tail.addAll(p.r);
    } else {
      CAS cas = doc.getCAS();
      TextRulerExample firstExample = doc.getPositiveExamples().get(0);
      TextRulerExample lastExample = doc.getPositiveExamples().get(
              doc.getPositiveExamples().size() - 1);
      TypeSystem ts = cas.getTypeSystem();
      Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
      List<AnnotationFS> headTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas,
              firstExample.getAnnotations()[0].getBegin(), 0, TextRulerToolkit
                      .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      TextRulerAnnotation[] lastExampleAnnotations = lastExample.getAnnotations();
      List<AnnotationFS> tailTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas,
              lastExampleAnnotations[lastExampleAnnotations.length - 1].getEnd(), 0,
              TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      for (AnnotationFS afs : headTokens)
        head.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      for (AnnotationFS afs : tailTokens)
        tail.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      PatternPair p = new PatternPair();
      p.l.addAll(head);
      p.r.addAll(tail);
      headTailCache.put(key, p);
    }
  }

  protected List<TextRulerRulePattern> getInterTupleSepatators(TextRulerExampleDocument doc) {
    String key = doc.getCasFileName();
    if (interTupelSeparatorsCache.containsKey(key)) {
      return interTupelSeparatorsCache.get(key);
    } else {
      List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
      CAS cas = doc.getCAS();
      TypeSystem ts = cas.getTypeSystem();
      Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
      List<TextRulerExample> examples = doc.getPositiveExamples();
      for (int i = 0; i < examples.size() - 1; i++) {
        // get separator between i'th and (i+1)'th example:
        TextRulerAnnotation[] exampleAnnotations1 = examples.get(i).getAnnotations();
        TextRulerAnnotation[] exampleAnnotations2 = examples.get(i + 1).getAnnotations();
        TextRulerAnnotation lastOf1 = exampleAnnotations1[exampleAnnotations1.length - 1];
        TextRulerAnnotation firstOf2 = exampleAnnotations2[0];
        List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, lastOf1
                .getEnd(), firstOf2.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
                slotNames, filterSet), tokenType);
        TextRulerRulePattern thePattern = new TextRulerRulePattern();
        for (AnnotationFS afs : theTokens)
          thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
        if (thePattern.size() > 0)
          result.add(thePattern);

      }
      interTupelSeparatorsCache.put(key, result);
      return result;
    }
  }

  protected List<TextRulerRulePattern> getRightContextForSlot(TextRulerExampleDocument doc,
          int slotIndex) {
    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
    CAS cas = doc.getCAS();
    TypeSystem ts = cas.getTypeSystem();
    Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
    List<TextRulerExample> examples = doc.getPositiveExamples();
    boolean isLastSlot = slotIndex >= slotNames.length - 1;
    for (int ei = 0; ei < examples.size(); ei++) {
      boolean isLastExample = ei == examples.size() - 1;
      TextRulerExample e = examples.get(ei);
      // get stuff between slot slotIndex and slotIndex+1
      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
      TextRulerAnnotation nextSlotAnnotation;

      if (!isLastSlot)
        nextSlotAnnotation = e.getAnnotations()[slotIndex + 1];
      else {
        if (!isLastExample) // the next slot annotation is the first
          // example annotation of the next template:
          nextSlotAnnotation = examples.get(ei + 1).getAnnotations()[0];
        else
          nextSlotAnnotation = null;
      }

      List<AnnotationFS> theTokens;
      if (nextSlotAnnotation == null)
        theTokens = TextRulerToolkit.getAnnotationsAfterPosition(cas, slotAnnotation.getEnd(), 0,
                TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      else
        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, slotAnnotation.getEnd(),
                nextSlotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
                        slotNames, filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

  protected List<TextRulerRulePattern> getLeftContextForSlot(TextRulerExampleDocument doc,
          int slotIndex) {
    if (slotIndex == 0)
      return null;
    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
    CAS cas = doc.getCAS();
    TypeSystem ts = cas.getTypeSystem();
    Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
    List<TextRulerExample> examples = doc.getPositiveExamples();

    boolean isFirstSlot = slotIndex == 0;
    for (int ei = 0; ei < examples.size(); ei++) {
      boolean isFirstExample = ei == 0;
      TextRulerExample e = examples.get(ei);
      // get stuff between slot slotIndex and slotIndex+1
      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
      TextRulerAnnotation prevSlotAnnotation;

      if (!isFirstSlot)
        prevSlotAnnotation = e.getAnnotations()[slotIndex - 1];
      else {
        if (!isFirstExample)
          prevSlotAnnotation = examples.get(ei - 1).getAnnotations()[slotNames.length - 1];
        else
          prevSlotAnnotation = null;
      }

      List<AnnotationFS> theTokens;
      if (prevSlotAnnotation == null)
        theTokens = TextRulerToolkit.getAnnotationsBeforePosition(cas, slotAnnotation.getBegin(),
                0, TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      else
        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, prevSlotAnnotation.getEnd(),
                slotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
                        filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc), true));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

  protected List<TextRulerRulePattern> getSlotFillerPatterns(TextRulerExampleDocument doc,
          int slotIndex) {
    List<TextRulerRulePattern> result = new ArrayList<TextRulerRulePattern>();
    CAS cas = doc.getCAS();
    TypeSystem ts = cas.getTypeSystem();
    Type tokenType = ts.getType(TextRulerToolkit.RUTA_ALL_TYPE_NAME);
    List<TextRulerExample> examples = doc.getPositiveExamples();
    for (TextRulerExample e : examples) {
      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
      List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas,
              slotAnnotation.getBegin(), slotAnnotation.getEnd(), TextRulerToolkit
                      .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

  protected constraint3ReturnType testConstraint3(TextRulerRulePattern h, TextRulerRulePattern t,
          TextRulerRulePattern l1) {
    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
      constraint3ReturnType r = testConstraint3(doc, h, t, l1);
      if (r != constraint3ReturnType.C3_SUCCESS)
        return r;
    }
    return constraint3ReturnType.C3_SUCCESS;
  }

  protected boolean testConstraint1(TextRulerExampleDocument doc, TextRulerRulePattern rk, int k) {
    List<TextRulerRulePattern> rightContexts = getRightContextForSlot(doc, k);
    for (TextRulerRulePattern rx : rightContexts) {
      if (rx.find(rk) != 0)
        return false;
    }
    List<TextRulerRulePattern> contents = getSlotFillerPatterns(doc, k);
    for (TextRulerRulePattern c : contents) {
      if (c.find(rk) >= 0)
        return false;
    }

    return true;
  }

  protected boolean testConstraint1(TextRulerRulePattern rk, int k) {
    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
      if (!testConstraint1(doc, rk, k))
        return false;
    }
    return true;
  }

  protected boolean testConstraint2(TextRulerExampleDocument doc, TextRulerRulePattern lk, int k) {
    List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
    for (TextRulerRulePattern lx : leftContexts) {
      if (lx.size() < lk.size())
        return false;
      int pos = lx.find(lk);
      if (pos < 0 || pos != lx.size() - lk.size())
        return false;
    }
    return true;
  }

  protected boolean testConstraint2(TextRulerRulePattern lk, int k) {
    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
      if (!testConstraint2(doc, lk, k))
        return false;
    }
    return true;
  }

  protected constraint3ReturnType testConstraint3(TextRulerExampleDocument doc,
          TextRulerRulePattern h, TextRulerRulePattern t, TextRulerRulePattern l1) {
    final boolean logReasons = false;

    TextRulerRulePattern head = new TextRulerRulePattern();
    TextRulerRulePattern tail = new TextRulerRulePattern();

    getPageHeadAndTailPortion(doc, head, tail);

    // 1: l1 must be a proper suffix of the portion between the end of h and
    // the first slot filler:
    // (head / h) / l1 = l1

    int hPos = head.find(h);

    // TOOD precalculate this outside this method ?
    TextRulerRulePattern restForL1 = head.subPattern(hPos + h.size(), -1).copy();
    for (TextRulerRuleItem it : restForL1)
      ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
    int l1Pos = restForL1.find(l1);
    if (l1Pos < 0 || l1Pos != restForL1.size() - l1.size()) {
      TextRulerToolkit.logIf(logReasons, "REASON 1\n\tl1         \t" + l1 + "\n\trestforl1\t"
              + restForL1);
      return constraint3ReturnType.C3_L1CandidateSuffixError;
    }

    // 2: t must not occur in the subpattern after h and before l1
    if (l1Pos > 0) {
      TextRulerRulePattern patternBetweenHandL1 = restForL1.subPattern(0, l1Pos);
      if (patternBetweenHandL1.size() >= t.size()) {
        if (patternBetweenHandL1.find(t) >= 0) {
          TextRulerToolkit.logIf(logReasons, "REASON 2");
          return constraint3ReturnType.C3_TailCandidateH_L1Error;
        }
      }
    }

    // 2a: addons, not specified in WIEN paper !!
    TextRulerRulePattern lastSlotRightPattern = patternPairs.get(slotNames.length - 1).r;
    if (t.find(lastSlotRightPattern) == 0) // the right boundary of the last
    // slot may not be part of the
    // tail pattern!
    {
      TextRulerToolkit.logIf(logReasons, "REASON 3: " + lastSlotRightPattern + "\tTail: " + t);
      return constraint3ReturnType.C3_TailCandidateRK_PrefixError;
    }

    int tPos = tail.find(t);
    if (tPos < 0) {
      TextRulerToolkit.logIf(logReasons, "REASON 4");
      return constraint3ReturnType.C3_TailCandidateNotFoundError;
    } // this is an own constraint definition: if a document does not have
    // the tail in it,
    // what should we do then ? is this a n error or is this okay since the
    // document may not have any tail after the data ?

    // 3: l1 must not precede t in the page's tail:
    int l1tPos = tail.find(l1);
    if (l1tPos >= 0) // l1 occurs in the page's tail:
    {
      if (l1tPos < tPos) {
        TextRulerToolkit.logIf(logReasons, "REASON 5");
        return constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError;
      }
    }

    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc);

    for (TextRulerRulePattern itSep : interTupleSeparators) {
      // 4: l1 must be a proper suffix of each of the inter-tuple
      // separators:
      TextRulerRulePattern itSepCopy = itSep.copy();
      for (TextRulerRuleItem it : itSepCopy)
        ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
      int l1itsPos = itSepCopy.find(l1);
      if (l1itsPos < 0 || l1itsPos != itSepCopy.size() - l1.size()) {
        TextRulerToolkit.logIf(logReasons, "REASON 6: \n\tl1\t" + l1 + "\n\titSep\t" + itSep);
        return constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError;
      }

      // 5: t must never precede l1 in any inter-tuple separator:
      int itstPos = itSep.find(t);
      if (itstPos >= 0 && itstPos < l1itsPos) {
        TextRulerToolkit.logIf(logReasons, "REASON 7");
        return constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError;
      }

    }
    return constraint3ReturnType.C3_SUCCESS;
  }

  public String getResultString() {
    if (theRule == null)
      return "<no results yet>";
    String result = getFileHeaderString(true) + "DECLARE wien_tail;\n" + "DECLARE wien_rulemark;\n"
            + "DECLARE wien_content;\n" + "BOOLEAN wien_redo;\n\n"
            + "// tail/head/content area stuff:\n";

    TextRulerRulePattern hCopy = hPattern.copy();

    ((WienRuleItem) hCopy.get(0)).addCondition("-PARTOF(wien_content)");
    result += hCopy + " ALL*?{->MARK(wien_content)};\n";

    TextRulerRulePattern tCopy = tPattern.copy();
    ((WienRuleItem) tCopy.get(0)).addCondition("PARTOF(wien_content)");

    result += tCopy + "{->MARK(wien_tail";
    if (tPattern.size() > 1)
      result += ", 1, " + tPattern.size();
    result += ")};\n\n";

    result += "BLOCK(findData) wien_content {\n"
            + "\t// find out if tail is before the next occurence of l1\n"
            + "\t"
            + theRule.getRuleString()
            + "\n"
            + "\tDocument{->ASSIGN(wien_redo, false)};\n"
            + "\twien_tail{PARTOF(wien_rulemark)->UNMARK(wien_tail), ASSIGN(wien_redo, true)}; // remove tail marks that are no longer relevant for us after the last rule !\n"
            + "\tDocument{IF(wien_redo)->CALL(filename.findData)};\n" + "}\n";

    result += "\n// cleaning up:\n" + "wien_tail{->UNMARK(wien_tail)};\n"
            + "wien_rulemark{->UNMARK(wien_rulemark)};\n"
            + "wien_content{->UNMARK(wien_content)};\n";
    return result;
  }

  public void setParameters(Map<String, Object> params) {
  }

}
TOP

Related Classes of org.apache.uima.ruta.textruler.learner.wien.Wien

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.