Examples of org.apache.uima.ruta.textruler.core.TextRulerRulePattern

Package org.apache.uima.ruta.textruler.core

Examples of org.apache.uima.ruta.textruler.core.TextRulerRulePattern

org.apache.uima.ruta.textruler.core.TextRulerRulePattern
TextRulerRulePattern is an ordered list of rule items and provides some special functionality for dealing with rule patterns like finding sub patterns or such. hint: this is a very basic implementation and could surely be optimized ;-)

      int shortest = Integer.MAX_VALUE;
      for (TextRulerRulePattern p : rightContexts)
        shortest = p.size() < shortest ? p.size() : shortest;
      boolean found = false;
      for (int len = 1; len <= shortest; len++) {
        TextRulerRulePattern subPattern = rightContexts.get(0).subPattern(0, len);
        if (testConstraint1(subPattern, k)) {
          // for (TextRulerRuleItem i : subPattern)
          // ((WienRuleItem)i).getWordConstraint().setGeneralizeLinkMarkUp(true);
          patternPairs.get(k).r = subPattern;
          TextRulerToolkit.log("right " + k + ": " + subPattern);

View Full Code Here

    for (int k = 1; k < slotNames.length; k++) {
      List<TextRulerRulePattern> leftContexts = getLeftContextForSlot(doc, k);
      int shortest = Integer.MAX_VALUE;
      for (TextRulerRulePattern p : leftContexts)
        shortest = p.size() < shortest ? p.size() : shortest;
      TextRulerRulePattern sourcePattern = leftContexts.get(0);
      boolean found = false;
      for (int len = 1; len <= shortest; len++) {
        // get suffix:
        TextRulerRulePattern subPattern = sourcePattern.subPattern(sourcePattern.size() - len, len);
        if (testConstraint2(subPattern, k)) {
          patternPairs.get(k).l = subPattern;
          for (TextRulerRuleItem i : subPattern)
            ((WienRuleItem) i).getWordConstraint().setGeneralizeLinkMarkUp(true);
          TextRulerToolkit.log("left " + k + ": " + subPattern);

View Full Code Here

  }


  protected boolean findHeadTailAndL1Patterns() {
    List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
    TextRulerExampleDocument doc0 = docs.get(0);
    TextRulerRulePattern head = new TextRulerRulePattern();
    TextRulerRulePattern tail = new TextRulerRulePattern();
    getPageHeadAndTailPortion(doc0, head, tail);


    final class HLCandidate {
      public TextRulerRulePattern head = new TextRulerRulePattern();


      public TextRulerRulePattern l1 = new TextRulerRulePattern();
    }


    // a small optimization:
    // find out the maximum possible length for l1 in doc0 since l1 is much
    // smaller than the possible head length!
    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
    int shortestL1 = head.size() - 1;
    for (TextRulerRulePattern its : interTupleSeparators)
      shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;


    List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
    // create candidates for each separation of the head and tail patterns:
    for (int separator = head.size() - 1; separator > 0; separator--) {
      HLCandidate c = new HLCandidate();
      for (int i = 0; i < head.size(); i++) {
        if (i < separator)
          c.head.add(head.get(i));
        else {
          WienRuleItem it = (WienRuleItem) head.get(i).copy();
          it.getWordConstraint().setGeneralizeLinkMarkUp(true);
          c.l1.add(it);
        }
      }
      hlCandidates.add(c);
      TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
      if (c.l1.size() >= shortestL1)
        break;
    }


    long total = 0;


    // get total h l1 t combination count:
    long tCand = (tail.size() * (tail.size() + 1)) / 2;
    for (HLCandidate c : hlCandidates) {
      total += ((c.head.size() - 1) * (c.head.size())) / 2;
    }
    total *= tCand;


    long current = 0;
    int oldPercent = -1;


    for (HLCandidate c : hlCandidates) {
      // for each "candidate" which represents a l1 suffix pattern of the
      // head tokens and a rest pattern for the h pattern,
      // we have to create every sub pattern of the remaining h pattern as
      // a h candidate:
      TextRulerRulePattern l1 = c.l1;
      TextRulerRulePattern h = null;


      boolean l1Sucks = false;


      for (int endI = c.head.size() - 1; endI > 0; endI--) {
        for (int startI = endI; startI > 0; startI--) {
          h = new TextRulerRulePattern();
          for (int i = startI; i <= endI; i++)
            h.add(c.head.get(i));


          // now for each h candidate we have to create each t
          // candidate:
          TextRulerRulePattern t = null;
          for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
            for (int tendI = tstartI; tendI < tail.size(); tendI++) {
              int percent = Math.round(((float) current * 100 / total));
              if (percent != oldPercent) {
                oldPercent = percent;
                if (percent > 100)
                  percent = 100;
                // TextRulerToolkit.log(current+" / "+total);
                sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
                        TextRulerLearnerState.ML_RUNNING, false);
              }
              if (shouldAbort())
                return false;
              current++;


              t = new TextRulerRulePattern();
              for (int i = tstartI; i <= tendI; i++)
                t.add(tail.get(i));


              // no we have a possible candidate triple: h, t and
              // l1:


              constraint3ReturnType c3Result = testConstraint3(h, t, l1);

View Full Code Here

        TextRulerAnnotation lastOf1 = exampleAnnotations1[exampleAnnotations1.length - 1];
        TextRulerAnnotation firstOf2 = exampleAnnotations2[0];
        List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, lastOf1
                .getEnd(), firstOf2.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
                slotNames, filterSet), tokenType);
        TextRulerRulePattern thePattern = new TextRulerRulePattern();
        for (AnnotationFS afs : theTokens)
          thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
        if (thePattern.size() > 0)
          result.add(thePattern);


      }
      interTupelSeparatorsCache.put(key, result);
      return result;

View Full Code Here

                TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      else
        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, slotAnnotation.getEnd(),
                nextSlotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(
                        slotNames, filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

View Full Code Here

  public List<RapierRule> specializePreFiller(RapierRule curRule, int n) {
    RapierRule baseRule1 = curRule.getParent1();
    RapierRule baseRule2 = curRule.getParent2();
    int n1 = curRule.getParent1PreFiller_n();
    int n2 = curRule.getParent2PreFiller_n();
    TextRulerRulePattern preFiller1 = baseRule1.getPreFillerPattern();
    TextRulerRulePattern preFiller2 = baseRule2.getPreFillerPattern();
    int preFiller1MaxIndex = preFiller1.size() - n1 - 1;
    int preFiller2MaxIndex = preFiller2.size() - n2 - 1;


    // generate 3 different possible sets for generalizations:


    // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2)
    TextRulerRulePattern consideredPreFiller1 = new TextRulerRulePattern();
    TextRulerRulePattern consideredPreFiller2 = new TextRulerRulePattern();
    for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++)
      consideredPreFiller1.add(preFiller1.get(i));
    for (int i = preFiller2.size() - n + 1; i >= 0 && i <= preFiller2MaxIndex; i++)
      consideredPreFiller2.add(preFiller2.get(i));
    List<TextRulerRulePattern> genList1 = null;
    if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
      genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
              consideredPreFiller1, consideredPreFiller2);


    List<TextRulerRulePattern> genList2 = null;
    List<TextRulerRulePattern> genList3 = null;


    if (useAllGenSetsAtSpecialization) // due to performance reasons the
    // user can switch this off
    {
      // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2)
      consideredPreFiller1.clear();
      consideredPreFiller2.clear();
      for (int i = preFiller1.size() - n + 1; i >= 0 && i <= preFiller1MaxIndex; i++)
        consideredPreFiller1.add(preFiller1.get(i));
      for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++)
        consideredPreFiller2.add(preFiller2.get(i));


      if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
        genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
                consideredPreFiller1, consideredPreFiller2);


      // 3. n vs. n (n elements of baserule1, n of baserule2)
      consideredPreFiller1.clear();
      consideredPreFiller2.clear();
      for (int i = preFiller1.size() - n; i >= 0 && i <= preFiller1MaxIndex; i++)
        consideredPreFiller1.add(preFiller1.get(i));
      for (int i = preFiller2.size() - n; i >= 0 && i <= preFiller2MaxIndex; i++)
        consideredPreFiller2.add(preFiller2.get(i));
      if (consideredPreFiller1.size() + consideredPreFiller2.size() > 0)
        genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
                consideredPreFiller1, consideredPreFiller2);
    }


    // TODO optimize and don't store all 3 genLists ! but for debugging

View Full Code Here

                0, TextRulerToolkit.getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      else
        theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas, prevSlotAnnotation.getEnd(),
                slotAnnotation.getBegin(), TextRulerToolkit.getFilterSetWithSlotNames(slotNames,
                        filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc), true));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

View Full Code Here

    for (TextRulerExample e : examples) {
      TextRulerAnnotation slotAnnotation = e.getAnnotations()[slotIndex];
      List<AnnotationFS> theTokens = TextRulerToolkit.getAnnotationsWithinBounds(cas,
              slotAnnotation.getBegin(), slotAnnotation.getEnd(), TextRulerToolkit
                      .getFilterSetWithSlotNames(slotNames, filterSet), tokenType);
      TextRulerRulePattern thePattern = new TextRulerRulePattern();
      for (AnnotationFS afs : theTokens)
        thePattern.add(new WienRuleItem(new TextRulerAnnotation(afs, doc)));
      if (thePattern.size() > 0)
        result.add(thePattern);
    }
    return result;
  }

View Full Code Here


  protected constraint3ReturnType testConstraint3(TextRulerExampleDocument doc,
          TextRulerRulePattern h, TextRulerRulePattern t, TextRulerRulePattern l1) {
    final boolean logReasons = false;


    TextRulerRulePattern head = new TextRulerRulePattern();
    TextRulerRulePattern tail = new TextRulerRulePattern();


    getPageHeadAndTailPortion(doc, head, tail);


    // 1: l1 must be a proper suffix of the portion between the end of h and
    // the first slot filler:
    // (head / h) / l1 = l1


    int hPos = head.find(h);


    // TOOD precalculate this outside this method ?
    TextRulerRulePattern restForL1 = head.subPattern(hPos + h.size(), -1).copy();
    for (TextRulerRuleItem it : restForL1)
      ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
    int l1Pos = restForL1.find(l1);
    if (l1Pos < 0 || l1Pos != restForL1.size() - l1.size()) {
      TextRulerToolkit.logIf(logReasons, "REASON 1\n\tl1         \t" + l1 + "\n\trestforl1\t"
              + restForL1);
      return constraint3ReturnType.C3_L1CandidateSuffixError;
    }


    // 2: t must not occur in the subpattern after h and before l1
    if (l1Pos > 0) {
      TextRulerRulePattern patternBetweenHandL1 = restForL1.subPattern(0, l1Pos);
      if (patternBetweenHandL1.size() >= t.size()) {
        if (patternBetweenHandL1.find(t) >= 0) {
          TextRulerToolkit.logIf(logReasons, "REASON 2");
          return constraint3ReturnType.C3_TailCandidateH_L1Error;
        }
      }
    }


    // 2a: addons, not specified in WIEN paper !!
    TextRulerRulePattern lastSlotRightPattern = patternPairs.get(slotNames.length - 1).r;
    if (t.find(lastSlotRightPattern) == 0) // the right boundary of the last
    // slot may not be part of the
    // tail pattern!
    {
      TextRulerToolkit.logIf(logReasons, "REASON 3: " + lastSlotRightPattern + "\tTail: " + t);
      return constraint3ReturnType.C3_TailCandidateRK_PrefixError;
    }


    int tPos = tail.find(t);
    if (tPos < 0) {
      TextRulerToolkit.logIf(logReasons, "REASON 4");
      return constraint3ReturnType.C3_TailCandidateNotFoundError;
    } // this is an own constraint definition: if a document does not have
    // the tail in it,
    // what should we do then ? is this a n error or is this okay since the
    // document may not have any tail after the data ?


    // 3: l1 must not precede t in the page's tail:
    int l1tPos = tail.find(l1);
    if (l1tPos >= 0) // l1 occurs in the page's tail:
    {
      if (l1tPos < tPos) {
        TextRulerToolkit.logIf(logReasons, "REASON 5");
        return constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError;
      }
    }


    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc);


    for (TextRulerRulePattern itSep : interTupleSeparators) {
      // 4: l1 must be a proper suffix of each of the inter-tuple
      // separators:
      TextRulerRulePattern itSepCopy = itSep.copy();
      for (TextRulerRuleItem it : itSepCopy)
        ((WienRuleItem) it).getWordConstraint().setGeneralizeLinkMarkUp(true);
      int l1itsPos = itSepCopy.find(l1);
      if (l1itsPos < 0 || l1itsPos != itSepCopy.size() - l1.size()) {
        TextRulerToolkit.logIf(logReasons, "REASON 6: \n\tl1\t" + l1 + "\n\titSep\t" + itSep);
        return constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError;
      }


      // 5: t must never precede l1 in any inter-tuple separator:

View Full Code Here

    }
    RapierRule baseRule1 = curRule.getParent1();
    RapierRule baseRule2 = curRule.getParent2();
    int n1 = curRule.getParent1PostFiller_n();
    int n2 = curRule.getParent2PostFiller_n();
    TextRulerRulePattern postFiller1 = baseRule1.getPostFillerPattern();
    TextRulerRulePattern postFiller2 = baseRule2.getPostFillerPattern();
    int postFiller1MinIndex = n1;
    int postFiller2MinIndex = n2;


    // generate 3 different possible sets for generalizations:


    // 1. n vs. n-1 (n elements of baserule1, n-1 of baserule2)
    TextRulerRulePattern consideredPostFiller1 = new TextRulerRulePattern();
    TextRulerRulePattern consideredPostFiller2 = new TextRulerRulePattern();
    for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++)
      consideredPostFiller1.add(postFiller1.get(i));
    for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n - 1; i++)
      consideredPostFiller2.add(postFiller2.get(i));
    List<TextRulerRulePattern> genList1 = null;
    if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
      genList1 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
              consideredPostFiller1, consideredPostFiller2);


    // 2. n-1 vs. n (n-1 elements of baserule1, n of baserule2)
    consideredPostFiller1.clear();
    consideredPostFiller2.clear();
    for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n - 1; i++)
      consideredPostFiller1.add(postFiller1.get(i));
    for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++)
      consideredPostFiller2.add(postFiller2.get(i));
    List<TextRulerRulePattern> genList2 = null;
    if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
      genList2 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
              consideredPostFiller1, consideredPostFiller2);


    // 3. n vs. n (n elements of baserule1, n of baserule2)
    consideredPostFiller1.clear();
    consideredPostFiller2.clear();
    for (int i = postFiller1MinIndex; i < postFiller1.size() && i < n; i++)
      consideredPostFiller1.add(postFiller1.get(i));
    for (int i = postFiller2MinIndex; i < postFiller2.size() && i < n; i++)
      consideredPostFiller2.add(postFiller2.get(i));
    List<TextRulerRulePattern> genList3 = null;
    if (consideredPostFiller1.size() + consideredPostFiller2.size() > 0)
      genList3 = RapierGeneralizationHelper.getGeneralizationsForRuleItemPatterns(
              consideredPostFiller1, consideredPostFiller2);


    // TODO optimize and don't store all 3 genLists ! but for debugging
    // purposes we keep them for now !

View Full Code Here

0 1 2 3 4

TOP

Related Classes of org.apache.uima.ruta.textruler.core.TextRulerRulePattern

org.apache.uima.ruta.textruler.learner.kep.KEPLearner

org.apache.uima.ruta.textruler.learner.kep.KEPRule

org.apache.uima.ruta.textruler.learner.lp2.NaiveLP2

org.apache.uima.ruta.textruler.learner.rapier.Rapier

org.apache.uima.ruta.textruler.learner.rapier.RapierGeneralizationHelper

org.apache.uima.ruta.textruler.learner.rapier.RapierRule

org.apache.uima.ruta.textruler.learner.whisk.generic.Whisk

org.apache.uima.ruta.textruler.learner.whisk.generic.WhiskRule

org.apache.uima.ruta.textruler.learner.whisk.token.Whisk

org.apache.uima.ruta.textruler.learner.whisk.token.WhiskRule

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.