Examples of org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector

Package org.apache.uima.ruta.textruler.core

Examples of org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector

org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector
TextRulerStatisticsCollector is used for collecting data while e.g. testing a rule. It holds a HashMap for the covered positive and negative MLExamples of e.g. the training documents, counts the true positives (coveredPositives), false positives (coveredNegatives), ... Currently false negatives (missedPositives) are not counted (see TextRulerExampleDocument.compareOriginalDocumentWithTestCAS) but this functionality can be easily added (since it exists as commented out code)

              + RutaEngine.SCRIPT_FILE_EXTENSION));
      FileUtils.saveString2File(allRulesContent, new File(getTempRulesFileName()));


      CAS testCAS = getTestCAS();
      for (TextRulerExampleDocument doc : sortedDocs) {
        TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
        doc.resetAndFillTestCAS(testCAS, target);
        CAS docCAS = doc.getCAS();
        ae.process(testCAS);
        compareOriginalDocumentWithTestCAS(doc, testCAS, target, c, true); // test whole ruleset and
        // collect negative
        // examples


        // now we have some covered positive examples that are good, and
        // maybe some negative examples
        // for that we might create Correction Rules... in order to do
        // that we have to create
        // ShiftExamples and map negative examples (incorrect inserted
        // boundaries) with a specific
        // distance to an original positive example...


        // TODO should that be done in both directions ? left and right
        // ?! what happes if we
        // find two potential examples, one left, one right ? --> for
        // now: use the nearer one. if
        // exactly the same distance, use the one where the wrong tag
        // would be IN the slot filler!
        List<TextRulerExample> correctTags = doc.getPositiveExamples();
        List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(
                c.getCoveredNegativeExamples());
        List<TextRulerShiftExample> newExamples = new ArrayList<TextRulerShiftExample>();
        for (TextRulerExample wrongTag : wrongTags) {
          // test, if there's a corresponding positive example
          // somewhere around (within maxDistance)
          List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag

View Full Code Here


      // TextRulerToolkit.log("RULE: "+rule.getRuleString());
      // testRuleOnTrainingsSet(rule, exampleDocuments.getDocuments());


      // this rule has to at least cover its seed example!!
      TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
      c.addCoveredPositive(example);
      rule.setCoveringStatistics(c);
      slotRules.add(rule);
    }
    initialRuleBaseSize = slotRules.size();
  }

View Full Code Here

      result.add(bestRule);
      rules.remove(0);
      Set<TextRulerExample> coveredExamples = bestRule.getCoveringStatistics()
              .getCoveredPositiveExamples();
      for (TrabalRule rule : rules) {
        TextRulerStatisticsCollector collector = rule.getCoveringStatistics();
        Iterator<TextRulerExample> iterator = coveredExamples.iterator();
        while (iterator.hasNext()) {
          TextRulerExample example = iterator.next();
          if (collector.getCoveredPositiveExamples().contains(example)) {
            collector.getCoveredPositiveExamples().remove(example);
            // if (rule.getErrorType() == ErrorType.ANNOTATION
            // || rule.getErrorType() == ErrorType.DELETING) {
            // collector.incCoveredPositives(-1);
            // } else {
            collector.incCoveredPositives(-1);
            // }
          }
          if (collector.getCoveredPositivesCount() <= 0) {
            break;
          }
        }
        if (rule.getErrorRate() > maxErrorRate
                || rule.getCoveringStatistics().getCoveredPositivesCount() <= 0) {

View Full Code Here

    if (rules.isEmpty())
      return rules;


    List<TextRulerStatisticsCollector> sums = new ArrayList<TextRulerStatisticsCollector>();
    for (TrabalRule each : rules) {
      sums.add(new TextRulerStatisticsCollector());
    }
    List<TextRulerExampleDocument> goldDocs;
    List<TextRulerExampleDocument> additionalDocs;
    goldDocs = documents.getDocuments();
    additionalDocs = additionalDocuments.getDocuments();
    CAS theTestCAS = getTestCAS();
    int counter = 0;
    for (TrabalRule rule : rules) {
      counter++;
      String ruleString = rule.getRuleString();
      String ruleInfo = getRuleInfo(rule);
      System.out.println("testing: " + ruleString);
      if (inducedRules.containsKey(ruleString)) {
        rule.setCoveringStatistics(inducedRules.get(ruleString));
        System.out.println("skipped with " + inducedRules.get(ruleString));
      } else {
        for (int i = 0; i < goldDocs.size(); i++) {
          TextRulerExampleDocument goldDoc = goldDocs.get(i);
          TextRulerExampleDocument additionalDoc = additionalDocs.get(i);
          sendStatusUpdateToDelegate("Testing " + ruleSet + ruleInfo + " on document " + (i + 1)
                  + " of " + goldDocs.size() + " : rule " + counter + " of " + rules.size(),
                  TextRulerLearnerState.ML_RUNNING, false);
          TextRulerStatisticsCollector sumC = new TextRulerStatisticsCollector();
          prepareTestCas(theTestCAS, goldDoc, additionalDoc);
          testRuleOnDocument(rule, goldDoc, additionalDoc, sumC, theTestCAS);
          sums.get(counter - 1).add(sumC);
          int n = sumC.getCoveredNegativesCount();
          int p = sumC.getCoveredPositivesCount();
          int pnorm = p;
          if (pnorm == 0) {
            pnorm = 1;
          }
          if (n / pnorm > maxErrorRate) {
            System.out.println("stopped:" + sumC);
            break;
          }


          if (shouldAbort())
            return rules;
        }
        TextRulerStatisticsCollector c = sums.get(counter - 1);
        rule.setCoveringStatistics(sums.get(counter - 1));
        inducedRules.put(ruleString, c);
      }
    }
    for (int ruleIndex = 0; ruleIndex < rules.size(); ruleIndex++) {

View Full Code Here

    if (shouldAbort())
      return;
    String rStr = getAnnotationRulesString(target.getSingleSlotTypeName());
    for (TextRulerExampleDocument doc : exampleDocuments.getDocuments()) {
      CAS processedCAS = applyScriptOnDocument(rStr, doc, target);
      TextRulerStatisticsCollector scriptStatistics = new TextRulerStatisticsCollector();
      compareOriginalDocumentWithTestCAS(doc, processedCAS, target, scriptStatistics,
              collectNegativeCoveredInstancesWhenTesting());
      for (KEPRule cRule : correctionRules.get(target.getSingleSlotTypeName())) {
        if (shouldAbort())
          break;
        if (cRule.getCoveringStatistics() == null) {
          cRule.setCoveringStatistics(new TextRulerStatisticsCollector());
        }
        processedCAS = applyScriptOnDocument(rStr, doc, target);
        TextRulerStatisticsCollector correctedStats = new TextRulerStatisticsCollector();
        testRuleOnDocument(cRule, doc, correctedStats, processedCAS);
        for (TextRulerExample ex : scriptStatistics.getCoveredNegativeExamples()) {
          if (!correctedStats.getCoveredNegativeExamples().contains(ex)) {
            cRule.getCoveringStatistics().addCoveredNegative(ex);
          }
        }
        for (TextRulerExample ex : scriptStatistics.getCoveredPositiveExamples()) {
          if (!correctedStats.getCoveredPositiveExamples().contains(ex)) {
            cRule.getCoveringStatistics().addCoveredPositive(ex);
          }
        }
        cRule.getCoveringStatistics().reflectCountsFromCoveredExamples();
      }

View Full Code Here

          if (containsWordConstraint)
            break;
        }
      if (!containsWordConstraint) {
        // and calculate intersection of coverings:
        TextRulerStatisticsCollector newCovering;
        if (currentCovering != null)
          newCovering = getCoveringIntersection(currentCovering,
                  candidateRule.getCoveringStatistics());
        else
          newCovering = candidateRule.getCoveringStatistics();


        // prune all rules that go below our minCoveredPositives
        // threshold!
        if (newCovering.getCoveredPositivesCount() >= minCoveredPositives) {
          // add rule to configuration tuple
          currentRuleTuple.add(candidateRule);


          if (!recursiveCreateAllRuleCombinations(startRules, ctxStartRules, index + 1,
                  currentRuleTuple, newCovering, debugRuleCollector))

View Full Code Here

  }


  protected TextRulerStatisticsCollector getCoveringIntersection(
          final TextRulerStatisticsCollector c1, final TextRulerStatisticsCollector c2) {
    // calculate intersections of coverings:
    TextRulerStatisticsCollector resultC = new TextRulerStatisticsCollector(c1);


    resultC.getCoveredPositiveExamples().retainAll(c2.getCoveredPositiveExamples());
    resultC.getCoveredNegativeExamples().retainAll(c2.getCoveredNegativeExamples());
    resultC.reflectCountsFromCoveredExamples();


    return resultC;
  }

View Full Code Here


      // TextRulerToolkit.log("RULE: "+rule.getRuleString());
      // testRuleOnTrainingsSet(rule, exampleDocuments.getDocuments());


      // this rule has to at least cover its seed example!!
      TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
      c.addCoveredPositive(example);
      rule.setCoveringStatistics(c);
      slotRules.add(rule);
    }
    initialRuleBaseSize = slotRules.size();
  }

View Full Code Here

              + RutaEngine.SCRIPT_FILE_EXTENSION));
      FileUtils.saveString2File(allRulesContent, new File(getTempRulesFileName()));


      CAS testCAS = getTestCAS();
      for (TextRulerExampleDocument doc : sortedDocs) {
        TextRulerStatisticsCollector c = new TextRulerStatisticsCollector();
        doc.resetAndFillTestCAS(testCAS, target);
        CAS docCAS = doc.getCAS();
        ae.process(testCAS);
        compareOriginalDocumentWithTestCAS(doc, testCAS, target, c, true); // test whole ruleset and
        // collect negative
        // examples


        // now we have some covered positive examples that are good, and
        // maybe some negative examples
        // for that we might create Correction Rules... in order to do
        // that we have to create
        // ShiftExamples and map negative examples (incorrect inserted
        // boundaries) with a specific
        // distance to an original positive example...


        // TODO should that be done in both directions ? left and right
        // ?! what happes if we
        // find two potential examples, one left, one right ? --> for
        // now: use the nearer one. if
        // exactly the same distance, use the one where the wrong tag
        // would be IN the slot filler!
        List<TextRulerExample> correctTags = doc.getPositiveExamples();
        List<TextRulerExample> wrongTags = new ArrayList<TextRulerExample>(
                c.getCoveredNegativeExamples());
        List<TextRulerShiftExample> newExamples = new ArrayList<TextRulerShiftExample>();
        for (TextRulerExample wrongTag : wrongTags) {
          // test, if there's a corresponding positive example
          // somewhere around (within maxDistance)
          List<AnnotationFS> left = TextRulerToolkit.getAnnotationsBeforePosition(docCAS, wrongTag

View Full Code Here

          if (containsWordConstraint)
            break;
        }
      if (!containsWordConstraint) {
        // and calculate intersection of coverings:
        TextRulerStatisticsCollector newCovering;
        if (currentCovering != null)
          newCovering = getCoveringIntersection(currentCovering,
                  candidateRule.getCoveringStatistics());
        else
          newCovering = candidateRule.getCoveringStatistics();


        // prune all rules that go below our minCoveredPositives
        // threshold!
        if (newCovering.getCoveredPositivesCount() >= minCoveredPositives) {
          // add rule to configuration tuple
          currentRuleTuple.add(candidateRule);


          if (!recursiveCreateAllRuleCombinations(startRules, ctxStartRules, index + 1,
                  currentRuleTuple, newCovering, debugRuleCollector))

View Full Code Here

0 1

TOP

Related Classes of org.apache.uima.ruta.textruler.core.TextRulerStatisticsCollector

org.apache.uima.ruta.textruler.learner.kep.KEPLearner

org.apache.uima.ruta.textruler.learner.lp2.BasicLP2

org.apache.uima.ruta.textruler.learner.lp2.OptimizedLP2

org.apache.uima.ruta.textruler.learner.rapier.Rapier

org.apache.uima.ruta.textruler.learner.trabal.TrabalLearner

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.