Package edu.stanford.nlp.ling

Examples of edu.stanford.nlp.ling.CoreLabel


    return featuresCpCp2C;
  }


  protected Collection<String> featuresCpCp2Cp3C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);

    Collection<String> featuresCpCp2Cp3C = new ArrayList<String>();

    if (flags.useTaggySequences) {
      if (flags.useTags) {
        if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
          featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
          if (flags.useTaggySequencesShapeInteraction) {
            featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTTS-CS");
          }
        }
      }
      if (flags.useDistSim) {
        if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
          featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
          if (flags.useTaggySequencesShapeInteraction) {
            featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS");
          }
        }
      }
    }
View Full Code Here


      int ngTag = classIndex.indexOf("G");
      //int bgTag = classIndex.indexOf(BACKGROUND);
      int bgTag = classIndex.indexOf(flags.backgroundSymbol);

      for (int i = 0, dSize = document.size(); i < dSize; i++) {
        CoreLabel wordInfo =document.get(i);

        if ("NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) {
          int start = i;
          int j;
          for (j = i; j < document.size(); j++) {
            wordInfo = document.get(j);
            if (!"NEWGENE".equals(wordInfo.get(CoreAnnotations.GazAnnotation.class))) {
              break;
            }
          }
          int end = j;
          //int end = i + 1;

          int winStart = Math.max(0, start - 4);
          int winEnd = Math.min(tags.length, end + 4);
          // clear a window around the sequences
          for (j = winStart; j < winEnd; j++) {
            copy[j] = bgTag;
          }

          // score as nongene
          double bgScore = 0.0;
          for (j = start; j < end; j++) {
            double[] scores = ts.scoresOf(copy, j);
            scores = Scorer.recenter(scores);
            bgScore += scores[bgTag];
          }

          // first pass, compute all of the scores
          ClassicCounter<Pair<Integer,Integer>> prevScores = new ClassicCounter<Pair<Integer,Integer>>();
          for (j = start; j < end; j++) {
            // clear the sequence
            for (int k = start; k < end; k++) {
              copy[k] = bgTag;
            }

            // grow the sequence from j until the end
            for (int k = j; k < end; k++) {
              copy[k] = ngTag;
              // score the sequence
              double ngScore = 0.0;
              for (int m = start; m < end; m++) {
                double[] scores = ts.scoresOf(copy, m);
                scores = Scorer.recenter(scores);
                ngScore += scores[tags[m]];
              }
              prevScores.incrementCount(new Pair<Integer,Integer>(Integer.valueOf(j), Integer.valueOf(k)), ngScore - bgScore);
            }
          }
          for (j = start; j < end; j++) {
            // grow the sequence from j until the end
            for (int k = j; k < end; k++) {
              double score = prevScores.getCount(new Pair<Integer,Integer>(Integer.valueOf(j), Integer.valueOf(k)));
              Pair<Integer, Integer> al = new Pair<Integer,Integer>(Integer.valueOf(j - 1), Integer.valueOf(k)); // adding a word to the left
              Pair<Integer, Integer> ar = new Pair<Integer,Integer>(Integer.valueOf(j), Integer.valueOf(k + 1)); // adding a word to the right
              Pair<Integer, Integer> sl = new Pair<Integer,Integer>(Integer.valueOf(j + 1), Integer.valueOf(k)); // subtracting word from left
              Pair<Integer, Integer> sr = new Pair<Integer,Integer>(Integer.valueOf(j), Integer.valueOf(k - 1)); // subtracting word from right

              // make sure the score is greater than all its neighbors (one add or subtract)
              if (score >= flags.newgeneThreshold && (!prevScores.containsKey(al) || score > prevScores.getCount(al)) && (!prevScores.containsKey(ar) || score > prevScores.getCount(ar)) && (!prevScores.containsKey(sl) || score > prevScores.getCount(sl)) && (!prevScores.containsKey(sr) || score > prevScores.getCount(sr))) {
                StringBuilder sb = new StringBuilder();
                wordInfo = document.get(j);
                String docId = wordInfo.get(CoreAnnotations.IDAnnotation.class);
                String startIndex = wordInfo.get(CoreAnnotations.PositionAnnotation.class);
                wordInfo = document.get(k);
                String endIndex = wordInfo.get(CoreAnnotations.PositionAnnotation.class);
                for (int m = j; m <= k; m++) {
                  wordInfo = document.get(m);
                  sb.append(wordInfo.word());
                  sb.append(' ');
                }
                /*System.err.println(sb.toString()+"score:"+score+
                  " al:"+prevScores.getCount(al)+
                  " ar:"+prevScores.getCount(ar)+
                  "  sl:"+prevScores.getCount(sl)+" sr:"+ prevScores.getCount(sr));*/
                System.out.println(docId + '|' + startIndex + ' ' + endIndex + '|' + sb.toString().trim());
              }
            }
          }

          // restore the original tags
          for (j = winStart; j < winEnd; j++) {
            copy[j] = tags[j];
          }
          i = end;
        }
      }
    }

    for (int i = 0, docSize = document.size(); i < docSize; i++) {
      CoreLabel lineInfo = document.get(i);
      String answer = classIndex.get(tags[i]);
      lineInfo.set(CoreAnnotations.AnswerAnnotation.class, answer);
    }

    if (flags.justify && classifier instanceof LinearClassifier) {
      LinearClassifier<String, String> lc = (LinearClassifier<String, String>) classifier;
      if (flags.dump) {
        lc.dump();
      }
      for (int i = 0, docSize = document.size(); i < docSize; i++) {
        CoreLabel lineInfo = document.get(i);
        System.err.print("@@ Position is: " + i + ": ");
        System.err.println(lineInfo.word() + ' ' + lineInfo.get(CoreAnnotations.AnswerAnnotation.class));
        lc.justificationOf(makeDatum(document, i, featureFactories));
      }
    }

//    document.remove(0);
View Full Code Here

  }

  protected Collection<String> featuresCpCp2Cp3Cp4C(PaddedList<IN> cInfo, int loc) {
    Collection<String> featuresCpCp2Cp3Cp4C = new ArrayList<String>();

    CoreLabel p = cInfo.get(loc - 1);

    if (flags.maxLeft >= 4) {
      if (flags.useLongSequences) {
        featuresCpCp2Cp3Cp4C.add("PPPPSEQ");
      }
View Full Code Here

    return featuresCpCp2Cp3Cp4C;
  }


  protected Collection<String> featuresCnC(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);

    Collection<String> featuresCnC = new ArrayList<String>();

    if (flags.useNext) {
      if (flags.useSequences && flags.useNextSequences) {
View Full Code Here

    return featuresCnC;
  }


  protected Collection<String> featuresCpCnC(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);

    Collection<String> featuresCpCnC = new ArrayList<String>();

    if (flags.useNext && flags.usePrev) {
      if (flags.useSequences && flags.usePrevSequences && flags.useNextSequences) {
View Full Code Here

  private Collection<String> occurrencePatterns(PaddedList<IN> cInfo, int loc) {
    // features on last Cap
    String word = getWord(cInfo.get(loc));
    String nWord = getWord(cInfo.get(loc + reverse(1)));
    CoreLabel p = cInfo.get(loc - reverse(1));
    String pWord = getWord(p);
    // System.err.println(word+" "+nWord);
    if (!(isNameCase(word) && noUpperCase(nWord) && hasLetter(nWord) && hasLetter(pWord) && p != cInfo.getPad())) {
      return Collections.singletonList("NO-OCCURRENCE-PATTERN");
    }
View Full Code Here

          }
          answerArrays.add(Arrays.asList(seq));
        }
      }
      for (int i = 0; i < leng; i++) {
        CoreLabel wordInfo = doc.get(i);
        classIndex.add(wordInfo.get(CoreAnnotations.AnswerAnnotation.class));
      }

      if (flags.useReverse) {
        Collections.reverse(doc);
      }
View Full Code Here

        features.addAll(feats);
      }
    }

    printFeatures(pInfo.get(loc), features);
    CoreLabel c = info.get(loc);
    return new BasicDatum<String, String>(features, c.get(CoreAnnotations.AnswerAnnotation.class));
  }
View Full Code Here

    Matcher m = null;
    if (parseInsidePattern != null) {
      m = parseInsidePattern.matcher(""); // create once as performance hack
    }
    for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) {
      CoreLabel obj = tokenizer.next();
      // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
      String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
      String str;
      if (lowerCase) {
        str = origStr.toLowerCase(Locale.ENGLISH);
        obj.set(CoreAnnotations.TextAnnotation.class, str);
      } else {
        str = origStr;
      }
      if (m != null && m.reset(origStr).matches()) {
        printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is
      } else if (printing) {
        if (dump) {
          // after having checked for tags, change str to be exhaustive
          str = obj.toString();
        }
        if (preserveLines) {
          if (PTBLexer.NEWLINE_TOKEN.equals(origStr)) {
            beginLine = true;
            writer.newLine();
View Full Code Here

      int possibleNumStart = -1;
      Number possibleNumEndUnit = null;
      Number lastUnit = null;
      // Check if we need to split matched chunk up more
      for (int i = matcher.start(); i < matcher.end(); i++) {
        CoreLabel token = tokens.get(i);
        CoreLabel prev = (i > matcher.start())? tokens.get(i - 1): null;
        Number num = token.get(CoreAnnotations.NumericValueAnnotation.class);
        Number prevNum = (prev != null)? prev.get(CoreAnnotations.NumericValueAnnotation.class):null;
        String w = token.word();
        w = w.trim().toLowerCase();
        switch (w) {
          case ",":
            if (lastUnit != null && lastUnitPos == i - 1) {
              // OKAY, this may be one big number
              possibleNumEnd = i;
              possibleNumEndUnit = lastUnit;
            } else {
              // Not one big number
              if (numStart < i) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                numStart = i + 1;
                possibleNumEnd = -1;
                possibleNumEndUnit = null;
                lastUnit = null;
                lastUnitPos = -1;
              }
            }
            if (numStart == i) {
              numStart = i + 1;
            }
            break;
          case "and":
            // Check if number before and was unit
            String prevWord = prev.word();
            if (lastUnitPos == i - 1 || (lastUnitPos == i - 2 && ",".equals(prevWord))) {
              // Okay
            } else {
              // Two separate numbers
              if (numStart < possibleNumEnd) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                if (possibleNumStart >= possibleNumEnd) {
                  numStart = possibleNumStart;
                } else {
                  numStart = i + 1;
                }
              } else if (numStart < i) {
                numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                numStart = i + 1;
              }
              if (lastUnitPos < numStart) {
                lastUnit = null;
                lastUnitPos = -1;
              }
              possibleNumEnd = -1;
              possibleNumEndUnit = null;
            }
            break;
          default:
            // NUMBER or ORDINAL
            String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class);
            if ("UNIT".equals(numType)) {
              // Compare this unit with previous
              if (lastUnit == null || lastUnit.longValue() > num.longValue()) {
                // lastUnit larger than this unit
                // maybe four thousand two hundred?
                // OKAY, probably one big number
              } else {
                if (numStart < possibleNumEnd) {
                  // Units are increasing - check if this unit is >= unit before "," (if so, need to split into chunks)
                  // Not one big number  ( had a comma )
                  if (num.longValue() >= possibleNumEndUnit.longValue()) {
                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                    if (possibleNumStart >= possibleNumEnd) {
                      numStart = possibleNumStart;
                    } else {
                      numStart = i;
                    }
                    possibleNumEnd = -1;
                    possibleNumEndUnit = null;
                  }
                } else {
                  // unit is increasing - can be okay, maybe five hundred thousand?
                  // what about four hundred five thousand
                  // unit might also be the same, as in thousand thousand,
                  // which we convert to million
                }
              }
              lastUnit = num;
              lastUnitPos = i;
            } else {
              // Normal number
              if (num == null) {
                logger.warning("NO NUMBER: " + token.word());
                continue;
              }
              if (prevNum != null) {
                if (num.doubleValue() > 0) {
                  if (num.doubleValue() < 10) {
                    // This number is a digit
                    // Treat following as two separate numbers
                    //    \d+ [0-9]
                    //    [one to nine]  [0-9]
                    if (NumberNormalizer.numPattern.matcher(prev.word()).matches() ||
                        prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0) {
                      // two separate numbers
                      if (numStart < i) {
                        numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                      }
                      numStart = i;
                      possibleNumEnd = -1;
                      possibleNumEndUnit = null;
                      lastUnit = null;
                      lastUnitPos = -1;
                    }
                  } else {
                    String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class);
                    if ("UNIT".equals(prevNumType)) {
                      // OKAY
                    } else if (!ordinalUnitPattern.matcher(w).matches()) {
                      // Start of new number
                      if (numStart < i) {
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.ling.CoreLabel

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.