Package edu.stanford.nlp.ling

Examples of edu.stanford.nlp.ling.CoreLabel


  private final Pattern titlePattern = Pattern.compile("(?:Mr|Ms|Mrs|Dr|Miss|Sen|Judge|Sir)\\.?"); // todo: should make static final and add more titles


  protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
    CoreLabel p3 = cInfo.get(loc - 3);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);
    CoreLabel n2 = cInfo.get(loc + 2);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String nWord = getWord(n);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);

    Collection<String> featuresC = new ArrayList<String>();

    if (flags.useDistSim) {
      distSimAnnotate(cInfo);
    }

    if (flags.useBagOfWords) {
      for (IN word : cInfo) {
        featuresC.add(getWord(word) + "-BAGOFWORDS");
      }
    }

    if (flags.useDistSim && flags.useMoreTags) {
      featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD");
    }


    if (flags.useDistSim) {
      featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM");
    }


    if (flags.useTitle) {
      Matcher m = titlePattern.matcher(cWord);
      if (m.matches()) {
        featuresC.add("IS_TITLE");
      }
    }


    if (flags.useInternal && flags.useExternal ) {

      if (flags.useWord) {
        featuresC.add(cWord + "-WORD");
      }

      if (flags.use2W) {
        featuresC.add(getWord(p2) + "-P2W");
        featuresC.add(getWord(n2) + "-N2W");
      }

      if (flags.useLC) {
        featuresC.add(cWord.toLowerCase() + "-CL");
        featuresC.add(pWord.toLowerCase() + "-PL");
        featuresC.add(nWord.toLowerCase() + "-NL");
      }

      if (flags.useUnknown) { // for true casing
        featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class)+"-UNKNOWN");
        featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class)+"-PUNKNOWN");
        featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class)+"-NUNKNOWN");
      }

      if (flags.useLemmas) {
        String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
        if (! "".equals(lem)) {
          featuresC.add(lem + "-LEM");
        }
      }
      if (flags.usePrevNextLemmas) {
        String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
        String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
        if (! "".equals(plem)) {
          featuresC.add(plem + "-PLEM");
        }
        if (! "".equals(nlem)) {
          featuresC.add(nlem + "-NLEM");
        }
      }

      if (flags.checkNameList) {
        try {
          if (lastNames == null) {
            lastNames = Generics.newHashSet();

            for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
              String[] cols = line.split("\\s+");
              lastNames.add(cols[0]);
            }
          }
          if (maleNames == null) {
            maleNames = Generics.newHashSet();
            for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
              String[] cols = line.split("\\s+");
              maleNames.add(cols[0]);
            }
          }
          if (femaleNames == null) {
            femaleNames = Generics.newHashSet();
            for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
              String[] cols = line.split("\\s+");
              femaleNames.add(cols[0]);
            }
          }

          String name = cWord.toUpperCase();
          if (lastNames.contains(name)) {
            featuresC.add("LAST_NAME");
          }

          if (maleNames.contains(name)) {
            featuresC.add("MALE_NAME");
          }

          if (femaleNames.contains(name)) {
            featuresC.add("FEMALE_NAME");
          }

        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      if (flags.binnedLengths != null) {
        int len = cWord.length();
        String featureName = null;
        for (int i = 0; i <= flags.binnedLengths.length; i++) {
          if (i == flags.binnedLengths.length) {
            featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf";
          } else if (len <= flags.binnedLengths[i]) {
            featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i];
            break;
          }
        }
        featuresC.add(featureName);
      }

      if (flags.useABGENE) {
        featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE");
        featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE");
        featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE");
      }

      if (flags.useABSTRFreqDict) {
        featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
      }

      if (flags.useABSTR) {
        featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT");
        featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT");
        featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT");
      }

      if (flags.useGENIA) {
        featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA");
        featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA");
        featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA");
      }
      if (flags.useWEBFreqDict) {
        featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
      }

      if (flags.useWEB) {
        featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB");
        featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB");
        featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB");
      }

      if (flags.useIsURL) {
        featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL");
      }
      if (flags.useEntityRule) {
        featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class)+"-ENTITYRULE");
      }
      if (flags.useEntityTypes) {
        featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE");
      }
      if (flags.useIsDateRange) {
        featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE");
      }

      if (flags.useABSTRFreq) {
        featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
      }

      if (flags.useFREQ) {
        featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
      }

      if (flags.useMoreTags) {
        featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD");
      }

      if (flags.usePosition) {
        featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION");
      }
      if (flags.useBeginSent) {
        String pos = c.get(CoreAnnotations.PositionAnnotation.class);
        if ("0".equals(pos)) {
          featuresC.add("BEGIN-SENT");
          featuresC.add(cShape + "-BEGIN-SENT");
        } else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
          featuresC.add("END-SENT");
          featuresC.add(cShape + "-END-SENT");
        } else {
          featuresC.add("IN-SENT");
          featuresC.add(cShape + "-IN-SENT");
        }
      }
      if (flags.useTags) {
        featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
      }

      if (flags.useOrdinal) {
        if (isOrdinal(cInfo, loc)) {
          featuresC.add("C_ORDINAL");
          if (isOrdinal(cInfo, loc-1)) {
            //System.err.print(getWord(p) + " ");
            featuresC.add("PC_ORDINAL");
          }
          //System.err.println(cWord);
        }
        if (isOrdinal(cInfo, loc-1)) {
          featuresC.add("P_ORDINAL");
        }
      }

      if (flags.usePrev) {
        featuresC.add(pWord + "-PW");
        if (flags.useTags) {
          featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG");
        }
        if (flags.useDistSim) {
          featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM");
        }
        if (flags.useIsURL) {
          featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL");
        }
        if (flags.useEntityTypes) {
          featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE");
        }
      }

      if (flags.useNext) {
        featuresC.add(nWord + "-NW");
        if (flags.useTags) {
          featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG");
        }
        if (flags.useDistSim) {
          featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM");
        }
        if (flags.useIsURL) {
          featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL");
        }
        if (flags.useEntityTypes) {
          featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE");
        }
      }
      /*here, entityTypes refers to the type in the PASCAL IE challenge:
       * i.e. certain words are tagged "Date" or "Location" */


      if (flags.useEitherSideWord) {
        featuresC.add(pWord + "-EW");
        featuresC.add(nWord + "-EW");
      }

      if (flags.useWordPairs) {
        featuresC.add(cWord + '-' + pWord + "-W-PW");
        featuresC.add(cWord + '-' + nWord + "-W-NW");
      }

      if (flags.useSymTags) {
        if (flags.useTags) {
          featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS");
          featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS");
          featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS");
        }
        if (flags.useDistSim) {
          featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM");
          featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM");
          featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM");
        }

      }

      if (flags.useSymWordPairs) {
        featuresC.add(pWord + '-' + nWord + "-SWORDS");
      }

      String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? p.get(CoreAnnotations.GazAnnotation.class) : null;
      String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? n.get(CoreAnnotations.GazAnnotation.class) : null;
      String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? c.get(CoreAnnotations.GazAnnotation.class) : null;
      if (flags.useGazFeatures) {

        if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
          featuresC.add(cGazAnnotation + "-GAZ");
        }
        // n
        if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
          featuresC.add(nGazAnnotation + "-NGAZ");
        }
        // p
        if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
          featuresC.add(pGazAnnotation + "-PGAZ");
        }
      }

      if (flags.useMoreGazFeatures) {
        if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
          featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ");

          // c-n
          if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
            featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ");
          }

          // p-c
          if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
            featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ");
          }
        }
      }

      if (flags.useAbbr || flags.useMinimalAbbr) {
        featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
      }

      if (flags.useAbbr1 || flags.useMinimalAbbr1) {
        if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
          featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
        }
      }

      if (flags.useAbbr) {
        featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
        featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
        featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
      }

      if (flags.useAbbr1) {
        if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
          featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
          featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
          featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
        }
      }

      if (flags.useChunks) {
        featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK");
        featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK");
        featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
      }

      if (flags.useMinimalAbbr) {
        featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
      }

      if (flags.useMinimalAbbr1) {
        if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
          featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
        }
      }

      String prevVB = "", nextVB = "";
      if (flags.usePrevVB) {
        for (int j = loc - 1; ; j--) {
          CoreLabel wi = cInfo.get(j);
          if (wi == cInfo.getPad()) {
            prevVB = "X";
            featuresC.add("X-PVB");
            break;
          } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
            featuresC.add(getWord(wi) + "-PVB");
            prevVB = getWord(wi);
            break;
          }
        }
      }

      if (flags.useNextVB) {
        for (int j = loc + 1; ; j++) {
          CoreLabel wi = cInfo.get(j);
          if (wi == cInfo.getPad()) {
            featuresC.add("X-NVB");
            nextVB = "X";
            break;
          } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
            featuresC.add(getWord(wi) + "-NVB");
            nextVB = getWord(wi);
            break;
          }
        }
      }

      if (flags.useVB) {
        featuresC.add(prevVB + '-' + nextVB + "-PNVB");
      }

      if (flags.useShapeConjunctions) {
        featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH");
        if (flags.useTags) {
          featuresC.add(c.tag() + cShape + "-TAG-SH");
        }
        if (flags.useDistSim) {
          featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH");
        }

      }

      if (flags.useWordTag) {
        featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T");
        featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT");
        featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT");
      }

      if (flags.useNPHead) {
        featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW");
        if (flags.useTags) {
          featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T");
        }
        if (flags.useDistSim) {
          featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM");
        }
      }

      if (flags.useNPGovernor) {
        featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW");
        if (flags.useTags) {
          featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T");
        }
        if (flags.useDistSim) {
          featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1");
        }
      }

      if (flags.useHeadGov) {
        featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW");
      }

      if (flags.useClassFeature) {
        featuresC.add("###");
      }

      if (flags.useFirstWord) {
        String firstWord = getWord(cInfo.get(0));
        featuresC.add(firstWord);
      }

      if (flags.useNGrams) {
        Collection<String> subs = null;
        if (flags.cacheNGrams) {
          subs = wordToSubstrings.get(cWord);
        }
        if (subs == null) {
          subs = new ArrayList<String>();
          String word = '<' + cWord + '>';
          if (flags.lowercaseNGrams) {
            word = word.toLowerCase();
          }
          if (flags.dehyphenateNGrams) {
            word = dehyphenate(word);
          }
          if (flags.greekifyNGrams) {
            word = greekify(word);
          }
          // minimum length substring is 2 letters (hardwired)
          // hoist flags.noMidNGrams so only linear in word length for that case
          if (flags.noMidNGrams) {
            int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) :
                                                word.length();
            for (int j = 2; j <= max; j++) {
              subs.add(intern('#' + word.substring(0, j) + '#'));
            }
            int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) :
                                                0;
            int lenM1 = word.length() - 1;
            for (int i = start; i < lenM1; i++) {
              subs.add(intern('#' + word.substring(i) + '#'));
            }
          } else {
            for (int i = 0; i < word.length(); i++) {
              for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) {
                if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                  continue;
                }
                subs.add(intern('#' + word.substring(i, j) + '#'));
              }
            }
          }
          if (flags.cacheNGrams) {
            wordToSubstrings.put(cWord, subs);
          }
        }
        featuresC.addAll(subs);
        if (flags.conjoinShapeNGrams) {
          for (String str : subs) {
            String feat = str + '-' + cShape + "-CNGram-CS";
            featuresC.add(feat);
          }
        }
      }

      if (flags.useGazettes) {
        if (flags.sloppyGazette) {
          Collection<String> entries = wordToGazetteEntries.get(cWord);
          if (entries != null) {
            featuresC.addAll(entries);
          }
        }
        if (flags.cleanGazette) {
          Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
          if (infos != null) {
            for (GazetteInfo gInfo : infos) {
              boolean ok = true;
              for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
                ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)));
              }
              if (ok) {
                featuresC.add(gInfo.feature);
              }
            }
          }
        }
      }

      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
        featuresC.add(cShape + "-TYPE");
        if (flags.useTypeSeqs) {
          featuresC.add(pShape + "-PTYPE");
          featuresC.add(nShape + "-NTYPE");
          featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
          featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
          featuresC.add(pShape + "..." + cShape + "-PCTYPE");
          featuresC.add(cShape + "..." + nShape + "-CNTYPE");
          featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
        }
      }

      if (flags.useLastRealWord) {
        if (pWord.length() <= 3) {
          // extending this to check for 2 short words doesn't seem to help....
          featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
        }
      }

      if (flags.useNextRealWord) {
        if (nWord.length() <= 3) {
          // extending this to check for 2 short words doesn't seem to help....
          featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
        }
      }

      if (flags.useOccurrencePatterns) {
        featuresC.addAll(occurrencePatterns(cInfo, loc));
      }

      if (flags.useDisjunctive) {
        for (int i = 1; i <= flags.disjunctionWidth; i++) {
          CoreLabel dn = cInfo.get(loc + i);
          CoreLabel dp = cInfo.get(loc - i);
          featuresC.add(getWord(dn) + "-DISJN");
          if (flags.useDisjunctiveShapeInteraction) {
            featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
          }
          featuresC.add(getWord(dp) + "-DISJP");
          if (flags.useDisjunctiveShapeInteraction) {
            featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
          }
        }
      }

      if (flags.useWideDisjunctive) {
        for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
          featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
          featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
        }
      }

      if (flags.useEitherSideDisjunctive) {
        for (int i = 1; i <= flags.disjunctionWidth; i++) {
          featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE");
          featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE");
        }
      }

      if (flags.useDisjShape) {
        for (int i = 1; i <= flags.disjunctionWidth; i++) {
          featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
          // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
          featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE");
          // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
        }
      }

      if (flags.useExtraTaggySequences) {
        if (flags.useTags) {
          featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
          featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
        }
        if (flags.useDistSim) {
          featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
          featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
        }
      }

      if (flags.useMUCFeatures) {
        featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class)+"-SECTION");
        featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class)+"-WORD_POSITION");
        featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class)+"-SENT_POSITION");
        featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class)+"-PARA_POSITION");
        featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class)+ '-' +c.get(CoreAnnotations.ShapeAnnotation.class)+"-WORD_POSITION_SHAPE");
      }
    } else if (flags.useInternal) {

      if (flags.useWord) {
        featuresC.add(cWord + "-WORD");
      }

      if (flags.useNGrams) {
        Collection<String> subs = wordToSubstrings.get(cWord);
        if (subs == null) {
          subs = new ArrayList<String>();
          String word = '<' + cWord + '>';
          if (flags.lowercaseNGrams) {
            word = word.toLowerCase();
          }
          if (flags.dehyphenateNGrams) {
            word = dehyphenate(word);
          }
          if (flags.greekifyNGrams) {
            word = greekify(word);
          }
          for (int i = 0; i < word.length(); i++) {
            for (int j = i + 2; j <= word.length(); j++) {
              if (flags.noMidNGrams && i != 0 && j != word.length()) {
                continue;
              }
              if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                continue;
              }
              //subs.add(intern("#" + word.substring(i, j) + "#"));
              subs.add(intern('#' + word.substring(i, j) + '#'));
            }
          }
          if (flags.cacheNGrams) {
            wordToSubstrings.put(cWord, subs);
          }
        }
        featuresC.addAll(subs);
        if (flags.conjoinShapeNGrams) {
          String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
          for (String str : subs) {
            String feat = str + '-' + shape + "-CNGram-CS";
            featuresC.add(feat);
          }
        }
      }

      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
        featuresC.add(cShape + "-TYPE");
      }

      if (flags.useOccurrencePatterns) {
        featuresC.addAll(occurrencePatterns(cInfo, loc));
      }

    } else if (flags.useExternal) {

      if (flags.usePrev) {
        featuresC.add(pWord + "-PW");
      }

      if (flags.useNext) {
        featuresC.add(nWord + "-NW");
      }

      if (flags.useWordPairs) {
        featuresC.add(cWord + '-' + pWord + "-W-PW");
        featuresC.add(cWord + '-' + nWord + "-W-NW");
      }

      if (flags.useSymWordPairs) {
        featuresC.add(pWord + '-' + nWord + "-SWORDS");
      }

      if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
        if (flags.useTypeSeqs) {
          featuresC.add(pShape + "-PTYPE");
          featuresC.add(nShape + "-NTYPE");
          featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
          featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
          if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps.  Might be useful 0th-order
          featuresC.add(cShape + "..." + nShape + "-CNTYPE");
          featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
        }
      }

      if (flags.useLastRealWord) {
        if (pWord.length() <= 3) {
          featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
        }
      }

      if (flags.useNextRealWord) {
        if (nWord.length() <= 3) {
          featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
        }
      }

      if (flags.useDisjunctive) {
        for (int i = 1; i <= flags.disjunctionWidth; i++) {
          CoreLabel dn = cInfo.get(loc + i);
          CoreLabel dp = cInfo.get(loc - i);
          featuresC.add(getWord(dn) + "-DISJN");
          if (flags.useDisjunctiveShapeInteraction) {
            featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
          }
          featuresC.add(getWord(dp) + "-DISJP");
View Full Code Here


    public Class<String> getType() {  return String.class; } }



  protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String cDS = c.getString(CoreAnnotations.DistSimAnnotation.class);
    String pDS = p.getString(CoreAnnotations.DistSimAnnotation.class);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    Collection<String> featuresCpC = new ArrayList<String>();

    if (flags.noEdgeFeature)
      return featuresCpC;

    if (flags.transitionEdgeOnly) {
      featuresCpC.add("PSEQ");
      return featuresCpC;
    }

    if (flags.useNeighborNGrams) {
      int maxLen = pWord.length();
      if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
        maxLen = flags.maxNGramLeng;
      }
      for (int len = 1; len <= maxLen; ++len) {
        featuresCpC.add(pWord.substring(0, len) + "-PREVIOUS-PREFIX");
      }
      for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) {
        featuresCpC.add(pWord.substring(pos, pWord.length()) +
                        "-PREVIOUS-SUFFIX");
      }

      maxLen = cWord.length();
      if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
        maxLen = flags.maxNGramLeng;
      }
      for (int len = 1; len <= maxLen; ++len) {
        featuresCpC.add(cWord.substring(0, len) + "-CURRENT-PREFIX");
      }
      for (int pos = cWord.length() - maxLen; pos < cWord.length(); ++pos) {
        featuresCpC.add(cWord.substring(pos, cWord.length()) +
                        "-CURRENT-SUFFIX");
      }
    }

    if (flags.useInternal && flags.useExternal ) {

      if (flags.useOrdinal) {
        if (isOrdinal(cInfo, loc)) {
          featuresCpC.add("C_ORDINAL");
          if (isOrdinal(cInfo, loc-1)) {
            featuresCpC.add("PC_ORDINAL");
          }
        }
        if (isOrdinal(cInfo, loc-1)) {
          featuresCpC.add("P_ORDINAL");
        }
      }

      if (flags.useAbbr || flags.useMinimalAbbr) {
        featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS");
      }

      if (flags.useAbbr1 || flags.useMinimalAbbr1) {
        if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
          featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS");
        }
      }

      if (flags.useChunkySequences) {
        featuresCpC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
      }

      if (flags.usePrev) {
        if (flags.useSequences && flags.usePrevSequences) {
          featuresCpC.add("PSEQ");
          featuresCpC.add(cWord + "-PSEQW");

          if ( ! flags.strictGoodCoNLL) {
            featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2")// added later after goodCoNLL
            featuresCpC.add(pWord + "-PSEQpW"); // added later after goodCoNLL
          }

          if (flags.useDistSim) {
            featuresCpC.add(pDS + "-PSEQpDS");
            featuresCpC.add(cDS + "-PSEQcDS");
            featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS");
          }

          if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) {
            if ( ! flags.strictGoodCoNLL) {     // These ones were added later after goodCoNLL
              featuresCpC.add(pShape + "-PSEQpS");
              featuresCpC.add(cShape + "-PSEQcS");
            }
            if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates) {
              featuresCpC.add(pShape + '-' + cShape + "-PSEQpcS"); // Duplicate (in goodCoNLL orig, see -TYPES below)
            }
          }
        }
      }

      if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
           flags.useShapeStrings)
          && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
        if (flags.useTypeSeqs3) {
          featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES");
        }
        if (flags.useTypeSeqs2) {
          featuresCpC.add(pShape + '-' + cShape + "-TYPES")// this duplicates PSEQpcS above
        }

        if (flags.useYetMoreCpCShapes) {
          String p2Shape = cInfo.get(loc - 2).getString(CoreAnnotations.ShapeAnnotation.class);
          featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS");
          featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(CoreAnnotations.ShapeAnnotation.class) + "-YMSPCN");
        }
      }

      if (flags.useTypeySequences) {
        featuresCpC.add(cShape + "-TPS2");
        featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1");
        // featuresCpC.add(pShape) + "-" + cShape) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slightly increase sigma to duplicate previous results, however.
      }

      if (flags.useTaggySequences) {
        if (flags.useTags) {
          featuresCpC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TS");
        }
        if (flags.useDistSim) {
          featuresCpC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TS1");
        }
      }

      if (flags.useParenMatching) {
        if (flags.useReverse) {
          if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) {
            if (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-")) {
              featuresCpC.add("PAREN-MATCH");
            }
          }
        } else {
          if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) {
            if (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-")) {
              featuresCpC.add("PAREN-MATCH");
            }
          }
        }
      }
      if (flags.useEntityTypeSequences) {
        featuresCpC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + '-' + c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ETSEQ");
      }
      if (flags.useURLSequences) {
        featuresCpC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + '-' + c.get(CoreAnnotations.IsURLAnnotation.class) + "-URLSEQ");
      }
    } else if (flags.useInternal) {

      if (flags.useSequences && flags.usePrevSequences) {
        featuresCpC.add("PSEQ");
        featuresCpC.add(cWord + "-PSEQW");
      }

      if (flags.useTypeySequences) {
        featuresCpC.add(cShape + "-TPS2");
      }

    } else if (flags.useExternal) {

      if( ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) ||
           flags.useShapeStrings)
          && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
        if (flags.useTypeSeqs3) {
          featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES");
        }
        if (flags.useTypeSeqs2) {
          featuresCpC.add(pShape + '-' + cShape + "-TYPES");
        }
      }

      if (flags.useTypeySequences) {
        featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1");
        featuresCpC.add(pShape + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TPS");
      }
    }

    return featuresCpC;
View Full Code Here

      how = 5;
    } else {
      System.err.println("entitySubclassify: unknown style: " + style);
      how = 4;
    }
    tokens = new PaddedList<CoreLabel>(tokens, new CoreLabel());
    int k = tokens.size();
    String[] newAnswers = new String[k];
    for (int i = 0; i < k; i++) {
      final CoreLabel c = tokens.get(i);
      final CoreLabel p = tokens.get(i - 1);
      final CoreLabel n = tokens.get(i + 1);
      final String cAns = c.get(CoreAnnotations.AnswerAnnotation.class);
      if (cAns.length() > 1 && cAns.charAt(1) == '-') {
        String pAns = p.get(CoreAnnotations.AnswerAnnotation.class);
        if (pAns == null) { pAns = OTHER; }
        String nAns = n.get(CoreAnnotations.AnswerAnnotation.class);
        if (nAns == null) { nAns = OTHER; }
        final String base = cAns.substring(2, cAns.length());
        String pBase = (pAns.length() > 2 ? pAns.substring(2, pAns.length()) : pAns);
        String nBase = (nAns.length() > 2 ? nAns.substring(2, nAns.length()) : nAns);
        char prefix = cAns.charAt(0);
        char pPrefix = (pAns.length() > 0) ? pAns.charAt(0) : ' ';
        char nPrefix = (nAns.length() > 0) ? nAns.charAt(0) : ' ';
        boolean isStartAdjacentSame = base.equals(pBase) &&
          (prefix == 'B' || prefix == 'S' || pPrefix == 'E' || pPrefix == 'S');
        boolean isEndAdjacentSame = base.equals(nBase) &&
          (prefix == 'E' || prefix == 'S' || nPrefix == 'B' || pPrefix == 'S');
        boolean isFirst = (!base.equals(pBase)) || cAns.charAt(0) == 'B';
        boolean isLast = (!base.equals(nBase)) || nAns.charAt(0) == 'B';
        switch (how) {
        case 0:
          if (isStartAdjacentSame) {
            newAnswers[i] = intern("B-" + base);
          } else {
            newAnswers[i] = intern("I-" + base);
          }
          break;
        case 1:
          if (isFirst) {
            newAnswers[i] = intern("B-" + base);
          } else {
            newAnswers[i] = intern("I-" + base);
          }
          break;
        case 2:
          if (isEndAdjacentSame) {
            newAnswers[i] = intern("E-" + base);
          } else {
            newAnswers[i] = intern("I-" + base);
          }
          break;
        case 3:
          if (isLast) {
            newAnswers[i] = intern("E-" + base);
          } else {
            newAnswers[i] = intern("I-" + base);
          }
          break;
        case 4:
          newAnswers[i] = intern("I-" + base);
          break;
        case 5:
          if (isFirst && isLast) {
            newAnswers[i] = intern("S-" + base);
          } else if ((!isFirst) && isLast) {
            newAnswers[i] = intern("E-" + base);
          } else if (isFirst && (!isLast)) {
            newAnswers[i] = intern("B-" + base);
          } else {
            newAnswers[i] = intern("I-" + base);
          }
        }
      } else {
        newAnswers[i] = cAns;
      }
    }
    for (int i = 0; i < k; i++) {
      CoreLabel c = tokens.get(i);
      c.set(CoreAnnotations.AnswerAnnotation.class, newAnswers[i]);
    }
  }
View Full Code Here

   *
   *  @param line A line of CoNLL input
   *  @return The constructed token
   */
  private CoreLabel makeCoreLabel(String line) {
    CoreLabel wi = new CoreLabel();
    // wi.line = line;
    String[] bits = line.split("\\s+");
    switch (bits.length) {
    case 0:
    case 1:
      wi.setWord(BOUNDARY);
      wi.set(CoreAnnotations.AnswerAnnotation.class, OTHER);
      break;
    case 2:
      wi.setWord(bits[0]);
      wi.set(CoreAnnotations.AnswerAnnotation.class, bits[1]);
      break;
    case 3:
      wi.setWord(bits[0]);
      wi.setTag(bits[1]);
      wi.set(CoreAnnotations.AnswerAnnotation.class, bits[2]);
      break;
    case 4:
      wi.setWord(bits[0]);
      wi.setTag(bits[1]);
      wi.set(CoreAnnotations.ChunkAnnotation.class, bits[2]);
      wi.set(CoreAnnotations.AnswerAnnotation.class, bits[3]);
      break;
    case 5:
      if (flags.useLemmaAsWord) {
        wi.setWord(bits[1]);
      } else {
        wi.setWord(bits[0]);
        }
      wi.set(CoreAnnotations.LemmaAnnotation.class, bits[1]);
      wi.setTag(bits[2]);
      wi.set(CoreAnnotations.ChunkAnnotation.class, bits[3]);
      wi.set(CoreAnnotations.AnswerAnnotation.class, bits[4]);
      break;
    default:
      throw new RuntimeIOException("Unexpected input (many fields): " + line);
    }
    wi.set(CoreAnnotations.OriginalAnswerAnnotation.class, wi.get(CoreAnnotations.AnswerAnnotation.class));
    return wi;
  }
View Full Code Here

   */
  private void deEndify(List<CoreLabel> tokens) {
    if (flags.retainEntitySubclassification) {
      return;
    }
    tokens = new PaddedList<CoreLabel>(tokens, new CoreLabel());
    int k = tokens.size();
    String[] newAnswers = new String[k];
    for (int i = 0; i < k; i++) {
      CoreLabel c = tokens.get(i);
      CoreLabel p = tokens.get(i - 1);
      if (c.get(CoreAnnotations.AnswerAnnotation.class).length() > 1 && c.get(CoreAnnotations.AnswerAnnotation.class).charAt(1) == '-') {
        String base = c.get(CoreAnnotations.AnswerAnnotation.class).substring(2);
        String pBase = (p.get(CoreAnnotations.AnswerAnnotation.class).length() <= 2 ? p.get(CoreAnnotations.AnswerAnnotation.class) : p.get(CoreAnnotations.AnswerAnnotation.class).substring(2));
        boolean isSecond = (base.equals(pBase));
        boolean isStart = (c.get(CoreAnnotations.AnswerAnnotation.class).charAt(0) == 'B' || c.get(CoreAnnotations.AnswerAnnotation.class).charAt(0) == 'S');
        if (isSecond && isStart) {
          newAnswers[i] = intern("B-" + base);
        } else {
          newAnswers[i] = intern("I-" + base);
        }
      } else {
        newAnswers[i] = c.get(CoreAnnotations.AnswerAnnotation.class);
      }
    }
    for (int i = 0; i < k; i++) {
      CoreLabel c = tokens.get(i);
      c.set(CoreAnnotations.AnswerAnnotation.class, newAnswers[i]);
    }
  }
View Full Code Here

          if (donotuse == false) {
            String phStr = StringUtils.join(ph, " ");
            Redwood.log(ConstantsAndVariables.extremedebug,"Labeling because of phrase " + phStr);
            for (int j = 0; j < ph.length; j++) {
              int index = idx + j;
              CoreLabel l = sentEn.getValue().get(index);
              if (constVars.usePatternResultAsLabel) {
                sentenceChanged = true;
                l.set(constVars.getAnswerClass().get(label), label);


                CollectionValuedMap<String, String> matched = new CollectionValuedMap<String, String>();
                matched.add(label, phStr);
                if(!l.containsKey(PatternsAnnotations.MatchedPhrases.class))
                  l.set(PatternsAnnotations.MatchedPhrases.class, matched);
                else
                  l.get(PatternsAnnotations.MatchedPhrases.class).addAll(matched);

                for (int k = Math.max(0, index - PatternFactory.numWordsCompound); k < sentEn.getValue().size()
                    && k <= index + PatternFactory.numWordsCompound + 1; k++) {
                  contextWordsRecalculatePats.add(k);
                }
View Full Code Here

      for (Entry<String, List<CoreLabel>> docEn : testSentences.entrySet()) {
        List<CoreLabel> doc = docEn.getValue();
        List<CoreLabel> doceval = new ArrayList<CoreLabel>();
        for (CoreLabel l : doc) {
          CoreLabel l2 = new CoreLabel();
          l2.setWord(l.word());

          if (l.get(anscl.getValue()).equals(label)) {
            l2.set(CoreAnnotations.AnswerAnnotation.class, label);
          } else
            l2.set(CoreAnnotations.AnswerAnnotation.class, constVars.backgroundSymbol);

          // If the gold label is not the label we are calculating the scores
          // for, set it to the background symbol
          if (!l.get(CoreAnnotations.GoldAnswerAnnotation.class).equals(label)) {
            l2.set(CoreAnnotations.GoldAnswerAnnotation.class, constVars.backgroundSymbol);
          } else
            l2.set(CoreAnnotations.GoldAnswerAnnotation.class, label);
          doceval.add(l2);
        }

        countResults(doceval, entityTP, entityFP, entityFN, constVars.backgroundSymbol, wordTP, wordTN, wordFP, wordFN,
            CoreAnnotations.AnswerAnnotation.class, evalPerEntity); //
View Full Code Here

      PTBTokenizer ptb = PTBTokenizer.newPTBTokenizer(new BufferedReader(new StringReader(doc)), false, true);
      List<CoreLabel> words = ptb.tokenize();

      List<CoreLabel> result = new ArrayList<CoreLabel>();

      CoreLabel prev = null;
      String prevString = "";
      Matcher matcher;

      for (CoreLabel word : words) {
        matcher = sgml.matcher(word.word());
        if (matcher.matches()) {
          String tag = matcher.group(1);
          if (word.word().equalsIgnoreCase("<p>")) {
            pNum++;
            sNum = 0;
            wNum = 0;

            if (prev != null) {
              String s = prev.get(CoreAnnotations.AfterAnnotation.class);
              s += word.originalText()+word.after();
              prev.set(CoreAnnotations.AfterAnnotation.class, s);
            }
            prevString += word.before() + word.originalText();

          } else if (word.word().equalsIgnoreCase("<s>")) {
            sNum++;
            wNum = 0;

            if (prev != null) {
              String s = prev.get(CoreAnnotations.AfterAnnotation.class);
              s += word.originalText()+word.after();
              prev.set(CoreAnnotations.AfterAnnotation.class, s);
            }
            prevString += word.before() + word.originalText();

          } else {
            matcher = beginEntity.matcher(word.word());
            if (matcher.matches()) {
              entityClass = matcher.group(1);
              entity = matcher.group(2);
              if (prev != null) {
                String s = prev.get(CoreAnnotations.AfterAnnotation.class);
                s += word.after();
                prev.set(CoreAnnotations.AfterAnnotation.class, s);
              }
              prevString += word.before();
            } else {
              matcher = endEntity.matcher(word.word());
              if (matcher.matches()) {
                entityClass = "";
                entity = "O";
                if (prev != null) {
                  String s = prev.get(CoreAnnotations.AfterAnnotation.class);
                  s += word.after();
                  prev.set(CoreAnnotations.AfterAnnotation.class, s);
                }
                prevString += word.before();
              } else if (word.word().equalsIgnoreCase("<doc>")) {
                prevString += word.before() + word.originalText();
              } else if (word.word().equalsIgnoreCase("</doc>")) {
                String s = prev.get(CoreAnnotations.AfterAnnotation.class);
                s += word.originalText();
                prev.set(CoreAnnotations.AfterAnnotation.class, s);
              } else {
                section = tag.toUpperCase();
                if (prev != null) {
                  String s = prev.get(CoreAnnotations.AfterAnnotation.class);
                  s += word.originalText() + word.after();
                  prev.set(CoreAnnotations.AfterAnnotation.class, s);
                }
                prevString += word.before() + word.originalText();
              }
            }
          }
        } else {
          CoreLabel wi = new CoreLabel();
          wi.setWord(word.word());
          wi.set(CoreAnnotations.OriginalTextAnnotation.class, word.originalText());
          wi.set(CoreAnnotations.BeforeAnnotation.class, prevString+word.before());
          wi.set(CoreAnnotations.AfterAnnotation.class, word.after());
          wi.set(CoreAnnotations.WordPositionAnnotation.class, ""+wNum);
          wi.set(CoreAnnotations.SentencePositionAnnotation.class, ""+sNum);
          wi.set(CoreAnnotations.ParaPositionAnnotation.class, ""+pNum);
          wi.set(CoreAnnotations.SectionAnnotation.class, section);
          wi.set(CoreAnnotations.AnswerAnnotation.class, entity);
          wi.set(CoreAnnotations.EntityClassAnnotation.class, entityClass);
          wNum++;
          prevString = "";
          result.add(wi);
          prev = wi;
        }
View Full Code Here

    for (CoreLabel w: l) {
      String entityType = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
      //if we've just completed an entity and we're looking at a non-continuation,
      //we want to add that now.
      if (entityStringCollector != null && ! entityType.equals(lastEntity)) {
        CoreLabel nextWord = new CoreLabel();
        nextWord.setWord(entityStringCollector.toString());
        nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP");
        nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity);
        s.add(nextWord);
        if (DEBUG) {
          err.print("Quantifiable: Collapsing ");
          err.println(entityStringCollector.toString());
        }
        entityStringCollector = null;
      }
      //If its not to be collapsed, toss it onto the sentence.
      if ( ! collapseBeforeParsing.contains(entityType)) {
        s.add(w);
      } else { //If it is to be collapsed....
        //if its a continuation of the last entity, add it to the
        //current buffer.
        if (entityType.equals(lastEntity)){
          assert entityStringCollector != null;
          entityStringCollector.append('_');
          entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class));
        } else {
          //and its NOT a continuation, make a new buffer.
          entityStringCollector = new StringBuilder();
          entityStringCollector.append(w.get(CoreAnnotations.TextAnnotation.class));
        }
      }
      lastEntity=entityType;
    }
    // if the last token was a named-entity, we add it here.
    if (entityStringCollector!=null) {
      CoreLabel nextWord = new CoreLabel();
      nextWord.setWord(entityStringCollector.toString());
      nextWord.set(CoreAnnotations.PartOfSpeechAnnotation.class, "NNP");
      nextWord.set(CoreAnnotations.NamedEntityTagAnnotation.class, lastEntity);
      s.add(nextWord);
    }
    for (CoreLabel w : s) {
      System.err.println("<<"+w.get(CoreAnnotations.TextAnnotation.class)+"::"+w.get(CoreAnnotations.PartOfSpeechAnnotation.class)+"::"+w.get(CoreAnnotations.NamedEntityTagAnnotation.class)+">>");
    }
View Full Code Here

          if (tag == null || tag.equals("")) {
            err.println("Quantifiable: error! tag is " + tag);
          }
        }
      }
      copyL.add(new CoreLabel(l.get(i)));
    }
    // run NumberSequenceClassifier
    AbstractSequenceClassifier<CoreLabel> nsc = new NumberSequenceClassifier();
    copyL = nsc.classify(copyL);
    // update entity only if it was not O
    for (int i = 0; i < sz; i++) {
      E before = l.get(i);
      CoreLabel nscAnswer = copyL.get(i);
      if (before.get(CoreAnnotations.NamedEntityTagAnnotation.class) == null && before.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals(BACKGROUND_SYMBOL) &&
          (nscAnswer.get(CoreAnnotations.AnswerAnnotation.class) != null && !nscAnswer.get(CoreAnnotations.AnswerAnnotation.class).equals(BACKGROUND_SYMBOL))) {
        System.err.println("Quantifiable: updating class for " +
            before.get(CoreAnnotations.TextAnnotation.class) + '/' +
            before.get(CoreAnnotations.NamedEntityTagAnnotation.class) + " to " + nscAnswer.get(CoreAnnotations.AnswerAnnotation.class));
        before.set(CoreAnnotations.NamedEntityTagAnnotation.class, nscAnswer.get(CoreAnnotations.AnswerAnnotation.class));
      }
    }

    addNormalizedQuantitiesToEntities(l);
    return l;
View Full Code Here

TOP

Related Classes of edu.stanford.nlp.ling.CoreLabel

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.