Source Code of edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory

package edu.stanford.nlp.wordseg;


import java.util.ArrayList;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.international.pennchinese.RadicalMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;




/**
 * A Chinese segmenter Feature Factory for the GALE project.
 * (Modified from the feature factory for Sighan Bakeoff 2005.)
 * <p>
 * c is Chinese character ("char").  c means current, n means next and p means previous.
 * </p>
 *
 * <table>
 * <tr>
 * <th>Feature</th><th>Templates</th>
 * </tr>
 * <tr>
 * <tr>
 * <th></th><th>Current position clique</th>
 * </tr>
 * <tr>
 * <td>useWord1</td><td>CONSTANT, cc, nc, pc, pc+cc, if (As|Msr|Pk|Hk) cc+nc, pc,nc </td>
 * </tr>
 * </table>
 *
 * @author Huihsin Tseng
 * @author Pichuan Chang
 * @author Christopher Manning
 */
public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> {


  private static final int DEBUG = 0;


  private transient TagAffixDetector taDetector; // = null;
  private transient CorpusDictionary outDict; // = null;


  @Override
  public void init(SeqClassifierFlags flags) {
    super.init(flags);
  }


  private synchronized void createTADetector() {
    if (taDetector == null) {
      taDetector = new TagAffixDetector(flags);
    }
  }


  private synchronized void createOutDict() {
    if (outDict == null) {
      System.err.println("reading "+flags.outDict2+" as a seen lexicon");
      outDict = new CorpusDictionary(flags.outDict2);
    }
  }




  /**
   * Extracts all the features from the input data at a certain index.
   *
   * @param cInfo The complete data set as a List of WordInfo
   * @param loc  The index at which to extract features.
   */
  @Override
  public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
    Collection<String> features = Generics.newHashSet();


    if (clique == cliqueC) {
      addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
    } else if (clique == cliqueCpC) {
      addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
      addAllInterningAndSuffixing(features, featuresCnC(cInfo, loc-1), "CnC");
    } else if (clique == cliqueCpCp2C) {
      addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C");


    } else if (clique == cliqueCpCp2Cp3C) {
      addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
    }


    if (DEBUG > 0) {
      EncodingPrintWriter.err.println("For " + cInfo.get(loc) +
              ", features: " + features, "UTF-8");
    }
    return features;
  }






  private static final Pattern patE = Pattern.compile("[a-z]");
  private static final Pattern patEC = Pattern.compile("[A-Z]");


  private static String isEnglish(String chp, String chc) {
    Matcher mp = patE.matcher(chp);   // previous char is [a-z]
    Matcher mc = patE.matcher(chc);   //  current char is [a-z]
    Matcher mpC = patEC.matcher(chp); // previous char is [A-Z]
    Matcher mcC = patEC.matcher(chc); //  current char is [A-Z]
    if (mp.matches() && mcC.matches()){
      return "BND"; // [a-z][A-Z]
    } else if (mp.matches() && mc.matches()){
      return "ENG"; // [a-z][a-z]
    } else if (mpC.matches() && mcC.matches()){
      return "BCC"; // [A-Z][A-Z]
    } else if (mp.matches() && !mc.matches() && !mcC.matches()){
      return "e1";  // [a-z][^A-Za-z]
    } else if (mc.matches() && !mp.matches() && !mpC.matches()) {
      return "e2";  // [^A-Za-z][a-z]
    } else if (mpC.matches() && !mc.matches() && !mcC.matches()){
      return "e3";  // [A-Z][^A-Za-z]
    } else if (mcC.matches() && !mp.matches() && !mpC.matches()) {
      return "e4";  // [^A-Za-z][A-Z]
    } else {
      return "";
    }
  } // end isEnglish


  // the pattern used to be [\u00b7\\-\\.] which AFAICS matched only . because - wasn't escaped. CDM Nov 2007
  private static final Pattern patP = Pattern.compile("[-\u00b7.]");


  private static String isEngPU(String Ep) {
    Matcher mp = patP.matcher(Ep);
    if (mp.matches()) {
      return "1:EngPU";
    } else {
      return "";
    }
  } //is EnglishPU




  private static void dictionaryFeaturesC(Class<? extends CoreAnnotation<String>> lbeginFieldName,
                                   Class<? extends CoreAnnotation<String>> lmiddleFieldName,
                                   Class<? extends CoreAnnotation<String>> lendFieldName,
                                   String dictSuffix, Collection<String> features, CoreLabel p, CoreLabel c, CoreLabel c2) {
      String lbegin = c.getString(lbeginFieldName);
      String lmiddle = c.getString(lmiddleFieldName);
      String lend = c.getString(lendFieldName);
      features.add(lbegin+dictSuffix+"-lb");
      features.add(lmiddle+dictSuffix+"-lm");
      features.add(lend+dictSuffix+"-le");


      lbegin = p.getString(lbeginFieldName);
      lmiddle = p.getString(lmiddleFieldName);
      lend = p.getString(lendFieldName);
      features.add(lbegin+dictSuffix+"-plb");
      features.add(lmiddle+dictSuffix+"-plm");
      features.add(lend+dictSuffix+"-ple");


      lbegin = c2.getString(lbeginFieldName);
      lmiddle = c2.getString(lmiddleFieldName);
      lend = c2.getString(lendFieldName);
      features.add(lbegin+dictSuffix+"-c2lb");
      features.add(lmiddle+dictSuffix+"-c2lm");
      features.add(lend+dictSuffix+"-c2le");
  }




  protected Collection<String> featuresC(PaddedList<? extends CoreLabel> cInfo, int loc) {
    Collection<String> features = new ArrayList<String>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c2 = cInfo.get(loc + 1);
    CoreLabel c3 = cInfo.get(loc + 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);
    String charc = c.getString(CoreAnnotations.CharAnnotation.class);
    String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class);
    String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class);
    String charp = p.getString(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class);
    Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec = (cI != null ? cI.toString() : "");
    Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec2 = (c2I != null ? c2I.toString() : "");
    Integer c3I = c3.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec3 = (c3I != null ? c3I.toString() : "");
    Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypep = (pI != null ? pI.toString() : "");
    Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypep2 = (p2I != null ? p2I.toString() : "");


    /* N-gram features. N is upto 2. */


    if (flags.useWord1) {
      // features.add(charc +"c");
      // features.add(charc2+"c2");
      // features.add(charp +"p");
      // features.add(charp + charc  +"pc");
      // features.add(charc + charc2  +"cc2");
      // cdm: need hyphen so you can see which of charp or charc2 is null....
      // features.add(charp + "-" + charc2 + "pc2");


      features.add(charc +"::c");
      features.add(charc2+"::c2");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huishin described in SIGHAN 2005 paper
      features.add(charc +charc2  +"::cn");
      features.add(charc +charc3  +"::cn2");
      features.add(charp +charc  +"::pc");
      features.add(charp +charc2  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c");
    }


    if (flags.dictionary != null || flags.serializedDictionary != null) {
      dictionaryFeaturesC(CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class,"",features, p, c, c2);
    }


    if (flags.dictionary2 != null) {
      dictionaryFeaturesC(CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class,"-D2-",features, p, c, c2);
    }


    if (flags.useFeaturesC4gram || flags.useFeaturesC5gram || flags.useFeaturesC6gram) {
      features.add(charp2 + charp  +"p2p");
      features.add(charp2 + "p2");
    }
    if (flags.useFeaturesC5gram || flags.useFeaturesC6gram) {
      features.add(charc3+"c3");
      features.add(charc2 + charc3 + "c2c3");
    }
    if (flags.useFeaturesC6gram) {
      features.add(charp3 + "p3");
      features.add(charp3 + charp2 + "p3p2");
    }


    if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) {
      features.add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3");
    }
    if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) {
      features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4");
    }
    if (flags.useUnicodeType5gram) {
      features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5");
    }
    if (flags.useUnicodeBlock) {
      features.add(p.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c2.getString(CoreAnnotations.UBlockAnnotation.class) + "-uBlock");
    }
    if (flags.useShapeStrings) {
      if (flags.useShapeStrings1) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps");
        features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs");
        features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s");
      }
      if (flags.useShapeStrings3) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s");
      }
      if (flags.useShapeStrings4) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s");
      }
      if (flags.useShapeStrings5) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s");
      }
    }


    features.add("cliqueC");


    return features;
  }




  private void dictionaryFeaturesCpC(Class<? extends CoreAnnotation<String>> lbeginFieldName,
                                     Class<? extends CoreAnnotation<String>> lmiddleFieldName,
                                     Class<? extends CoreAnnotation<String>> lendFieldName,
                                     String dictSuffix, Collection<String> features, CoreLabel p2, CoreLabel p, CoreLabel c, CoreLabel c2) {
    String lbegin = c.getString(lbeginFieldName);
    String lmiddle = c.getString(lmiddleFieldName);
    String lend = c.getString(lendFieldName);
    features.add(lbegin+dictSuffix+"-lb");
    features.add(lmiddle+dictSuffix+"-lm");
    features.add(lend+dictSuffix+"-le");


    lbegin = p.getString(lbeginFieldName);
    lmiddle = p.getString(lmiddleFieldName);
    lend = p.get(lendFieldName);
    features.add(lbegin+dictSuffix+"-plb");
    features.add(lmiddle+dictSuffix+"-plm");
    features.add(lend+dictSuffix+"-ple");


    lbegin = c2.getString(lbeginFieldName);
    lmiddle = c2.getString(lmiddleFieldName);
    lend = c2.getString(lendFieldName);
    features.add(lbegin+dictSuffix+"-c2lb");
    features.add(lmiddle+dictSuffix+"-c2lm");
    features.add(lend+dictSuffix+"-c2le");


    if (flags.useDictionaryConjunctions) {
      String p2Lend = p2.getString(lendFieldName);
      String pLend = p.getString(lendFieldName);
      String pLbegin = p.getString(lbeginFieldName);
      String cLbegin = c.getString(lbeginFieldName);
      String cLmiddle = c.getString(lmiddleFieldName);
      if (flags.useDictionaryConjunctions3) {
        features.add(pLend + cLbegin + cLmiddle + dictSuffix + "-pcLconj1");
      }
      features.add(p2Lend + pLend + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj1");
      features.add(p2Lend + pLend + pLbegin + cLbegin + cLmiddle + dictSuffix + "-p2pcLconj2");
    }
  }




  protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo, int loc) {
    Collection<String> features = new ArrayList<String>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c2 = cInfo.get(loc + 1);
    CoreLabel c3 = cInfo.get(loc + 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);


    String charc = c.getString(CoreAnnotations.CharAnnotation.class);
    String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class);
    String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class);
    String charp = p.getString(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class);


    Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec = (cI != null ? cI.toString() : "");
    Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec2 = (c2I != null ? c2I.toString() : "");
    Integer c3I = c3.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypec3 = (c3I != null ? c3I.toString() : "");
    Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypep = (pI != null ? pI.toString() : "");
    Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class);
    String uTypep2 = (p2I != null ? p2I.toString() : "");


    if (flags.dictionary != null || flags.serializedDictionary != null) {
      dictionaryFeaturesCpC(CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class,"",features, p2, p, c, c2);
    }
    if (flags.dictionary2 != null) {
      dictionaryFeaturesCpC(CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class,"-D2-",features, p2, p, c, c2);
    }


    /*
     * N-gram features. N is upto 2.
     */
    if (flags.useWord2) {
      // features.add(charc +"c");
      // features.add(charc2+"c2");
      // features.add(charp +"p");
      // features.add(charp + charc  +"pc");
      // features.add(charc + charc2  +"cc2");
      // // cdm: need hyphen so you can see which of charp or charc2 is null....
      // features.add(charp + "-" + charc2 + "pc2");




      features.add(charc +"::c");
      features.add(charc2+"::c1");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huihsin described in SIGHAN 2005 paper
      features.add(charc +charc2  +"::cn"); // (*)
      features.add(charp +charc  +"::pc");
      features.add(charp +charc2  +"::pn");
      features.add(charp2 +charp  +"::p2p");
      features.add(charp2 +charc  +"::p2c");
      features.add(charc2 +charc  +"::n2c"); // todo: this is messed up: Same as one above at (*); should be cn2 = charc + charc3 + "::cn2"


    }
    if (flags.useFeaturesCpC4gram || flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) {
      // todo: Both these features duplicate ones already in useWord2
      features.add(charp2 + charp  +"p2p");
      features.add(charp2 + "p2");
    }
    if (flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) {
      features.add(charc3+"c3");
      features.add(charc2 + charc3 + "c2c3");
    }
    if (flags.useFeaturesCpC6gram) {
      features.add(charp3 + "p3");
      features.add(charp3 + charp2 + "p3p2");
    }
    if (flags.useGoodForNamesCpC) {
      // these 2 features should be distinctively good at biasing from
      // picking up a Chinese family name in the p2 or p3 positions:
      // familyName X X startWord AND familyName X startWord
      // But actually they seem to have negative value.
      features.add(charp2 + "p2");
      features.add(charp3 + "p3");
    }


    if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) {
      features.add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3");
    }
    if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) {
      features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4");
    }
    if (flags.useUnicodeType5gram) {
      features.add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5");
    }
    if (flags.useWordUTypeConjunctions2) {
      features.add(uTypep + charc + "putcc");
      features.add(charp + uTypec + "pccut");
    }
    if (flags.useWordUTypeConjunctions3) {
      features.add(uTypep2 + uTypep + charc + "p2utputcc");
      features.add(uTypep + charc + uTypec2 + "putccc2ut");
      features.add(charc + uTypec2 + uTypec3 + "ccc2utc3ut");
    }
    if (flags.useUnicodeBlock) {
      features.add(p.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c.getString(CoreAnnotations.UBlockAnnotation.class) + "-" + c2.getString(CoreAnnotations.UBlockAnnotation.class) + "-uBlock");
    }


    if (flags.useShapeStrings) {
      if (flags.useShapeStrings1) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps");
        features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs");
        features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s");
      }
      if (flags.useShapeStrings3) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s");
      }
      if (flags.useShapeStrings4) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s");
      }
      if (flags.useShapeStrings5) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s");
      }
      if (flags.useWordShapeConjunctions2) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "pscc");
        features.add(charp + c.getString(CoreAnnotations.ShapeAnnotation.class) + "pccs");
      }
      if (flags.useWordShapeConjunctions3) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "p2spscc");
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "psccc2s");
        features.add(charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "ccc2sc3s");
      }
    }


    /*
      Radical N-gram features. N is upto 4.
      Smoothing method of N-gram, because there are too many characters in Chinese.
      (It works better than N-gram when they are used individually. less sparse)
    */


    char rcharc, rcharc2, rcharp, rcharp2;
    if (charc.length()==0) { rcharc='n'; } else { rcharc= RadicalMap.getRadical(charc.charAt(0));}
    if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));}
    if (charp.length()==0)  { rcharp='n';  } else { rcharp=RadicalMap.getRadical(charp.charAt(0));  }
    if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));}


    if (flags.useRad2) {
      features.add(rcharc+"rc");
      features.add(rcharc2+"rc2");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rprc");
      features.add(rcharc +rcharc2 +"rcrc2");
      features.add(rcharp +  rcharc  +rcharc2 +"rprcrc2");
    }
    if (flags.useRad2b) {
      features.add(rcharc+"rc");
      features.add(rcharc2+"rc2");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rprc");
      features.add(rcharc +rcharc2 +"rcrc2");
      features.add(rcharp2 +rcharp +"rp2rp");
    }


    /* Non-word dictionary: SEEN bi-gram marked as non-word.
     * This is frickin' useful.  I hadn't realized.  CDM Oct 2007.
     */
    if (flags.useDict2) {
      NonDict2 nd = new NonDict2(flags);
      features.add(nd.checkDic(charp+charc, flags)+"nondict");
    }


    if (flags.useOutDict2) {
      if (outDict == null) {
        createOutDict();
      }
      features.add(outDict.getW(charp+charc)+"outdict");       // -1 0
      features.add(outDict.getW(charc+charc2)+"outdict");      // 0 1
      features.add(outDict.getW(charp2+charp)+"outdict");      // -2 -1
      features.add(outDict.getW(charp2+charp+charc)+"outdict");      // -2 -1 0
      features.add(outDict.getW(charp3+charp2+charp)+"outdict");      // -3 -2 -1
      features.add(outDict.getW(charp+charc+charc2)+"outdict");      // -1 0 1
      features.add(outDict.getW(charc+charc2+charc3)+"outdict");      // 0 1 2
      features.add(outDict.getW(charp+charc+charc2+charc3)+"outdict");      // -1 0 1 2
    }


    /*
      (CTB/ASBC/HK/PK/MSR) POS information of each characters.
      If a character falls into some function categories,
      it is very likely there is a boundary.
      A lot of Chinese function words belong to single characters.
      This feature is also good for numbers and punctuations.
      DE* are grouped into DE.
    */
    if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2
        || flags.usePKChar2 || flags.useMSRChar2) {
      String[] tagsets;
      // the "useChPos" now only works for CTB and PK
      if (flags.useChPos) {
        if(flags.useCTBChar2) {
          tagsets = new String[]{"AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M",  "NN",  "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
        } else if (flags.usePKChar2) {
          //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
          tagsets = new String[]{"2","3","4"};
        } else {
          throw new RuntimeException("only support settings for CTB and PK now.");
        }
      } else {
        //System.err.println("Using Derived features");
        tagsets = new String[]{"2","3","4"};
      }


      if (taDetector == null) {
        createTADetector();
      }
      for (String tag : tagsets) {
  features.add(taDetector.checkDic(tag+"p", charp) + taDetector.checkDic(tag+"i", charp) + taDetector.checkDic(tag+"s", charc)+ taDetector.checkInDic(charp)+taDetector.checkInDic(charc)+ tag+ "prep-sufc" );
        //features.add("|ctbchar2");
      }
    }


    /*
      In error analysis, we found English words and numbers are often separated.
      Rule 1: isNumber feature: check if the current and previous char is a number.
      Rule 2: Disambiguation of time point and time duration.
      Rule 3: isEnglish feature: check if the current and previous character is an english letter.
      Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
      Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
    */


    if (flags.useRule2) {
      /* Reduplication features */
      // previous character == current character
      if(charp.equals(charc)){ features.add("11-R2");}
      // previous character == next character
      if(charp.equals(charc2)){ features.add("22-R2");}


      // current character == next next character
      // fire only when usePk and useHk are both false.
      // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
      if( !flags.usePk && !flags.useHk) {
        if(charc.equals(charc2)){features.add("33-R2");}
      }


      char cur1 = ' ';
      char cur2 = ' ';
      char cur =  ' ';
      char pre =  ' ';
      // actually their length must be either 0 or 1
      if (charc2.length() > 0) { cur1 = charc2.charAt(0); }
      if (charc3.length() > 0) { cur2 = charc3.charAt(0); }
      if (charc.length() > 0) { cur = charc.charAt(0); }
      if (charp.length() > 0) { pre = charp.charAt(0); }


      String prer= String.valueOf(rcharp); // the radical of previous character


      Pattern E = Pattern.compile("[a-zA-Z]");
      Pattern N = Pattern.compile("[0-9]");
      Matcher m = E.matcher(charp);
      Matcher ce = E.matcher(charc);
      Matcher pe = E.matcher(charp2);
      Matcher cn = N.matcher(charc);
      Matcher pn = N.matcher(charp2);




      // if current and previous characters are numbers...
      if (cur >= '0' && cur <= '9'&& pre >= '0' && pre <= '9'){
        if (cur == '9' && pre == '1' && cur1 == '9'&& cur2 >= '0' && cur2 <= '9'){ //199x
          features.add("YR-R2");
        }else{
          features.add("2N-R2");
        }


        // if current and previous characters are not both numbers
        // but previous char is a number
        // i.e. patterns like "1N" , "2A", etc
      } else if (pre >= '0' && pre <= '9'){
        features.add("1N-R2");


        // if previous character is an English character
      } else if(m.matches()){
        features.add("E-R2");


        // if the previous character contains no radical (and it exist)
      } else if(prer.equals(".") && charp.length() == 1){
        if(ce.matches()){
          features.add("PU+E-R2");
        }
        if(pe.matches()){
          features.add("E+PU-R2");
        }
        if(cn.matches()){
          features.add("PU+N-R2");
        }
        if(pn.matches()){
          features.add("N+PU-R2");
        }
        features.add("PU-R2");
      }


      String engType = isEnglish(charp, charc);
      String engPU = isEngPU(charp);
      if ( ! engType.equals(""))
        features.add(engType);
      if ( ! engPU.equals("") && ! engType.equals("")) {
        StringBuilder sb = new StringBuilder();
        sb.append(engPU).append(engType).append("R2");
        features.add(sb.toString());
      }
    }//end of use rule




    // features using "Character.getType" information!
    String origS = c.getString(CoreAnnotations.OriginalCharAnnotation.class);
    char origC = ' ';
    if (origS.length() > 0) { origC = origS.charAt(0); }
    int type = Character.getType(origC);
    switch (type) {
    case Character.UPPERCASE_LETTER: // A-Z and full-width A-Z
    case Character.LOWERCASE_LETTER: // a-z and full-width a-z
      features.add("CHARTYPE-LETTER");
      break;
    case Character.DECIMAL_DIGIT_NUMBER:
      features.add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
      break;
    case Character.OTHER_LETTER: // mostly chinese chars
      features.add("CHARTYPE-OTHER_LETTER");
      break;
    default: // other types
      features.add("CHARTYPE-MISC");
    }


    features.add("cliqueCpC");


    return features;
  } // end featuresCpC




  /** For a CRF, this shouldn't be necessary, since the features duplicate
   *  those from CpC, but Huihsin found some valuable, presumably becuase
   *  it modified the regularization a bit.
   *
   *  @param cInfo The list of characters
   *  @param loc Position of c in list
   *  @return Collection of String features (sparse set of boolean features
   */
  protected Collection<String> featuresCnC(PaddedList<? extends CoreLabel> cInfo, int loc) {
    Collection<String> features = new ArrayList<String>();
    if (flags.useWordn) {
      CoreLabel c = cInfo.get(loc);
      CoreLabel c2 = cInfo.get(loc + 1);
      CoreLabel p = cInfo.get(loc - 1);
      CoreLabel p2 = cInfo.get(loc - 2);
      String charc = c.getString(CoreAnnotations.CharAnnotation.class);
      String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class);
      String charp = p.getString(CoreAnnotations.CharAnnotation.class);
      String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class);


      features.add(charc +"c");
      features.add(charc2+"c2");
      features.add(charp +"p");
      features.add(charp2 + "p2");
      features.add(charp2 + charp  +"p2p");
      features.add(charp + charc  +"pc");
      features.add(charc + charc2  +"cc2");
      features.add(charp + "-" + charc2 + "pc2");
      features.add("cliqueCnC");
    }
    return features;
  } //end of CnC




  /** Second order clique features
   *
   *  @param cInfo The list of characters
   *  @param loc Position of c in list
   *  @return Collection of String features (sparse set of boolean features
   */
  protected Collection<String> featuresCpCp2C(PaddedList<? extends CoreLabel> cInfo, int loc) {
    Collection<String> features = new ArrayList<String>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel c2 = cInfo.get(loc + 1);
    CoreLabel c3 = cInfo.get(loc + 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);


    String charc = c.getString(CoreAnnotations.CharAnnotation.class);
    String charc2 = c2.getString(CoreAnnotations.CharAnnotation.class);
    String charc3 = c3.getString(CoreAnnotations.CharAnnotation.class);
    String charp = p.getString(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class);


    // N-gram features. N is up to 3
    if (flags.useWord3) {
      features.add(charc +"::c");
      features.add(charc2+"::n");
      features.add(charp +"::p");
      features.add(charp2 +"::p2");
      // trying to restore the features that Huihsin described in SIGHAN 2005 paper
      features.add(charc + charc2  +"::cn");
      features.add(charc + charc2 + charc3 + "::cnn2");
      features.add(charp + charc  +"::pc");
      features.add(charp + charc2  +"::pn");
      features.add(charp2 + charp  +"::p2p");
      features.add(charp3 + charp2 + charp + "::p3p2p");
      features.add(charp2 + charc  +"::p2c");
      features.add(charc + charc3  +"::cn2");


    }


    if (flags.useShapeStrings) {
      if (flags.useShapeStrings1) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + "ps");
        features.add(c.getString(CoreAnnotations.ShapeAnnotation.class) + "cs");
        features.add(c2.getString(CoreAnnotations.ShapeAnnotation.class) + "c2s");
      }
      if (flags.useShapeStrings3) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "pscsc2s");
      }
      if (flags.useShapeStrings4) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2s");
      }
      if (flags.useShapeStrings5) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + c.getString(CoreAnnotations.ShapeAnnotation.class) + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "p2spscsc2sc3s");
      }
      if (flags.useWordShapeConjunctions2) {
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "pscc");
        features.add(charp + c.getString(CoreAnnotations.ShapeAnnotation.class) + "pccs");
      }
      if (flags.useWordShapeConjunctions3) {
        features.add(p2.getString(CoreAnnotations.ShapeAnnotation.class) + p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + "p2spscc");
        features.add(p.getString(CoreAnnotations.ShapeAnnotation.class) + charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + "psccc2s");
        features.add(charc + c2.getString(CoreAnnotations.ShapeAnnotation.class) + c3.getString(CoreAnnotations.ShapeAnnotation.class) + "ccc2sc3s");
      }
    }


    /*
      Radical N-gram features. N is upto 4.
      Smoothing method of N-gram, because there are too many characters in Chinese.
      (It works better than N-gram when they are used individually. less sparse)
    */


    char rcharc, rcharc2, rcharp, rcharp2;
    if (charc.length()==0) { rcharc='n'; } else { rcharc= RadicalMap.getRadical(charc.charAt(0));}
    if (charc2.length()==0) { rcharc2='n'; } else { rcharc2=RadicalMap.getRadical(charc2.charAt(0));}
    if (charp.length()==0)  { rcharp='n';  } else { rcharp=RadicalMap.getRadical(charp.charAt(0));  }
    if (charp2.length()==0) { rcharp2='n'; } else { rcharp2=RadicalMap.getRadical(charp2.charAt(0));}


    if (flags.useRad2) {
      features.add(rcharc+"rc");
      features.add(rcharc2+"rc2");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rprc");
      features.add(rcharc +rcharc2 +"rcrc2");
      features.add(rcharp +  rcharc  +rcharc2 +"rprcrc2");
    }
    if (flags.useRad2b) {
      features.add(rcharc+"rc");
      features.add(rcharc2+"rc2");
      features.add(rcharp+"rp");
      features.add(rcharp  +  rcharc+"rprc");
      features.add(rcharc +rcharc2 +"rcrc2");
      features.add(rcharp2 +rcharp +"rp2rp");
    }


    features.add("cliqueCpCp2C");


    return features;
  } // end featuresCpCp2C




  protected Collection<String> featuresCpCp2Cp3C(PaddedList<? extends CoreLabel> cInfo, int loc) {
    Collection<String> features = new ArrayList<String>();
    if (flags.use4Clique && flags.maxLeft >= 3) {
      CoreLabel c = cInfo.get(loc);
      CoreLabel c2 = cInfo.get(loc + 1);
      CoreLabel p = cInfo.get(loc - 1);
      CoreLabel p2 = cInfo.get(loc - 2);
      CoreLabel p3 = cInfo.get(loc - 3);
      String charc = c.getString(CoreAnnotations.CharAnnotation.class);
      String charp = p.getString(CoreAnnotations.CharAnnotation.class);
      String charp2 = p2.getString(CoreAnnotations.CharAnnotation.class);
      String charp3 = p3.getString(CoreAnnotations.CharAnnotation.class);
      Integer cI = c.get(CoreAnnotations.UTypeAnnotation.class);
      String uTypec = (cI != null ? cI.toString() : "");
      Integer c2I = c2.get(CoreAnnotations.UTypeAnnotation.class);
      String uTypec2 = (c2I != null ? c2I.toString() : "");
      Integer pI = p.get(CoreAnnotations.UTypeAnnotation.class);
      String uTypep = (pI != null ? pI.toString() : "");
      Integer p2I = p2.get(CoreAnnotations.UTypeAnnotation.class);
      String uTypep2 = (p2I != null ? p2I.toString() : "");
      Integer p3I = p3.get(CoreAnnotations.UTypeAnnotation.class);
      String uTypep3 = (p3I != null ? p3I.toString() : "");




      if (flags.useLongSequences) {
        features.add(charp3 + charp2 + charp + charc + "p3p2pc");
      }
      if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) {
        features.add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4");
      }
      if (flags.useUnicodeType5gram) {
        features.add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5");
      }
      features.add("cliqueCpCp2Cp3C");
    }
    return features;
  }


  private static final long serialVersionUID = 8197648719208850960L;


} // end class Gale2007ChineseSegmenterFeatureFactory
Source Code of edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory

Related Classes of edu.stanford.nlp.wordseg.Gale2007ChineseSegmenterFeatureFactory