Package org.ictclas4j.bean

Examples of org.ictclas4j.bean.Dictionary


  public static ArrayList<SegNode> finalAdjust(ArrayList<SegNode> optSegPath, DictLib dictLib) {
    SegNode wr = null;
    ArrayList<SegNode> result = null;
    if (dictLib == null)
      return null;
    Dictionary placeTagger = dictLib.getPlaceUnknownDict();
    Dictionary personTagger = dictLib.getPersonUnknownDict();

    if (optSegPath != null && optSegPath.size() > 0 && personTagger != null && placeTagger != null) {

      result = new ArrayList<SegNode>();
      for (int i = 0; i < optSegPath.size(); i++) {
View Full Code Here


 
  public static  PersonName chineseNameSplit(String word, PosTagger personTagger ) {
    PersonName result = null;

    if (word != null && personTagger!=null  ) {
      Dictionary personDict =personTagger.getUnknownDict();
      int len = word.length();
      if (len < 2 || len > 4)
        return null;
      String[] atoms = GFString.atomSplit(word);
      for (String s : atoms) {
        if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
          return null;
      }

      String surName = null;
      int surNameLen = 2;
      if (len > 2)
        surName = word.substring(0, surNameLen);
      else if (len == 2)
        surName = word;
      if (!personDict.isExist(surName, 1)) {
        surNameLen = 1;
        if (len > 1)
          surName = word.substring(0, surNameLen);
        else if (len == 1)
          surName = word;
        if (!personDict.isExist(surName, 1)) {
          surName = null;
          surNameLen = 0;
        }
      }
      String giveName = word.substring(surNameLen);
      if (len > 3) {
        String temp = word.substring(surNameLen, surNameLen + 1);
        if (personDict.isExist(temp, 1)) {

          giveName = word.substring(surNameLen + 1);
        }
      }

      double freq = personDict.getFreq(surName, 1);
      String temp = giveName.substring(0, 1);
      double freq2 = personDict.getFreq(temp, 2);

      if (surNameLen != 2
          && ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
              && freq < personDict.getFreq("��", 1) / 40 && freq2 < personDict.getFreq("��", 2) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
        return null;
      if (len == 2 && personTagger.isGivenName(word))
        return null;
      result=new PersonName();
      result.setFirstName(surName);
View Full Code Here

    SegNode sn = null;
    Atom atom = null;

    if (atoms != null && atoms.size() > 0 && dictLib != null) {
      segGraph = new SegGraph();
      Dictionary dict = dictLib.getCoreDict();

      // �ȰѷǺ����ַ��Ĵ���ʶ�����
      for (int i = 0; i < atoms.size(); i++) {
        atom = atoms.get(i);
        String word = atom.getWord();
        if (atom.getPos() == Utility.CT_CHINESE)
          sn = new SegNode(i, i + 1, 0, 0, atom.getWord());
        else {
          int pos = 0;
          double value = Utility.MAX_FREQUENCE;

          switch (atom.getPos()) {
          case Utility.CT_INDEX:
          case Utility.CT_NUM:
            pos = -POSTag.NUM;// 'm'*256
            word = Utility.UNKNOWN_NUM;
            value = 0;
            break;
          case Utility.CT_DELIMITER:
            pos = POSTag.PUNC;// 'w'*256;
            break;
          case Utility.CT_LETTER:
            pos = -POSTag.NOUN_LETTER;//
            value = 0;
            word = Utility.UNKNOWN_LETTER;
            break;
          case Utility.CT_SINGLE:// 12021-2129-3121
            if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
              pos = -POSTag.NUM;// 'm'*256
              word = Utility.UNKNOWN_NUM;
            } else {
              pos = -POSTag.NOUN_LETTER;//
              word = Utility.UNKNOWN_LETTER;
            }
            value = 0;
            break;
          default:
            pos = atom.getPos();// '?'*256;
            break;
          }

          int gbkID = dictLib.getGBKID(word);
          sn = new SegNode(i, i + 1, pos, value, word);
          sn.setGbkID(gbkID);
        }

        sn.setSrcWord(atom.getWord());
        segGraph.insert(sn, true);
      }

      StringBuffer words = new StringBuffer();
      for (int i = 0; i < atoms.size(); i++) {
        int j = i + 1;
        words.delete(0, words.length());
        words.append(atoms.get(i).getWord());

        // ����ǡ��·ݡ�����Ҫ�ָ�
        boolean flag = false;
        if (j < atoms.size()) {
          Atom a2 = atoms.get(j);
          if ("��".equals(words.toString()) && "��".equals(a2.getWord())) {
            segGraph.delete(i, j);
            segGraph.delete(i + 1, j + 1);
            words.append(a2.getWord());
            flag = true;
            j++;
          }
        }

        SegAtom sa = null;
        String word = words.toString();
        int gbkID = dictLib.getGBKID(word);
        int wordMaxLen = dict.getWordMaxLen(word, gbkID);
        for (; j <= atoms.size() && word.length() < wordMaxLen; j++) {
          word = words.toString();
          sa = dict.getSegAtom(word, gbkID);
          if (sa != null) {
            // 1���ڣ�1999��ĩ
            // if (word.length() == 2 && segGraph.getSize() > 0) {
            // SegNode g2 = segGraph.getLast();
            // if (Utility.isAllNum(g2.getWord()) ||
View Full Code Here

    double curFreq;
    SegGraph segGraph = null;
    final double smoothParam = 0.1;
    if (dictLib == null)
      return null;
    Dictionary dict = dictLib.getCoreDict();
    Dictionary biDict = dictLib.getBigramDict();

    if (seg != null && dict != null && biDict != null) {
      segGraph = new SegGraph();
      ArrayList<SegNode> sgs = seg.getSnList();

      for (int i = 0; sgs != null && i < sgs.size(); i++) {
        SegNode sg = sgs.get(i);
        if (sg.getPos() >= 0)
          curFreq = sg.getWeight();
        else {
          int gbkID = sg.getGbkID();// dictLib.getGBKID(sg.getWord());
          curFreq = dict.getFreq(sg.getWord(), 2, gbkID);
        }

        // �õ�������ֵ�͸���ֵ��ȵ�����Ԫ��
        ArrayList<SegNode> nextSgs = seg.getNextElements(i);
        for (SegNode graph : nextSgs) {
          String twoWords = sg.getWord();
          twoWords += Utility.WORD_SEGMENTER;
          twoWords += graph.getWord();
          int gbkID = sg.getGbkID();// dictLib.getGBKID(twoWords);

          // ��������������֮���ƽ��ֵ
          // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
          int twoFreq = biDict.getFreq(twoWords, 3, gbkID);
          double temp = (double) 1 / Utility.MAX_FREQUENCE;
          double value = smoothParam * (1 + curFreq) / (Utility.MAX_FREQUENCE + 80000);
          value += (1 - smoothParam) * ((1 - temp) * twoFreq / (1 + curFreq) + temp);
          value = -Math.log(value);

View Full Code Here

  public static PersonName chineseNameSplit( PosTagger personTagger,String word, int index) {
    PersonName result = null;

    if ( word != null && personTagger != null) {
      Dictionary personDict = personTagger.getUnknownDict();
      int len = word.length();
      if (len < 2 || len > 4)
        return null;
      String[] atoms = GFString.atomSplit(word);
      for (String s : atoms) {
        if (Utility.charType(s) != Utility.CT_CHINESE && Utility.charType(s) != Utility.CT_OTHER)
          return null;
      }

      String surName = null;
      int surNameLen = 2;
      if (len > 2)
        surName = word.substring(0, surNameLen);
      else if (len == 2)
        surName = word;
      if (!personDict.isExist( surName, 1,index)) {
        surNameLen = 1;
        if (len > 1)
          surName = word.substring(0, surNameLen);
        else if (len == 1)
          surName = word;
        if (!personDict.isExist( surName, 1,index)) {
          surName = null;
          surNameLen = 0;
        }
      }
      String giveName = word.substring(surNameLen);
      if (len > 3) {
        String temp = word.substring(surNameLen, surNameLen + 1);
        if (personDict.isExist( temp, 1,index)) {

          giveName = word.substring(surNameLen + 1);
        }
      }

      double freq = personDict.getFreq( surName, 1,index);
      String temp = giveName.substring(0, 1);
      double freq2 = personDict.getFreq( temp, 2,index);

      if (surNameLen != 2
          && ((surNameLen == 0 && len > 2) || giveName.length() > 2 || getForeignCharCount(word) >= 3
              && freq < personDict.getFreq( "��", 1,index) / 40 && freq2 < personDict.getFreq( "��", 2,index) / 20 || (freq < 10 && getForeignCharCount(giveName) == (len - surNameLen) / 2)))
        return null;
      if (len == 2 && personTagger.isGivenName(word))
        return null;
      result = new PersonName();
      result.setFirstName(surName);
View Full Code Here

  static Logger logger = Logger.getLogger(Segment.class);

  public Segment(int segPathCount) {
    this.segPathCount = segPathCount;
    logger.info("Load coreDict  ...");
    coreDict = new Dictionary("data\\coreDict.dct");

    logger.info("Load bigramDict ...");
    bigramDict = new Dictionary("data\\bigramDict.dct");

    logger.info("Load tagger dict ...");
    personTagger = new PosTagger(Utility.TAG_TYPE.TT_PERSON, "data\\nr", coreDict);
    transPersonTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON, "data\\tr", coreDict);
    placeTagger = new PosTagger(Utility.TAG_TYPE.TT_TRANS_PERSON, "data\\ns", coreDict);
View Full Code Here

    if (fileName != null) {
      this.coreDict = coreDict;
      if (type == Utility.TAG_TYPE.TT_NORMAL)
        this.unknownDict = coreDict;
      else {
        unknownDict = new Dictionary();
        unknownDict.load(fileName + ".dct");

      }
      context = new ContextStat();
      context.load(fileName + ".ctx");
View Full Code Here

    PosContext cs = new PosContext("data\\lexical.ctx");
    System.out.println(cs);
  }
 
  public static void test3(){
    Dictionary cs = new Dictionary("E:\\document\\NLP\\corpus\\eve\\bigramDict.dct");
    System.out.println(cs);
  }
View Full Code Here

    //testRead();
    convertFormat();
  }
 
  public static  void testWrite(){
    Dictionary dict =new Dictionary();
  }
View Full Code Here

  //�Ѵʵ��Ӿɸ�ʽת���¸�ʽ��������wordMaxLen
  public static void convertFormat(){
    Dictionary0 dict=new Dictionary0(false);
    dict.load("data\\tr.dct");
    Dictionary dict2=new Dictionary(false);
    WordTable[] wts=dict2.getWts();
   
    long size=0;
    for(int i=0;i<dict.dict_count;i++){
      WordTable0 wt0=dict.wts[i];
      ArrayList<WordItem0> wis0=wt0.getWords();
      int count=0;
      int wordMaxLen=0;
      if(wis0!=null){
        System.out.println("size:"+i+","+wis0.size()+","+size);
        HashMap<String, SegAtom> wordMap = new HashMap<String, SegAtom>();
        for(int j=0;j<wis0.size();j++){
          SegAtom sa=new SegAtom();
          WordItem0 wi=wis0.get(j);
          sa.setWord(wi.getWord());
          sa.addPos(new Pos(wi.getHandle(),wi.getFreq(),false));
          count++;
          size+=8+wi.getWord().getBytes().length;
         
          //����ͬ�Ĵʣ����кϲ�
          while(j<wis0.size()-1 && wis0.get(j).getWord()!=null && wis0.get(j).getWord().equals(wis0.get(j+1).getWord())){
            wi=wis0.get(j+1);
            sa.addPos(new Pos(wi.getHandle(),wi.getFreq(),false));
            j++;
            size+=8;
          }
         
          wordMap.put(sa.getWord(), sa);
          if(sa.getWord().length()>wordMaxLen)
            wordMaxLen=sa.getWord().length();
         
        }
       
        WordTable wt=new WordTable();
        wt.setWordCount(count);
        wt.setWordMaxLen(wordMaxLen);
        wt.setWordMap(wordMap);
        wts[i]=wt;
      }
    }
   
    dict2.save("data2\\tr.dct");
  }
View Full Code Here

TOP

Related Classes of org.ictclas4j.bean.Dictionary

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.