Package org.ictclas4j.segment

Source Code of org.ictclas4j.segment.GraphGenerate

package org.ictclas4j.segment;

import java.util.ArrayList;

import org.ictclas4j.bean.Atom;
import org.ictclas4j.bean.Dictionary;
import org.ictclas4j.bean.SegNode;
import org.ictclas4j.bean.WordItem;
import org.ictclas4j.utility.POSTag;
import org.ictclas4j.utility.Utility;


public class GraphGenerate {

  /**
   * ȫ�з�,�����з�ͼ.���ҳ����п��ܵĴ���
   *
   * @param atoms
   * @return
   */
  public static SegGraph generate(ArrayList<Atom> atoms,Dictionary dict) {
    SegGraph segGraph = null;
    SegNode sn = null;
    Atom atom = null;

    if (atoms != null && atoms.size() > 0 && dict != null) {
      segGraph = new SegGraph();
      for (int i = 0; i < atoms.size(); i++) {
        atom = atoms.get(i);
        String word = atom.getWord();
        if (atom.getPos() == Utility.CT_CHINESE)
          sn = new SegNode(i, i + 1, 0,0, atom.getWord());
        else {
          double value = Utility.MAX_FREQUENCE;
          int pos = 0;

          switch (atom.getPos()) {
          case Utility.CT_INDEX:
          case Utility.CT_NUM:
            pos = -POSTag.NUM;// 'm'*256
            word = Utility.UNKNOWN_NUM;
            value = 0;
            break;
          case Utility.CT_DELIMITER:
            pos = POSTag.PUNC;// 'w'*256;
            break;
          case Utility.CT_LETTER:
            pos = -POSTag.NOUN_LETTER;//
            value = 0;
            word = Utility.UNKNOWN_LETTER;
            break;
          case Utility.CT_SINGLE:// 12021-2129-3121
            if (Utility.getCharCount("+-1234567890", atom.getWord()) == atom.getLen()) {
              pos = -POSTag.NUM;// 'm'*256
              word = Utility.UNKNOWN_NUM;
            } else {
              pos = -POSTag.NOUN_LETTER;//
              word = Utility.UNKNOWN_LETTER;
            }
            value = 0;
            break;
          default:
            pos = atom.getPos();// '?'*256;
            break;
          }

          sn = new SegNode(i, i + 1,pos, value , word);
        }

        sn.setSrcWord(atom.getWord());
        segGraph.insert(sn, true);
      }

      String word = null;
      for (int i = 0; i < atoms.size(); i++) {
        int j = i + 1;
        word = atoms.get(i).getWord();
        // ����ǡ��·ݡ�����Ҫ�ָ�
        boolean flag = false;
        if (j < atoms.size()) {
          Atom a2 = atoms.get(j);
          if ("��".equals(word) && "��".equals(a2.getWord())) {
            segGraph.delete(i, j);
            segGraph.delete(i + 1, j + 1);
            word += a2.getWord();
            flag = true;
            j++;
          }
        }

        WordItem wi = null;
        for (; j <= atoms.size(); j++) {
          int totalFreq = 0;
          wi = dict.getMaxMatch(word);
          if (wi != null) {
            // find it
            if (word.equals(wi.getWord())) {
              ArrayList<WordItem> wis = dict.getHandle(word);
              for (WordItem w : wis)
                totalFreq += w.getFreq();

              // 1���ڣ�1999��ĩ
              if (word.length() == 2 && segGraph.getSize() > 0) {
                SegNode g2 = segGraph.getLast();
                if (Utility.isAllNum(g2.getWord()) || Utility.isAllChinese(g2.getWord())
                    && (g2.getWord().indexOf("��") == 0 || g2.getWord().indexOf("��") == 0)) {

                  if ("ĩ���е�ǰ���".indexOf(word.substring(1)) != -1)
                    break;
                }
              }
              // ֻ��һ���Դʣ�������
              SegNode sg = null;
              if (wis.size() == 1)
                sg = new SegNode(i, j,wis.get(0).getHandle(),totalFreq , word);
              else
                sg = new SegNode(i, j, 0,totalFreq , word);
           
              segGraph.insert(sg, true);

            }
            if (flag)
              i++;
            if (j < atoms.size()) {
              String word2 = atoms.get(j).getWord();
              word += word2;
            } else
              break;
          } else
            break;
        }
      }

    }
    return segGraph;
  }

  /**
   * ���ɶ���ͼ��,ÿ���ڵ��ʾ���������������Ϲ�ϵ,��:˵@��ȷ
   *
   * @param sgs
   */
  public static SegGraph biGenerate(SegGraph seg,Dictionary dict,Dictionary biDict) {
    SegGraph segGraph = null;
    final double smoothParam = 0.1;
    double curFreq;

    if (seg != null && dict != null && biDict != null) {
      segGraph = new SegGraph();
      ArrayList<SegNode> sgs = seg.getSnList();

      for (int i = 0; sgs != null && i < sgs.size(); i++) {
        SegNode sg = sgs.get(i);
        if (sg.getPos() >= 0)
          curFreq = sg.getValue();
        else
          curFreq = dict.getFreq(sg.getWord(), 2);

        // �õ�������ֵ�͸���ֵ��ȵ�����Ԫ��
        ArrayList<SegNode> nextSgs = seg.getNextElements(i);
        for (SegNode graph : nextSgs) {
          String twoWords = sg.getWord();
          twoWords += Utility.WORD_SEGMENTER;
          twoWords += graph.getWord();

          // ��������������֮���ƽ��ֵ
          // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
          int twoFreq = biDict.getFreq(twoWords, 3);
          double temp = (double) 1 / Utility.MAX_FREQUENCE;
          double value = smoothParam * (1 + curFreq) / (Utility.MAX_FREQUENCE + 80000);
          value += (1 - smoothParam) * ((1 - temp) * twoFreq / (1 + curFreq) + temp);
          value = -Math.log(value);

          if (value < 0)
            value += sg.getValue();

          SegNode sg2 = new SegNode();
          // �ָ���@ǰ�Ĵ��������е�λ��
          int wordIndex = getWordIndex(sgs, sg);
          sg2.setRow(wordIndex);

          // �ָ���@��Ĵ��������е�λ��
          wordIndex = getWordIndex(sgs, graph);
          sg2.setCol(wordIndex);
          sg2.setWord(twoWords);
          sg2.setPos(sg.getPos());
          sg2.setValue(value);
          segGraph.insert(sg2, false);
        }
      }
    }
    return segGraph;
  }

  private static int getWordIndex(ArrayList<SegNode> sgs, SegNode graph) {
    if (sgs != null && graph != null) {
      for (int i = 0; i < sgs.size(); i++) {
        if (sgs.get(i) == graph)
          return i;
      }
    }

    return -1;
  }

}
TOP

Related Classes of org.ictclas4j.segment.GraphGenerate

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.