Examples of org.fnlp.nlp.cn.tag.CWSTagger

org.fnlp.nlp.cn.tag.CWSTagger
中文分词器 @author xpqiu @version 1.0 @since FudanNLP 1.0

   */
  public static void loadSeg(String path) throws LoadModelException {
    if(seg==null)
    {
      String file = path+segModel;
      seg = new CWSTagger(file);
      seg.setEnFilter(isEnFilter);
    } 
  }

View Full Code Here

  public static void tearDownAfterClass() throws Exception {
  }


  @Before
  public void setUp() throws Exception {
    CWSTagger tag = new CWSTagger("../models/seg.m");  


    rlseg = new RLSeg(tag,"../tmp/FNLPDATA/all.dict");
  }

View Full Code Here

  
  public static void main(String[] args) throws Exception {
    
    
    StopWords sw= new StopWords("../models/stopwords");
    CWSTagger seg = new CWSTagger("../models/seg.m");
    AbstractExtractor key = new WordExtract(seg,sw);
    
    System.out.println(key.extract("甬温线特别重大铁路交通事故车辆经过近24小时的清理工作，26日深夜已经全部移出事故现场，之前埋下的D301次动车车头被挖出运走", 20, true));
    
    //处理已经分好词的句子

View Full Code Here

   * @throws  
   */
  public static void main(String[] args) throws Exception {


    
    CWSTagger cws = new CWSTagger("../models/seg.m");
    tag = new POSTagger(cws,"../models/pos.m");
    
    System.out.println("得到支持的词性标签集合");
    System.out.println(tag.getSupportedTags());
    System.out.println(tag.getSupportedTags().size());
    System.out.println("\n");
    
    String str = "媒体计算研究所成立了，高级数据挖掘很难。乐phone很好！";
    String s = tag.tag(str);
    System.out.println("处理未分词的句子");
    System.out.println(s);
    
    System.out.println("使用英文标签");
    tag.SetTagType("en");    
    System.out.println(tag.getSupportedTags());
    System.out.println(tag.getSupportedTags().size());
    s = tag.tag(str);
    System.out.println(s);    
    System.out.println();
    
    CWSTagger cws2 = new CWSTagger("../models/seg.m", new Dictionary("../models/dict.txt"));
    
    //bool值指定该dict是否用于cws分词（分词和词性可以使用不同的词典）
    tag = new POSTagger(cws2, "../models/pos.m"
        , new Dictionary("../models/dict.txt"), true);//true就替换了之前的dict.txt
    tag.removeDictionary(false);//不移除分词的词典
    tag.setDictionary(new Dictionary("../models/dict.txt"), false);//设置POS词典，分词使用原来设置
    
    String str2 = "媒体计算研究所成立了，高级数据挖掘很难。乐phone很好！";
    String s2 = tag.tag(str2);
    System.out.println("处理未分词的句子，使用词典");
    System.out.println(s2);
    System.out.println();
    
    Dictionary dict = new Dictionary();
    dict.add("媒体计算","mypos1","mypos2");
    dict.add("乐phone","专有名");
    tag.setDictionary(dict, true);
    String s22 = tag.tag(str2);
    System.out.println(s22);
    System.out.println();
    
    POSTagger tag1 = new POSTagger("../models/pos.m");
    String str1 = "媒体计算 研究所 成立 了 , 高级 数据挖掘 很 难";
    String[] w = str1.split(" ");
    String[] s1 = tag1.tagSeged(w);
    System.out.println("直接处理分好词的句子:++++++++++");
    for(int i=0;i<s1.length;i++){
      System.out.print(w[i]+"/"+s1[i]+" ");
    }
    System.out.println("\n");
    
    POSTagger tag3 = new POSTagger("../models/pos.m", new Dictionary("../models/dict.txt"));
    String str3 = "媒体计算 研究所 成立 了 , 高级 数据挖掘 很 难 ";
    String[] w3 = str3.split(" ");
    String[] s3 = tag3.tagSeged(w3);
    System.out.println("直接处理分好词的句子，使用词典");
    for(int i=0;i<s3.length;i++){
      System.out.print(w3[i]+"/"+s3[i]+" ");
    }
    System.out.println("\n");
    
    //????????????????????????????
    
    System.out.println("重新构造");
    cws = new CWSTagger("../models/seg.m");
    tag = new POSTagger(cws,"../models/pos.m");
    str = "媒体计算研究所成立了, 高级数据挖掘很难";
    System.out.println(tag.tag(str));
    String[][] sa = tag.tag2Array(str);
    for(int i = 0; i < sa.length; i++) {

View Full Code Here

   * @param args 
   * @throws Exception
   * @throws  
   */
  public static void main(String[] args) throws Exception {
    CWSTagger tag = new CWSTagger("../models/seg.m");
    System.out.println("不使用词典的分词：");
    String str = " 媒体计算研究所成立了, 高级数据挖掘(data mining)很难。 乐phone热卖！";
    String s = tag.tag(str);
    System.out.println(s);
    
    //设置英文预处理
    tag.setEnFilter(true);
    s = tag.tag(str);
    System.out.println(s);
//    tag.setEnFilter(false);
    
    System.out.println("\n设置临时词典：");
    ArrayList<String> al = new ArrayList<String>();
    al.add("数据挖掘");
    al.add("媒体计算研究所");
    al.add("乐phone");
    Dictionary dict = new Dictionary(false);
    dict.addSegDict(al);
    tag.setDictionary(dict);
    s = tag.tag(str);
    System.out.println(s);
    
    
    CWSTagger tag2 = new CWSTagger("../models/seg.m", new Dictionary("../models/dict.txt"));
    System.out.println("\n使用词典的分词：");
    String str2 = "媒体计算研究所成立了, 高级数据挖掘很难。 乐phone热卖！";
    String s2 = tag2.tag(str2);
    System.out.println(s2);
    
    //使用不严格的词典
    CWSTagger tag3 = new CWSTagger("../models/seg.m", new Dictionary("../models/dict_ambiguity.txt",true));
    //尽量满足词典，比如词典中有“成立”“成立了”和“了”, 会使用Viterbi决定更合理的输出
    System.out.println("\n使用不严格的词典的分词：");
    String str3 = "媒体计算研究所成立了, 高级数据挖掘很难";
    String s3 = tag3.tag(str3);
    System.out.println(s3);
    str3 = "我送给力学系的同学一个玩具 (送给给力力学力学系都在词典中)";
    s3 = tag3.tag(str3);
    System.out.println(s3);
    
    System.out.println("\n处理文件：");
    String s4 = tag.tagFile("../example-data/data-tag.txt");
    System.out.println(s4);

View Full Code Here

  private static String bayesModelFile = dataPath+"modelBayes.gz";


  public static void main(String[] args) throws Exception {
    //分词
    Pipe removepp=new RemoveWords();
    CWSTagger tag = new CWSTagger("../models/seg.m");
    Pipe segpp=new CNPipe(tag);
    Pipe s2spp=new Strings2StringArray();
    /**
     * Bayes
     */

View Full Code Here

  private static String knnModelFile = dataPath+"modelKnn.gz";


  public static void main(String[] args) throws Exception {
    //分词
    Pipe removepp=new RemoveWords();
    CWSTagger tag = new CWSTagger("../models/seg.m");
    Pipe segpp=new CNPipe(tag);
    Pipe s2spp=new Strings2StringArray();
    
    //建立字典管理器
    AlphabetFactory af = AlphabetFactory.buildFactory();

View Full Code Here

    precision = 1.0;
    dN = 0.85;
  }
  
  public WordExtract(String segPath, String dicPath) throws Exception{
    tag = new CWSTagger(segPath);
    test = new StopWords(dicPath);
  }

View Full Code Here

  /**
   * @param args
   * @throws Exception 
   */
  public static void main(String[] args) throws Exception {
    CWSTagger seg = new CWSTagger("./models/seg.m");  
    POSTagger pos = new POSTagger(seg, "./models/pos.m");


    RLSeg rlseg = new RLSeg(seg,"./tmpdata/FNLPDATA/all.dict");
//    tag.setDictionary(rlseg.tempdict);
    String file = "./tmpdata/20120927-微博分词-5000-test-utf-8.txt";
    BufferedReader bfr = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf8"));
    BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("./tmp/complex.txt"), "UTF-8"));
    BufferedWriter bcqa = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("./tmp/seged.txt"), "UTF-8"));
    String line = null;  
    int i=0;
    while ((line = bfr.readLine()) != null) {
      System.out.println(i++);


      if(line.length()==0)
        continue;
      String[] toks = seg.tag2Array(line);
      
      for(int j=0;j<toks.length;j++){
        bcqa.write(toks[j]);
        if(j<toks.length-1)
        bcqa.write(" ");

View Full Code Here

   * @throws Exception 
   */
  public static void main(String[] args) throws Exception {
    
    WordCount wc = new WordCount();
     wc.seg = new CWSTagger("./models/seg.m");
    
    wc.count("./tmp/filterTweets.y");
    wc.count("./tmp/filterTweets.n");    
    wc.write("./tmp/wc.txt", true);
    wc.filter(500);

View Full Code Here

0 1

TOP

Related Classes of org.fnlp.nlp.cn.tag.CWSTagger

org.apache.commons.cli.BasicParser

org.apache.commons.cli.CommandLine

org.apache.commons.cli.HelpFormatter

org.apache.commons.cli.Options

org.fnlp.app.keyword.WordExtract

org.fnlp.demo.nlp.ChineseWordSegmentation

org.fnlp.demo.nlp.KeyWordExtraction

org.fnlp.demo.nlp.PartsOfSpeechTag

org.fnlp.demo.nlp.tc.TextClassificationBasedOnBayes

org.fnlp.demo.nlp.tc.TextClassificationBasedOnKNN

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.