Package org.fnlp.nlp.corpus

Source Code of org.fnlp.nlp.corpus.StopWords

/**
*  This file is part of FNLP (formerly FudanNLP).
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see <http://www.gnu.org/licenses/>.
*  Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/

package org.fnlp.nlp.corpus;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.lang.String;
import java.util.*;
import java.util.regex.Pattern;

/**
* 本类主要功能是过滤停用词
* @author ltian
*
*/

public class StopWords {

  TreeSet<String> sWord = new TreeSet<String>();
  String dicPath;
  HashMap<String, Long> lastModTime = new HashMap<String, Long>();

  public StopWords(){ 
  }
  public StopWords(String dicPath1,boolean b){
    this.dicPath = dicPath1;
    // 定期监视文件改动
    Timer timer = new Timer(true);
    timer.schedule(new TimerTask() {
      @Override
      public void run() {
        read(dicPath);
      }


    }, new Date(System.currentTimeMillis() + 10000), 24*60*60*1000);
  }
  /**
   * 构造函数
   * @param dicPath
   *        stopword所在地址
   */

  public StopWords(String dicPath) {   
    this.dicPath = dicPath;
    read(dicPath);   
  }

  /**
   * 读取stopword
   * @param dicPath
   *       stopword所在地址
   * @throws FileNotFoundException
   */

  public void read(String dicPath) {

    File path = new File(dicPath);
    if(path.isDirectory()){
      String[] subdir = path.list(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
          if(name.toLowerCase().endsWith("txt"))
            return true;
          else
            return false;
        }
      });
      for(int i=0;i<subdir.length;i++){
        read(path+"/"+subdir[i]);
      }
      return;
    }
    Long newTime = path.lastModified();
    Long lastTime = lastModTime.get(dicPath);
    if(lastTime ==null || !lastTime.equals(newTime)){
      //路径是文件
      try{
        InputStreamReader read = new InputStreamReader(new FileInputStream(path), "UTF-8");
        BufferedReader in = new BufferedReader(read);
        String s;
        while ((s = in.readLine()) != null){
          s = s.trim();
          if (!s.matches("^$"))
            sWord.add(s);
        }
        in.close();
      }catch (Exception e) {
        System.err.println("停用词文件路径错误");
      }
    }
  }

  /**
   * 删除stopword
   * 将string字符串转换为List类型,并返回
   * @param words
   *       要进行处理的字符串
   * @return
   *       删除stopword后的List类型
   */

  public List<String> phraseDel(String[] words){
    List<String> list = new ArrayList<String>();
    String s;
    int length= words.length;
    for(int i = 0; i < length; i++){
      s = words[i];
      if(!isStopWord(s))
        list.add(s);
    }
    return list;
  }

  Pattern noise = Pattern.compile(".*["+CharSets.allRegexPunc+"\\d]+.*");
 
  /**
   * 判断是否为停用词
   * @param word
   * @param minLen 最小长度
   * @param maxLen 最大长度
   * @return
   */
  public boolean isStopWord(String word,int minLen, int maxLen) {
    if (word.length() < minLen || word.length()>maxLen)
      return true;

    if (noise.matcher(word).matches())
      return true;

    if (sWord.contains(word))
      return true;

    return false;
  }
 
  /**
   * 判断是否为停用词
   * @param word
   * @return
   */
  public boolean isStopWord(String word) {   
   
    if (sWord.contains(word.toLowerCase()))
      return true;

    return false;
  }
}
TOP

Related Classes of org.fnlp.nlp.corpus.StopWords

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.