Package ivory.core.tokenize

Source Code of ivory.core.tokenize.StanfordChineseTokenizer

package ivory.core.tokenize;

import ivory.core.Constants;
import java.io.IOException;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;

public class StanfordChineseTokenizer extends Tokenizer {
  private static final Logger sLogger = Logger.getLogger(StanfordChineseTokenizer.class);
  static{
    sLogger.setLevel(Level.INFO);
  }
  @SuppressWarnings("unchecked")
  CRFClassifier classifier;
  @SuppressWarnings("unchecked")
  DocumentReaderAndWriter readerWriter;

  public StanfordChineseTokenizer(){
    super();
  }

  @Override
  public void configure(Configuration conf) {
    FileSystem fs;
    try {
      fs = FileSystem.get(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }  
    configure(conf, fs);
  }

  @SuppressWarnings("unchecked")
  @Override
  public void configure(Configuration conf, FileSystem fs) {
    Properties props = new Properties();
    props.setProperty("sighanCorporaDict", conf.get(Constants.TokenizerData));    //data
    props.setProperty("serDictionary",conf.get(Constants.TokenizerData)+"/dict-chris6.ser");//"data/dict-chris6.ser.gz");
    props.setProperty("inputEncoding", "UTF-8");
    props.setProperty("sighanPostProcessing", "true");

    try {
      classifier = new CRFClassifier(props);
      FSDataInputStream in = fs.open(new Path(conf.get(Constants.TokenizerData)+"/pku"));
      FSDataInputStream inDict = fs.open(new Path(conf.get(Constants.TokenizerData)+"/dict-chris6.ser"));
      classifier.loadClassifier(in, props);      //data/pku.gz
      classifier.flags.setConf(conf);
      readerWriter = classifier.makeReaderAndWriter(inDict);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  @Override
  public String[] processContent(String text) {
    String[] tokens = null;
    try {
      text = text.toLowerCase();      // for non-Chinese characters
      tokens = classifier.classifyStringAndReturnAnswers(text, readerWriter);
    } catch (IOException e) {
      sLogger.info("Problem in tokenizing Chinese");
      e.printStackTrace();
    }
    if (vocab == null) {
      return tokens;
    } else {
      String tokenized = "";
      for (String token : tokens) {
        if ( vocab.get(token) <= 0) { continue; }
        tokenized += ( token + " " );
      }
      return tokenized.trim().split("\\s+");
    }
  }

  @Override
  public int getNumberTokens(String text){
    return processContent(text).length;
  }

  @Override
  public String removeBorderStopWords(String tokenizedText) {
    return tokenizedText;
  }
}
TOP

Related Classes of ivory.core.tokenize.StanfordChineseTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.