Package ivory.core.tokenize

Source Code of ivory.core.tokenize.StanfordChineseTokenizer

package ivory.core.tokenize;

import ivory.core.Constants;
import java.io.IOException;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;

public class StanfordChineseTokenizer extends Tokenizer {
  private static final Logger LOG = Logger.getLogger(StanfordChineseTokenizer.class);

  CRFClassifier classifier;
  DocumentReaderAndWriter readerWriter;

  public StanfordChineseTokenizer() {
    super();
  }

  @Override
  public void configure(Configuration conf) {
    FileSystem fs;
    try {
      fs = FileSystem.get(conf);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    configure(conf, fs);
  }

  @Override
  public void configure(Configuration conf, FileSystem fs) {
    Properties props = new Properties();
    props.setProperty("sighanCorporaDict", conf.get(Constants.TokenizerData));
    props.setProperty("serDictionary", conf.get(Constants.TokenizerData) + "/dict-chris6.ser");
    props.setProperty("inputEncoding", "UTF-8");
    props.setProperty("sighanPostProcessing", "true");

    try {
      classifier = new CRFClassifier(props);
      FSDataInputStream in = fs.open(new Path(conf.get(Constants.TokenizerData) + "/pku"));
      FSDataInputStream inDict = fs.open(new Path(conf.get(Constants.TokenizerData) + "/dict-chris6.ser"));
      classifier.loadClassifier(in, props);
      classifier.flags.setConf(conf);
      readerWriter = classifier.makeReaderAndWriter(inDict);
    } catch (Exception e) {
      e.printStackTrace();
      throw new RuntimeException("Tokenizer not configured properly!");
    }
  }

  @Override
  public String[] processContent(String text) {
    String[] tokens = null;
    try {
      text = postNormalize(preNormalize(text).toLowerCase());      // normalization for non-Chinese characters
      tokens = classifier.classifyStringAndReturnAnswers(text, readerWriter);
    } catch (IOException e) {
      LOG.info("Problem in tokenizing Chinese");
      e.printStackTrace();
    }
    if (vocab == null) {
      return tokens;
    } else {
      StringBuilder finalTokenized = new StringBuilder();
      for (String token : tokens) {
        if ( vocab.get(token) <= 0) { continue; }
        finalTokenized.append( token + " " );
      }
      return finalTokenized.toString().trim().split("\\s+");
    }
  }

  @Override
  public String removeBorderStopWords(String tokenizedText) {
    return tokenizedText;
  }

  @Override
  public boolean isStopWord(String token) {
    return false;
  }
}
TOP

Related Classes of ivory.core.tokenize.StanfordChineseTokenizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.