Source Code of edu.stanford.nlp.wordseg.MaxMatchSegmenter

package edu.stanford.nlp.wordseg;


import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;


import java.util.*;
import java.io.*;
import java.util.regex.Pattern;


/**
 * Lexicon-based semgenter. Uses dynamic programming to find a word
 * segmentation that satisfies the following two preferences:
 * (1) minimize the number of out-of-vocabulary (OOV) words;
 * (2) if there are multiple segmentations with the same number
 * of OOV words, then select the one that minimizes the number
 * of segments. Note that {@link edu.stanford.nlp.parser.lexparser.MaxMatchSegmenter}
 * contains a greedy version of this algorithm.
 *
 * <p>Note that the output segmentation may need to postprocessing for the segmentation
 * of non-Chinese characters (e.g., punctuation, foreign names).
 *
 * @author Michel Galley
 */
public class MaxMatchSegmenter implements WordSegmenter {


  private static final boolean DEBUG = false;


  private Set<String> words = Generics.newHashSet();
  private int len=-1;
  private int edgesNb=0;
  private static final int maxLength = 10;
  private List<DFSAState<Word, Integer>> states;
  private DFSA<Word, Integer> lattice=null;
  public enum MatchHeuristic { MINWORDS, MAXWORDS, MAXLEN }


  private static final Pattern chineseStartChars = Pattern.compile("^[\u4E00-\u9FFF]");
  private static final Pattern chineseEndChars = Pattern.compile("[\u4E00-\u9FFF]$");
  private static final Pattern chineseChars = Pattern.compile("[\u4E00-\u9FFF]");


  private static final Pattern excludeChars = Pattern.compile("[0-9\uff10-\uff19" +
        "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4E5D\u5341" +
        "\u96F6\u3007\u767E\u5343\u4E07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303A" +
        "-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]");


  @Override
  public void initializeTraining(double numTrees) {}


  @Override
  public void train(Collection<Tree> trees) {
    for (Tree tree : trees) {
      train(tree);
    }
  }


  @Override
  public void train(Tree tree) {
    train(tree.taggedYield());
  }


  @Override
  public void train(List<TaggedWord> sentence) {
    for (TaggedWord word : sentence) {
      if (word.word().length() <= maxLength) {
        addStringToLexicon(word.word());
      }
    }
  }


  @Override
  public void finishTraining() {}


  public void loadSegmenter(String filename) {
    addLexicon(filename);
  }


  public List<HasWord> segment(String s) {
    buildSegmentationLattice(s);
    ArrayList<Word> sent = maxMatchSegmentation();
    printlnErr("raw output: "+Sentence.listToString(sent));
    ArrayList<Word> postProcessedSent = postProcessSentence(sent);
    printlnErr("processed output: "+Sentence.listToString(postProcessedSent));
    String postSentString = ChineseStringUtils.postProcessingAnswerCTB(postProcessedSent.toString(),false,false);
    printlnErr("Sighan2005 output: "+postSentString);
    String[] postSentArray = postSentString.split("\\s+");
    ArrayList<Word> postSent = new ArrayList<Word>();
    for(String w : postSentArray) {
      postSent.add(new Word(w));
    }
    return new ArrayList<HasWord>(postSent);
  }


  /**
   * Add a word to the lexicon, unless it contains some non-Chinese character.
   */
  public void addStringToLexicon(String str) {
    if(str.equals("")) {
      System.err.println("WARNING: blank line in lexicon");
    } else if(str.contains(" ")) {
      System.err.println("WARNING: word with space in lexicon");
    } else {
      if(excludeChar(str)) {
        printlnErr("skipping word: "+str);
        return;
      }
      // printlnErr("adding word: "+str);
      words.add(str);
    }
  }


  /**
   * Read lexicon from a one-column text file.
   */
  public void addLexicon(String filename) {
    try {
      BufferedReader lexiconReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
      String lexiconLine;
      while ((lexiconLine = lexiconReader.readLine()) != null) {
        addStringToLexicon(lexiconLine);
      }
    } catch (FileNotFoundException e) {
      System.err.println("Lexicon not found: "+ filename);
      System.exit(-1);
    } catch (IOException e) {
      System.err.println("IO error while reading: "+ filename);
      throw new RuntimeException(e);
    }
  }


  /**
   * Builds a lattice of all possible segmentations using only words
   * present in the lexicon. This function must be run prior to
   * running maxMatchSegmentation.
   */
  public void buildSegmentationLattice(String s) {
    edgesNb = 0;
    len = s.length();
    // Initialize word lattice:
    states = new ArrayList<DFSAState<Word, Integer>>();
    lattice = new DFSA<Word, Integer>("wordLattice");
    for (int i=0; i<=s.length(); ++i)
      states.add(new DFSAState<Word, Integer>(i, lattice));
    // Set start and accepting state:
    lattice.setInitialState(states.get(0));
    states.get(len).setAccepting(true);
    // Find all instances of lexicon words in input string:
    for (int start=0; start<len; ++start) {
      for (int end=len; end>start; --end) {
        String str = s.substring(start, end);
        assert(str.length() > 0);
        boolean isOneChar = (start+1 == end);
        boolean isInDict = words.contains(str);
        if (isInDict || isOneChar) {
          double cost = isInDict ? 1 : 100;
          DFSATransition<Word, Integer> trans =
            new DFSATransition<Word, Integer>(null, states.get(start), states.get(end), new Word(str), null, cost);
          //System.err.println("start="+start+" end="+end+" word="+str);
          states.get(start).addTransition(trans);
          ++edgesNb;
        }
      }
    }
  }


  /**
   *  Returs the lexicon-based segmentation that minimizes the number of words.
   * @return Segmented sentence.
   */
  public ArrayList<Word> maxMatchSegmentation() {
    return segmentWords(MatchHeuristic.MINWORDS);
  }


  /**
   * Returns the lexicon-based segmentation following heuristic h.
   * Note that buildSegmentationLattice must be run first.
   * Two heuristics are currently available -- MINWORDS and MAXWORDS --
   * to respectively minimize and maximize the number of segment
   * (where each segment is a lexicon word, if possible).
   *
   * @param h Heuristic to use for segmentation.
   * @return Segmented sentence.
   * @throws UnsupportedOperationException
   * @see #buildSegmentationLattice
   */
  public ArrayList<Word> segmentWords(MatchHeuristic h) throws UnsupportedOperationException {
    if(lattice==null || len < 0)
      throw new UnsupportedOperationException("segmentWords must be run first");
    List<Word> segmentedWords = new ArrayList<Word>();
    // Init dynamic programming:
    double[] costs = new double[len+1];
    List<DFSATransition<Word, Integer>> bptrs = new ArrayList<DFSATransition<Word, Integer>>();
    for (int i = 0; i < len + 1; ++i) {
      bptrs.add(null);
    }
    costs[0]=0.0;
    for (int i=1; i<=len; ++i)
       costs[i] = Double.MAX_VALUE;
    // DP:
    for (int start=0; start<len; ++start) {
      DFSAState<Word, Integer> fromState = states.get(start);
      Collection<DFSATransition<Word, Integer>> trs = fromState.transitions();
      for (DFSATransition<Word, Integer> tr : trs) {
        DFSAState<Word, Integer> toState = tr.getTarget();
        double lcost = tr.score();
        int end = toState.stateID();
        //System.err.println("start="+start+" end="+end+" word="+tr.getInput());
        if (h == MatchHeuristic.MINWORDS) {
          // Minimize number of words:
          if (costs[start]+1 < costs[end]) {
            costs[end] = costs[start]+lcost;
            bptrs.set(end, tr);
            //System.err.println("start="+start+" end="+end+" word="+tr.getInput());
          }
        } else if (h == MatchHeuristic.MAXWORDS) {
          // Maximze number of words:
          if (costs[start]+1 < costs[end]) {
            costs[end] = costs[start]-lcost;
            bptrs.set(end, tr);
          }
        } else {
          throw new UnsupportedOperationException("unimplemented heuristic");
        }
      }
    }
    // Extract min-cost path:
    int i=len;
    while (i>0) {
      DFSATransition<Word, Integer> tr = bptrs.get(i);
      DFSAState<Word, Integer> fromState = tr.getSource();
      Word word = tr.getInput();
      if (!word.word().equals(" "))
        segmentedWords.add(0, word);
      i = fromState.stateID();
    }
    if(DEBUG) {
      // Print lattice density ([1,+inf[) : if equal to 1, it means
      // there is only one segmentation using words of the lexicon.
      double density = edgesNb*1.0/segmentedWords.size();
      System.err.println("latticeDensity: "+density+" cost: "+costs[len]);
    }
    return new ArrayList<Word>(segmentedWords);
  }


  /**
   * Returns a lexicon-based segmentation. At each position x in the input string,
   * it attempts to find largest value y, so that [x,y] is part of the lexicon.
   * Then, it tried to match more input from position y+1. This greedy algorithm
   * (taken from edu.stanford.nlp.lexparser.MaxMatchSegmenter) has no theoretical
   * guarantee, and it would be wise to use segmentWords instead.
   *
   * @param s Input (unsegmented) string.
   * @return Segmented sentence.
   */
  public ArrayList<Word> greedilySegmentWords(String s) {
    List<Word> segmentedWords = new ArrayList<Word>();
    int length = s.length();
    int start = 0;
    while (start < length) {
      int end = Math.min(length, start + maxLength);
      while (end > start + 1) {
        String nextWord = s.substring(start, end);
        if (words.contains(nextWord)) {
          segmentedWords.add(new Word(nextWord));
          break;
        }
        end--;
      }
      if (end == start + 1) {
        // character does not start any word in our dictionary
        segmentedWords.add(new Word(new String(new char[] {s.charAt(start)} )));
        start++;
      } else {
        start = end;
      }
    }
    return new ArrayList<Word>(segmentedWords);
  }


  public static void main(String[] args) {
    Properties props = StringUtils.argsToProperties(args);
    // System.err.println(props.toString());
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    MaxMatchSegmenter seg = new MaxMatchSegmenter();
    String lexiconFile = props.getProperty("lexicon");
    if(lexiconFile != null) {
      seg.addLexicon(lexiconFile);
    } else {
      System.err.println("Error: no lexicon file!");
      System.exit(1);
    }


    Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();
    sighanRW.init(flags);


    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    PrintWriter stdoutW = new PrintWriter(System.out);
    int lineNb = 0;
    for ( ; ; ) {
      ++lineNb;
      System.err.println("line: "+lineNb);
      try {
        String line = br.readLine();
        if(line == null)
          break;
        String outputLine = null;
        if(props.getProperty("greedy") != null) {
          ArrayList<Word> sentence = seg.greedilySegmentWords(line);
          outputLine = Sentence.listToString(sentence);
        } else if(props.getProperty("maxwords") != null) {
          seg.buildSegmentationLattice(line);
          outputLine = Sentence.listToString(seg.segmentWords(MatchHeuristic.MAXWORDS));
        } else {
          seg.buildSegmentationLattice(line);
          outputLine = Sentence.listToString(seg.maxMatchSegmentation());
        }
        StringReader strR = new StringReader(outputLine);
        Iterator<List<CoreLabel>> itr = sighanRW.getIterator(strR);
        while(itr.hasNext()) {
          sighanRW.printAnswers(itr.next(), stdoutW);
        }
        // System.out.println(outputLine);
      }
      catch (IOException e) {
        break;
      }
    }
    stdoutW.flush();
  }


  private static void printlnErr(String s) {
    EncodingPrintWriter.err.println(s, "UTF-8");
  }


  private static ArrayList<Word> postProcessSentence(ArrayList<Word> sent) {
    ArrayList<Word> newSent = new ArrayList<Word>();
    for(Word word : sent) {
      if(newSent.size() > 0) {
        String prevWord = newSent.get(newSent.size()-1).toString();
        String curWord = word.toString();
        String prevChar = prevWord.substring(prevWord.length()-1);
        String curChar = curWord.substring(0,1);
        if(!isChinese(prevChar) && !isChinese(curChar)) {
          Word mergedWord = new Word(prevWord+curWord);
          newSent.set(newSent.size()-1, mergedWord);
          //printlnErr("merged: "+mergedWord);
          //printlnErr("merged: "+mergedWord+" from: "+prevWord+" and: "+curWord);
          continue;
        }
      }
      newSent.add(word);
    }
    return new ArrayList<Word>(newSent);
  }


  private static boolean startsWithChinese(String str) { return chineseStartChars.matcher(str).matches(); }
  private static boolean endsWithChinese(String str) { return chineseEndChars.matcher(str).matches(); }
  private static boolean isChinese(String str) { return chineseChars.matcher(str).matches(); }
  private static boolean excludeChar(String str) { return excludeChars.matcher(str).matches(); }


  private static final long serialVersionUID   = 8263734344886904724L;
}
Source Code of edu.stanford.nlp.wordseg.MaxMatchSegmenter

Related Classes of edu.stanford.nlp.wordseg.MaxMatchSegmenter