Package joshua.decoder.ff.tm.hiero

Source Code of joshua.decoder.ff.tm.hiero.SamtFormatReader

package joshua.decoder.ff.tm.hiero;

import java.util.Arrays;
import java.util.logging.Logger;

import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.GrammarReader;

public class SamtFormatReader extends GrammarReader<BilingualRule> {

  private static final Logger logger = Logger
      .getLogger(SamtFormatReader.class.getName());
 
  private static final String samtNonTerminalMarkup;
 
  private int[] nonTerminalCache;
 
  static {
    fieldDelimiter = "#";
    nonTerminalRegEx = "^@[^\\s]+";
    nonTerminalCleanRegEx = "[\\,0-9\\s]+";
   
    samtNonTerminalMarkup = "@";
   
    description = "Original SAMT format";
  }
 
  public SamtFormatReader(String grammarFile, SymbolTable vocabulary) {
    super(grammarFile, vocabulary);
   
    // TODO: should be limited to maxNTs + 1 if defined in config.
    // position 0 will never be used
    nonTerminalCache = new int[30];
  }

  // Format example:
  // @VZ-HD @APPR-DA+ART-DA minutes#@2 protokoll @1#@PP-MO+VZ-HD#0 1 1 -0 0.5 -0
 
  @Override
  protected BilingualRule parseLine(String line) {
    String[] fields = line.split(fieldDelimiter);
    if (fields.length != 4) {
      logger.severe("Rule line does not have four fields: " + line);
      logger.severe("Skipped.");
      return null;
    }

    int lhs = symbolTable.addNonterminal(adaptNonTerminalMarkup(fields[2]));

    int arity = 0;
   
    // foreign side
    String[] foreignWords = fields[0].split("\\s+");
    int[] french = new int[foreignWords.length];
    for (int i = 0; i < foreignWords.length; i++) {
      if (isNonTerminal(foreignWords[i])) {
        arity++;
        french[i] = symbolTable.addNonterminal(adaptNonTerminalMarkup(foreignWords[i], arity));
        nonTerminalCache[arity] = french[i];
      } else {
        french[i] = symbolTable.addTerminal(foreignWords[i]);
      }
    }

    // HACK: avoid source-side loop rules
    // TODO: global lookup for goal symbol id would really help here
    if ((french.length == 1) && (arity == 1
        && !adaptNonTerminalMarkup(fields[2]).equals(JoshuaConfiguration.goal_symbol))
    {
      return null;
    }
   
    // english side
    String[] englishWords = fields[1].split("\\s+");
    int[] english = new int[englishWords.length];
    for (int i = 0; i < englishWords.length; i++) {
      if (isNonTerminal(englishWords[i])) {
        english[i] = nonTerminalCache[Integer.
            parseInt(cleanSamtNonTerminal(englishWords[i]))];
      } else {
        english[i] = symbolTable.addTerminal(englishWords[i]);
      }
    }

    // feature scores
    String[] scores = fields[3].split("\\s+");
    float[] feature_scores = new float[scores.length];
   
    int i = 0;
    for (String score : scores) {
      feature_scores[i++] = Float.parseFloat(score);
    }

    return new BilingualRule(lhs, french, english, feature_scores, arity);
  }

  protected String cleanSamtNonTerminal(String word) {
    // changes SAMT markup to Hiero-style
    return word.replaceAll(samtNonTerminalMarkup, "");
  }
 
  protected String adaptNonTerminalMarkup(String word) {
    // changes SAMT markup to Hiero-style
    return "[" + word.replaceAll(",", "_COMMA_")
      .replaceAll("\\$", "_DOLLAR_")
      .replaceAll(samtNonTerminalMarkup, "") + "]";
  }
 
  protected String adaptNonTerminalMarkup(String word, int ntIndex) {
    // changes SAMT markup to Hiero-style
    return "[" + word.replaceAll(",", "_COMMA_")
    .replaceAll("\\$", "_DOLLAR_")
    .replaceAll(samtNonTerminalMarkup, "") + "," + ntIndex + "]";
  }
 
  @Override
  public String toTokenIds(BilingualRule rule) {
    StringBuffer sb = new StringBuffer();
    sb.append(rule.getLHS());
    sb.append(" ||| ");
    sb.append(Arrays.toString(rule.getFrench()));
    sb.append(" ||| ");
    sb.append(Arrays.toString(rule.getEnglish()));
    sb.append(" |||");

    float[] feature_scores = rule.getFeatureScores();
    for (int i = 0; i < feature_scores.length; i++) {
      sb.append(String.format(" %.4f", feature_scores[i]));
    }
    return sb.toString();
  }

  @Override
  public String toTokenIdsWithoutFeatureScores(BilingualRule rule) {
    StringBuffer sb = new StringBuffer();
    sb.append(rule.getLHS());
    sb.append(" ||| ");
    sb.append(Arrays.toString(rule.getFrench()));
    sb.append(" ||| ");
    sb.append(Arrays.toString(rule.getEnglish()));
    return sb.toString();
  }

  @Override
  public String toWords(BilingualRule rule) {
    StringBuffer sb = new StringBuffer();
    sb.append(symbolTable.getWord(rule.getLHS()));
    sb.append(" ||| ");
    sb.append(symbolTable.getWords(rule.getFrench()));
    sb.append(" ||| ");
    sb.append(symbolTable.getWords(rule.getEnglish()));
    sb.append(" |||");

    float[] feature_scores = rule.getFeatureScores();
    for (int i = 0; i < feature_scores.length; i++) {
      sb.append(String.format(" %.4f", feature_scores[i]));
    }
    return sb.toString();
  }

  @Override
  public String toWordsWithoutFeatureScores(BilingualRule rule) {
    StringBuffer sb = new StringBuffer();
    sb.append(symbolTable.getWord(rule.getLHS()));
    sb.append(" ||| ");
    sb.append(symbolTable.getWords(rule.getFrench()));
    sb.append(" ||| ");
    sb.append(symbolTable.getWords(rule.getEnglish()));
    sb.append(" |||");
 
    return sb.toString();
  }
}
TOP

Related Classes of joshua.decoder.ff.tm.hiero.SamtFormatReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.