Package joshua.prefix_tree

Source Code of joshua.prefix_tree.HierarchicalRuleExtractor

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.prefix_tree;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.Corpus;
import joshua.corpus.LabeledSpan;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.RuleExtractor;
import joshua.corpus.Span;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrase;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.tm.BasicRuleCollection;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.MonolingualRule;
import joshua.decoder.ff.tm.Rule;
import joshua.util.Cache;

/**
* Rule extractor for Hiero-style hierarchical phrase-based
* translation.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-01-28 12:28:43 -0600 (Thu, 28 Jan 2010) $
*/
public class HierarchicalRuleExtractor implements RuleExtractor {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(HierarchicalRuleExtractor.class.getName());

  /** Lexical translation probabilities. */
  protected final LexicalProbabilities lexProbs;
 
  /**
   * Max span in the source corpus of any extracted hierarchical
   * phrase
   */
  protected final int maxPhraseSpan;
 
 
  /**
   * Maximum number of terminals plus nonterminals allowed
   * in any extracted hierarchical phrase.
   */
  protected final int maxPhraseLength;
 
  /**
   * Minimum span in the source corpus of any nonterminal in
   * an extracted hierarchical phrase.
   */
  protected final int minNonterminalSpan;
 
  /**
   * Maximum span in the source corpus of any nonterminal in
   * an extracted hierarchical phrase.
   */
  protected final int maxNonterminalSpan;
 
  /** Suffix array representing the source language corpus. */
  protected final Suffixes sourceSuffixArray;
 
  /** Corpus array representing the target language corpus. */
  protected final Suffixes targetSuffixArray;
 
  /** Corpus array representing the target language corpus. */
  protected final Corpus targetCorpus;
 
  /**
   * Represents alignments between words in the source corpus
   * and the target corpus.
   */
  protected final Alignments alignments;
 
  protected final ArrayList<FeatureFunction> models;
 
  /**
   * Specifies the maximum number of rules
   * that will be extracted for any source pattern
   */
  protected final int sampleSize;
 
  /**
   * Integer identifiers for the indexed nonterminals.
   */
  protected final int[] nonterminalIDs;
 
  /**
     * Constructs a rule extractor for
     * Hiero-style hierarchical phrase-based translation.
   *
   * @param sourceSuffixArray        Suffix array representing the
   *                           source language corpus
   * @param targetSuffixArray  Suffix array representing the
   *                           target language corpus
   * @param alignments         Represents alignments between words in the
   *                           source corpus and the target corpus
   * @param lexProbs           Lexical translation probability table
   * @param sampleSize         Specifies the maximum number of rules
   *                           that will be extracted for any source pattern
   * @param maxPhraseSpan      Max span in the source corpus of any
   *                           extracted hierarchical phrase
   * @param maxPhraseLength    Maximum number of terminals plus nonterminals
   *                           allowed in any extracted hierarchical phrase
   * @param minNonterminalSpan Minimum span in the source corpus of any
   *                           nonterminal in an extracted hierarchical
   *                           phrase
   * @param maxNonterminalSpan Maximum span in the source corpus of any
   *                           nonterminal in an extracted hierarchical
   *                           phrase
   */
  public HierarchicalRuleExtractor(
      Suffixes sourceSuffixArray,
      Suffixes targetSuffixArray,
      Alignments alignments,
      LexicalProbabilities lexProbs,
      ArrayList<FeatureFunction> models,
      int sampleSize,
      int maxPhraseSpan,
      int maxPhraseLength,
      int minNonterminalSpan,
      int maxNonterminalSpan) {
    this.lexProbs = lexProbs;
    this.maxPhraseSpan = maxPhraseSpan;
    this.maxPhraseLength = maxPhraseLength;
    this.minNonterminalSpan = minNonterminalSpan;
    this.maxNonterminalSpan = maxNonterminalSpan;
    this.targetSuffixArray = targetSuffixArray;
    this.targetCorpus = targetSuffixArray.getCorpus();
    this.alignments = alignments;
    this.sourceSuffixArray = sourceSuffixArray;
    this.sampleSize = sampleSize;
    this.models = models;
   
    SymbolTable vocab = sourceSuffixArray.getVocabulary();
    this.nonterminalIDs = new int[]{vocab.addNonterminal(SymbolTable.X1_STRING), vocab.addNonterminal(SymbolTable.X2_STRING)};
  }

  /* See Javadoc for RuleExtractor class. */
  public List<Rule> extractRules(MatchedHierarchicalPhrases sourceHierarchicalPhrases) {

    Pattern sourcePattern = sourceHierarchicalPhrases.getPattern();
   
    if (logger.isLoggable(Level.FINE)) logger.fine("Extracting rules for source pattern: " + sourcePattern);
     
    Cache<Pattern,List<Rule>> cache = sourceSuffixArray.getCachedRules();
   
    if (cache.containsKey(sourcePattern)) {
      return cache.get(sourcePattern);
    } else {
     
      ArrayList<HierarchicalPhrase> translations = getTranslations(sourceHierarchicalPhrases);
     
      Map<Pattern,Integer> counts = new HashMap<Pattern,Integer>();
      for (Pattern translation : translations) {
        if (translation != null) {
          Integer count = counts.get(translation);
          if (null == count) {
            count = 1;
          } else {
            count++;
          }
          counts.put(translation, count);
        }
      }

      if (logger.isLoggable(Level.FINER)) { logger.finer(
          translations.size() + " actual translations of " +
          sourcePattern + " being stored.");
      }


      float p_e_given_f_denominator = translations.size();

      // We don't want to produce duplicate rules
      HashSet<HierarchicalPhrase> uniqueTranslations = new HashSet<HierarchicalPhrase>(translations);
     
      List<Rule> results = new ArrayList<Rule>(sourceHierarchicalPhrases.size());
     
      int sourcePatternCount = sourceHierarchicalPhrases.size();
      for (HierarchicalPhrase translation : uniqueTranslations) {
        float[] featureScores =
          calculateFeatureValues(
              sourcePattern,
              sourcePatternCount,
              translation,
              counts, p_e_given_f_denominator);

        Rule rule = new BilingualRule(
            SymbolTable.X,
            sourcePattern.getWordIDs(),
            translation.getWordIDs(),
            featureScores,
            translation.arity(),
            sourceSuffixArray.getVocabulary().addTerminal(JoshuaConfiguration.phrase_owner),
            0.0f,
            MonolingualRule.DUMMY_RULE_ID);

        results.add(rule);
      }
     
      if (models != null) {
        BasicRuleCollection.sortRules(results, models);
      }
     
      cache.put(sourcePattern, results);
     
      return results;
    }
   
  }

  protected float calculateProbSourceGivenTarget(Pattern sourcePattern, Pattern targetPattern) {
   
   
   
   
    return 0.0f;
  }
 
  /**
   * Calculate feature values for given source-target pair.
   *
   * @param sourcePattern Source language pattern
   * @param sourcePatternCount TODO
   * @param translation Target language pattern
   * @param counts Map from target pattern to the number of times
   *               that pattern was returned as the translation of
   *               the source pattern.
   * @param totalTranslationCount Total number of translations
   *                              of the given source pattern.
   *                              If a translation was returned
   *                              multiple times, it should be
   *                              counted multiple times in this total.
   * @return Feature value array
   */
  protected float[] calculateFeatureValues(Pattern sourcePattern, int sourcePatternCount, HierarchicalPhrase translation, Map<Pattern,Integer> counts, float totalTranslationCount) {
     
    // Get translation probability
    float p_e_given_f =
      counts.get(translation) / totalTranslationCount;
    float logp_e_given_f = -1.0f * (float) Math.log10(p_e_given_f);
    if (Float.isInfinite(logp_e_given_f)) {
      p_e_given_f = PrefixTree.VERY_UNLIKELY;
    }
    if (logger.isLoggable(Level.FINER)) {
      logger.finer(
          "   prob( "+ translation.toString() + " | " +
          sourcePattern.toString() + " ) =  -log10(" +
          counts.get(translation)+ " / " +totalTranslationCount
          + ") = " + p_e_given_f);
    }

    // Get lexical translation probability
    float lex_p_e_given_f =
      lexProbs.lexProbTargetGivenSource(translation, sourcePattern);
    float lex_logp_e_given_f =
      -1.0f * (float) Math.log10(lex_p_e_given_f);     
    if (Float.isInfinite(lex_logp_e_given_f)) {
      lex_p_e_given_f = PrefixTree.VERY_UNLIKELY;
    }
    if (logger.isLoggable(Level.FINER)) {
      logger.finer(
          "lexprob( " + translation.toString() + " | " +
          sourcePattern.toString() + " ) =  -log10(" +
          lex_p_e_given_f + ") = " + lex_logp_e_given_f);
    }

    // Get reveres lexical translation probability
    float lex_p_f_given_e =
      lexProbs.lexProbSourceGivenTarget(sourcePattern, translation);
    float lex_logp_f_given_e =
      -1.0f * (float) Math.log10(lex_p_f_given_e);
    if (Float.isInfinite(lex_logp_f_given_e)) {
      lex_p_f_given_e = PrefixTree.VERY_UNLIKELY;
    }
    if (logger.isLoggable(Level.FINER)) {
      logger.finer(
          "lexprob( " + sourcePattern.toString() + " | " +
          translation.toString()+ " ) =  -log10(" +
          lex_p_f_given_e + ") = " + lex_logp_f_given_e);
    }

//    int tenOrMore = (sourcePatternCount >= 10) ? 1 : 0;
//    int hundredOrMore = (sourcePatternCount >= 100) ? 1 : 0;
//    int thousandOrMore = (sourcePatternCount >= 1000) ? 1 : 0;
   
    float[] featureScores = {
        logp_e_given_f
        ,lex_logp_f_given_e 
        ,lex_logp_e_given_f
//        ,tenOrMore
//        ,hundredOrMore
//        ,thousandOrMore
    };
   
    return featureScores;
  }

  /**
   * Builds a hierarchical phrase in the target language
   * substituting the terminal sequences in the target side
   * with nonterminal symbols corresponding to the source
   * nonterminals.
   * <p>
   * This assumes that the source and target spans are
   * consistent.
   *
   * @param sourcePhrases Source language phrase to be translated.
   * @param sourceSpan Span in the corpus of the source phrase;
   *            this is needed because the accurate span will
   *            not be in the sourcePhrase if it starts or
   *            ends with a nonterminal
   * @param targetSpan Span in the target corpus of the target
   *            phrase.
   * @param sourceStartsWithNT Indicates whether or not the
   *            source phrase starts with a nonterminal.
   * @param sourceEndsWithNT Indicates whether or not the
   *            source phrase ends with a nonterminal.
   *
   * @return null if no translation can be constructed
   */
  protected HierarchicalPhrase constructTranslation(
      MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex,
      Span sourceSpan, Span targetSpan, boolean sourceStartsWithNT, boolean sourceEndsWithNT) {   
   
   
    if (logger.isLoggable(Level.FINE)) logger.fine("Constructing translation for source span " + sourceSpan + ", target span " + targetSpan);
       
    if (sourceSpan.size() > this.maxPhraseSpan)
      return null;
   
    // Construct a pattern for the trivial case where there are no nonterminals
    if (sourcePhrases.arity() == 0) {

      if (sourceSpan.size() > this.maxPhraseLength) {
       
        return null;
       
      } else {
       
        int[] words = new int[targetSpan.size()];

        for (int i=targetSpan.start; i<targetSpan.end; i++) {
          words[i-targetSpan.start] = targetCorpus.getWordID(i);
        }
       
        return new HierarchicalPhrase(
            words,
            targetSpan,
            Collections.<LabeledSpan>emptyList(),
            targetCorpus);
      }
    }

   
    // Handle the more complex cases...
    List<LabeledSpan> targetNTSpans = new ArrayList<LabeledSpan>();
    int patternSize = targetSpan.size();
   
    int ntIndex = 0;
   
    // For each non terminal in the source, find their corresponding positions in the target span...
   
    // If the source phrase starts with a nonterminal, we have to handle that NT as a special case
    if (sourceStartsWithNT) {
     
      int firstTerminalIndex = sourcePhrases.getFirstTerminalIndex(sourcePhraseIndex);
     
      if (firstTerminalIndex - sourceSpan.start < minNonterminalSpan) {
       
        return null;
       
      } else {
        // If the source phrase starts with NT, then we need to calculate the span of the first NT
        Span nonterminalSourceSpan = new Span(sourceSpan.start, firstTerminalIndex);
        Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);

        if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;

        targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
        ntIndex++;
        // the pattern length will be reduced by the length of the non-terminal, and increased by 1 for the NT itself.
        patternSize = patternSize - nonterminalTargetSpan.size() +1;
      }
    }
   
    // Process all internal nonterminals
    for (int i=0, n=sourcePhrases.getNumberOfTerminalSequences()-1; i<n; i++) {
     
      int nextStartIndex =
        sourcePhrases.getTerminalSequenceStartIndex(sourcePhraseIndex, i+1);
     
      int currentEndIndex =
        sourcePhrases.getTerminalSequenceEndIndex(sourcePhraseIndex, i);
     
      if (nextStartIndex - currentEndIndex < minNonterminalSpan) {
       
        return null;
       
      } else {
       
        Span nonterminalSourceSpan = new Span(currentEndIndex, nextStartIndex);

        Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);

        if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;

        targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
        ntIndex++;
        patternSize = patternSize - nonterminalTargetSpan.size() + 1;
       
      }
    }
     
    // If the source phrase starts with a nonterminal, we have to handle that NT as a special case
    if (sourceEndsWithNT) {
     
      int lastTerminalIndex = sourcePhrases.getLastTerminalIndex(sourcePhraseIndex);
     
      if (sourceSpan.end - lastTerminalIndex < minNonterminalSpan) {
       
        return null;
       
      } else {

        // If the source phrase ends with NT, then we need to calculate the span of the last NT
        Span nonterminalSourceSpan = new Span(lastTerminalIndex, sourceSpan.end);

        Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);
        if (logger.isLoggable(Level.FINEST)) logger.finest("Consistent target span " + nonterminalTargetSpan + " for NT source span " + nonterminalSourceSpan);


        if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;

        targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
        ntIndex++;
        patternSize = patternSize - nonterminalTargetSpan.size() + 1;

      }
    }
   
    boolean foundAlignedTerminal = false;
   
    // Create the pattern...
    int[] words = new int[patternSize];
    int patterCounter = 0;
   
    Collections.sort(targetNTSpans);
   
    if (targetNTSpans.get(0).getSpan().start == targetSpan.start) {
     
      int ntCumulativeSpan = 0;
     
      for (LabeledSpan span : targetNTSpans) {
        ntCumulativeSpan += span.size();
      }
     
      if (ntCumulativeSpan >= targetSpan.size()) {
        return null;
      }
     
    } else {
      // if we don't start with a non-terminal, then write out all the words
      // until we get to the first non-terminal
      for (int i = targetSpan.start; i < targetNTSpans.get(0).getSpan().start; i++) {
        if (!foundAlignedTerminal) {
          foundAlignedTerminal = alignments.hasAlignedTerminal(i, sourcePhrases, sourcePhraseIndex);
        }
        words[patterCounter] = targetCorpus.getWordID(i);
        patterCounter++;
      }
    }

    // add the first non-terminal
    words[patterCounter] = targetNTSpans.get(0).getLabel();
    patterCounter++;
   
    // add everything until the final non-terminal
    for(int i = 1; i < targetNTSpans.size(); i++) {
      LabeledSpan NT1 = targetNTSpans.get(i-1);
      LabeledSpan NT2 = targetNTSpans.get(i);
     
      for(int j = NT1.getSpan().end; j < NT2.getSpan().start; j++) {
        if (!foundAlignedTerminal) {
          foundAlignedTerminal = alignments.hasAlignedTerminal(j, sourcePhrases, sourcePhraseIndex);
        }
        words[patterCounter] = targetCorpus.getWordID(j);
        patterCounter++;
      }
      words[patterCounter] = NT2.getLabel();
      patterCounter++;
    }
   
    // if we don't end with a non-terminal, then write out all remaining words
    if(targetNTSpans.get(targetNTSpans.size()-1).getSpan().end != targetSpan.end) {
      // the target pattern starts with a non-terminal
      for(int i = targetNTSpans.get(targetNTSpans.size()-1).getSpan().end; i < targetSpan.end; i++) {
        if (!foundAlignedTerminal) {
          foundAlignedTerminal = alignments.hasAlignedTerminal(i, sourcePhrases, sourcePhraseIndex);
        }
        words[patterCounter] = targetCorpus.getWordID(i);
        patterCounter++;
      }
    }
   
    if (foundAlignedTerminal) {
      return new HierarchicalPhrase(
          words,
          targetSpan,
          targetNTSpans,
          targetCorpus);
    } else {
      if (logger.isLoggable(Level.FINEST)) logger.finest("Potential translation contained no aligned terminals");
      return null;
    }
   
  }
 
  protected ArrayList<HierarchicalPhrase> getTranslations(MatchedHierarchicalPhrases sourceHierarchicalPhrases) {
   
    int listSize = sourceHierarchicalPhrases.size();
    int stepSize; {
      if (listSize <= sampleSize) {
        stepSize = 1;
      } else {
        stepSize = listSize / sampleSize;
      }
    }
   
    ArrayList<HierarchicalPhrase> translations = new ArrayList<HierarchicalPhrase>();
   
    // For each sample HierarchicalPhrase
    for (int i=0, n=sourceHierarchicalPhrases.size(); i<n; i+=stepSize) {

      HierarchicalPhrase translation = getTranslation(sourceHierarchicalPhrases, i);
      if (translation != null) {
        translations.add(translation);
      }
    }
   
    return translations;
  }
 
  /**
   * Gets the target side translation pattern for a particular
   * source phrase.
   * <p>
   * This is a fairly involved method - the complications
   * arise because we must handle 4 cases:
   * <ul>
   * <li>The source phrase neither starts nor ends with a
   *     nonterminal</li>
   * <li>The source phrase starts but doesn't end with a
   *     nonterminal</li>
   * <li>The source phrase ends but doesn't start with a
   *     nonterminal</li>
   * <li>The source phrase both starts and ends with a
   *     nonterminal</li>
   * </ul>
   * <p>
   * When a hierarchical phrase begins (or ends) with a
   * nonterminal its start (or end) point is <em>not</em>
   * explicitly stored. This is by design to allow a hierarchical
   * phrase to describe a set of possibly matching points in
   * the corpus, but it complicates this method.
   *
   * @param sourcePhrase
   * @return the target side translation pattern for a particular source phrase.
   */
  protected HierarchicalPhrase getTranslation(MatchedHierarchicalPhrases sourcePhrase, int sourcePhraseIndex) {
   
    // Case 1:  If sample !startsWithNT && !endsWithNT
    if (!sourcePhrase.startsWithNonterminal() && !sourcePhrase.endsWithNonterminal()) {
     
      if (logger.isLoggable(Level.FINER)) logger.finer("Case 1: Source phrase !startsWithNT && !endsWithNT");
     
      // Get target span
      Span sourceSpan = sourcePhrase.getSpan(sourcePhraseIndex);

      Span targetSpan = alignments.getConsistentTargetSpan(sourceSpan);
     
      // If target span and source span are consistent
      if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {
       
        // Construct a translation
        HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, sourceSpan, targetSpan, false, false);
       
        if (translation != null) {
          if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 1: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + sourceSpan);

          return translation;
        } else if (logger.isLoggable(Level.FINER)) {
          logger.finer("No valid translation returned from attempt to construct translation for source span " + sourceSpan + ", target span " + targetSpan);
        }
       
      }
     
    }
   
    // Case 2: If sourcePhrase startsWithNT && !endsWithNT
    else if (sourcePhrase.startsWithNonterminal() && !sourcePhrase.endsWithNonterminal()) {
     
      if (logger.isLoggable(Level.FINER)) logger.finer("Case 2: Source phrase startsWithNT && !endsWithNT");
     
      int sentenceNumber = sourcePhrase.getSentenceNumber(sourcePhraseIndex);
      int startOfSentence = sourceSuffixArray.getCorpus().getSentencePosition(sentenceNumber);
      int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
      int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
     
      // Start by assuming the initial source nonterminal starts one word before the first source terminal
      Span possibleSourceSpan = new Span(startOfTerminalSequence-1, endOfTerminalSequence);
     
      // Loop over all legal source spans
      //      (this is variable because we don't know the length of the NT span)
      //      looking for a source span with a consistent translation
      while (possibleSourceSpan.start >= startOfSentence &&
          startOfTerminalSequence-possibleSourceSpan.start<=maxNonterminalSpan &&
          endOfTerminalSequence-possibleSourceSpan.start<=maxPhraseSpan) {
       
        // Get target span
        Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);

        // If target span and source span are consistent
        if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {

          // Construct a translation
          HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, true, false);

          if (translation != null) {
            if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 2: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);

            return translation;
          }

        }
       
        possibleSourceSpan.start--;
       
      }
     
    }
   
    // Case 3: If sourcePhrase !startsWithNT && endsWithNT
    else if (!sourcePhrase.startsWithNonterminal() && sourcePhrase.endsWithNonterminal()) {
     
      if (logger.isLoggable(Level.FINER)) logger.finer("Case 3: Source phrase !startsWithNT && endsWithNT");
     
      int endOfSentence = sourceSuffixArray.getCorpus().getSentenceEndPosition(sourcePhrase.getSentenceNumber(sourcePhraseIndex));
      int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
      int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
     
      // Start by assuming the initial source nonterminal starts one word after the last source terminal
      Span possibleSourceSpan =
        new Span(startOfTerminalSequence, endOfTerminalSequence+1);
       
      // Loop over all legal source spans
      //      (this is variable because we don't know the length of the NT span)
      //      looking for a source span with a consistent translation
      while (possibleSourceSpan.end <= endOfSentence &&
          possibleSourceSpan.end - endOfTerminalSequence <= maxNonterminalSpan &&
          possibleSourceSpan.size()<=maxPhraseSpan) {
         
        // Get target span
        Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);

        // If target span and source span are consistent
        if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {

          // Construct a translation
          HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, false, true);

          if (translation != null) {
            if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 3: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);

            return translation;
          }

        }
       
        possibleSourceSpan.end++;
       
      }
     
    }
   
    // Case 4: If sourcePhrase startsWithNT && endsWithNT
    else if (sourcePhrase.startsWithNonterminal() && sourcePhrase.endsWithNonterminal()) {
     
      if (logger.isLoggable(Level.FINER)) logger.finer("Case 4: Source phrase startsWithNT && endsWithNT");
     
      int sentenceNumber = sourcePhrase.getSentenceNumber(sourcePhraseIndex);
      int startOfSentence = sourceSuffixArray.getCorpus().getSentencePosition(sentenceNumber);
      int endOfSentence = sourceSuffixArray.getCorpus().getSentenceEndPosition(sentenceNumber);
      int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
      int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
     
      // Start by assuming the initial source nonterminal
      //   starts one word before the first source terminal and
      //   ends one word after the last source terminal
      Span possibleSourceSpan =
        new Span(startOfTerminalSequence-1, endOfTerminalSequence+1);
       
      // Loop over all legal source spans
      //      (this is variable because we don't know the length of the NT span)
      //      looking for a source span with a consistent translation
      while (possibleSourceSpan.start >= startOfSentence &&
          possibleSourceSpan.end <= endOfSentence &&
          startOfTerminalSequence-possibleSourceSpan.start<=maxNonterminalSpan &&
          possibleSourceSpan.end-endOfTerminalSequence<=maxNonterminalSpan &&
          possibleSourceSpan.size()<=maxPhraseSpan) {
   
        // Get target span
        Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);

        // If target span and source span are consistent
        if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {

          // Construct a translation
          HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, true, true);

          if (translation != null) {
            if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 4: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);

            return translation;
          }

        }
       
        if (possibleSourceSpan.end < endOfSentence && possibleSourceSpan.end-endOfTerminalSequence+1<=maxNonterminalSpan && possibleSourceSpan.size()+1<=maxPhraseSpan) {
          possibleSourceSpan.end++;
        } else {
          possibleSourceSpan.end = endOfTerminalSequence+1;//1;
          possibleSourceSpan.start--;
        }
                   
      }
     
    }
   
    // Is this the right thing to do, or should we throw an Error?
    return null;
  }


}
TOP

Related Classes of joshua.prefix_tree.HierarchicalRuleExtractor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.