/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.prefix_tree;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.Corpus;
import joshua.corpus.LabeledSpan;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.RuleExtractor;
import joshua.corpus.Span;
import joshua.corpus.alignment.Alignments;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.suffix_array.HierarchicalPhrase;
import joshua.corpus.suffix_array.Pattern;
import joshua.corpus.suffix_array.Suffixes;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.tm.BasicRuleCollection;
import joshua.decoder.ff.tm.BilingualRule;
import joshua.decoder.ff.tm.MonolingualRule;
import joshua.decoder.ff.tm.Rule;
import joshua.util.Cache;
/**
* Rule extractor for Hiero-style hierarchical phrase-based
* translation.
*
* @author Lane Schwartz
* @version $LastChangedDate: 2010-01-28 12:28:43 -0600 (Thu, 28 Jan 2010) $
*/
public class HierarchicalRuleExtractor implements RuleExtractor {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(HierarchicalRuleExtractor.class.getName());
/** Lexical translation probabilities. */
protected final LexicalProbabilities lexProbs;
/**
* Max span in the source corpus of any extracted hierarchical
* phrase
*/
protected final int maxPhraseSpan;
/**
* Maximum number of terminals plus nonterminals allowed
* in any extracted hierarchical phrase.
*/
protected final int maxPhraseLength;
/**
* Minimum span in the source corpus of any nonterminal in
* an extracted hierarchical phrase.
*/
protected final int minNonterminalSpan;
/**
* Maximum span in the source corpus of any nonterminal in
* an extracted hierarchical phrase.
*/
protected final int maxNonterminalSpan;
/** Suffix array representing the source language corpus. */
protected final Suffixes sourceSuffixArray;
/** Corpus array representing the target language corpus. */
protected final Suffixes targetSuffixArray;
/** Corpus array representing the target language corpus. */
protected final Corpus targetCorpus;
/**
* Represents alignments between words in the source corpus
* and the target corpus.
*/
protected final Alignments alignments;
protected final ArrayList<FeatureFunction> models;
/**
* Specifies the maximum number of rules
* that will be extracted for any source pattern
*/
protected final int sampleSize;
/**
* Integer identifiers for the indexed nonterminals.
*/
protected final int[] nonterminalIDs;
/**
* Constructs a rule extractor for
* Hiero-style hierarchical phrase-based translation.
*
* @param sourceSuffixArray Suffix array representing the
* source language corpus
* @param targetSuffixArray Suffix array representing the
* target language corpus
* @param alignments Represents alignments between words in the
* source corpus and the target corpus
* @param lexProbs Lexical translation probability table
* @param sampleSize Specifies the maximum number of rules
* that will be extracted for any source pattern
* @param maxPhraseSpan Max span in the source corpus of any
* extracted hierarchical phrase
* @param maxPhraseLength Maximum number of terminals plus nonterminals
* allowed in any extracted hierarchical phrase
* @param minNonterminalSpan Minimum span in the source corpus of any
* nonterminal in an extracted hierarchical
* phrase
* @param maxNonterminalSpan Maximum span in the source corpus of any
* nonterminal in an extracted hierarchical
* phrase
*/
public HierarchicalRuleExtractor(
Suffixes sourceSuffixArray,
Suffixes targetSuffixArray,
Alignments alignments,
LexicalProbabilities lexProbs,
ArrayList<FeatureFunction> models,
int sampleSize,
int maxPhraseSpan,
int maxPhraseLength,
int minNonterminalSpan,
int maxNonterminalSpan) {
this.lexProbs = lexProbs;
this.maxPhraseSpan = maxPhraseSpan;
this.maxPhraseLength = maxPhraseLength;
this.minNonterminalSpan = minNonterminalSpan;
this.maxNonterminalSpan = maxNonterminalSpan;
this.targetSuffixArray = targetSuffixArray;
this.targetCorpus = targetSuffixArray.getCorpus();
this.alignments = alignments;
this.sourceSuffixArray = sourceSuffixArray;
this.sampleSize = sampleSize;
this.models = models;
SymbolTable vocab = sourceSuffixArray.getVocabulary();
this.nonterminalIDs = new int[]{vocab.addNonterminal(SymbolTable.X1_STRING), vocab.addNonterminal(SymbolTable.X2_STRING)};
}
/* See Javadoc for RuleExtractor class. */
public List<Rule> extractRules(MatchedHierarchicalPhrases sourceHierarchicalPhrases) {
Pattern sourcePattern = sourceHierarchicalPhrases.getPattern();
if (logger.isLoggable(Level.FINE)) logger.fine("Extracting rules for source pattern: " + sourcePattern);
Cache<Pattern,List<Rule>> cache = sourceSuffixArray.getCachedRules();
if (cache.containsKey(sourcePattern)) {
return cache.get(sourcePattern);
} else {
ArrayList<HierarchicalPhrase> translations = getTranslations(sourceHierarchicalPhrases);
Map<Pattern,Integer> counts = new HashMap<Pattern,Integer>();
for (Pattern translation : translations) {
if (translation != null) {
Integer count = counts.get(translation);
if (null == count) {
count = 1;
} else {
count++;
}
counts.put(translation, count);
}
}
if (logger.isLoggable(Level.FINER)) { logger.finer(
translations.size() + " actual translations of " +
sourcePattern + " being stored.");
}
float p_e_given_f_denominator = translations.size();
// We don't want to produce duplicate rules
HashSet<HierarchicalPhrase> uniqueTranslations = new HashSet<HierarchicalPhrase>(translations);
List<Rule> results = new ArrayList<Rule>(sourceHierarchicalPhrases.size());
int sourcePatternCount = sourceHierarchicalPhrases.size();
for (HierarchicalPhrase translation : uniqueTranslations) {
float[] featureScores =
calculateFeatureValues(
sourcePattern,
sourcePatternCount,
translation,
counts, p_e_given_f_denominator);
Rule rule = new BilingualRule(
SymbolTable.X,
sourcePattern.getWordIDs(),
translation.getWordIDs(),
featureScores,
translation.arity(),
sourceSuffixArray.getVocabulary().addTerminal(JoshuaConfiguration.phrase_owner),
0.0f,
MonolingualRule.DUMMY_RULE_ID);
results.add(rule);
}
if (models != null) {
BasicRuleCollection.sortRules(results, models);
}
cache.put(sourcePattern, results);
return results;
}
}
protected float calculateProbSourceGivenTarget(Pattern sourcePattern, Pattern targetPattern) {
return 0.0f;
}
/**
* Calculate feature values for given source-target pair.
*
* @param sourcePattern Source language pattern
* @param sourcePatternCount TODO
* @param translation Target language pattern
* @param counts Map from target pattern to the number of times
* that pattern was returned as the translation of
* the source pattern.
* @param totalTranslationCount Total number of translations
* of the given source pattern.
* If a translation was returned
* multiple times, it should be
* counted multiple times in this total.
* @return Feature value array
*/
protected float[] calculateFeatureValues(Pattern sourcePattern, int sourcePatternCount, HierarchicalPhrase translation, Map<Pattern,Integer> counts, float totalTranslationCount) {
// Get translation probability
float p_e_given_f =
counts.get(translation) / totalTranslationCount;
float logp_e_given_f = -1.0f * (float) Math.log10(p_e_given_f);
if (Float.isInfinite(logp_e_given_f)) {
p_e_given_f = PrefixTree.VERY_UNLIKELY;
}
if (logger.isLoggable(Level.FINER)) {
logger.finer(
" prob( "+ translation.toString() + " | " +
sourcePattern.toString() + " ) = -log10(" +
counts.get(translation)+ " / " +totalTranslationCount
+ ") = " + p_e_given_f);
}
// Get lexical translation probability
float lex_p_e_given_f =
lexProbs.lexProbTargetGivenSource(translation, sourcePattern);
float lex_logp_e_given_f =
-1.0f * (float) Math.log10(lex_p_e_given_f);
if (Float.isInfinite(lex_logp_e_given_f)) {
lex_p_e_given_f = PrefixTree.VERY_UNLIKELY;
}
if (logger.isLoggable(Level.FINER)) {
logger.finer(
"lexprob( " + translation.toString() + " | " +
sourcePattern.toString() + " ) = -log10(" +
lex_p_e_given_f + ") = " + lex_logp_e_given_f);
}
// Get reveres lexical translation probability
float lex_p_f_given_e =
lexProbs.lexProbSourceGivenTarget(sourcePattern, translation);
float lex_logp_f_given_e =
-1.0f * (float) Math.log10(lex_p_f_given_e);
if (Float.isInfinite(lex_logp_f_given_e)) {
lex_p_f_given_e = PrefixTree.VERY_UNLIKELY;
}
if (logger.isLoggable(Level.FINER)) {
logger.finer(
"lexprob( " + sourcePattern.toString() + " | " +
translation.toString()+ " ) = -log10(" +
lex_p_f_given_e + ") = " + lex_logp_f_given_e);
}
// int tenOrMore = (sourcePatternCount >= 10) ? 1 : 0;
// int hundredOrMore = (sourcePatternCount >= 100) ? 1 : 0;
// int thousandOrMore = (sourcePatternCount >= 1000) ? 1 : 0;
float[] featureScores = {
logp_e_given_f
,lex_logp_f_given_e
,lex_logp_e_given_f
// ,tenOrMore
// ,hundredOrMore
// ,thousandOrMore
};
return featureScores;
}
/**
* Builds a hierarchical phrase in the target language
* substituting the terminal sequences in the target side
* with nonterminal symbols corresponding to the source
* nonterminals.
* <p>
* This assumes that the source and target spans are
* consistent.
*
* @param sourcePhrases Source language phrase to be translated.
* @param sourceSpan Span in the corpus of the source phrase;
* this is needed because the accurate span will
* not be in the sourcePhrase if it starts or
* ends with a nonterminal
* @param targetSpan Span in the target corpus of the target
* phrase.
* @param sourceStartsWithNT Indicates whether or not the
* source phrase starts with a nonterminal.
* @param sourceEndsWithNT Indicates whether or not the
* source phrase ends with a nonterminal.
*
* @return null if no translation can be constructed
*/
protected HierarchicalPhrase constructTranslation(
MatchedHierarchicalPhrases sourcePhrases, int sourcePhraseIndex,
Span sourceSpan, Span targetSpan, boolean sourceStartsWithNT, boolean sourceEndsWithNT) {
if (logger.isLoggable(Level.FINE)) logger.fine("Constructing translation for source span " + sourceSpan + ", target span " + targetSpan);
if (sourceSpan.size() > this.maxPhraseSpan)
return null;
// Construct a pattern for the trivial case where there are no nonterminals
if (sourcePhrases.arity() == 0) {
if (sourceSpan.size() > this.maxPhraseLength) {
return null;
} else {
int[] words = new int[targetSpan.size()];
for (int i=targetSpan.start; i<targetSpan.end; i++) {
words[i-targetSpan.start] = targetCorpus.getWordID(i);
}
return new HierarchicalPhrase(
words,
targetSpan,
Collections.<LabeledSpan>emptyList(),
targetCorpus);
}
}
// Handle the more complex cases...
List<LabeledSpan> targetNTSpans = new ArrayList<LabeledSpan>();
int patternSize = targetSpan.size();
int ntIndex = 0;
// For each non terminal in the source, find their corresponding positions in the target span...
// If the source phrase starts with a nonterminal, we have to handle that NT as a special case
if (sourceStartsWithNT) {
int firstTerminalIndex = sourcePhrases.getFirstTerminalIndex(sourcePhraseIndex);
if (firstTerminalIndex - sourceSpan.start < minNonterminalSpan) {
return null;
} else {
// If the source phrase starts with NT, then we need to calculate the span of the first NT
Span nonterminalSourceSpan = new Span(sourceSpan.start, firstTerminalIndex);
Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);
if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;
targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
ntIndex++;
// the pattern length will be reduced by the length of the non-terminal, and increased by 1 for the NT itself.
patternSize = patternSize - nonterminalTargetSpan.size() +1;
}
}
// Process all internal nonterminals
for (int i=0, n=sourcePhrases.getNumberOfTerminalSequences()-1; i<n; i++) {
int nextStartIndex =
sourcePhrases.getTerminalSequenceStartIndex(sourcePhraseIndex, i+1);
int currentEndIndex =
sourcePhrases.getTerminalSequenceEndIndex(sourcePhraseIndex, i);
if (nextStartIndex - currentEndIndex < minNonterminalSpan) {
return null;
} else {
Span nonterminalSourceSpan = new Span(currentEndIndex, nextStartIndex);
Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);
if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;
targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
ntIndex++;
patternSize = patternSize - nonterminalTargetSpan.size() + 1;
}
}
// If the source phrase starts with a nonterminal, we have to handle that NT as a special case
if (sourceEndsWithNT) {
int lastTerminalIndex = sourcePhrases.getLastTerminalIndex(sourcePhraseIndex);
if (sourceSpan.end - lastTerminalIndex < minNonterminalSpan) {
return null;
} else {
// If the source phrase ends with NT, then we need to calculate the span of the last NT
Span nonterminalSourceSpan = new Span(lastTerminalIndex, sourceSpan.end);
Span nonterminalTargetSpan = alignments.getConsistentTargetSpan(nonterminalSourceSpan);
if (logger.isLoggable(Level.FINEST)) logger.finest("Consistent target span " + nonterminalTargetSpan + " for NT source span " + nonterminalSourceSpan);
if (nonterminalTargetSpan==null || nonterminalTargetSpan.equals(targetSpan)) return null;
targetNTSpans.add(new LabeledSpan(nonterminalIDs[ntIndex],nonterminalTargetSpan));
ntIndex++;
patternSize = patternSize - nonterminalTargetSpan.size() + 1;
}
}
boolean foundAlignedTerminal = false;
// Create the pattern...
int[] words = new int[patternSize];
int patterCounter = 0;
Collections.sort(targetNTSpans);
if (targetNTSpans.get(0).getSpan().start == targetSpan.start) {
int ntCumulativeSpan = 0;
for (LabeledSpan span : targetNTSpans) {
ntCumulativeSpan += span.size();
}
if (ntCumulativeSpan >= targetSpan.size()) {
return null;
}
} else {
// if we don't start with a non-terminal, then write out all the words
// until we get to the first non-terminal
for (int i = targetSpan.start; i < targetNTSpans.get(0).getSpan().start; i++) {
if (!foundAlignedTerminal) {
foundAlignedTerminal = alignments.hasAlignedTerminal(i, sourcePhrases, sourcePhraseIndex);
}
words[patterCounter] = targetCorpus.getWordID(i);
patterCounter++;
}
}
// add the first non-terminal
words[patterCounter] = targetNTSpans.get(0).getLabel();
patterCounter++;
// add everything until the final non-terminal
for(int i = 1; i < targetNTSpans.size(); i++) {
LabeledSpan NT1 = targetNTSpans.get(i-1);
LabeledSpan NT2 = targetNTSpans.get(i);
for(int j = NT1.getSpan().end; j < NT2.getSpan().start; j++) {
if (!foundAlignedTerminal) {
foundAlignedTerminal = alignments.hasAlignedTerminal(j, sourcePhrases, sourcePhraseIndex);
}
words[patterCounter] = targetCorpus.getWordID(j);
patterCounter++;
}
words[patterCounter] = NT2.getLabel();
patterCounter++;
}
// if we don't end with a non-terminal, then write out all remaining words
if(targetNTSpans.get(targetNTSpans.size()-1).getSpan().end != targetSpan.end) {
// the target pattern starts with a non-terminal
for(int i = targetNTSpans.get(targetNTSpans.size()-1).getSpan().end; i < targetSpan.end; i++) {
if (!foundAlignedTerminal) {
foundAlignedTerminal = alignments.hasAlignedTerminal(i, sourcePhrases, sourcePhraseIndex);
}
words[patterCounter] = targetCorpus.getWordID(i);
patterCounter++;
}
}
if (foundAlignedTerminal) {
return new HierarchicalPhrase(
words,
targetSpan,
targetNTSpans,
targetCorpus);
} else {
if (logger.isLoggable(Level.FINEST)) logger.finest("Potential translation contained no aligned terminals");
return null;
}
}
protected ArrayList<HierarchicalPhrase> getTranslations(MatchedHierarchicalPhrases sourceHierarchicalPhrases) {
int listSize = sourceHierarchicalPhrases.size();
int stepSize; {
if (listSize <= sampleSize) {
stepSize = 1;
} else {
stepSize = listSize / sampleSize;
}
}
ArrayList<HierarchicalPhrase> translations = new ArrayList<HierarchicalPhrase>();
// For each sample HierarchicalPhrase
for (int i=0, n=sourceHierarchicalPhrases.size(); i<n; i+=stepSize) {
HierarchicalPhrase translation = getTranslation(sourceHierarchicalPhrases, i);
if (translation != null) {
translations.add(translation);
}
}
return translations;
}
/**
* Gets the target side translation pattern for a particular
* source phrase.
* <p>
* This is a fairly involved method - the complications
* arise because we must handle 4 cases:
* <ul>
* <li>The source phrase neither starts nor ends with a
* nonterminal</li>
* <li>The source phrase starts but doesn't end with a
* nonterminal</li>
* <li>The source phrase ends but doesn't start with a
* nonterminal</li>
* <li>The source phrase both starts and ends with a
* nonterminal</li>
* </ul>
* <p>
* When a hierarchical phrase begins (or ends) with a
* nonterminal its start (or end) point is <em>not</em>
* explicitly stored. This is by design to allow a hierarchical
* phrase to describe a set of possibly matching points in
* the corpus, but it complicates this method.
*
* @param sourcePhrase
* @return the target side translation pattern for a particular source phrase.
*/
protected HierarchicalPhrase getTranslation(MatchedHierarchicalPhrases sourcePhrase, int sourcePhraseIndex) {
// Case 1: If sample !startsWithNT && !endsWithNT
if (!sourcePhrase.startsWithNonterminal() && !sourcePhrase.endsWithNonterminal()) {
if (logger.isLoggable(Level.FINER)) logger.finer("Case 1: Source phrase !startsWithNT && !endsWithNT");
// Get target span
Span sourceSpan = sourcePhrase.getSpan(sourcePhraseIndex);
Span targetSpan = alignments.getConsistentTargetSpan(sourceSpan);
// If target span and source span are consistent
if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {
// Construct a translation
HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, sourceSpan, targetSpan, false, false);
if (translation != null) {
if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 1: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + sourceSpan);
return translation;
} else if (logger.isLoggable(Level.FINER)) {
logger.finer("No valid translation returned from attempt to construct translation for source span " + sourceSpan + ", target span " + targetSpan);
}
}
}
// Case 2: If sourcePhrase startsWithNT && !endsWithNT
else if (sourcePhrase.startsWithNonterminal() && !sourcePhrase.endsWithNonterminal()) {
if (logger.isLoggable(Level.FINER)) logger.finer("Case 2: Source phrase startsWithNT && !endsWithNT");
int sentenceNumber = sourcePhrase.getSentenceNumber(sourcePhraseIndex);
int startOfSentence = sourceSuffixArray.getCorpus().getSentencePosition(sentenceNumber);
int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
// Start by assuming the initial source nonterminal starts one word before the first source terminal
Span possibleSourceSpan = new Span(startOfTerminalSequence-1, endOfTerminalSequence);
// Loop over all legal source spans
// (this is variable because we don't know the length of the NT span)
// looking for a source span with a consistent translation
while (possibleSourceSpan.start >= startOfSentence &&
startOfTerminalSequence-possibleSourceSpan.start<=maxNonterminalSpan &&
endOfTerminalSequence-possibleSourceSpan.start<=maxPhraseSpan) {
// Get target span
Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);
// If target span and source span are consistent
if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {
// Construct a translation
HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, true, false);
if (translation != null) {
if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 2: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);
return translation;
}
}
possibleSourceSpan.start--;
}
}
// Case 3: If sourcePhrase !startsWithNT && endsWithNT
else if (!sourcePhrase.startsWithNonterminal() && sourcePhrase.endsWithNonterminal()) {
if (logger.isLoggable(Level.FINER)) logger.finer("Case 3: Source phrase !startsWithNT && endsWithNT");
int endOfSentence = sourceSuffixArray.getCorpus().getSentenceEndPosition(sourcePhrase.getSentenceNumber(sourcePhraseIndex));
int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
// Start by assuming the initial source nonterminal starts one word after the last source terminal
Span possibleSourceSpan =
new Span(startOfTerminalSequence, endOfTerminalSequence+1);
// Loop over all legal source spans
// (this is variable because we don't know the length of the NT span)
// looking for a source span with a consistent translation
while (possibleSourceSpan.end <= endOfSentence &&
possibleSourceSpan.end - endOfTerminalSequence <= maxNonterminalSpan &&
possibleSourceSpan.size()<=maxPhraseSpan) {
// Get target span
Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);
// If target span and source span are consistent
if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {
// Construct a translation
HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, false, true);
if (translation != null) {
if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 3: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);
return translation;
}
}
possibleSourceSpan.end++;
}
}
// Case 4: If sourcePhrase startsWithNT && endsWithNT
else if (sourcePhrase.startsWithNonterminal() && sourcePhrase.endsWithNonterminal()) {
if (logger.isLoggable(Level.FINER)) logger.finer("Case 4: Source phrase startsWithNT && endsWithNT");
int sentenceNumber = sourcePhrase.getSentenceNumber(sourcePhraseIndex);
int startOfSentence = sourceSuffixArray.getCorpus().getSentencePosition(sentenceNumber);
int endOfSentence = sourceSuffixArray.getCorpus().getSentenceEndPosition(sentenceNumber);
int startOfTerminalSequence = sourcePhrase.getFirstTerminalIndex(sourcePhraseIndex);
int endOfTerminalSequence = sourcePhrase.getLastTerminalIndex(sourcePhraseIndex);
// Start by assuming the initial source nonterminal
// starts one word before the first source terminal and
// ends one word after the last source terminal
Span possibleSourceSpan =
new Span(startOfTerminalSequence-1, endOfTerminalSequence+1);
// Loop over all legal source spans
// (this is variable because we don't know the length of the NT span)
// looking for a source span with a consistent translation
while (possibleSourceSpan.start >= startOfSentence &&
possibleSourceSpan.end <= endOfSentence &&
startOfTerminalSequence-possibleSourceSpan.start<=maxNonterminalSpan &&
possibleSourceSpan.end-endOfTerminalSequence<=maxNonterminalSpan &&
possibleSourceSpan.size()<=maxPhraseSpan) {
// Get target span
Span targetSpan = alignments.getConsistentTargetSpan(possibleSourceSpan);
// If target span and source span are consistent
if (targetSpan!=null && targetSpan.size()>=sourcePhrase.arity()+1 && targetSpan.size()<=maxPhraseSpan) {
// Construct a translation
HierarchicalPhrase translation = constructTranslation(sourcePhrase, sourcePhraseIndex, possibleSourceSpan, targetSpan, true, true);
if (translation != null) {
if (logger.isLoggable(Level.FINEST)) logger.finest("\tCase 4: Adding translation: '" + translation + "' for target span " + targetSpan + " from source span " + possibleSourceSpan);
return translation;
}
}
if (possibleSourceSpan.end < endOfSentence && possibleSourceSpan.end-endOfTerminalSequence+1<=maxNonterminalSpan && possibleSourceSpan.size()+1<=maxPhraseSpan) {
possibleSourceSpan.end++;
} else {
possibleSourceSpan.end = endOfTerminalSequence+1;//1;
possibleSourceSpan.start--;
}
}
}
// Is this the right thing to do, or should we throw an Error?
return null;
}
}