/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.Corpus;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.Phrase;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.ff.tm.Rule;
import joshua.util.Cache;
/**
* This class provides a mostly-complete implementation of the
* <code>Suffixes</code> interface, designed to minimize the effort
* required to build a concrete implementation of a suffix array
* data structure.
* <p>
* To implement a concrete suffix array, the programmer need only
* implement the <code>getCorpusIndex(int suffixIndex)</code> and
* <code>size()</code> methods.
*
* @author Lane Schwartz
* @author Chris Callison-Burch
*/
public abstract class AbstractSuffixArray implements Suffixes {
/** Logger for this class. */
private static Logger logger =
Logger.getLogger(AbstractSuffixArray.class.getName());
/**
* Maps from patterns to lists of hierarchical phrases that
* match the corresponding pattern in the corpus.
* <p>
* This cache is a most-recently accessed map, so commonly
* accessed patterns will remain in the cache, while rare
* patterns will eventually drop out of the cache.
*/
protected final Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache;
/**
* Maps from patterns to lists of hierarchical phrases that
* match the corresponding pattern in the corpus.
* <p>
* This cache is a most-recently accessed map, so commonly
* accessed patterns will remain in the cache, while rare
* patterns will eventually drop out of the cache.
*/
protected final Cache<Pattern,List<Rule>> ruleCache;
/**
* Integer array representation of the corpus for this
* suffix array.
*/
protected final Corpus corpus;
/**
* Constructs an abstract suffix array based on the provided
* corpus.
*
* The specified cache will be used to store matched
* hierarchical phrases for frequently accessed patterns.
*
* @param corpus Corpus upon which this suffix array is based.
* @param hierarchicalPhraseCache Cache to store matched
* hierarchical phrases for frequently accessed
* patterns
*/
public AbstractSuffixArray(
Corpus corpus,
Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache,
Cache<Pattern,List<Rule>> ruleCache) {
this.hierarchicalPhraseCache = hierarchicalPhraseCache;
this.ruleCache = ruleCache;
this.corpus = corpus;
}
/* See Javadoc for Suffixes interface.*/
public Cache<Pattern,MatchedHierarchicalPhrases> getCachedHierarchicalPhrases() {
return hierarchicalPhraseCache;
}
/* See Javadoc for Suffixes interface.*/
public Cache<Pattern,List<Rule>> getCachedRules() {
return this.ruleCache;
}
/* See Javadoc for Suffixes interface.*/
public MatchedHierarchicalPhrases createHierarchicalPhrases(Pattern pattern, int minNonterminalSpan, int maxPhraseSpan) {
if (hierarchicalPhraseCache.containsKey(pattern)) {
return hierarchicalPhraseCache.get(pattern);
} else {
int arity = pattern.arity();
int size = pattern.size();
int[] patternTokens = pattern.getWordIDs();
SymbolTable vocab = corpus.getVocabulary();
if (arity==0) {
int[] bounds = this.findPhrase(pattern, 0, pattern.size(), 0, this.size()-1);
int[] startPositions = this.getAllPositions(bounds);
MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab);
return result;
} else if (arity==size) {
int[] startPositions = new int[]{};
MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab);
return result;
} else if (arity==1 && pattern.startsWithNonterminal()) {
int[] terminals = new int[size-1];
for (int i=1; i<size; i++) {
terminals[i-1] = patternTokens[i];
}
Pattern terminalsPattern = new Pattern(vocab, terminals);
MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan);
MatchedHierarchicalPhrases result = terminalsMatch.copyWithInitialX();
hierarchicalPhraseCache.put(pattern, result);
return result;
} else if (arity==1 && pattern.endsWithNonterminal()) {
int[] terminals = new int[size-1];
for (int i=0, n=size-1; i<n; i++) {
terminals[i] = patternTokens[i];
}
Pattern terminalsPattern = new Pattern(vocab, terminals);
MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan);
MatchedHierarchicalPhrases result = terminalsMatch.copyWithFinalX();
hierarchicalPhraseCache.put(pattern, result);
return result;
// int[] bounds = this.findPhrase(pattern, 0, size, 0, this.size());
// int[] startPositions = this.getAllPositions(bounds);
//// Pattern patternX = new Pattern(pattern, PrefixTree.X);
// MatchedHierarchicalPhrases result = this.createHierarchicalPhrases(startPositions, pattern, vocab);
// return result;
} else {
int[] prefixTokens = new int[patternTokens.length - 1];
for (int i=0, n=patternTokens.length-1; i<n; i++) {
prefixTokens[i] = patternTokens[i];
}
int[] suffixTokens = new int[patternTokens.length - 1];
for (int i=1, n=patternTokens.length; i<n; i++) {
suffixTokens[i-1] = patternTokens[i];
}
Pattern prefix = new Pattern(vocab, prefixTokens);
Pattern suffix = new Pattern(vocab, suffixTokens);
MatchedHierarchicalPhrases prefixMatches = createHierarchicalPhrases(prefix, minNonterminalSpan, maxPhraseSpan);
MatchedHierarchicalPhrases suffixMatches = createHierarchicalPhrases(suffix, minNonterminalSpan, maxPhraseSpan);
MatchedHierarchicalPhrases result =
HierarchicalPhrases.queryIntersect(
pattern, prefixMatches, suffixMatches,
minNonterminalSpan, maxPhraseSpan, this);
hierarchicalPhraseCache.put(pattern, result);
return result;
}
}
}
/* See Javadoc for Suffixes interface.*/
public MatchedHierarchicalPhrases createTriviallyHierarchicalPhrases(int[] startPositions,
Pattern pattern, SymbolTable vocab) {
if (hierarchicalPhraseCache.containsKey(pattern)) {
if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, and did contain pattern: " + pattern.toString());
return hierarchicalPhraseCache.get(pattern);
} else {
if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, but did not contain pattern: " + pattern.toString());
// In the case of contiguous phrases,
// the hpCache is essentially acting as Adam's Inverted Index,
// because it stores the corpus-sorted indexes of each of the phrases.
// It differs because it creates a HierarchicalPhrases object rather than just int[].
Arrays.sort(startPositions);
HierarchicalPhrases hierarchicalPhrases = new HierarchicalPhrases(pattern, startPositions, getCorpus().getSentenceIndices(startPositions));
hierarchicalPhraseCache.put(pattern, hierarchicalPhrases);
return hierarchicalPhrases;
}
}
/* See Javadoc for Suffixes interface.*/
public int[] findPhrase(Phrase phrase) {
return findPhrase(phrase, 0, phrase.size());
}
/* See Javadoc for Suffixes interface.*/
public int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd,
int lowerBound, int upperBound) {
int[] bounds = new int[2];
lowerBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, true);
if (lowerBound < 0) return null;
upperBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, false);
bounds[0]=lowerBound;
bounds[1]=upperBound;
return bounds;
}
/* See Javadoc for Suffixes interface.*/
public int[] getAllPositions(int[] bounds) {
if (bounds != null) {
int startInSuffixArray = bounds[0];
int endInSuffixArray = bounds[1];
int length = endInSuffixArray - startInSuffixArray + 1;
int[] positions = new int[length];
for (int i = 0; i < length; i++) {
positions[i] = getCorpusIndex(i+startInSuffixArray);
}
Arrays.sort(positions);
return positions;
} else {
return new int[0];
}
}
/* See Javadoc for Suffixes interface.*/
public Corpus getCorpus() {
return corpus;
}
/* See Javadoc for Suffixes interface.*/
public abstract int getCorpusIndex(int suffixIndex);
/* See Javadoc for Suffixes interface.*/
public MatchedHierarchicalPhrases getMatchingPhrases(Pattern pattern) {
return hierarchicalPhraseCache.get(pattern);
}
/* See Javadoc for Suffixes interface.*/
public int getSentenceIndex(int corpusIndex) {
return corpus.getSentenceIndex(corpusIndex);
}
/* See Javadoc for Suffixes interface.*/
public int getSentencePosition(int sentenceIndex) {
return corpus.getSentencePosition(sentenceIndex);
}
/* See Javadoc for Suffixes interface.*/
public SymbolTable getVocabulary() {
return corpus.getVocabulary();
}
/* See Javadoc for Suffixes interface.*/
public void cacheMatchingPhrases(MatchedHierarchicalPhrases matchings) {
hierarchicalPhraseCache.put(matchings.getPattern(), matchings);
}
/* See Javadoc for Suffixes interface.*/
public abstract int size();
/**
* Finds a phrase in the suffix array. The phrase is extracted
* from the sentence given the start and end points.
*
* @param sentence the sentence/superphrase to draw the
* search phrase from
* @param phraseStart the start of the phrase in the sentence
* (inclusive)
* @param phraseEnd the end of the phrase in the sentence
* (exclusive)
* @return a tuple containing the (inclusive) start and the
* (inclusive) end bounds in the suffix array for
* the phrase
*/
protected int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd) {
return findPhrase(sentence, phraseStart, phraseEnd, 0, size()-1);
}
/**
* Finds the first or last occurrence of a phrase in the
* suffix array, within a subset of the suffix array that
* is bounded by suffixArrayStart and suffixArrayEnd. For
* efficiency of looking up all subphrases in a sentence
* we do not require that multplie int[]s be created for
* each subphrase. Instead this method will look for the
* subphrase within the sentence between phraseStart and
* phraseEnd.
*
* @param sentence the sentence/superphrase in int
* representation to draw the search
* phrase from
* @param phraseStart the start of the phrase in the sentence
* (inclusive)
* @param phraseEnd the end of the phrase in the sentence
* (exclusive)
* @param suffixArrayStart the point at which to start the
* search in the suffix array
* @param suffixArrayEnd the end point in the suffix array
* beyond which the search doesn't need
* to take place
* @param findFirst a flag that indicates whether we
* should find the first or last occurrence
* of the phrase
*/
private int findPhraseBound(
Phrase sentence,
int phraseStart,
int phraseEnd,
int suffixArrayStart,
int suffixArrayEnd,
boolean findFirst
) {
int low = suffixArrayStart;
int high = suffixArrayEnd;
// Do a binary search between the low and high points
while (low <= high) {
int mid = (low + high) >>> 1;
int start = getCorpusIndex(mid);
int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd);
if (diff == 0) {
// If the difference between the search phrase and the phrase in the corpus
// is 0, then we have found it. However, there might be multiple matches in
// the corpus, so we need to continue searching until we find the end point
int neighbor = mid;
if (findFirst) {
neighbor--;
} else {
neighbor++;
}
if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) {
int nextDiff = corpus.comparePhrase(getCorpusIndex(neighbor), sentence, phraseStart, phraseEnd);
if (nextDiff == 0) {
// There's another equivalent phrase, so we need to specify
// in which direction to continue searching
if (findFirst) {
diff = 1; //search lower
} else {
diff = -1; //search higher
}
}
}
}
if (diff < 0) {
low = mid + 1;
} else if (diff > 0) {
high = mid - 1;
} else {
return mid; //this is the edge
}
}
return -1; // key not found.
}
}