/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedSet;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;
import joshua.corpus.ContiguousPhrase;
import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.mm.MemoryMappedCorpusArray;
import joshua.corpus.suffix_array.mm.MemoryMappedSuffixArray;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Cache;
import joshua.util.Counted;
import joshua.util.io.BinaryIn;
/**
* Represents the most frequent phrases in a corpus.
*
* @author Chris Callison-Burch
* @author Lane Schwartz
*/
public class FrequentPhrases {
/** Logger for this class. */
private static final Logger logger =
Logger.getLogger(FrequentPhrases.class.getName());
/** Suffix array in which frequent phrases are located. */
final Suffixes suffixes;
/**
* Stores the number of times a phrase occurred in the
* corpus.
* <p>
* The iteration order of this map should start with the
* most frequent phrase and end with the least frequent
* phrase stored in the map.
* <p>
* The key set for this map should be identical to the key
* set in the <code>ranks</code> map.
*/
LinkedHashMap<Phrase,Integer> frequentPhrases;
/** Maximum number of phrases of which this object is aware. */
short maxPhrases;
/** Maximum phrase length to consider. */
int maxPhraseLength;
/**
* Minimum number of words in a corpus
* which a nonterminal may represent.
*/
int minNonterminalSpan;
/**
* Maximum span (from first token to last token) in a corpus
* which a phrase may represent.
*/
int maxPhraseSpan;
/** Stores sorted lists of corpus locations for most frequent phrases. */
Map<Phrase,InvertedIndex> invertedIndices;
/**
* Stores the corpus locations of collocations
* of frequent phrases with other frequent phrases.
*/
List<HierarchicalPhrases> frequentCollocations;
/**
* Constructs data regarding the frequencies of the <em>n</em>
* most frequent phrases found in the corpus backed by the
* provided suffix array.
*
* @param suffixes Suffix array corresponding to a corpus.
* @param minFrequency The minimum frequency required to
* for a phrase to be considered frequent.
* @param maxPhrases The number of phrases to consider.
* @param maxPhraseLength Maximum phrase length to consider.
* @param maxContiguousPhraseLength Maximum phrase length to consider for a contiguous phrase
* @param maxPhraseSpan Maximum span (from first token to last token) in a corpus
* which a phrase may represent.
* @param minNonterminalSpan Minimum number of words in a corpus
* which a nonterminal may represent.
*/
public FrequentPhrases(
Suffixes suffixes,
int minFrequency,
short maxPhrases,
int maxPhraseLength,
int maxContiguousPhraseLength,
int maxPhraseSpan, int minNonterminalSpan) {
this.maxPhrases = maxPhrases;
this.maxPhraseLength = maxPhraseLength;
this.minNonterminalSpan = minNonterminalSpan;
this.maxPhraseSpan = maxPhraseSpan;
this.suffixes = suffixes;
this.frequentPhrases = getMostFrequentPhrases(suffixes, minFrequency, maxPhrases, maxContiguousPhraseLength);
this.invertedIndices = calculateInvertedIndices();
this.frequentCollocations = countCollocations();
}
public FrequentPhrases(Suffixes suffixes, String binaryFilename) throws IOException, ClassNotFoundException {
this.suffixes = suffixes;
BinaryIn<InvertedIndex> in = new BinaryIn<InvertedIndex>(binaryFilename, InvertedIndex.class);
this.readExternal(in);
}
//
// public short getMaxPhrases() {
// return this.maxPhrases;
// }
Suffixes getSuffixes() {
return this.suffixes;
}
// /**
// * This method performs a one-pass computation of the
// * collocation of two frequent subphrases. It is used for
// * the precalculation of the translations of hierarchical
// * phrases which are problematic to calculate on the fly.
// * This procedure is described in "Hierarchical Phrase-Based
// * Translation with Suffix Arrays" by Adam Lopez.
// *
// * @param maxPhraseLength the maximum length of any phrase
// * in the phrases
// * @param windowSize the maximum allowable space between
// * phrases for them to still be considered
// * collocated
// * @param minNonterminalSpan Minimum span allowed for a nonterminal
// */
// public FrequentMatches getCollocations(
// int maxPhraseLength,
// int windowSize,
// short minNonterminalSpan
// ) {
//
//// FrequentMatches collocations = new FrequentMatches(this, maxPhraseLength, windowSize, minNonterminalSpan);
////
//// countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
////
//// collocations.histogramSort();
////
//// return collocations;
//
// throw new RuntimeException("Not currently supported");
//
// }
/**
* Gets the number of times any frequent phrase co-occurred
* with any frequent phrase within the given window.
* <p>
* This method performs a one-pass computation of the
* collocation of two frequent sub-phrases. It is used for
* the precalculation of the translations of hierarchical
* phrases which are problematic to calculate on the fly.
*
* This procedure is described in "Hierarchical Phrase-Based
* Translation with Suffix Arrays" by Adam Lopez.
*
* @param maxPhraseLength the maximum length of any phrase
* in the phrases
* @param windowSize the maximum allowable space between
* phrases for them to still be considered
* collocated
*
* @return The number of times any frequent phrase co-occurred
* with any frequent phrase within the given window.
*/
// int countCollocations(int maxPhraseLength, int windowSize, short minNonterminalSpan) {
// return countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
// }
protected List<HierarchicalPhrases> getFrequentCollocations() {
return this.frequentCollocations;
}
/**
* Gets the hierarchical phrases that represent
* the collocations of one frequent phrase with
* another frequent phrase.
* <p>
* This method performs a one-pass computation of the
* collocation of two frequent sub-phrases. It is used for
* the precalculation of the translations of hierarchical
* phrases which are problematic to calculate on the fly.
*
* This procedure is described in "Hierarchical Phrase-Based
* Translation with Suffix Arrays" by Adam Lopez.
*
* @return The number of times any frequent phrase co-occurred
* with any frequent phrase within the given window.
*/
private List<HierarchicalPhrases> countCollocations() {
PhrasePairCollocations collocations = new PhrasePairCollocations(suffixes.getCorpus());
LinkedList<Phrase> phrasesInWindow = new LinkedList<Phrase>();
LinkedList<Integer> positions = new LinkedList<Integer>();
int sentenceNumber = 1;
int endOfSentence = suffixes.getSentencePosition(sentenceNumber);
if (logger.isLoggable(Level.FINEST)) logger.finest("END OF SENT: " + endOfSentence);
Corpus corpus = suffixes.getCorpus();
int endOfCorpus = corpus.size();
// Start at the beginning of the corpus...
for (int currentPosition : corpus.corpusPositions()) {
// Start with a phrase length of 1, at the current position...
for (int i = 1, endOfPhrase = currentPosition + i;
// ...ensure the phrase length isn't too long...
i <= maxPhraseLength &&
// ...and that the phrase doesn't extend past the end of the sentence...
endOfPhrase <= endOfSentence &&
// ...or past the end of the corpus
endOfPhrase <= endOfCorpus;
// ...then increment the phrase length and end of phrase marker.
i++, endOfPhrase = currentPosition + i) {
// Get the current phrase
Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);
if (logger.isLoggable(Level.FINEST)) logger.finest("Found phrase (" +currentPosition + ","+endOfPhrase+") " + phrase);
// If the phrase is one we care about...
if (frequentPhrases.containsKey(phrase)) {
if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);
// Remember the phrase...
phrasesInWindow.add(phrase);
// ...and its starting position
positions.add(currentPosition);
}
} // end iterating over various phrase lengths
// check whether we're at the end of the sentence and dequeue...
if (currentPosition == endOfSentence) {
if (logger.isLoggable(Level.FINEST)) {
logger.finest("REACHED END OF SENT: " + currentPosition);
logger.finest("PHRASES: " + phrasesInWindow);
logger.finest("POSITIONS: " + positions);
}
// empty the whole queue...
// for (int i = 0, n=phrasesInWindow.size(); i < n; i++) {
while (! phrasesInWindow.isEmpty()) {
processPhraseWindow(collocations, phrasesInWindow, positions);
}
// clear the queues
phrasesInWindow.clear();
positions.clear();
// update the end of sentence marker
sentenceNumber++;
endOfSentence = suffixes.getSentencePosition(sentenceNumber)-1;
if (logger.isLoggable(Level.FINER)) logger.finer("END OF SENT: " + sentenceNumber + " at position " + endOfSentence);
} // Done processing end of sentence.
// check whether the initial elements are
// outside the window size...
if (! phrasesInWindow.isEmpty()) {
int position1 = positions.get(0);
// dequeue the first element and
// calculate its collocations...
while (! phrasesInWindow.isEmpty() &&
((currentPosition+1==endOfCorpus) ||
(currentPosition-position1 >= maxPhraseSpan))) {
processPhraseWindow(collocations, phrasesInWindow, positions);
// if (logger.isLoggable(Level.FINEST)) logger.finest("OUTSIDE OF WINDOW: " + position1 + " " + currentPosition + " " + maxPhraseSpan);
//
// Phrase phrase1 = phrasesInWindow.removeFirst();
// positions.removeFirst();
//
// Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
// Iterator<Integer> positionIterator = positions.iterator();
//
// int end1 = position1 + phrase1.size();
//
// for (int j = 0, n=phrasesInWindow.size(); j < n; j++) {
//
// Phrase phrase2 = phraseIterator.next();
// int position2 = positionIterator.next();
//
// if (position2-end1 >= minNonterminalSpan) {
// if (logger.isLoggable(Level.FINEST)) logger.finest("CASE2: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
// collocations.record(phrase1, phrase2, position1, position2);
// } else if (logger.isLoggable(Level.FINEST)) {
// logger.finest("Not recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
// }
// }
if (phrasesInWindow.size() > 0) {
position1 = positions.getFirst();
} else {
position1 = currentPosition;
}
}
}
} // end iterating over positions in the corpus
return collocations.getHierarchicalPhrases();
// return count;
}
/**
* @param collocations
* @param phrasesInWindow
* @param positions
* @param i
* @param n
*/
private void processPhraseWindow(PhrasePairCollocations collocations,
LinkedList<Phrase> phrasesInWindow,
LinkedList<Integer> positions) {
Phrase phrase1 = phrasesInWindow.removeFirst();
int position1 = positions.removeFirst();
Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
Iterator<Integer> positionIterator = positions.iterator();
int end1 = position1 + phrase1.size();
while (phraseIterator.hasNext() && positionIterator.hasNext()) {
Phrase phrase2 = phraseIterator.next();
int position2 = positionIterator.next();
int end2 = position2 + phrase2.size();
if (position2-end1 >= minNonterminalSpan && end2-position1 <= maxPhraseSpan) {
if (logger.isLoggable(Level.FINEST)) logger.finest(" Recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
collocations.record(phrase1, phrase2, position1, position2);
} else if (logger.isLoggable(Level.FINEST)) {
logger.finest("Not recording collocation: "+ phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
}
}
}
// /**
// * Returns an integer identifier for the collocation of
// * <code>phrase1</code> with <code>phrase2</code>.
// * <p>
// * If <code>rank1</code> is the rank of <code>phrase1</code>
// * and <code>rank2</code> is the rank of <code>phrase2</code>,
// * the identifier returned by this method is defined to be
// * <code>rank1*maxPhrases + rank2</code>.
// * <p>
// * As such, the range of possible values returned by this
// * method will be </code>0</code> through
// * <code>maxPhrases*maxPhrases-1</code>.
// *
// * @param phrase1 First phrase in a collocation.
// * @param phrase2 Second phrase in a collocation.
// * @return a unique integer identifier for the collocation.
// */
// private int getKey(LinkedHashMap<Phrase,Short> ranks, Phrase phrase1, Phrase phrase2) {
//
// short rank1 = ranks.get(phrase1);
// short rank2 = ranks.get(phrase2);
//
// int rank = rank1*maxPhrases + rank2;
//
// return rank;
// }
// /**
// * Builds a HashMap of all the occurrences of the phrase,
// * keying them based on the index of the sentence that they
// * occur in. Since we iterate over all occurrences of the
// * phrase, this method is linear with respect to the number
// * of occurrences, and should not be used for very frequent
// * phrases. This is part of the baseline method described
// * in Section 4.1 of Adam Lopez's EMNLP paper.
// */
// public HashMap<Integer,HashSet<Integer>> keyPositionsWithSentenceNumber(Phrase phrase) {
// // keys are the sentence numbers of partial matches
// HashMap<Integer,HashSet<Integer>> positionsKeyedWithSentenceNumber = new HashMap<Integer,HashSet<Integer>>(suffixes.size());
// int[] bounds = suffixes.findPhrase(phrase);
// if (bounds == null) return positionsKeyedWithSentenceNumber;
//
// int[] positions = suffixes.getAllPositions(bounds);
// for (int i = 0; i < positions.length; i++) {
// int sentenceNumber = suffixes.getSentenceIndex(positions[i]);
// HashSet<Integer> positionsInSentence = positionsKeyedWithSentenceNumber.get(sentenceNumber);
// if (positionsInSentence == null) {
// positionsInSentence = new HashSet<Integer>();
// }
// positionsInSentence.add(positions[i]);
// positionsKeyedWithSentenceNumber.put(sentenceNumber, positionsInSentence);
// }
// return positionsKeyedWithSentenceNumber;
// }
//===============================================================
// Protected
//===============================================================
//===============================================================
// Methods
//===============================================================
/**
* Calculates the frequency ranks of the provided phrases.
* <p>
* The iteration order of the <code>frequentPhrases</code>
* parameter is used by this method to determine the
* rank of each phrase. Specifically, the first phrase
* returned by the map's iterator is taken to be the most
* frequent phrase; the last phrase returned by the map's
* iterator is taken to be the least frequent phrase.
*
* @param frequentPhrases Map from phrase to frequency of
* that phrase in a corpus.
* @return the frequency ranks of the provided phrases
*/
protected LinkedHashMap<Phrase,Short> getRanks() {
logger.fine("Calculating ranks of frequent phrases");
LinkedHashMap<Phrase,Short> ranks = new LinkedHashMap<Phrase,Short>(frequentPhrases.size());
short i=0;
for (Phrase phrase : frequentPhrases.keySet()) {
ranks.put(phrase, i++);
}
logger.fine("Done calculating ranks");
return ranks;
}
/**
* Calculates the most frequent phrases in the corpus.
* <p>
* Allows a threshold to be set for the minimum frequency
* to remember, as well as the maximum number of phrases.
* <p>
* This method implements the
* <code>print_LDIs_stack</code> function defined in
* section 2.5 of Yamamoto and Church.
*
* @param suffixes a suffix array for the corpus
* @param minFrequency the minimum frequency required to
* retain phrases
* @param maxPhrases the maximum number of phrases to
* return
* @param maxPhraseLength the maximum phrase length to
* consider
*
* @return A map from phrase to the number of times
* that phrase occurred in the corpus.
* The iteration order of the map will start
* with the most frequent phrase, and
* end with the least frequent calculated phrase.
*
* @see "Yamamoto and Church (2001), section 2.5"
*/
@SuppressWarnings("unchecked")
protected static LinkedHashMap<Phrase,Integer> getMostFrequentPhrases(
Suffixes suffixes,
int minFrequency,
int maxPhrases,
int maxPhraseLength
) {
PriorityQueue<Counted<Phrase>> frequentPhrases = new PriorityQueue<Counted<Phrase>>();
Set<Integer> prunedFrequencies = new HashSet<Integer>();
Corpus corpus = suffixes.getCorpus();
FrequencyClasses frequencyClasses = getFrequencyClasses(suffixes);
for (FrequencyClass frequencyClass : frequencyClasses.withMinimumFrequency(minFrequency)) {
int frequency = frequencyClass.getFrequency();
if (! prunedFrequencies.contains(frequency)) {
int i = frequencyClass.getIntervalStart();
int startOfPhrase = suffixes.getCorpusIndex(i);
int sentenceNumber = suffixes.getSentenceIndex(startOfPhrase);
int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
int max = Math.min(maxPhraseLength, endOfSentence-startOfPhrase);
if (logger.isLoggable(Level.FINER)) logger.finer("Max phrase length is " + max + " for " + frequencyClass.toString());
for (int phraseLength : frequencyClass.validPhraseLengths(max)) {
int endOfPhrase = startOfPhrase + phraseLength;
Phrase phrase = new ContiguousPhrase(
startOfPhrase,
endOfPhrase,
corpus);
frequentPhrases.add(new Counted<Phrase>(phrase, frequency));
if (frequentPhrases.size() > maxPhrases) {
Counted<Phrase> pruned = frequentPhrases.poll();
int prunedFrequency = pruned.getCount();
prunedFrequencies.add(prunedFrequency);
if (logger.isLoggable(Level.FINER)) logger.info("Pruned " + pruned.getElement() + " with frequency " + prunedFrequency);
break;
}
}
} else if (logger.isLoggable(Level.FINER)) {
logger.finer("Skipping pruned frequency " + frequency);
}
}
while (! frequentPhrases.isEmpty() && prunedFrequencies.contains(frequentPhrases.peek().getCount())) {
Counted<Phrase> pruned = frequentPhrases.poll();
if (logger.isLoggable(Level.FINER)) logger.finer("Pruned " + pruned.getElement() + " " + pruned.getCount());
}
Counted<Phrase>[] reverse = new Counted[frequentPhrases.size()];
{
int i=frequentPhrases.size()-1;
while (! frequentPhrases.isEmpty()) {
reverse[i] = frequentPhrases.poll();
i -= 1;
}
}
LinkedHashMap<Phrase,Integer> results = new LinkedHashMap<Phrase,Integer>();
for (Counted<Phrase> countedPhrase : reverse) {
Phrase phrase = countedPhrase.getElement();
Integer count = countedPhrase.getCount();
results.put(phrase, count);
}
//
// while (! frequentPhrases.isEmpty()) {
// Counted<Phrase> countedPhrase = frequentPhrases.poll();
// Phrase phrase = countedPhrase.getElement();
// Integer count = countedPhrase.getCount();
// results.put(phrase, count);
// }
//
return results;
}
/**
* Calculates the frequencies for
* all phrase frequency classes in the corpus.
* <p>
* This method is implements the
* <code>print_LDIs_stack</code> function defined in
* section 2.5 of Yamamoto and Church.
*
* @param suffixes a suffix array for the corpus
* @return A list of term frequency classes
*
* @see "Yamamoto and Church (2001), section 2.5"
*/
protected static FrequencyClasses getFrequencyClasses(Suffixes suffixes) {
// calculate the longest common prefix delimited intervals...
int[] longestCommonPrefixes = calculateLongestCommonPrefixes(suffixes);
// Construct an initially empty object to hold class frequency information
FrequencyClasses frequencyClasses = new FrequencyClasses(longestCommonPrefixes);
// stack_i <-- an integer array for the stack of left edges, i
Stack<Integer> startIndices = new Stack<Integer>();
// stack_k <-- an integer array for the stack of representatives, k
Stack<Integer> shortestInteriorLCPIndices = new Stack<Integer>();
// stack_i[0] <-- 0
startIndices.push(0);
// stack_k[0] <-- 0
shortestInteriorLCPIndices.push(0);
// sp <-- 1 (a stack pointer)
// for j <-- 0,1,2, ..., N-1
for (int j = 0, size=suffixes.size(); j < size; j++) {
// Output an lcp-delimited interval <j,j> with tf=1
// (trivial interval i==j, frequency=1)
if (logger.isLoggable(Level.FINEST)) logger.finest("Output trivial interval <"+j+","+j+"> with tf=1");
frequencyClasses.record(j);
//frequencyClasses.record(j, j, Integer.MAX_VALUE, 1);
// While lcp[j+1] < lcp[stack_k[sp-1]] do
while (longestCommonPrefixes[j+1] < longestCommonPrefixes[shortestInteriorLCPIndices.peek()]) {
int i = startIndices.pop();
int k = shortestInteriorLCPIndices.pop();
int longestBoundingLCP = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
int shortestInteriorLCP = longestCommonPrefixes[k];
// Output an interval <i,j> with tf=j-i+1, if it is lcp-delimited
// (non-trivial interval)
// sp <-- sp - 1
if (longestBoundingLCP < shortestInteriorLCP) {
int frequency = j-i+1;
if (logger.isLoggable(Level.FINEST)) logger.finest("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));
frequencyClasses.record(i, j, k, frequency);
}
}
// stack_i[sp] <-- stack_k[sp-1]
startIndices.push(shortestInteriorLCPIndices.peek());
// stack_k[sp] <-- j+1
shortestInteriorLCPIndices.push(j+1);
// sp <-- sp + 1
}
return frequencyClasses;
}
public void cacheInvertedIndices() {
for (HierarchicalPhrases phrases : frequentCollocations) {
suffixes.cacheMatchingPhrases(phrases);
}
for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
Pattern pattern = new Pattern(entry.getKey());
InvertedIndex list = entry.getValue();
HierarchicalPhrases phraseLocations = new HierarchicalPhrases(pattern,list.corpusLocations, list.sentenceNumbers);
suffixes.cacheMatchingPhrases(phraseLocations);
if (logger.isLoggable(Level.FINE)) logger.fine("Cached sorted locations for " + pattern);
if (pattern.toString().equals("[.]")) {
logger.fine("Found .");
}
if (logger.isLoggable(Level.FINE)) {
StringBuilder s = new StringBuilder();
String patternString = pattern.toString();
for (Integer i : list.corpusLocations) {
s.append(patternString);
s.append('\t');
s.append(i);
s.append('\n');
}
logger.fine(s.toString());
}
}
}
/**
* Constructs an auxiliary array that stores longest common
* prefixes. The length of the array is the corpus size+1.
* Each elements lcp[i] indicates the length of the common
* prefix between two positions s[i-1] and s[i] in the
* suffix array.
*
* @param suffixes Suffix array
* @return Longest common prefix array
*/
protected static int[] calculateLongestCommonPrefixes(Suffixes suffixes) {
int length = suffixes.size();
Corpus corpus = suffixes.getCorpus();
int[] longestCommonPrefixes = new int[length +1];
// For each element in the suffix array
for (int i = 1; i < length; i++) {
int corpusIndex = suffixes.getCorpusIndex(i);
int prevCorpusIndex = suffixes.getCorpusIndex(i-1);
// Start by assuming that the two positions
// don't have anything in common
int commonPrefixSize = 0;
// While the 1st position is not at the end of the corpus...
while(corpusIndex+commonPrefixSize < length &&
// ... and the 2nd position is not at the end of the corpus...
prevCorpusIndex + commonPrefixSize < length &&
// ... and the nth word at the 1st position ...
(corpus.getWordID(corpusIndex + commonPrefixSize) ==
// ... is the same as the nth word at the 2nd position ...
corpus.getWordID(prevCorpusIndex + commonPrefixSize) &&
// ... and the length to consider isn't too long
commonPrefixSize <= Suffixes.MAX_COMPARISON_LENGTH)) {
// The two positions match for their respective nth words!
// Increment commonPrefixSize to reflect this fact
commonPrefixSize++;
}
// Record how long the common prefix is between
// suffix array element s[i] and s[i-1]
longestCommonPrefixes[i] = commonPrefixSize;
}
// By definition, the 0th element of lcp is 0
longestCommonPrefixes[0] = 0;
// By definition, the final element of lcp is 0
longestCommonPrefixes[length] = 0;
return longestCommonPrefixes;
}
// /**
// * This method extracts phrases which reach the specified
// * minimum frequency. It uses the equivalency classes for
// * substrings in the interval i-j in the suffix array, as
// * defined in section 2.3 of the the Yamamoto and Church
// * CL article. This is a helper function for the
// * getMostFrequentPhrases method.
// *
// * @param suffixes Suffix array
// * @param longestCommonPrefixes Longest common prefix array
// * @param i Index specifying a starting range in the suffix array
// * @param j Index specifying an ending range in the suffix array
// * @param k Index specifying a representative value of the range,
// * such that i < k <= j, and such that longestCommonPrefixes[k]
// * is the shortest interior longest common prefix of the range
// * (see section 2.5 of Yamamoto and Church)
// * @param phrases
// * @param frequencies
// * @param minFrequency
// * @param maxPhrases
// * @param maxPhraseLength
// * @param comparator
// */
// protected static void recordPhraseFrequencies(
// Suffixes suffixes,
// int[] longestCommonPrefixes,
// int i,
// int j,
// int k,
// List<Phrase> phrases,
// List<Integer> frequencies,
// int minFrequency,
// int maxPhrases,
// int maxPhraseLength,
// Comparator<Integer> comparator
// ) {
//
// if (i==j) {
// logger.info("Output trivial interval <"+j+","+j+"> with k="+k+" and tf=1");
// } else {
//
// int LBL = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
// int SIL = longestCommonPrefixes[k];
//
// if (LBL < SIL) {
// logger.info("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));
// } else {
// logger.info("Interval <"+i+","+j+"> is NOT lcp-delimited, because " + LBL + " not < " +SIL);
// }
// }
// }
private Map<Phrase,InvertedIndex> calculateInvertedIndices() {
Map<Phrase,InvertedIndex> invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
Corpus corpus = suffixes.getCorpus();
int endOfCorpus = corpus.size();
logger.fine("Corpus has size " + endOfCorpus);
int sentenceNumber = 0;
int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
boolean trackMe = false;
// Start at the beginning of the corpus...
for (int currentPosition : corpus.corpusPositions()) {
//
if (trackMe)
{
logger.fine("At corpus position " + currentPosition);
}
//
// if (currentPosition==0 || currentPosition==1) {
// logger.fine("Here!");
// }
// Start with a phrase length of 1, at the current position...
for (int i = 1, endOfPhrase = currentPosition + i;
// ...ensure the phrase length isn't too long...
i <= maxPhraseLength &&
// ...and that the phrase doesn't extend past the end of the sentence...
endOfPhrase <= endOfSentence &&
// ...or past the end of the corpus
endOfPhrase <= endOfCorpus;
// ...then increment the phrase length and end of phrase marker.
i++, endOfPhrase = currentPosition + i) {
if (trackMe) logger.fine("endOfPhrase=="+endOfPhrase);
// Get the current phrase
Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);
if (phrase.toString().equals(".")) {
logger.fine("Huzzah, £20 for the King!");
trackMe = true;
}
if (logger.isLoggable(Level.FINE)) logger.fine("In sentence " + sentenceNumber + " found phrase (" +currentPosition + ","+endOfPhrase+") " + phrase);
// If the phrase is one we care about...
if (frequentPhrases.containsKey(phrase)) {
if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);
if (! invertedIndices.containsKey(phrase)) {
invertedIndices.put(phrase, new InvertedIndex());
}
InvertedIndex invertedIndex = invertedIndices.get(phrase);
logger.fine("Recording position " + currentPosition + " in sentence " + sentenceNumber + " for phrase " + phrase);
invertedIndex.record(currentPosition, sentenceNumber);
}
} // end iterating over various phrase lengths
if (currentPosition+1 == endOfSentence) {
sentenceNumber += 1;
endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
}
}
return invertedIndices;
}
/* See Javadoc for java.io.Externalizable interface. */
public void readExternal(ObjectInput in) throws IOException,
ClassNotFoundException {
boolean loggingFiner = logger.isLoggable(Level.FINER);
SymbolTable vocab = suffixes.getVocabulary();
// Read in the maximum number of phrases of which this object is aware.
this.maxPhrases = in.readShort();
if (loggingFiner) logger.finer(" Read: maxPhrases="+maxPhrases);
// Read in the maximum phrase length to consider.
this.maxPhraseLength = in.readInt();
if (loggingFiner) logger.finer(" Read: maxPhraseLength="+maxPhraseLength);
// Read in the count of frequent phrase types
int frequentPhrasesSize = in.readInt();
if (loggingFiner) logger.finer(" Read: frequentPhrases.size()="+frequentPhrasesSize);
// Read in the frequentPhrases map
this.frequentPhrases = new LinkedHashMap<Phrase,Integer>();
for (int i=0; i<frequentPhrasesSize; i++) {
// Write out number of times the phrase is found in the corpus
int count = in.readInt();
if (loggingFiner) logger.finer(" Read: phraseCount="+count);
// Read in the number of tokens in the phrase
int tokenCount = in.readInt();
if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
int[] wordIDs = new int[tokenCount];
for (int j=0; j<tokenCount; j++) {
int wordID = in.readInt();
if (loggingFiner) logger.finer(" Read: wordIDs["+j+"]="+wordID);
wordIDs[j] = wordID;
}
BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
// if (loggingFinest) logger.finer("Read: phrase="+Arrays.toString(wordIDs)+ " " + phrase);
this.frequentPhrases.put(phrase, count);
}
// Read in number of inverted indices
int invertedIndicesCount = in.readInt();
if (loggingFiner) logger.finer(" Read: invertedIndices.size()="+invertedIndicesCount);
// Read in inverted indices
this.invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
for (int i=0; i<invertedIndicesCount; i++) {
// Read in the number of tokens in the phrase
int tokenCount = in.readInt();
if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
int[] wordIDs = new int[tokenCount];
for (int j=0; j<tokenCount; j++) {
wordIDs[j] = in.readInt();
if (loggingFiner) logger.finer(" Read: wordID["+j+"]="+wordIDs[j]);
}
// Reconstruct phrase
BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
// Read in inverted index
InvertedIndex invertedIndex = new InvertedIndex();
if (loggingFiner) logger.finer(" Read: about to InvertedIndex");
if (phrase.toString().equals("it")) {
logger.fine("Found it!");
}
invertedIndex.readExternal(in);
this.invertedIndices.put(phrase, invertedIndex);
}
// Read collocations
int frequentCollocationsSize = in.readInt();
this.frequentCollocations = new ArrayList<HierarchicalPhrases>(frequentCollocationsSize);
for (int i=0; i<frequentCollocationsSize; i++) {
// Read the pattern
int wordsLength = in.readInt();
int[] words = new int[wordsLength];
for (int j=0; j<wordsLength; j++) {
words[j]=in.readInt();
}
Pattern pattern = new Pattern(vocab, words);
// int terminalSequenceLengthsLength = in.readInt();
// int[] terminalSequenceLengths = new int[terminalSequenceLengthsLength];
// for (int j=0; j<terminalSequenceLengthsLength; j++) {
// terminalSequenceLengths[j]=in.readInt();
// }
// Read the number of corpus matches
// int phrasesSize = in.readInt();
// Next, read the sentence numbers
// There should be size of these
int[] sentenceNumber = new int[in.readInt()];
for (int j=0, n=sentenceNumber.length; j<n; j++) {
sentenceNumber[j] = in.readInt();
}
// Next, read the start index of each corpus match
// There should be size of these
int[] terminalSequenceStartIndices = new int[in.readInt()];
for (int j=0, n=terminalSequenceStartIndices.length; j<n; j++) {
terminalSequenceStartIndices[j] = in.readInt();
}
HierarchicalPhrases phrases = new HierarchicalPhrases(pattern, terminalSequenceStartIndices, sentenceNumber);
this.frequentCollocations.add(phrases);
}
}
public void writeExternal(ObjectOutput out) throws IOException {
boolean loggingFiner = logger.isLoggable(Level.FINER);
// Write out maximum number of phrases of which this object is aware.
out.writeShort(maxPhrases);
if (loggingFiner) logger.finest("Wrote: maxPhrases="+maxPhrases);
// Write out maximum phrase length to consider.
out.writeInt(maxPhraseLength);
if (loggingFiner) logger.finest("Wrote: maxPhraseLength="+maxPhraseLength);
// Write out count of frequent phrase types
out.writeInt(frequentPhrases.size());
if (loggingFiner) logger.finest("Wrote: frequentPhrases.size()="+frequentPhrases.size());
// Write out frequentPhrases map
for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {
Phrase phrase = entry.getKey();
int phraseCount = entry.getValue();
int[] wordIDs = phrase.getWordIDs();
// Write out number of times the phrase is found in the corpus
out.writeInt(phraseCount);
if (loggingFiner) logger.finer("Wrote: phraseCount="+phraseCount);
// Write out the number of tokens in the phrase
out.writeInt(wordIDs.length);
if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
// Write out each token in the phrase
int index = 0;
for (int wordID : wordIDs) {
out.writeInt(wordID);
if (loggingFiner) logger.finer("Wrote: wordIDs["+index+"]="+wordID);
index+=1;
}
// if (loggingFinest) logger.finest("Wrote: wordIDs="+Arrays.toString(wordIDs));
}
// Write out number of inverted indices
out.writeInt(invertedIndices.size());
if (loggingFiner) logger.finer("Wrote: invertedIndices.size()="+invertedIndices.size());
// Write out inverted indices
for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
Pattern pattern = new Pattern(entry.getKey());
int[] wordIDs = pattern.getWordIDs();
// Write out number of tokens in the pattern
out.writeInt(wordIDs.length);
if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
// Write out each token in the phrase
int index = 0;
for (int wordID : wordIDs) {
out.writeInt(wordID);
if (loggingFiner) logger.finer("Wrote: wordID["+index+"]="+wordID);
index+=1;
}
// Write out inverted index for this phrase
InvertedIndex list = entry.getValue();
if (loggingFiner) logger.finer("Wrote: about to InvertedIndex");
// if (pattern.toString().contains("[it]")) {
// logger.fine("Found it!");
// }
out.writeObject(list);
}
/////////////
// Write collocations
out.writeInt(frequentCollocations.size());
for (HierarchicalPhrases phrases : frequentCollocations) {
// Write the pattern
int[] words = phrases.pattern.getWordIDs();
out.writeInt(words.length);
for (int token : phrases.pattern.getWordIDs()) {
out.writeInt(token);
}
// out.writeInt(phrases.pattern.arity());
//
// out.writeInt(phrases.terminalSequenceLengths.length);
// for (int l : phrases.terminalSequenceLengths) {
// out.writeInt(l);
// }
// Write the number of corpus matches
// out.writeInt(phrases.size);
// Next, write the sentence numbers
// There should be size of these
out.writeInt(phrases.sentenceNumber.length);
for (int n : phrases.sentenceNumber) {
out.writeInt(n);
}
// Next, write the start index of each corpus match
// There should be size of these
out.writeInt(phrases.terminalSequenceStartIndices.length);
for (int startIndex : phrases.terminalSequenceStartIndices) {
out.writeInt(startIndex);
}
}
}
public String toString() {
String format = null;
StringBuilder s = new StringBuilder();
for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {
Phrase phrase = entry.getKey();
Integer frequency = entry.getValue();
if (format==null) {
int length = frequency.toString().length();
format = "%1$" + length + "d";
}
s.append(String.format(format, frequency));
s.append('\t');
s.append(phrase.toString());
s.append('\n');
}
return s.toString();
}
/**
* Private helper method for performing fast intersection.
*
* @param <E>
* @param sortedData
* @param sortedQueries
* @param result
*/
private static <E extends Comparable<E>> void fastIntersect(List<E> sortedData, List<E> sortedQueries, SortedSet<E> result) {
int medianQueryIndex = sortedQueries.size() / 2;
E medianQuery = sortedQueries.get(medianQueryIndex);
int index = Collections.binarySearch(sortedData, medianQuery);
if (index >= 0) {
result.add(medianQuery);
} else {
index = (-1 * index) + 1;
}
if (index-1 >= 0 && medianQueryIndex-1 >=0) {
fastIntersect(sortedData.subList(0, index), sortedQueries.subList(0, medianQueryIndex), result);
}
if (index+1 < sortedData.size() && medianQueryIndex+1 < sortedQueries.size()) {
fastIntersect(sortedData.subList(index+1, sortedData.size()), sortedQueries.subList(medianQueryIndex+1, sortedQueries.size()), result);
}
}
//===============================================================
// Static
//===============================================================
//===============================================================
// Inner classes
//===============================================================
//===============================================================
// Main method
//===============================================================
public static void main(String[] args) throws IOException, ClassNotFoundException {
Vocabulary symbolTable;
Corpus corpusArray;
Suffixes suffixArray;
FrequentPhrases frequentPhrases;
if (args.length == 1) {
String corpusFileName = args[0];
logger.info("Constructing vocabulary from file " + corpusFileName);
symbolTable = new Vocabulary();
int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);
logger.info("Constructing corpus array from file " + corpusFileName);
corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);
logger.info("Constructing suffix array from file " + corpusFileName);
suffixArray = new SuffixArray(corpusArray, Cache.DEFAULT_CAPACITY);
} else if (args.length == 3) {
String binarySourceVocabFileName = args[0];
String binaryCorpusFileName = args[1];
String binarySuffixArrayFileName = args[2];
if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language vocabulary from binary file " + binarySourceVocabFileName);
ObjectInput in = BinaryIn.vocabulary(binarySourceVocabFileName);
symbolTable = new Vocabulary();
symbolTable.readExternal(in);
logger.info("Constructing corpus array from file " + binaryCorpusFileName);
if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
corpusArray = new MemoryMappedCorpusArray(symbolTable, binaryCorpusFileName);
logger.info("Constructing suffix array from file " + binarySuffixArrayFileName);
suffixArray = new MemoryMappedSuffixArray(binarySuffixArrayFileName, corpusArray, Cache.DEFAULT_CAPACITY);
} else {
System.err.println("Usage: java " + SuffixArray.class.getName() + " source.vocab source.corpus source.suffixes");
System.exit(0);
symbolTable = null;
corpusArray = null;
suffixArray = null;
}
int minFrequency = 0;
short maxPhrases = 100;
int maxPhraseLength = 10;
int maxPhraseSpan = 10;
short minNonterminalSpan = 2;
logger.info("Calculating " + maxPhrases + " most frequent phrases");
frequentPhrases = new FrequentPhrases(suffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
logger.info("Frequent phrases: \n" + frequentPhrases.toString());
logger.info("Caching inverted indices");
frequentPhrases.cacheInvertedIndices();
logger.info("Calculating collocations for most frequent phrases");
List<HierarchicalPhrases> collocations = frequentPhrases.getFrequentCollocations();//frequentPhrases.countCollocations(maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
Comparator<HierarchicalPhrases> compare = new Comparator<HierarchicalPhrases>() {
public int compare(HierarchicalPhrases o1, HierarchicalPhrases o2) {
Integer i1 = o1.size;
Integer i2 = o2.size();
return i2.compareTo(i1);
}
};
Collections.sort(collocations,compare);
for (HierarchicalPhrases locations : collocations) {
logger.info(locations.toString());
}
// FrequentMatches matches = frequentPhrases.getCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
//
//
//
//
// logger.info("Printing collocations for most frequent phrases");
// logger.info("Total collocations: " + matches.counter);
//
// logger.info(matches.toString());
// for (int i=0, n=matches.counter; i<n; i+=3) {
//
// int key = matches..get(i);
// short rank2 = (short) key;
// short rank1 = (short) (key >> 8);
// Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
// Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
//
// String pattern = phrase1.toString() + " X " + phrase2.toString();
//
// int position1 = collocations.get(i+1);
// int position2 = collocations.get(i+2);
//
// System.out.println(pattern + " " + position1 + "," + position2);
// }
// for (Map.Entry<Integer, ArrayList<int[]>> entry : collocations.entrySet()) {
//
// int key = entry.getKey();
// ArrayList<int[]> values = entry.getValue();
//
// short rank2 = (short) key;
// short rank1 = (short) (key >> 8);
//
// Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
// Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
//
// String pattern = phrase1.toString() + " X " + phrase2.toString();
//
// for (int[] value : values) {
// System.out.println(value + "\t" + pattern);
// }
// }
}
}