Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.FrequentPhrases

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.SortedSet;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.ContiguousPhrase;
import joshua.corpus.Corpus;
import joshua.corpus.Phrase;
import joshua.corpus.mm.MemoryMappedCorpusArray;
import joshua.corpus.suffix_array.mm.MemoryMappedSuffixArray;
import joshua.corpus.vocab.SymbolTable;
import joshua.corpus.vocab.Vocabulary;
import joshua.util.Cache;
import joshua.util.Counted;
import joshua.util.io.BinaryIn;

/**
* Represents the most frequent phrases in a corpus.
*
* @author Chris Callison-Burch
* @author Lane Schwartz
*/
public class FrequentPhrases {

  /** Logger for this class. */
  private static final Logger logger =
    Logger.getLogger(FrequentPhrases.class.getName());
 
  /** Suffix array in which frequent phrases are located. */
  final Suffixes suffixes;
 
  /**
   * Stores the number of times a phrase occurred in the
   * corpus.
   * <p>
   * The iteration order of this map should start with the
   * most frequent phrase and end with the least frequent
   * phrase stored in the map.
   * <p>
   * The key set for this map should be identical to the key
   * set in the <code>ranks</code> map.
   */
  LinkedHashMap<Phrase,Integer> frequentPhrases;
 
  /** Maximum number of phrases of which this object is aware. */
  short maxPhrases;
 
  /** Maximum phrase length to consider. */
  int maxPhraseLength;
 
  /**
   * Minimum number of words in a corpus
   * which a nonterminal may represent.
   */
  int minNonterminalSpan;
 
  /**
   * Maximum span (from first token to last token) in a corpus
   * which a phrase may represent.
   */
  int maxPhraseSpan;
 
  /** Stores sorted lists of corpus locations for most frequent phrases. */
  Map<Phrase,InvertedIndex> invertedIndices;
 
  /**
   * Stores the corpus locations of collocations
   * of frequent phrases with other frequent phrases.
   */
  List<HierarchicalPhrases> frequentCollocations;
 
  /**
   * Constructs data regarding the frequencies of the <em>n</em>
   * most frequent phrases found in the corpus backed by the
   * provided suffix array.
   *
   * @param suffixes   Suffix array corresponding to a corpus.
   * @param minFrequency The minimum frequency required to
   *                   for a phrase to be considered frequent.
   * @param maxPhrases The number of phrases to consider.
   * @param maxPhraseLength Maximum phrase length to consider.
   * @param maxContiguousPhraseLength Maximum phrase length to consider for a contiguous phrase
   * @param maxPhraseSpan Maximum span (from first token to last token) in a corpus
   *                      which a phrase may represent.
   * @param minNonterminalSpan Minimum number of words in a corpus
   *                           which a nonterminal may represent.
   */
  public FrequentPhrases(
      Suffixes suffixes,
      int minFrequency,
      short maxPhrases,
      int maxPhraseLength,
      int maxContiguousPhraseLength,
      int maxPhraseSpan, int minNonterminalSpan) {
   
    this.maxPhrases = maxPhrases;
    this.maxPhraseLength = maxPhraseLength;
    this.minNonterminalSpan = minNonterminalSpan;
    this.maxPhraseSpan = maxPhraseSpan;
   
    this.suffixes = suffixes;
    this.frequentPhrases = getMostFrequentPhrases(suffixes, minFrequency, maxPhrases, maxContiguousPhraseLength);
    this.invertedIndices = calculateInvertedIndices();
    this.frequentCollocations = countCollocations();
  }
 
  public FrequentPhrases(Suffixes suffixes, String binaryFilename) throws IOException, ClassNotFoundException {
    this.suffixes = suffixes;
    BinaryIn<InvertedIndex> in = new BinaryIn<InvertedIndex>(binaryFilename, InvertedIndex.class);
    this.readExternal(in);
  }
//
//  public short getMaxPhrases() {
//    return this.maxPhrases;
//  }
 
  Suffixes getSuffixes() {
    return this.suffixes;
  }
 
//  /**
//   * This method performs a one-pass computation of the
//   * collocation of two frequent subphrases. It is used for
//   * the precalculation of the translations of hierarchical
//   * phrases which are problematic to calculate on the fly.
//   * This procedure is described in "Hierarchical Phrase-Based
//   * Translation with Suffix Arrays" by Adam Lopez.
//   *
//   * @param maxPhraseLength the maximum length of any phrase
//   *                   in the phrases
//   * @param windowSize the maximum allowable space between
//   *                   phrases for them to still be considered
//   *                   collocated
//   * @param minNonterminalSpan Minimum span allowed for a nonterminal
//   */
//  public FrequentMatches getCollocations(
//      int maxPhraseLength,
//      int windowSize,
//      short minNonterminalSpan
//  ) {
// 
////    FrequentMatches collocations = new FrequentMatches(this, maxPhraseLength, windowSize, minNonterminalSpan);
////   
////    countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
////   
////    collocations.histogramSort();
////   
////    return collocations;
//   
//    throw new RuntimeException("Not currently supported");
//   
//  }


  /**
   * Gets the number of times any frequent phrase co-occurred
   * with any frequent phrase within the given window.
   * <p>       
   * This method performs a one-pass computation of the
   * collocation of two frequent sub-phrases. It is used for
   * the precalculation of the translations of hierarchical
   * phrases which are problematic to calculate on the fly.
   *
   * This procedure is described in "Hierarchical Phrase-Based
   * Translation with Suffix Arrays" by Adam Lopez.
   *
   * @param maxPhraseLength the maximum length of any phrase
   *                   in the phrases
   * @param windowSize the maximum allowable space between
   *                   phrases for them to still be considered
   *                   collocated
   *                  
   * @return The number of times any frequent phrase co-occurred
   *         with any frequent phrase within the given window.
   */
//  int countCollocations(int maxPhraseLength, int windowSize, short minNonterminalSpan) {
//    return countCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
//  }
 
 
  protected List<HierarchicalPhrases> getFrequentCollocations() {
    return this.frequentCollocations;
  }
 
 
  /**
   * Gets the hierarchical phrases that represent
   * the collocations of one frequent phrase with
   * another frequent phrase.
   * <p>       
   * This method performs a one-pass computation of the
   * collocation of two frequent sub-phrases. It is used for
   * the precalculation of the translations of hierarchical
   * phrases which are problematic to calculate on the fly.
   *
   * This procedure is described in "Hierarchical Phrase-Based
   * Translation with Suffix Arrays" by Adam Lopez.
   *
   * @return The number of times any frequent phrase co-occurred
   *         with any frequent phrase within the given window.
   */
  private List<HierarchicalPhrases> countCollocations() {
   
    PhrasePairCollocations collocations = new PhrasePairCollocations(suffixes.getCorpus());

    LinkedList<Phrase> phrasesInWindow = new LinkedList<Phrase>();
    LinkedList<Integer> positions = new LinkedList<Integer>();
    int sentenceNumber = 1;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber);

    if (logger.isLoggable(Level.FINEST)) logger.finest("END OF SENT: " + endOfSentence);

    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
   
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
         
      // Start with a phrase length of 1, at the current position...
      for (int i = 1, endOfPhrase = currentPosition + i;
          // ...ensure the phrase length isn't too long...
          i <= maxPhraseLength  && 
          // ...and that the phrase doesn't extend past the end of the sentence...
          endOfPhrase <= endOfSentence  && 
          // ...or past the end of the corpus
          endOfPhrase <= endOfCorpus;
          // ...then increment the phrase length and end of phrase marker.
          i++, endOfPhrase = currentPosition + i) {

       
        // Get the current phrase
        Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);

        if (logger.isLoggable(Level.FINEST)) logger.finest("Found phrase (" +currentPosition + ","+endOfPhrase+") "  + phrase);

        // If the phrase is one we care about...
        if (frequentPhrases.containsKey(phrase)) {

          if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);

          // Remember the phrase...
          phrasesInWindow.add(phrase);

          // ...and its starting position
          positions.add(currentPosition);
        }

      } // end iterating over various phrase lengths


      // check whether we're at the end of the sentence and dequeue...
      if (currentPosition == endOfSentence) {

        if (logger.isLoggable(Level.FINEST)) {
          logger.finest("REACHED END OF SENT: " + currentPosition);
          logger.finest("PHRASES:   " + phrasesInWindow);
          logger.finest("POSITIONS: " + positions);
        }

        // empty the whole queue...
//        for (int i = 0, n=phrasesInWindow.size(); i < n; i++) {
        while (! phrasesInWindow.isEmpty()) {

          processPhraseWindow(collocations, phrasesInWindow, positions);

        }
        // clear the queues
        phrasesInWindow.clear();
        positions.clear();

        // update the end of sentence marker
        sentenceNumber++;
        endOfSentence = suffixes.getSentencePosition(sentenceNumber)-1;

        if (logger.isLoggable(Level.FINER)) logger.finer("END OF SENT: " + sentenceNumber + " at position " + endOfSentence);

      } // Done processing end of sentence.


      // check whether the initial elements are
      // outside the window size...
      if (! phrasesInWindow.isEmpty()) {
        int position1 = positions.get(0);
        // dequeue the first element and
        // calculate its collocations...
        while (! phrasesInWindow.isEmpty() &&
            ((currentPosition+1==endOfCorpus) ||
                (currentPosition-position1 >= maxPhraseSpan))) {

          processPhraseWindow(collocations, phrasesInWindow, positions);
         
//          if (logger.isLoggable(Level.FINEST)) logger.finest("OUTSIDE OF WINDOW: " + position1 + " " +  currentPosition + " " + maxPhraseSpan);
//         
//          Phrase phrase1 = phrasesInWindow.removeFirst();
//          positions.removeFirst();
//         
//          Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
//          Iterator<Integer> positionIterator = positions.iterator();
//
//          int end1 = position1 + phrase1.size();
//         
//          for (int j = 0, n=phrasesInWindow.size(); j < n; j++) {
//
//            Phrase phrase2 = phraseIterator.next();
//            int position2 = positionIterator.next();
//
//            if (position2-end1 >= minNonterminalSpan) {
//              if (logger.isLoggable(Level.FINEST)) logger.finest("CASE2: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
//              collocations.record(phrase1, phrase2, position1, position2);
//            } else if (logger.isLoggable(Level.FINEST)) {
//              logger.finest("Not recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
//            }
//          }
          if (phrasesInWindow.size() > 0) {
            position1 = positions.getFirst();
          } else {
            position1 = currentPosition;
          }
        }
      }

    } // end iterating over positions in the corpus

    return collocations.getHierarchicalPhrases();
//    return count;
  }

  /**
   * @param collocations
   * @param phrasesInWindow
   * @param positions
   * @param i
   * @param n
   */
  private void processPhraseWindow(PhrasePairCollocations collocations,
      LinkedList<Phrase> phrasesInWindow,
      LinkedList<Integer> positions) {
   
    Phrase phrase1 = phrasesInWindow.removeFirst();
    int position1 = positions.removeFirst();

    Iterator<Phrase> phraseIterator = phrasesInWindow.iterator();
    Iterator<Integer> positionIterator = positions.iterator();

    int end1 = position1 + phrase1.size();
   
    while (phraseIterator.hasNext() && positionIterator.hasNext()) {
   
      Phrase phrase2 = phraseIterator.next();
      int position2 = positionIterator.next();

      int end2 = position2 + phrase2.size();
     
      if (position2-end1 >= minNonterminalSpan  &&  end2-position1 <= maxPhraseSpan) {
        if (logger.isLoggable(Level.FINEST)) logger.finest("    Recording collocation: " + phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
        collocations.record(phrase1, phrase2, position1, position2);
      } else if (logger.isLoggable(Level.FINEST)) {
        logger.finest("Not recording collocation: "+ phrase1 + "\t" + phrase2 + "\t" + position1 + "\t" + position2);
      }

    }
  }


//  /**
//   * Returns an integer identifier for the collocation of
//   * <code>phrase1</code> with <code>phrase2</code>.
//   * <p>
//   * If <code>rank1</code> is the rank of <code>phrase1</code>
//   * and <code>rank2</code> is the rank of <code>phrase2</code>,
//   * the identifier returned by this method is defined to be
//   * <code>rank1*maxPhrases + rank2</code>.
//   * <p>
//   * As such, the range of possible values returned by this
//   * method will be </code>0</code> through
//   * <code>maxPhrases*maxPhrases-1</code>.
//   *
//   * @param phrase1 First phrase in a collocation.
//   * @param phrase2 Second phrase in a collocation.
//   * @return a unique integer identifier for the collocation.
//   */
//  private int getKey(LinkedHashMap<Phrase,Short> ranks, Phrase phrase1, Phrase phrase2) {
//
//    short rank1 = ranks.get(phrase1);
//    short rank2 = ranks.get(phrase2);
//
//    int rank = rank1*maxPhrases + rank2;
//
//    return rank;
//  }
 

  //  /**
  //   * Builds a HashMap of all the occurrences of the phrase,
  //   * keying them based on the index of the sentence that they
  //   * occur in. Since we iterate over all occurrences of the
  //   * phrase, this method is linear with respect to the number
  //   * of occurrences, and should not be used for very frequent
  //   * phrases. This is part of the baseline method described
  //   * in Section 4.1 of Adam Lopez's EMNLP paper.
  //   */
  //  public HashMap<Integer,HashSet<Integer>> keyPositionsWithSentenceNumber(Phrase phrase) {
  //    // keys are the sentence numbers of partial matches
  //    HashMap<Integer,HashSet<Integer>> positionsKeyedWithSentenceNumber = new HashMap<Integer,HashSet<Integer>>(suffixes.size());
  //    int[] bounds = suffixes.findPhrase(phrase);
  //    if (bounds == null) return positionsKeyedWithSentenceNumber;
  //   
  //    int[] positions = suffixes.getAllPositions(bounds);
  //    for (int i = 0; i < positions.length; i++) {
  //      int sentenceNumber = suffixes.getSentenceIndex(positions[i]);
  //      HashSet<Integer> positionsInSentence = positionsKeyedWithSentenceNumber.get(sentenceNumber);
  //      if (positionsInSentence == null) {
  //        positionsInSentence = new HashSet<Integer>();
  //      }
  //      positionsInSentence.add(positions[i]);
  //      positionsKeyedWithSentenceNumber.put(sentenceNumber, positionsInSentence);
  //    }
  //    return positionsKeyedWithSentenceNumber;
  //  }

  //===============================================================
  // Protected
  //===============================================================

  //===============================================================
  // Methods
  //===============================================================

  /**
   * Calculates the frequency ranks of the provided phrases.
   * <p>
   * The iteration order of the <code>frequentPhrases</code>
   * parameter is used by this method to determine the
   * rank of each phrase. Specifically, the first phrase
   * returned by the map's iterator is taken to be the most
   * frequent phrase; the last phrase returned by the map's
   * iterator is taken to be the least frequent phrase.
   *
   * @param frequentPhrases Map from phrase to frequency of
   *                        that phrase in a corpus.
   * @return the frequency ranks of the provided phrases
   */
  protected LinkedHashMap<Phrase,Short> getRanks() {
   
    logger.fine("Calculating ranks of frequent phrases");
   
    LinkedHashMap<Phrase,Short> ranks = new LinkedHashMap<Phrase,Short>(frequentPhrases.size());

    short i=0;
    for (Phrase phrase : frequentPhrases.keySet()) {
      ranks.put(phrase, i++);
    }
   
    logger.fine("Done calculating ranks");
   
    return ranks;
  }
 

  /**
   * Calculates the most frequent phrases in the corpus.
   * <p>
   * Allows a threshold to be set for the minimum frequency
   * to remember, as well as the maximum number of phrases.
   * <p>
   * This method implements the
   * <code>print_LDIs_stack</code> function defined in
   * section 2.5 of Yamamoto and Church.
   *
   * @param suffixes     a suffix array for the corpus
   * @param minFrequency the minimum frequency required to
   *                     retain phrases
   * @param maxPhrases   the maximum number of phrases to
   *                     return
   * @param maxPhraseLength the maximum phrase length to
   *                     consider
   *
   * @return A map from phrase to the number of times
   *         that phrase occurred in the corpus.
   *         The iteration order of the map will start
   *         with the most frequent phrase, and
   *         end with the least frequent calculated phrase.
   *        
   * @see "Yamamoto and Church (2001), section 2.5"
   */
  @SuppressWarnings("unchecked")
  protected static LinkedHashMap<Phrase,Integer> getMostFrequentPhrases(
      Suffixes suffixes,
      int minFrequency,
      int maxPhrases,
      int maxPhraseLength
  ) {
   
    PriorityQueue<Counted<Phrase>> frequentPhrases = new PriorityQueue<Counted<Phrase>>();
    Set<Integer> prunedFrequencies = new HashSet<Integer>();
   
    Corpus corpus = suffixes.getCorpus();
   
    FrequencyClasses frequencyClasses = getFrequencyClasses(suffixes);
   
    for (FrequencyClass frequencyClass : frequencyClasses.withMinimumFrequency(minFrequency)) {
     
      int frequency = frequencyClass.getFrequency();
     
      if (! prunedFrequencies.contains(frequency)) {
       
        int i = frequencyClass.getIntervalStart();
        int startOfPhrase = suffixes.getCorpusIndex(i);
        int sentenceNumber = suffixes.getSentenceIndex(startOfPhrase);
        int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
       
        int max = Math.min(maxPhraseLength, endOfSentence-startOfPhrase);
        if (logger.isLoggable(Level.FINER)) logger.finer("Max phrase length is " + max + " for " + frequencyClass.toString());
       
        for (int phraseLength : frequencyClass.validPhraseLengths(max)) {
         
          int endOfPhrase = startOfPhrase + phraseLength;
         
          Phrase phrase = new ContiguousPhrase(
              startOfPhrase,
              endOfPhrase,
              corpus);
         
          frequentPhrases.add(new Counted<Phrase>(phrase, frequency));
          if (frequentPhrases.size() > maxPhrases) {
            Counted<Phrase> pruned = frequentPhrases.poll();
            int prunedFrequency = pruned.getCount();
            prunedFrequencies.add(prunedFrequency);
            if (logger.isLoggable(Level.FINER)) logger.info("Pruned " + pruned.getElement() + " with frequency " + prunedFrequency);
            break;
          }
         
        }
      } else if (logger.isLoggable(Level.FINER)) {
        logger.finer("Skipping pruned frequency " + frequency);
      }
    }

    while (! frequentPhrases.isEmpty() && prunedFrequencies.contains(frequentPhrases.peek().getCount())) {
      Counted<Phrase> pruned = frequentPhrases.poll();
      if (logger.isLoggable(Level.FINER)) logger.finer("Pruned " + pruned.getElement() + " " + pruned.getCount());
    }
   
    Counted<Phrase>[] reverse = new Counted[frequentPhrases.size()];
    {
      int i=frequentPhrases.size()-1;
      while (! frequentPhrases.isEmpty()) {
        reverse[i] = frequentPhrases.poll();
        i -= 1;
      }
    }
   
    LinkedHashMap<Phrase,Integer> results = new LinkedHashMap<Phrase,Integer>();
    for (Counted<Phrase> countedPhrase : reverse) {
      Phrase phrase = countedPhrase.getElement();
      Integer count = countedPhrase.getCount();
      results.put(phrase, count);
    }
//   
//    while (! frequentPhrases.isEmpty()) {
//      Counted<Phrase> countedPhrase = frequentPhrases.poll();
//      Phrase phrase = countedPhrase.getElement();
//      Integer count = countedPhrase.getCount();
//      results.put(phrase, count);
//    }
//   
    return results;
   
  }
 
  /**
   * Calculates the frequencies for
   * all phrase frequency classes in the corpus.
   * <p>
   * This method is implements the
   * <code>print_LDIs_stack</code> function defined in
   * section 2.5 of Yamamoto and Church.
   *
   * @param suffixes a suffix array for the corpus
   * @return A list of term frequency classes
   *        
   * @see "Yamamoto and Church (2001), section 2.5"
   */
  protected static FrequencyClasses getFrequencyClasses(Suffixes suffixes) {
   
    // calculate the longest common prefix delimited intervals...
    int[] longestCommonPrefixes = calculateLongestCommonPrefixes(suffixes);

    // Construct an initially empty object to hold class frequency information
    FrequencyClasses frequencyClasses = new FrequencyClasses(longestCommonPrefixes);
   
    // stack_i <-- an integer array for the stack of left edges, i
    Stack<Integer> startIndices = new Stack<Integer>();
   
    // stack_k <-- an integer array for the stack of representatives, k
    Stack<Integer> shortestInteriorLCPIndices = new Stack<Integer>();
   
    // stack_i[0] <-- 0
    startIndices.push(0);

    // stack_k[0] <-- 0
    shortestInteriorLCPIndices.push(0);
   
    // sp <-- 1 (a stack pointer)
   
    // for j <-- 0,1,2, ..., N-1
    for (int j = 0, size=suffixes.size(); j < size; j++) { 
     
      // Output an lcp-delimited interval <j,j> with tf=1
      //        (trivial interval i==j, frequency=1)
      if (logger.isLoggable(Level.FINEST)) logger.finest("Output trivial interval <"+j+","+j+"> with tf=1");
      frequencyClasses.record(j);
      //frequencyClasses.record(j, j, Integer.MAX_VALUE, 1);

      // While lcp[j+1] < lcp[stack_k[sp-1]] do
      while (longestCommonPrefixes[j+1] < longestCommonPrefixes[shortestInteriorLCPIndices.peek()]) {
             
        int i = startIndices.pop();
        int k = shortestInteriorLCPIndices.pop();
       
        int longestBoundingLCP = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
        int shortestInteriorLCP = longestCommonPrefixes[k];

        // Output an interval <i,j> with tf=j-i+1, if it is lcp-delimited
        //                    (non-trivial interval)
        // sp <-- sp - 1
        if (longestBoundingLCP < shortestInteriorLCP) {
 
          int frequency = j-i+1;
          if (logger.isLoggable(Level.FINEST)) logger.finest("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));
          frequencyClasses.record(i, j, k, frequency)
        }
       
      }
     
      // stack_i[sp] <-- stack_k[sp-1]
      startIndices.push(shortestInteriorLCPIndices.peek());

      // stack_k[sp] <-- j+1
      shortestInteriorLCPIndices.push(j+1);

      // sp <-- sp + 1

    }
   
    return frequencyClasses;
  }
     


  public void cacheInvertedIndices() {
 
    for (HierarchicalPhrases phrases : frequentCollocations) {
      suffixes.cacheMatchingPhrases(phrases);
    }
   
    for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
     
      Pattern pattern = new Pattern(entry.getKey());
      InvertedIndex list = entry.getValue();
     
      HierarchicalPhrases phraseLocations = new HierarchicalPhrases(pattern,list.corpusLocations, list.sentenceNumbers);
      suffixes.cacheMatchingPhrases(phraseLocations);
      if (logger.isLoggable(Level.FINE)) logger.fine("Cached sorted locations for " + pattern);
     
      if (pattern.toString().equals("[.]")) {
        logger.fine("Found .");
      }
     
      if (logger.isLoggable(Level.FINE)) {
        StringBuilder s = new StringBuilder();
        String patternString = pattern.toString();
        for (Integer i : list.corpusLocations) {
          s.append(patternString);
          s.append('\t');
          s.append(i);
          s.append('\n');
        }
        logger.fine(s.toString());
      }
     
    }
   
  }

  /**
   * Constructs an auxiliary array that stores longest common
   * prefixes. The length of the array is the corpus size+1.
   * Each elements lcp[i] indicates the length of the common
   * prefix between two positions s[i-1] and s[i] in the
   * suffix array.
   *
   * @param suffixes Suffix array
   * @return Longest common prefix array
   */
  protected static int[] calculateLongestCommonPrefixes(Suffixes suffixes) {

    int length = suffixes.size();
    Corpus corpus = suffixes.getCorpus();

    int[] longestCommonPrefixes = new int[length +1];
   
    // For each element in the suffix array
    for (int i = 1; i < length; i++) {
      int corpusIndex = suffixes.getCorpusIndex(i);
      int prevCorpusIndex = suffixes.getCorpusIndex(i-1);

      // Start by assuming that the two positions
      //    don't have anything in common
      int commonPrefixSize = 0;
     
      // While the 1st position is not at the end of the corpus...
      while(corpusIndex+commonPrefixSize < length &&
          // ... and the 2nd position is not at the end of the corpus...
          prevCorpusIndex + commonPrefixSize < length &&
          // ... and the nth word at the 1st position ...
          (corpus.getWordID(corpusIndex  + commonPrefixSize) ==
            // ... is the same as the nth word at the 2nd position ...
            corpus.getWordID(prevCorpusIndex + commonPrefixSize) &&
            // ... and the length to consider isn't too long
            commonPrefixSize <= Suffixes.MAX_COMPARISON_LENGTH)) {
       
        // The two positions match for their respective nth words!
        // Increment commonPrefixSize to reflect this fact
        commonPrefixSize++;
      }
     
      // Record how long the common prefix is between
      //    suffix array element s[i] and s[i-1]
      longestCommonPrefixes[i] = commonPrefixSize;
    }
   
    // By definition, the 0th element of lcp is 0
    longestCommonPrefixes[0] = 0;
   
    // By definition, the final element of lcp is 0
    longestCommonPrefixes[length] = 0;
   
    return longestCommonPrefixes;

  }
 
//  /**
//   * This method extracts phrases which reach the specified
//   * minimum frequency. It uses the equivalency classes for
//   * substrings in the interval i-j in the suffix array, as
//   * defined in section 2.3 of the the Yamamoto and Church
//   * CL article. This is a helper function for the
//   * getMostFrequentPhrases method.
//   *
//   * @param suffixes Suffix array
//   * @param longestCommonPrefixes Longest common prefix array
//   * @param i Index specifying a starting range in the suffix array
//   * @param j Index specifying an ending range in the suffix array
//   * @param k Index specifying a representative value of the range,
//   *          such that i < k <= j, and such that longestCommonPrefixes[k]
//   *          is the shortest interior longest common prefix of the range
//   *          (see section 2.5 of Yamamoto and Church)
//   * @param phrases
//   * @param frequencies
//   * @param minFrequency
//   * @param maxPhrases
//   * @param maxPhraseLength
//   * @param comparator
//   */
//  protected static void recordPhraseFrequencies(
//      Suffixes            suffixes,
//      int[]               longestCommonPrefixes,
//      int                 i,
//      int                 j,
//      int                 k,
//      List<Phrase>        phrases,
//      List<Integer>       frequencies,
//      int                 minFrequency,
//      int                 maxPhrases,
//      int                 maxPhraseLength,
//      Comparator<Integer> comparator
//  ) {
//   
//    if (i==j) {
//      logger.info("Output trivial interval <"+j+","+j+"> with k="+k+" and tf=1");
//    } else {
//
//      int LBL = Math.max(longestCommonPrefixes[i], longestCommonPrefixes[j+1]);
//      int SIL = longestCommonPrefixes[k];
//
//      if (LBL < SIL) {
//        logger.info("Output interval <"+i+","+j+"> with k="+k+" and tf="+j+"-"+i+"+1="+(j-i+1));       
//      } else {
//        logger.info("Interval <"+i+","+j+"> is NOT lcp-delimited, because " + LBL + " not < " +SIL);
//      }
//    }
//  }
 
 
  private Map<Phrase,InvertedIndex> calculateInvertedIndices() {
    Map<Phrase,InvertedIndex> invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
   
    Corpus corpus = suffixes.getCorpus();
    int endOfCorpus = corpus.size();
    logger.fine("Corpus has size " + endOfCorpus);
   
    int sentenceNumber = 0;
    int endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
    boolean trackMe = false;
    // Start at the beginning of the corpus...
    for (int currentPosition : corpus.corpusPositions()) {
//         
      if (trackMe)
        {
        logger.fine("At corpus position " + currentPosition);
        }
//     
//      if (currentPosition==0 || currentPosition==1) {
//        logger.fine("Here!");
//      }
     
      // Start with a phrase length of 1, at the current position...
      for (int i = 1, endOfPhrase = currentPosition + i;
          // ...ensure the phrase length isn't too long...
          i <= maxPhraseLength  && 
          // ...and that the phrase doesn't extend past the end of the sentence...
          endOfPhrase <= endOfSentence  && 
          // ...or past the end of the corpus
          endOfPhrase <= endOfCorpus;
          // ...then increment the phrase length and end of phrase marker.
          i++, endOfPhrase = currentPosition + i) {

        if (trackMe) logger.fine("endOfPhrase=="+endOfPhrase);
        // Get the current phrase
        Phrase phrase = new ContiguousPhrase(currentPosition, endOfPhrase, corpus);

        if (phrase.toString().equals(".")) {
          logger.fine("Huzzah, £20 for the King!");
          trackMe = true;
        }
       
        if (logger.isLoggable(Level.FINE)) logger.fine("In sentence " + sentenceNumber + " found phrase (" +currentPosition + ","+endOfPhrase+") "  + phrase);

        // If the phrase is one we care about...
        if (frequentPhrases.containsKey(phrase)) {

          if (logger.isLoggable(Level.FINER)) logger.finer("\"" + phrase + "\" found at currentPosition " + currentPosition);

          if (! invertedIndices.containsKey(phrase)) {
            invertedIndices.put(phrase, new InvertedIndex());
          }
         
          InvertedIndex invertedIndex = invertedIndices.get(phrase);
         
          logger.fine("Recording position " + currentPosition + " in sentence " + sentenceNumber + " for phrase " + phrase);
          invertedIndex.record(currentPosition, sentenceNumber);

        }
       
      } // end iterating over various phrase lengths

      if (currentPosition+1 == endOfSentence) {
        sentenceNumber += 1;
        endOfSentence = suffixes.getSentencePosition(sentenceNumber+1);
      }
    }
   
    return invertedIndices;
  }
 
  /* See Javadoc for java.io.Externalizable interface. */
  public void readExternal(ObjectInput in) throws IOException,
      ClassNotFoundException {
   
    boolean loggingFiner = logger.isLoggable(Level.FINER);
   
    SymbolTable vocab = suffixes.getVocabulary();
   
    // Read in the maximum number of phrases of which this object is aware.
    this.maxPhrases = in.readShort();
    if (loggingFiner) logger.finer(" Read: maxPhrases="+maxPhrases);
   
    // Read in the maximum phrase length to consider.
    this.maxPhraseLength = in.readInt();
    if (loggingFiner) logger.finer(" Read: maxPhraseLength="+maxPhraseLength);
   
    // Read in the count of frequent phrase types
    int frequentPhrasesSize = in.readInt();
    if (loggingFiner) logger.finer(" Read: frequentPhrases.size()="+frequentPhrasesSize);
   
    // Read in the frequentPhrases map
    this.frequentPhrases = new LinkedHashMap<Phrase,Integer>();
    for (int i=0; i<frequentPhrasesSize; i++) {
     
      // Write out number of times the phrase is found in the corpus
      int count = in.readInt();
      if (loggingFiner) logger.finer(" Read: phraseCount="+count);
     
      // Read in the number of tokens in the phrase
      int tokenCount = in.readInt();
      if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
     
      int[] wordIDs = new int[tokenCount];
      for (int j=0; j<tokenCount; j++) {
        int wordID = in.readInt();
        if (loggingFiner) logger.finer(" Read: wordIDs["+j+"]="+wordID);
        wordIDs[j] = wordID;
      }
     
     
      BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
//      if (loggingFinest) logger.finer("Read: phrase="+Arrays.toString(wordIDs)+ " " + phrase);
      this.frequentPhrases.put(phrase, count);
     
    }
   
    // Read in number of inverted indices
    int invertedIndicesCount = in.readInt();
    if (loggingFiner) logger.finer(" Read: invertedIndices.size()="+invertedIndicesCount);
   
    // Read in inverted indices
    this.invertedIndices = new HashMap<Phrase,InvertedIndex>(frequentPhrases.keySet().size());
    for (int i=0; i<invertedIndicesCount; i++) {
     
      // Read in the number of tokens in the phrase
      int tokenCount = in.readInt();
      if (loggingFiner) logger.finer(" Read: wordIDs.length="+tokenCount);
     
      int[] wordIDs = new int[tokenCount];
      for (int j=0; j<tokenCount; j++) {
        wordIDs[j] = in.readInt();
        if (loggingFiner) logger.finer(" Read: wordID["+j+"]="+wordIDs[j]);
      }
     
      // Reconstruct phrase
      BasicPhrase phrase = new BasicPhrase(wordIDs, vocab);
     
      // Read in inverted index
      InvertedIndex invertedIndex = new InvertedIndex();
      if (loggingFiner) logger.finer(" Read: about to InvertedIndex");
      if (phrase.toString().equals("it")) {
        logger.fine("Found it!");
      }
      invertedIndex.readExternal(in);
     
      this.invertedIndices.put(phrase, invertedIndex);
    }
   
    // Read collocations
    int frequentCollocationsSize = in.readInt();
    this.frequentCollocations = new ArrayList<HierarchicalPhrases>(frequentCollocationsSize);
    for (int i=0; i<frequentCollocationsSize; i++) {
     
      // Read the pattern
      int wordsLength = in.readInt();
      int[] words = new int[wordsLength];
      for (int j=0; j<wordsLength; j++) {
        words[j]=in.readInt();
      }
      Pattern pattern = new Pattern(vocab, words);
     
//      int terminalSequenceLengthsLength = in.readInt();
//      int[] terminalSequenceLengths = new int[terminalSequenceLengthsLength];
//      for (int j=0; j<terminalSequenceLengthsLength; j++) {
//        terminalSequenceLengths[j]=in.readInt();
//      }
     
      // Read the number of corpus matches
//      int phrasesSize = in.readInt();
     
      // Next, read the sentence numbers
      // There should be size of these
      int[] sentenceNumber = new int[in.readInt()];
      for (int j=0, n=sentenceNumber.length; j<n; j++) {
        sentenceNumber[j] = in.readInt();
      }
     
      // Next, read the start index of each corpus match
      // There should be size of these
      int[] terminalSequenceStartIndices = new int[in.readInt()];
      for (int j=0, n=terminalSequenceStartIndices.length; j<n; j++) {
        terminalSequenceStartIndices[j] = in.readInt();
      }

      HierarchicalPhrases phrases = new HierarchicalPhrases(pattern, terminalSequenceStartIndices, sentenceNumber);
      this.frequentCollocations.add(phrases);
     
    }
  }

  public void writeExternal(ObjectOutput out) throws IOException {
   
    boolean loggingFiner = logger.isLoggable(Level.FINER);
   
    // Write out maximum number of phrases of which this object is aware.
    out.writeShort(maxPhrases);
    if (loggingFiner) logger.finest("Wrote: maxPhrases="+maxPhrases);
   
    // Write out maximum phrase length to consider.
    out.writeInt(maxPhraseLength);
    if (loggingFiner) logger.finest("Wrote: maxPhraseLength="+maxPhraseLength);
   
    // Write out count of frequent phrase types
    out.writeInt(frequentPhrases.size());
    if (loggingFiner) logger.finest("Wrote: frequentPhrases.size()="+frequentPhrases.size());
   
    // Write out frequentPhrases map
    for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {
      Phrase phrase = entry.getKey();
      int phraseCount = entry.getValue();
      int[] wordIDs = phrase.getWordIDs();
     
      // Write out number of times the phrase is found in the corpus
      out.writeInt(phraseCount);
      if (loggingFiner) logger.finer("Wrote: phraseCount="+phraseCount);
     
      // Write out the number of tokens in the phrase
      out.writeInt(wordIDs.length);
      if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
     
      // Write out each token in the phrase
      int index = 0;
      for (int wordID : wordIDs) {
        out.writeInt(wordID);
        if (loggingFiner) logger.finer("Wrote: wordIDs["+index+"]="+wordID);
        index+=1;
      }
//      if (loggingFinest) logger.finest("Wrote: wordIDs="+Arrays.toString(wordIDs));
    }
   
    // Write out number of inverted indices
    out.writeInt(invertedIndices.size());
    if (loggingFiner) logger.finer("Wrote: invertedIndices.size()="+invertedIndices.size());
   
    // Write out inverted indices
    for (Map.Entry<Phrase, InvertedIndex> entry : invertedIndices.entrySet()) {
     
      Pattern pattern = new Pattern(entry.getKey());
      int[] wordIDs = pattern.getWordIDs();
     
      // Write out number of tokens in the pattern
      out.writeInt(wordIDs.length);
      if (loggingFiner) logger.finer("Wrote: wordIDs.length="+wordIDs.length);
     
      // Write out each token in the phrase
      int index = 0;
      for (int wordID : wordIDs) {
        out.writeInt(wordID);
        if (loggingFiner) logger.finer("Wrote: wordID["+index+"]="+wordID);
        index+=1;
      }
     
      // Write out inverted index for this phrase
      InvertedIndex list = entry.getValue();
      if (loggingFiner) logger.finer("Wrote: about to InvertedIndex");
//      if (pattern.toString().contains("[it]")) {
//        logger.fine("Found it!");
//      }
      out.writeObject(list);
    }
   
    /////////////
   
    // Write collocations
    out.writeInt(frequentCollocations.size());
    for (HierarchicalPhrases phrases : frequentCollocations) {
     
      // Write the pattern
      int[] words = phrases.pattern.getWordIDs();
      out.writeInt(words.length);
      for (int token : phrases.pattern.getWordIDs()) {
        out.writeInt(token);
      }
//      out.writeInt(phrases.pattern.arity());
//     
//      out.writeInt(phrases.terminalSequenceLengths.length);
//      for (int l : phrases.terminalSequenceLengths) {
//        out.writeInt(l);
//      }
     
      // Write the number of corpus matches
//      out.writeInt(phrases.size);
     
      // Next, write the sentence numbers
      // There should be size of these
      out.writeInt(phrases.sentenceNumber.length);
      for (int n : phrases.sentenceNumber) {
        out.writeInt(n);
      }
     
      // Next, write the start index of each corpus match
      // There should be size of these
      out.writeInt(phrases.terminalSequenceStartIndices.length);
      for (int startIndex : phrases.terminalSequenceStartIndices) {
        out.writeInt(startIndex);
      }
     
    }
   

  }
 

  public String toString() {

    String format = null;

    StringBuilder s = new StringBuilder();

    for (Map.Entry<Phrase, Integer> entry : frequentPhrases.entrySet()) {

      Phrase phrase = entry.getKey();
      Integer frequency = entry.getValue();

      if (format==null) {
        int length = frequency.toString().length();
        format = "%1$" + length + "d";
      }

      s.append(String.format(format, frequency));
      s.append('\t');
      s.append(phrase.toString());
      s.append('\n');

    }

    return s.toString();
  }


  /**
   * Private helper method for performing fast intersection.
   *
   * @param <E>
   * @param sortedData
   * @param sortedQueries
   * @param result
   */
  private static <E extends Comparable<E>> void fastIntersect(List<E> sortedData, List<E> sortedQueries, SortedSet<E> result) {

    int medianQueryIndex = sortedQueries.size() / 2;
    E medianQuery = sortedQueries.get(medianQueryIndex);

    int index = Collections.binarySearch(sortedData, medianQuery);

    if (index >= 0) {
      result.add(medianQuery);
    } else {
      index = (-1 * index) + 1;
    }

    if (index-1 >= 0 && medianQueryIndex-1 >=0) {
      fastIntersect(sortedData.subList(0, index), sortedQueries.subList(0, medianQueryIndex), result);
    }

    if (index+1 < sortedData.size()  &&  medianQueryIndex+1 < sortedQueries.size()) {
      fastIntersect(sortedData.subList(index+1, sortedData.size()), sortedQueries.subList(medianQueryIndex+1, sortedQueries.size()), result);
    }
  } 


  //===============================================================
  // Static
  //===============================================================



  //===============================================================
  // Inner classes
  //===============================================================

 

  //===============================================================
  // Main method
  //===============================================================
 
  public static void main(String[] args) throws IOException, ClassNotFoundException {


    Vocabulary symbolTable;
    Corpus corpusArray;
    Suffixes suffixArray;
    FrequentPhrases frequentPhrases;

    if (args.length == 1) {

      String corpusFileName = args[0];

      logger.info("Constructing vocabulary from file " + corpusFileName);
      symbolTable = new Vocabulary();
      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, symbolTable, true);

      logger.info("Constructing corpus array from file " + corpusFileName);
      corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, symbolTable, lengths[0], lengths[1]);

      logger.info("Constructing suffix array from file " + corpusFileName);
      suffixArray = new SuffixArray(corpusArray, Cache.DEFAULT_CAPACITY);

    } else if (args.length == 3) {

      String binarySourceVocabFileName = args[0];
      String binaryCorpusFileName = args[1];
      String binarySuffixArrayFileName = args[2];

      if (logger.isLoggable(Level.INFO)) logger.info("Constructing source language vocabulary from binary file " + binarySourceVocabFileName);
      ObjectInput in = BinaryIn.vocabulary(binarySourceVocabFileName);
      symbolTable = new Vocabulary();
      symbolTable.readExternal(in);

      logger.info("Constructing corpus array from file " + binaryCorpusFileName);
      if (logger.isLoggable(Level.INFO)) logger.info("Constructing memory mapped source language corpus array.");
      corpusArray = new MemoryMappedCorpusArray(symbolTable, binaryCorpusFileName);

      logger.info("Constructing suffix array from file " + binarySuffixArrayFileName);
      suffixArray = new MemoryMappedSuffixArray(binarySuffixArrayFileName, corpusArray, Cache.DEFAULT_CAPACITY);


    } else {

      System.err.println("Usage: java " + SuffixArray.class.getName() + " source.vocab source.corpus source.suffixes");
      System.exit(0);

      symbolTable = null;
      corpusArray = null;
      suffixArray = null;

    }

    int minFrequency = 0;
    short maxPhrases = 100;
    int maxPhraseLength = 10;
    int maxPhraseSpan = 10;
    short minNonterminalSpan = 2;

    logger.info("Calculating " + maxPhrases + " most frequent phrases");
    frequentPhrases = new FrequentPhrases(suffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);

    logger.info("Frequent phrases: \n" + frequentPhrases.toString());

    logger.info("Caching inverted indices");
    frequentPhrases.cacheInvertedIndices();
   
    logger.info("Calculating collocations for most frequent phrases");
    List<HierarchicalPhrases> collocations = frequentPhrases.getFrequentCollocations();//frequentPhrases.countCollocations(maxPhraseLength, maxPhraseSpan, minNonterminalSpan);
   
    Comparator<HierarchicalPhrases> compare = new Comparator<HierarchicalPhrases>() {
      public int compare(HierarchicalPhrases o1, HierarchicalPhrases o2) {
        Integer i1 = o1.size;
        Integer i2 = o2.size();
        return i2.compareTo(i1);
      }
     
    };
    Collections.sort(collocations,compare);
    for (HierarchicalPhrases locations : collocations) {
      logger.info(locations.toString());
    }
//    FrequentMatches matches = frequentPhrases.getCollocations(maxPhraseLength, windowSize, minNonterminalSpan);
//
//   
//
//   
//    logger.info("Printing collocations for most frequent phrases");   
//    logger.info("Total collocations: " + matches.counter);
//   
//    logger.info(matches.toString());
   
//        for (int i=0, n=matches.counter; i<n; i+=3) {
//         
//          int key = matches..get(i);
//          short rank2 = (short) key;
//          short rank1 = (short) (key >> 8);
//          Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
//          Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
//         
//          String pattern = phrase1.toString() + " X " + phrase2.toString();
//         
//          int position1 = collocations.get(i+1);
//          int position2 = collocations.get(i+2);
//         
//          System.out.println(pattern + " " + position1 + "," + position2);
//        }



    //    for (Map.Entry<Integer, ArrayList<int[]>> entry : collocations.entrySet()) {
    //     
    //      int key = entry.getKey();
    //      ArrayList<int[]> values = entry.getValue();
    //     
    //      short rank2 = (short) key;
    //      short rank1 = (short) (key >> 8);
    //     
    //      Phrase phrase1 = frequentPhrases.phraseList.get(rank1);
    //      Phrase phrase2 = frequentPhrases.phraseList.get(rank2);
    //     
    //      String pattern = phrase1.toString() + " X " + phrase2.toString();
    //     
    //      for (int[] value : values) {
    //        System.out.println(value + "\t" + pattern);
    //      }
    //    }


  }
 
}
TOP

Related Classes of joshua.corpus.suffix_array.FrequentPhrases

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.