Package joshua.corpus.suffix_array

Source Code of joshua.corpus.suffix_array.AbstractSuffixArray

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
package joshua.corpus.suffix_array;

import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.Corpus;
import joshua.corpus.MatchedHierarchicalPhrases;
import joshua.corpus.Phrase;
import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.ff.tm.Rule;
import joshua.util.Cache;

/**
* This class provides a mostly-complete implementation of the
* <code>Suffixes</code> interface, designed to minimize the effort
* required to build a concrete implementation of a suffix array
* data structure.
* <p>
* To implement a concrete suffix array, the programmer need only
* implement the <code>getCorpusIndex(int suffixIndex)</code> and
* <code>size()</code> methods.
*
* @author Lane Schwartz
* @author Chris Callison-Burch
*/
public abstract class AbstractSuffixArray implements Suffixes {
 
  /** Logger for this class. */
  private static Logger logger =
    Logger.getLogger(AbstractSuffixArray.class.getName());
 
  /**
   * Maps from patterns to lists of hierarchical phrases that
   * match the corresponding pattern in the corpus.
   * <p>
   * This cache is a most-recently accessed map, so commonly
   * accessed patterns will remain in the cache, while rare
   * patterns will eventually drop out of the cache.
   */
  protected final Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache;
 
  /**
   * Maps from patterns to lists of hierarchical phrases that
   * match the corresponding pattern in the corpus.
   * <p>
   * This cache is a most-recently accessed map, so commonly
   * accessed patterns will remain in the cache, while rare
   * patterns will eventually drop out of the cache.
   */
  protected final Cache<Pattern,List<Rule>> ruleCache;
 
  /**
   * Integer array representation of the corpus for this
   * suffix array.
   */
  protected final Corpus corpus;
 
  /**
   * Constructs an abstract suffix array based on the provided
   * corpus.
   *
   * The specified cache will be used to store matched
   * hierarchical phrases for frequently accessed patterns.
   *
   * @param corpus Corpus upon which this suffix array is based.
   * @param hierarchicalPhraseCache Cache to store matched
   *               hierarchical phrases for frequently accessed
   *               patterns
   */
  public AbstractSuffixArray(
      Corpus corpus,
      Cache<Pattern,MatchedHierarchicalPhrases> hierarchicalPhraseCache,
      Cache<Pattern,List<Rule>> ruleCache) {
   
    this.hierarchicalPhraseCache = hierarchicalPhraseCache;
    this.ruleCache = ruleCache;
    this.corpus = corpus;
  }
 
  /* See Javadoc for Suffixes interface.*/
  public Cache<Pattern,MatchedHierarchicalPhrases> getCachedHierarchicalPhrases() {
    return hierarchicalPhraseCache;
  }

  /* See Javadoc for Suffixes interface.*/
  public Cache<Pattern,List<Rule>> getCachedRules() {
    return this.ruleCache;
  }
 
  /* See Javadoc for Suffixes interface.*/
  public MatchedHierarchicalPhrases createHierarchicalPhrases(Pattern pattern, int minNonterminalSpan, int maxPhraseSpan) {
   
    if (hierarchicalPhraseCache.containsKey(pattern)) {
      return hierarchicalPhraseCache.get(pattern);
    } else {

      int arity = pattern.arity();
      int size = pattern.size();
      int[] patternTokens = pattern.getWordIDs();
     
      SymbolTable vocab = corpus.getVocabulary();
     
      if (arity==0) {
        int[] bounds = this.findPhrase(pattern, 0, pattern.size(), 0, this.size()-1);
        int[] startPositions = this.getAllPositions(bounds);
        MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab);
        return result;
      } else if (arity==size) {
        int[] startPositions = new int[]{};
        MatchedHierarchicalPhrases result = this.createTriviallyHierarchicalPhrases(startPositions, pattern, vocab);
        return result;
      } else if (arity==1 && pattern.startsWithNonterminal()) {
        int[] terminals = new int[size-1];
        for (int i=1; i<size; i++) {
          terminals[i-1] = patternTokens[i];
        }
        Pattern terminalsPattern = new Pattern(vocab, terminals);
        MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan);
        MatchedHierarchicalPhrases result = terminalsMatch.copyWithInitialX();
        hierarchicalPhraseCache.put(pattern, result);
        return result;
      } else if (arity==1 && pattern.endsWithNonterminal()) {
        int[] terminals = new int[size-1];
        for (int i=0, n=size-1; i<n; i++) {
          terminals[i] = patternTokens[i];
        }
        Pattern terminalsPattern = new Pattern(vocab, terminals);
        MatchedHierarchicalPhrases terminalsMatch = this.createHierarchicalPhrases(terminalsPattern, minNonterminalSpan, maxPhraseSpan);
        MatchedHierarchicalPhrases result = terminalsMatch.copyWithFinalX();
        hierarchicalPhraseCache.put(pattern, result);
        return result;
//        int[] bounds = this.findPhrase(pattern, 0, size, 0, this.size());
//        int[] startPositions = this.getAllPositions(bounds);
////        Pattern patternX = new Pattern(pattern, PrefixTree.X);
//        MatchedHierarchicalPhrases result = this.createHierarchicalPhrases(startPositions, pattern, vocab);
//        return result;
      else {
       
        int[] prefixTokens = new int[patternTokens.length - 1];
        for (int i=0, n=patternTokens.length-1; i<n; i++) {
          prefixTokens[i] = patternTokens[i];
        }
       
        int[] suffixTokens = new int[patternTokens.length - 1];
        for (int i=1, n=patternTokens.length; i<n; i++) {
          suffixTokens[i-1] = patternTokens[i];
        }
       
        Pattern prefix = new Pattern(vocab, prefixTokens);
        Pattern suffix = new Pattern(vocab, suffixTokens);
       
        MatchedHierarchicalPhrases prefixMatches = createHierarchicalPhrases(prefix, minNonterminalSpan, maxPhraseSpan);
        MatchedHierarchicalPhrases suffixMatches = createHierarchicalPhrases(suffix, minNonterminalSpan, maxPhraseSpan);
       
        MatchedHierarchicalPhrases result =
          HierarchicalPhrases.queryIntersect(
              pattern, prefixMatches, suffixMatches,
              minNonterminalSpan, maxPhraseSpan, this);
     
        hierarchicalPhraseCache.put(pattern, result);
        return result;
      }
    }
   
  }
 
  /* See Javadoc for Suffixes interface.*/
  public MatchedHierarchicalPhrases createTriviallyHierarchicalPhrases(int[] startPositions,
      Pattern pattern, SymbolTable vocab) {

      if (hierarchicalPhraseCache.containsKey(pattern)) {
        if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, and did contain pattern:      " + pattern.toString());
        return hierarchicalPhraseCache.get(pattern);
      } else {
        if (logger.isLoggable(Level.FINEST)) logger.finest("Cache has " + hierarchicalPhraseCache.size() + " entries, but did not contain pattern:  " + pattern.toString());
        // In the case of contiguous phrases,
        //   the hpCache is essentially acting as Adam's Inverted Index,
        //   because it stores the corpus-sorted indexes of each of the phrases. 
        // It differs because it creates a HierarchicalPhrases object rather than just int[].
        Arrays.sort(startPositions);
        HierarchicalPhrases hierarchicalPhrases = new HierarchicalPhrases(pattern, startPositions, getCorpus().getSentenceIndices(startPositions))
        hierarchicalPhraseCache.put(pattern, hierarchicalPhrases);
        return hierarchicalPhrases;
      }
     
  }

  /* See Javadoc for Suffixes interface.*/
  public int[] findPhrase(Phrase phrase) {
    return findPhrase(phrase, 0, phrase.size());
  }

  /* See Javadoc for Suffixes interface.*/
  public int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd,
      int lowerBound, int upperBound) {

    int[] bounds = new int[2];
    lowerBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, true);
    if (lowerBound < 0) return null;
    upperBound = findPhraseBound(sentence, phraseStart, phraseEnd, lowerBound, upperBound, false);
    bounds[0]=lowerBound;
    bounds[1]=upperBound;
    return bounds; 
   
  }

  /* See Javadoc for Suffixes interface.*/
  public int[] getAllPositions(int[] bounds) {
    if (bounds != null) {
      int startInSuffixArray = bounds[0];
      int endInSuffixArray = bounds[1];
      int length = endInSuffixArray - startInSuffixArray + 1;
      int[] positions = new int[length];
      for (int i = 0; i < length; i++) {
        positions[i] = getCorpusIndex(i+startInSuffixArray);
      }
      Arrays.sort(positions);
      return positions;
    } else {
      return new int[0];
    }
  }

  /* See Javadoc for Suffixes interface.*/
  public Corpus getCorpus() {
    return corpus;
  }

  /* See Javadoc for Suffixes interface.*/
  public abstract int getCorpusIndex(int suffixIndex);

  /* See Javadoc for Suffixes interface.*/
  public MatchedHierarchicalPhrases getMatchingPhrases(Pattern pattern) {
    return hierarchicalPhraseCache.get(pattern);
  }

  /* See Javadoc for Suffixes interface.*/
  public int getSentenceIndex(int corpusIndex) {
    return corpus.getSentenceIndex(corpusIndex);
  }

  /* See Javadoc for Suffixes interface.*/
  public int getSentencePosition(int sentenceIndex) {
    return corpus.getSentencePosition(sentenceIndex);
  }

  /* See Javadoc for Suffixes interface.*/
  public SymbolTable getVocabulary() {
    return corpus.getVocabulary();
  }

  /* See Javadoc for Suffixes interface.*/
  public void cacheMatchingPhrases(MatchedHierarchicalPhrases matchings) { 
    hierarchicalPhraseCache.put(matchings.getPattern(), matchings)
  }

  /* See Javadoc for Suffixes interface.*/
  public abstract int size();

 
 
  /**
   * Finds a phrase in the suffix array. The phrase is extracted
   * from the sentence given the start and end points.
   *
   * @param sentence    the sentence/superphrase to draw the
   *                    search phrase from
   * @param phraseStart the start of the phrase in the sentence
   *                    (inclusive)
   * @param phraseEnd   the end of the phrase in the sentence
   *                    (exclusive)
   * @return a tuple containing the (inclusive) start and the
   *         (inclusive) end bounds in the suffix array for
   *         the phrase
   */
  protected int[] findPhrase(Phrase sentence, int phraseStart, int phraseEnd) {
    return findPhrase(sentence, phraseStart, phraseEnd, 0, size()-1);
  }
 
  /**
   * Finds the first or last occurrence of a phrase in the
   * suffix array, within a subset of the suffix array that
   * is bounded by suffixArrayStart and suffixArrayEnd. For
   * efficiency of looking up all subphrases in a sentence
   * we do not require that multplie int[]s be created for
   * each subphrase. Instead this method will look for the
   * subphrase within the sentence between phraseStart and
   * phraseEnd.
   *
   * @param sentence    the sentence/superphrase in int
   *                    representation to draw the search
   *                    phrase from
   * @param phraseStart the start of the phrase in the sentence
   *                    (inclusive)
   * @param phraseEnd   the end of the phrase in the sentence
   *                    (exclusive)
   * @param suffixArrayStart the point at which to start the
   *                    search in the suffix array
   * @param suffixArrayEnd the end point in the suffix array
   *                    beyond which the search doesn't need
   *                    to take place
   * @param findFirst   a flag that indicates whether we
   *                    should find the first or last occurrence
   *                    of the phrase
   */
  private int findPhraseBound(
    Phrase  sentence,
    int     phraseStart,
    int     phraseEnd,
    int     suffixArrayStart,
    int     suffixArrayEnd,
    boolean findFirst
  ) {
    int low = suffixArrayStart;
    int high = suffixArrayEnd;
   
    // Do a binary search between the low and high points
    while (low <= high) {
      int mid = (low + high) >>> 1;
      int start = getCorpusIndex(mid);
      int diff = corpus.comparePhrase(start, sentence, phraseStart, phraseEnd);
      if (diff == 0) {
        // If the difference between the search phrase and the phrase in the corpus
        // is 0, then we have found it.  However, there might be multiple matches in
        // the corpus, so we need to continue searching until we find the end point
        int neighbor = mid;
        if (findFirst) {
          neighbor--;
        } else {
          neighbor++;
        }
        if (neighbor >= suffixArrayStart && neighbor <= suffixArrayEnd) {
          int nextDiff = corpus.comparePhrase(getCorpusIndex(neighbor), sentence, phraseStart, phraseEnd);
          if (nextDiff == 0) {
            // There's another equivalent phrase, so we need to specify
            // in which direction to continue searching
            if (findFirst) {
              diff = 1; //search lower
            } else {
              diff = -1; //search higher
            }
          }
        }
      }
      if (diff < 0) {
        low = mid + 1;
      } else if (diff > 0) {
        high = mid - 1;
      } else {
        return mid; //this is the edge
      }
    }
    return -1; // key not found.
  }
}
TOP

Related Classes of joshua.corpus.suffix_array.AbstractSuffixArray

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.