Package edu.umd.hooka

Source Code of edu.umd.hooka.PhrasePair$SubPhraseCoordinates

package edu.umd.hooka;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.io.WritableComparable;

import edu.umd.hooka.alignment.aer.ReferenceAlignment;

/**
*
* @author chris
*
*  Notes:
*  This class represents a pair of phrases, one from target language, one from source language, and an alignment
*
*/
public class PhrasePair implements WritableComparable, Cloneable {

  private Phrase f;
  private Phrase e;
  private Alignment a;
  private AlignmentPosteriorGrid g;
 
  public Object clone() {
    Phrase nf = (Phrase)f.clone();
    Phrase ne = (Phrase)e.clone();
    Alignment na = (Alignment)a.clone();
    return new PhrasePair(nf, ne, na);
  }
  public int compareTo(Object o) {
    PhrasePair that = (PhrasePair)o;
    int c = that.f.compareTo(this.f);
    if (c != 0) { return c; }
    c = that.e.compareTo(this.e);
    return c;
  }
 
  public int hashCode() {
    return f.hashCode() * 31 + f.size();
  }
 
  public PhrasePair getTranspose() {
    PhrasePair res = new PhrasePair(e, f, a);
    return res;
  }
 
  public PhrasePair() {
    e = new Phrase();
    f = new Phrase();
    a = null;
  }
 
  public PhrasePair(Phrase f, Phrase e) {
    this.f = f;
    this.e = e;
    this.a = null;
  }

  public PhrasePair(Phrase f, Phrase e, Alignment a) {
    this.f = f;
    this.e = e;
    this.a = a;
  }
 
  public Alignment getAlignment() {
    return a;
  }
 
  public boolean equals(Object o) {
    if (!(o instanceof PhrasePair)) {
      return false;
    }
    PhrasePair that = (PhrasePair)o;
    if (this.a != null) {
      if (that == null ||  !that.a.equals(this.a))
        return false;
    } else {
      if (that.a != null) return false;
    }
    return (e.equals(that.e) && f.equals(that.f));
  }
 
  public PhrasePair(String f, Vocab vocF, String e, Vocab vocE, String a)
  {
    this.f = Phrase.fromString(1, f, vocF);
    this.e = Phrase.fromString(0, e, vocE);
    if (a != null || !a.equals("")) {
      this.a = new Alignment(this.f.size(), this.e.size(), a);
    }
  }
 
  public float ratioFtoE() {
    return ((float)this.f.size()) / ((float)this.e.size());
  }
 
  public String toString() {
    StringBuffer sb = new StringBuffer();
    sb.append("{F:").append(f).append(" ||| E:").append(e);
    if (a != null) { sb.append(" ||| A: ").append(a); }
    sb.append("}");
    return sb.toString();
  }
  public void mergeEnglishWords(int i, int j, int newE) {
    int elen = e.size();
    if (j >= elen)
      throw new IllegalArgumentException("mergeEnglishWords argument out of range j=" + j);
    if (i >= elen)
      throw new IllegalArgumentException("mergeEnglishWords argument out of range i=" + i);
    if (i == j)
      throw new IllegalArgumentException("i cannot equal j");
    int[] nep = new int[elen - 1];
    int[] ep = e.getWords();
    int d=0;
    for (int k = 0; k < elen-1; k++) {
      if ((k+d) == i) {
        nep[k] = newE;
        continue;
      }
      if (k == j)
        d++;
      nep[k] = ep[k+d];
    }
    e = new Phrase(nep,e.getLanguage());
    if (a != null)
      a = a.mergeEnglishWords(i, j);
  }
  public void splitEnglishWords(int i, int newE1, int newE2) {
    int elen = e.size();
    if (i >= elen)
      throw new IllegalArgumentException("splitEnglishWords argument out of range: " + i);
    int[] nep = new int[elen + 1];
    int[] ep = e.getWords();
    for (int k = 0; k < elen; k++) {
      if (k == i) {
        nep[k]   = newE1;
        nep[k+1] = newE2;
      } else if (k < i)
        nep[k] = ep[k];
      else if (k > i)
        nep[k+1] = ep[k];
    }
    e = new Phrase(nep,e.getLanguage());
    if (a != null)
      a = a.splitEnglishWords(i);
  }
  public void splitForeignWords(int j, int newF1, int newF2) {
    int flen = f.size();
    if (j >= flen)
      throw new IllegalArgumentException("splitForeignWords argument out of range: " + j);
    int[] nfp = new int[flen + 1];
    int[] fp = f.getWords();
    for (int k = 0; k < flen; k++) {
      if (k == j) {
        nfp[k]   = newF1;
        nfp[k+1] = newF2;
      } else if (k < j)
        nfp[k] = fp[k];
      else if (k > j)
        nfp[k+1] = fp[k];
    }
    f = new Phrase(nfp,f.getLanguage());
    if (a != null)
      a = a.splitForeignWords(j);
  }
  public String toString(Vocab vocF, Vocab vocE) {
    StringBuffer sb = new StringBuffer();
    sb.append(f.toString(vocF)).append(" ||| ").append(e.toString(vocE));
    if (hasAlignment()) {
      sb.append(" ||| ").append(a.toString());
    }
    return sb.toString();
  }
 
  public Phrase getE() {
    return e;
  }
 
  public Phrase getF() {
    return f;
  }
 
  public void setE(Phrase e) {
    this.e = e;
  }
 
  public void setF(Phrase f) {
    this.f = f;
  }

  public boolean hasAlignment() {
    return a != null;
  }
 
  public void setAlignment(Alignment a) {
    if (a == null) { this.a = null; return; }
    if (a.getELength() != e.size() ||
        a.getFLength() != f.size())
      throw new IllegalArgumentException("Mismatch p.e="+ e.size() + "a.e=" + a.getELength() + "  p.f=" + f.size() + " a.f=" + a.getFLength() );
    this.a = a;
  }
 
  public void readFields(DataInput in) throws IOException {
    f.readFields(in);
    e.readFields(in);
    byte at = in.readByte();
    a=null;
    if (at != 0) {
      //System.out.println("Reading " +f.size() + "--" + e.size());
      if (at == 1)
        a = new Alignment(f.size(), e.size());
      else if (at == 2)
        a = new ReferenceAlignment(f.size(), e.size());
      else
        throw new IOException("bad format! at="+at);
      a.readFields(in);
      assert(a.getELength() == e.getWords().length);
      assert(a.getFLength() == f.getWords().length);
    }
    boolean hasg = in.readBoolean();
    if (hasg) {
      g = new AlignmentPosteriorGrid(this);
      g.readFiles(in);
    }
  }

  public void write(DataOutput out) throws IOException {
    f.write(out);
    e.write(out);
    if (hasAlignment()) {
      out.writeByte(a.getType());
      a.write(out);
    } else {
      out.writeByte(0);
    }
    if (hasAlignmentPosteriors()) {
      out.writeBoolean(true);
      g.write(out);
    } else {
      out.writeBoolean(false);
    }
  }
 
  public boolean hasAlignmentPosteriors() {
    return (g != null);
  }
 
  public AlignmentPosteriorGrid getAlignmentPosteriorGrid() {
    return g;
  }
 
  public void setAlignmentPosteriorGrid(AlignmentPosteriorGrid g) {
    this.g = g;
  }
   
  public static final class SubPhraseCoordinates {
    public int e_start;
    public int e_end;
    public int f_start;
    public int f_end;
    public SubPhraseCoordinates() {}
    public SubPhraseCoordinates(int es,int ee, int fs, int fe) {
      e_start = es;
      e_end = ee;
      f_start = fs;
      f_end = fe;
    }
   
    public String toString() {
      return "<(" + f_start + "," + f_end + ")-(" + e_start + "," + e_end +")>";
    }
  }

  public PhrasePair extractSubPhrasePair(SubPhraseCoordinates c) {
    return extractSubPhrasePair(c.f_start, c.f_end, c.e_start, c.e_end);
  }

  public PhrasePair extractSubPhrasePair(int startF, int endF, int startE, int endE)
  {
    PhrasePair n = new PhrasePair();
    n.e = this.e.getSubPhrase(startE,endE);
    n.f = this.f.getSubPhrase(startF,endF);
    n.a = new Alignment(endF - startF + 1, endE - startE + 1);
    for (int fi = startF; fi <= endF; fi++)
      for (int ei = startE; ei <= endE; ei++)
        if (this.a.aligned(fi, ei))
          n.a.align(fi - startF, ei - startE);
    return n;
  }

  /**
   * Returns the smallest consistent phrase pair that contains [e_start,e_end]
   * This is not efficient- don't use it where speed counts!
   */
  public SubPhraseCoordinates getMinimalConsistentSubPhraseCoordsContainingESpan(int e_start, int e_end) {
    int elen = e.size();
    int flen = f.size();
    int ne_s = e_start;
    int ne_e = e_end;
    while (ne_s > 0    && !a.isEAligned(ne_s)) { ne_s--; // is start aligned? if not, keep moving left
    while (ne_e < elen && !a.isEAligned(ne_e)) { ne_e++; // is end aligned? if not, keep moving right
    if (ne_s <  0   ) { ne_s = 0;       } // make sure left edge isn't less than 0
    if (ne_e >= elen) { ne_e = elen-1} // make sure right edge isn't > len
    // at this point, e_start and e_end are aligned to f words, so find the f range
    boolean isConsistent = false;
    int maxF = -1;
    int minF = 9999999;
    while(!isConsistent) {
      isConsistent = true;
      maxF = -1;
      minF = 9999999;
      //System.err.println("ne_s:" + ne_s +"\tne_e:" + ne_e + "\telen:"+elen);
      for (int e = ne_s; e <= ne_e; e++) {   
        for (int f = 0; f<flen; f++) {
          if (a.aligned(f, e)) {
            if (f > maxF) maxF = f;
            if (f < minF) minF = f;
          }
        }
        if (maxF == -1) { maxF = flen - 1; }
        if (minF == 9999999) { minF = 0; }
      }
      for (int f = minF; f <= maxF; f++) {   
        for (int e = 0; e<elen; e++) {
          if (a.aligned(f, e)) {
            if (e > ne_e) {ne_e = e; isConsistent = false; }
            if (e < ne_s) {ne_s = e; isConsistent = false; }
          }
        }
      }
    }
    return new SubPhraseCoordinates(ne_s, ne_e, minF, maxF);
  }

  public PhrasePair extractMinimalConsistentPhrasePairContainingESpan(int eStart, int eEnd) {
    SubPhraseCoordinates spc = getMinimalConsistentSubPhraseCoordsContainingESpan(eStart,eEnd);
    return extractSubPhrasePair(spc.f_start, spc.f_end, spc.e_start, spc.e_end);
  }
     
  public ArrayList<SubPhraseCoordinates> extractConsistentSubPhraseCoordinates(int maxPhraseLength)
  {
    ArrayList<SubPhraseCoordinates> res = new ArrayList<SubPhraseCoordinates>();
    int _elen = e.size();
    int _flen = f.size();
    if (!this.hasAlignment())
      throw new RuntimeException("Missing alignment");
   
    int[] alignedCountF = new int[_flen];
    ArrayList<ArrayList<Integer> > alignedToE = new ArrayList<ArrayList<Integer> >();
    for (int i=0; i<_elen; i++)
    {
      alignedToE.add(new ArrayList<Integer>());
    }
    java.util.Iterator<Alignment.IntPair> ai = a.iterator();
    while (ai.hasNext())
    {
      Alignment.IntPair pair = ai.next();
      int f = pair.f;
      int e = pair.e;
      alignedToE.get(e).add(f);
      alignedCountF[f]++;
    }
    int[] usedF = new int[alignedCountF.length];
    //for (int cc=0; cc<_flen; cc++) {
    //  System.out.println(" " + cc + ": " + alignedCountF[cc]);
    //}
   
    for (int startE=0; startE<_elen; startE++) {
      for (int endE=startE; (endE<_elen && endE<startE+maxPhraseLength); endE++)
      {
        int maxF = -1;
        int minF = 9999999;
        System.arraycopy(alignedCountF, 0, usedF, 0, usedF.length);
        for (int ei=startE; ei<=endE; ei++) {
          ArrayList<Integer> alignedToEi = alignedToE.get(ei);
          int naei = alignedToEi.size();
          for (int i=0; i<naei; i++) {
            int fi = alignedToEi.get(i).intValue();
            if (fi < minF) { minF = fi; }
            if (fi > maxF) { maxF = fi; }
            usedF[fi]--;
          }
        }
       
        if (maxF >= 0 &&
            maxF - minF < maxPhraseLength)
        {
          boolean oob = false;
          for (int fi=minF;fi<=maxF && !oob;fi++) {
            if (usedF[fi] > 0) { oob = true; }
          }
          if (!oob) {
            for (int startF = minF;
                     (startF>=0 &&
                  startF>maxF - maxPhraseLength &&
                  (startF==minF || alignedCountF[startF]==0)); startF--) {
              for (int endF = maxF;
                  (endF < _flen && endF < startF + maxPhraseLength &&
                   (endF == maxF || alignedCountF[endF] == 0)); endF++) {
                res.add(new SubPhraseCoordinates(startE,endE, startF, endF));
              }           
            }
          }
        }
      }
    }
    return res;
  }
 
  public ArrayList<PhrasePair> extractConsistentPhrasePairs(int maxPhraseLength)
  {
    ArrayList<SubPhraseCoordinates> pcl = extractConsistentSubPhraseCoordinates(maxPhraseLength);
    ArrayList<PhrasePair> res = new ArrayList<PhrasePair>(pcl.size());
    for (SubPhraseCoordinates spc : pcl)
      res.add(this.extractSubPhrasePair(spc));
    return res;
  }
}
TOP

Related Classes of edu.umd.hooka.PhrasePair$SubPhraseCoordinates

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.