package edu.umd.hooka;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.io.WritableComparable;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
/**
*
* @author chris
*
* Notes:
* This class represents a pair of phrases, one from target language, one from source language, and an alignment
*
*/
public class PhrasePair implements WritableComparable, Cloneable {
private Phrase f;
private Phrase e;
private Alignment a;
private AlignmentPosteriorGrid g;
public Object clone() {
Phrase nf = (Phrase)f.clone();
Phrase ne = (Phrase)e.clone();
Alignment na = (Alignment)a.clone();
return new PhrasePair(nf, ne, na);
}
public int compareTo(Object o) {
PhrasePair that = (PhrasePair)o;
int c = that.f.compareTo(this.f);
if (c != 0) { return c; }
c = that.e.compareTo(this.e);
return c;
}
public int hashCode() {
return f.hashCode() * 31 + f.size();
}
public PhrasePair getTranspose() {
PhrasePair res = new PhrasePair(e, f, a);
return res;
}
public PhrasePair() {
e = new Phrase();
f = new Phrase();
a = null;
}
public PhrasePair(Phrase f, Phrase e) {
this.f = f;
this.e = e;
this.a = null;
}
public PhrasePair(Phrase f, Phrase e, Alignment a) {
this.f = f;
this.e = e;
this.a = a;
}
public Alignment getAlignment() {
return a;
}
public boolean equals(Object o) {
if (!(o instanceof PhrasePair)) {
return false;
}
PhrasePair that = (PhrasePair)o;
if (this.a != null) {
if (that == null || !that.a.equals(this.a))
return false;
} else {
if (that.a != null) return false;
}
return (e.equals(that.e) && f.equals(that.f));
}
public PhrasePair(String f, Vocab vocF, String e, Vocab vocE, String a)
{
this.f = Phrase.fromString(1, f, vocF);
this.e = Phrase.fromString(0, e, vocE);
if (a != null || !a.equals("")) {
this.a = new Alignment(this.f.size(), this.e.size(), a);
}
}
public float ratioFtoE() {
return ((float)this.f.size()) / ((float)this.e.size());
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("{F:").append(f).append(" ||| E:").append(e);
if (a != null) { sb.append(" ||| A: ").append(a); }
sb.append("}");
return sb.toString();
}
public void mergeEnglishWords(int i, int j, int newE) {
int elen = e.size();
if (j >= elen)
throw new IllegalArgumentException("mergeEnglishWords argument out of range j=" + j);
if (i >= elen)
throw new IllegalArgumentException("mergeEnglishWords argument out of range i=" + i);
if (i == j)
throw new IllegalArgumentException("i cannot equal j");
int[] nep = new int[elen - 1];
int[] ep = e.getWords();
int d=0;
for (int k = 0; k < elen-1; k++) {
if ((k+d) == i) {
nep[k] = newE;
continue;
}
if (k == j)
d++;
nep[k] = ep[k+d];
}
e = new Phrase(nep,e.getLanguage());
if (a != null)
a = a.mergeEnglishWords(i, j);
}
public void splitEnglishWords(int i, int newE1, int newE2) {
int elen = e.size();
if (i >= elen)
throw new IllegalArgumentException("splitEnglishWords argument out of range: " + i);
int[] nep = new int[elen + 1];
int[] ep = e.getWords();
for (int k = 0; k < elen; k++) {
if (k == i) {
nep[k] = newE1;
nep[k+1] = newE2;
} else if (k < i)
nep[k] = ep[k];
else if (k > i)
nep[k+1] = ep[k];
}
e = new Phrase(nep,e.getLanguage());
if (a != null)
a = a.splitEnglishWords(i);
}
public void splitForeignWords(int j, int newF1, int newF2) {
int flen = f.size();
if (j >= flen)
throw new IllegalArgumentException("splitForeignWords argument out of range: " + j);
int[] nfp = new int[flen + 1];
int[] fp = f.getWords();
for (int k = 0; k < flen; k++) {
if (k == j) {
nfp[k] = newF1;
nfp[k+1] = newF2;
} else if (k < j)
nfp[k] = fp[k];
else if (k > j)
nfp[k+1] = fp[k];
}
f = new Phrase(nfp,f.getLanguage());
if (a != null)
a = a.splitForeignWords(j);
}
public String toString(Vocab vocF, Vocab vocE) {
StringBuffer sb = new StringBuffer();
sb.append(f.toString(vocF)).append(" ||| ").append(e.toString(vocE));
if (hasAlignment()) {
sb.append(" ||| ").append(a.toString());
}
return sb.toString();
}
public Phrase getE() {
return e;
}
public Phrase getF() {
return f;
}
public void setE(Phrase e) {
this.e = e;
}
public void setF(Phrase f) {
this.f = f;
}
public boolean hasAlignment() {
return a != null;
}
public void setAlignment(Alignment a) {
if (a == null) { this.a = null; return; }
if (a.getELength() != e.size() ||
a.getFLength() != f.size())
throw new IllegalArgumentException("Mismatch p.e="+ e.size() + "a.e=" + a.getELength() + " p.f=" + f.size() + " a.f=" + a.getFLength() );
this.a = a;
}
public void readFields(DataInput in) throws IOException {
f.readFields(in);
e.readFields(in);
byte at = in.readByte();
a=null;
if (at != 0) {
//System.out.println("Reading " +f.size() + "--" + e.size());
if (at == 1)
a = new Alignment(f.size(), e.size());
else if (at == 2)
a = new ReferenceAlignment(f.size(), e.size());
else
throw new IOException("bad format! at="+at);
a.readFields(in);
assert(a.getELength() == e.getWords().length);
assert(a.getFLength() == f.getWords().length);
}
boolean hasg = in.readBoolean();
if (hasg) {
g = new AlignmentPosteriorGrid(this);
g.readFiles(in);
}
}
public void write(DataOutput out) throws IOException {
f.write(out);
e.write(out);
if (hasAlignment()) {
out.writeByte(a.getType());
a.write(out);
} else {
out.writeByte(0);
}
if (hasAlignmentPosteriors()) {
out.writeBoolean(true);
g.write(out);
} else {
out.writeBoolean(false);
}
}
public boolean hasAlignmentPosteriors() {
return (g != null);
}
public AlignmentPosteriorGrid getAlignmentPosteriorGrid() {
return g;
}
public void setAlignmentPosteriorGrid(AlignmentPosteriorGrid g) {
this.g = g;
}
public static final class SubPhraseCoordinates {
public int e_start;
public int e_end;
public int f_start;
public int f_end;
public SubPhraseCoordinates() {}
public SubPhraseCoordinates(int es,int ee, int fs, int fe) {
e_start = es;
e_end = ee;
f_start = fs;
f_end = fe;
}
public String toString() {
return "<(" + f_start + "," + f_end + ")-(" + e_start + "," + e_end +")>";
}
}
public PhrasePair extractSubPhrasePair(SubPhraseCoordinates c) {
return extractSubPhrasePair(c.f_start, c.f_end, c.e_start, c.e_end);
}
public PhrasePair extractSubPhrasePair(int startF, int endF, int startE, int endE)
{
PhrasePair n = new PhrasePair();
n.e = this.e.getSubPhrase(startE,endE);
n.f = this.f.getSubPhrase(startF,endF);
n.a = new Alignment(endF - startF + 1, endE - startE + 1);
for (int fi = startF; fi <= endF; fi++)
for (int ei = startE; ei <= endE; ei++)
if (this.a.aligned(fi, ei))
n.a.align(fi - startF, ei - startE);
return n;
}
/**
* Returns the smallest consistent phrase pair that contains [e_start,e_end]
* This is not efficient- don't use it where speed counts!
*/
public SubPhraseCoordinates getMinimalConsistentSubPhraseCoordsContainingESpan(int e_start, int e_end) {
int elen = e.size();
int flen = f.size();
int ne_s = e_start;
int ne_e = e_end;
while (ne_s > 0 && !a.isEAligned(ne_s)) { ne_s--; } // is start aligned? if not, keep moving left
while (ne_e < elen && !a.isEAligned(ne_e)) { ne_e++; } // is end aligned? if not, keep moving right
if (ne_s < 0 ) { ne_s = 0; } // make sure left edge isn't less than 0
if (ne_e >= elen) { ne_e = elen-1; } // make sure right edge isn't > len
// at this point, e_start and e_end are aligned to f words, so find the f range
boolean isConsistent = false;
int maxF = -1;
int minF = 9999999;
while(!isConsistent) {
isConsistent = true;
maxF = -1;
minF = 9999999;
//System.err.println("ne_s:" + ne_s +"\tne_e:" + ne_e + "\telen:"+elen);
for (int e = ne_s; e <= ne_e; e++) {
for (int f = 0; f<flen; f++) {
if (a.aligned(f, e)) {
if (f > maxF) maxF = f;
if (f < minF) minF = f;
}
}
if (maxF == -1) { maxF = flen - 1; }
if (minF == 9999999) { minF = 0; }
}
for (int f = minF; f <= maxF; f++) {
for (int e = 0; e<elen; e++) {
if (a.aligned(f, e)) {
if (e > ne_e) {ne_e = e; isConsistent = false; }
if (e < ne_s) {ne_s = e; isConsistent = false; }
}
}
}
}
return new SubPhraseCoordinates(ne_s, ne_e, minF, maxF);
}
public PhrasePair extractMinimalConsistentPhrasePairContainingESpan(int eStart, int eEnd) {
SubPhraseCoordinates spc = getMinimalConsistentSubPhraseCoordsContainingESpan(eStart,eEnd);
return extractSubPhrasePair(spc.f_start, spc.f_end, spc.e_start, spc.e_end);
}
public ArrayList<SubPhraseCoordinates> extractConsistentSubPhraseCoordinates(int maxPhraseLength)
{
ArrayList<SubPhraseCoordinates> res = new ArrayList<SubPhraseCoordinates>();
int _elen = e.size();
int _flen = f.size();
if (!this.hasAlignment())
throw new RuntimeException("Missing alignment");
int[] alignedCountF = new int[_flen];
ArrayList<ArrayList<Integer> > alignedToE = new ArrayList<ArrayList<Integer> >();
for (int i=0; i<_elen; i++)
{
alignedToE.add(new ArrayList<Integer>());
}
java.util.Iterator<Alignment.IntPair> ai = a.iterator();
while (ai.hasNext())
{
Alignment.IntPair pair = ai.next();
int f = pair.f;
int e = pair.e;
alignedToE.get(e).add(f);
alignedCountF[f]++;
}
int[] usedF = new int[alignedCountF.length];
//for (int cc=0; cc<_flen; cc++) {
// System.out.println(" " + cc + ": " + alignedCountF[cc]);
//}
for (int startE=0; startE<_elen; startE++) {
for (int endE=startE; (endE<_elen && endE<startE+maxPhraseLength); endE++)
{
int maxF = -1;
int minF = 9999999;
System.arraycopy(alignedCountF, 0, usedF, 0, usedF.length);
for (int ei=startE; ei<=endE; ei++) {
ArrayList<Integer> alignedToEi = alignedToE.get(ei);
int naei = alignedToEi.size();
for (int i=0; i<naei; i++) {
int fi = alignedToEi.get(i).intValue();
if (fi < minF) { minF = fi; }
if (fi > maxF) { maxF = fi; }
usedF[fi]--;
}
}
if (maxF >= 0 &&
maxF - minF < maxPhraseLength)
{
boolean oob = false;
for (int fi=minF;fi<=maxF && !oob;fi++) {
if (usedF[fi] > 0) { oob = true; }
}
if (!oob) {
for (int startF = minF;
(startF>=0 &&
startF>maxF - maxPhraseLength &&
(startF==minF || alignedCountF[startF]==0)); startF--) {
for (int endF = maxF;
(endF < _flen && endF < startF + maxPhraseLength &&
(endF == maxF || alignedCountF[endF] == 0)); endF++) {
res.add(new SubPhraseCoordinates(startE,endE, startF, endF));
}
}
}
}
}
}
return res;
}
public ArrayList<PhrasePair> extractConsistentPhrasePairs(int maxPhraseLength)
{
ArrayList<SubPhraseCoordinates> pcl = extractConsistentSubPhraseCoordinates(maxPhraseLength);
ArrayList<PhrasePair> res = new ArrayList<PhrasePair>(pcl.size());
for (SubPhraseCoordinates spc : pcl)
res.add(this.extractSubPhrasePair(spc));
return res;
}
}