/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package joshua.aligner;
import java.io.*;
import java.util.*;
import joshua.corpus.suffix_array.*;
import joshua.corpus.vocab.Vocabulary;
import joshua.corpus.Corpus;
import joshua.corpus.alignment.*;
import joshua.corpus.alignment.mm.MemoryMappedAlignmentGrids;
public class AlignCandidates {
private static Vocabulary srcVocab, tgtVocab;
private static Corpus srcCorpusArray, tgtCorpusArray;
private static Suffixes srcSA, tgtSA;
private static Alignments alignments;
private static HashMap<String,TreeSet<Integer>> alreadyResolved_srcSet;
private static HashMap<String,TreeSet<Integer>> alreadyResolved_tgtSet;
public static void main(String[] args) throws IOException {
/*
testJoshuaDerivationTree("(S{0-12} (S{0-11} (S{0-8} (X{0-8} (X{0-3} official (X{1-2} forecasts) are) based on (X{4-7} (X{4-5} only) 3 per cent))) (X{8-11} reported (X{8-9} ,) (X{10-11} bloomberg))) (X{11-12} .))");
testJoshuaDerivationTree("(S{0-5} (S{0-3} (S{0-1} (X{0-1} food)) (X{1-3} is to blame for)) (X{3-5} european (X{4-5} inflation)))");
*/
String paramFileName = args[0];
BufferedReader inFile_params = new BufferedReader(new FileReader(paramFileName));
String cands_fileName = (inFile_params.readLine().split("\\s+"))[0];
// String alignSrcRef_fileName = (inFile_params.readLine().split("\\s+"))[0];
// String parseSrc_fileName = (inFile_params.readLine().split("\\s+"))[0];
String alignSrcCand_phrasal_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
String alignSrcCand_word_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
// String alignCandRef_sen_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
// String alignCandRef_parse_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
String source_fileName = (inFile_params.readLine().split("\\s+"))[0];
// String ref_fileName = (inFile_params.readLine().split("\\s+"))[0];
String trainSrc_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src side of training corpus
String trainTgt_fileName = (inFile_params.readLine().split("\\s+"))[0]; // tgt side of training corpus
String trainAlign_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src-tgt of training corpus
String alignCache_fileName = (inFile_params.readLine().split("\\s+"))[0];
String alignmentsType = "AlignmentGrids"; // if (args.length >= 4) alignmentsType = args[3];
int maxCacheSize = 1000; // if (args.length >= 5) maxCacheSize = Integer.parseInt(args[4]);
inFile_params.close();
int numSentences = countLines(source_fileName);
InputStream inStream_src = new FileInputStream(new File(source_fileName));
BufferedReader srcFile = new BufferedReader(new InputStreamReader(inStream_src, "utf8"));
// InputStream inStream_ref = new FileInputStream(new File(ref_fileName));
// BufferedReader refFile = new BufferedReader(new InputStreamReader(inStream_ref, "utf8"));
String[] srcSentences = new String[numSentences];
// String[] refSentences = new String[numSentences];
for (int i = 0; i < numSentences; ++i) {
srcSentences[i] = srcFile.readLine();
// refSentences[i] = refFile.readLine();
}
srcFile.close();
// refFile.close();
// Source language vocabulary
println("Creating src vocabulary @ " + (new Date()));
srcVocab = new Vocabulary();
int[] sourceWordsSentences = Vocabulary.initializeVocabulary(trainSrc_fileName, srcVocab, true);
int numSourceWords = sourceWordsSentences[0];
int numSourceSentences = sourceWordsSentences[1];
// Source language corpus array
println("Reading src corpus @ " + (new Date()));
srcCorpusArray = SuffixArrayFactory.createCorpusArray(trainSrc_fileName, srcVocab, numSourceWords, numSourceSentences);
// Source language suffix array
println("Creating src SA @ " + (new Date()));
srcSA = SuffixArrayFactory.createSuffixArray(srcCorpusArray, maxCacheSize);
// Target language vocabulary
println("Creating tgt vocabulary @ " + (new Date()));
tgtVocab = new Vocabulary();
int[] targetWordsSentences = Vocabulary.initializeVocabulary(trainTgt_fileName, tgtVocab, true);
int numTargetWords = targetWordsSentences[0];
int numTargetSentences = targetWordsSentences[1];
// Target language corpus array
println("Reading tgt corpus @ " + (new Date()));
tgtCorpusArray = SuffixArrayFactory.createCorpusArray(trainTgt_fileName, tgtVocab, numTargetWords, numTargetSentences);
// Target language suffix array
println("Creating tgt SA @ " + (new Date()));
tgtSA = SuffixArrayFactory.createSuffixArray(tgtCorpusArray, maxCacheSize);
int trainingSize = srcCorpusArray.getNumSentences();
if (trainingSize != tgtCorpusArray.getNumSentences()) {
throw new RuntimeException("Source and target corpora have different number of sentences. This is bad.");
}
// Alignment data
println("Reading alignment data @ " + (new Date()));
alignments = null;
if ("AlignmentArray".equals(alignmentsType)) {
alignments = SuffixArrayFactory.createAlignments(trainAlign_fileName, srcSA, tgtSA);
} else if ("AlignmentGrids".equals(alignmentsType) || "AlignmentsGrid".equals(alignmentsType)) {
alignments = new AlignmentGrids(new Scanner(new File(trainAlign_fileName)), srcCorpusArray, tgtCorpusArray, trainingSize, true);
} else if ("MemoryMappedAlignmentGrids".equals(alignmentsType)) {
alignments = new MemoryMappedAlignmentGrids(trainAlign_fileName, srcCorpusArray, tgtCorpusArray);
}
if (!fileExists(alignCache_fileName)) {
alreadyResolved_srcSet = new HashMap<String,TreeSet<Integer>>();
alreadyResolved_tgtSet = new HashMap<String,TreeSet<Integer>>();
} else {
try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(alignCache_fileName));
alreadyResolved_srcSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
alreadyResolved_tgtSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
in.close();
} catch (FileNotFoundException e) {
System.err.println("FileNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
System.exit(99901);
} catch (IOException e) {
System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
System.exit(99902);
} catch (ClassNotFoundException e) {
System.err.println("ClassNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
System.exit(99904);
}
}
println("Processing candidates @ " + (new Date()));
PrintWriter outFile_alignSrcCand_phrasal = new PrintWriter(alignSrcCand_phrasal_fileName);
PrintWriter outFile_alignSrcCand_word = new PrintWriter(alignSrcCand_word_fileName);
// PrintWriter outFile_alignCandRef_sen = new PrintWriter(alignCandRef_sen_fileName);
InputStream inStream_cands = new FileInputStream(new File(cands_fileName));
BufferedReader candsFile = new BufferedReader(new InputStreamReader(inStream_cands, "utf8"));
String line = "";
String cand = "";
line = candsFile.readLine();
int countSatisfied = 0;
int countAll = 0;
int countSatisfied_sizeOne = 0;
int countAll_sizeOne = 0;
int prev_i = -1;
String srcSent = "";
String[] srcWords = null;
int candsRead = 0;
int C50count = 0;
while (line != null) {
++candsRead;
println("Read candidate on line #" + candsRead);
int i = toInt((line.substring(0,line.indexOf("|||"))).trim());
if (i != prev_i) {
srcSent = srcSentences[i];
srcWords = srcSent.split("\\s+");
prev_i = i;
println("New value for i: " + i + " seen @ " + (new Date()));
C50count = 0;
} else { ++C50count; }
line = (line.substring(line.indexOf("|||")+3)).trim(); // get rid of initial text
cand = (line.substring(0,line.indexOf("|||"))).trim();
cand = cand.substring(cand.indexOf(" ")+1,cand.length()-1); // trim "(ROOT{x-y} " and ")"
// testParseTree(cand);
JoshuaDerivationTree DT = new JoshuaDerivationTree(cand,0);
String candSent = DT.toSentence();
String[] candWords = candSent.split("\\s+");
///////////////////////////////
// align source to candidate //
///////////////////////////////
String alignSrcCand = DT.alignments();
// allow many-to-many
outFile_alignSrcCand_phrasal.println(alignSrcCand);
println(" i = " + i + ", alignSrcCand: " + alignSrcCand);
// resolve many-to-many
String alignSrcCand_res = "";
String[] linksSrcCand = alignSrcCand.split("\\s+");
for (int k = 0; k < linksSrcCand.length; ++k) {
String link = linksSrcCand[k];
if (link.indexOf(',') == -1) { // already one-to-one
alignSrcCand_res += " " + link.replaceFirst("--","-");
} else {
alignSrcCand_res += " " + resolve(link, srcWords, candWords);
}
}
alignSrcCand_res = alignSrcCand_res.trim();
println(" i = " + i + ", alignSrcCand_res: " + alignSrcCand_res);
outFile_alignSrcCand_word.println(alignSrcCand_res);
if (C50count == 50) { println("50C @ " + (new Date())); C50count = 0; }
line = candsFile.readLine();
}
outFile_alignSrcCand_phrasal.close();
outFile_alignSrcCand_word.close();
// outFile_alignCandRef_sen.close();
candsFile.close();
println("Finished processing candidates @ " + (new Date()));
/*
println("Satisfied: " + countSatisfied + "/" + countAll);
println("Satisfied_sizeOne: " + countSatisfied_sizeOne + "/" + countAll_sizeOne);
*/
try {
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(alignCache_fileName));
out.writeObject(alreadyResolved_srcSet);
out.writeObject(alreadyResolved_tgtSet);
out.flush();
out.close();
} catch (IOException e) {
System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
System.exit(99902);
}
// sanity checks
/*
String ph_str = "schuld an";
BasicPhrase ph = new BasicPhrase(ph_str,srcVocab);
int ph_size = ph.size();
int[] bounds = srcSA.findPhrase(ph);
int[] pos = srcSA.getAllPositions(bounds);
println("bounds: " + bounds[0] + " " + bounds[1]);
println("ph_str \"" + ph_str + "\" found in " + pos.length + " positions:");
*/
/*
for (int p = 1; p <= pos.length; ++p) {
int start_i = pos[p-1];
int final_i = start_i + ph_size - 1;
int senIndex = srcSA.getSentenceIndex(start_i);
int senPos_src = srcSA.getSentencePosition(senIndex);
int senPos_tgt = tgtSA.getSentencePosition(senIndex);
println(" POS #" + p + ": starting at " + start_i + ", ending at " + final_i);
println(" i.e. sentence " + senIndex + ", src words " + (start_i-senPos_src) + "-" + (final_i-senPos_src));
for (int i = start_i; i <= final_i; ++i) {
int[] tgtIndices = alignments.getAlignedTargetIndices(i);
if (tgtIndices != null) {
print(" i=" + i + " aligned to: ");
for (int k = 0; k < tgtIndices.length; ++k) print(tgtIndices[k] + " (" + (tgtIndices[k] - senPos_tgt) + ": " + tgtVocab.getWord(tgtCorpusArray.getWordID(tgtIndices[k])) + ") ");
println("");
} else {
println(" i=" + i + " unaligned");
}
}
}
*/
} // main
static private int countLines(String fileName)
{
int count = 0;
try {
BufferedReader inFile = new BufferedReader(new FileReader(fileName));
String line;
do {
line = inFile.readLine();
if (line != null) ++count;
} while (line != null);
inFile.close();
} catch (IOException e) {
System.err.println("IOException in AlignCandidates.countLines(String): " + e.getMessage());
System.exit(99902);
}
return count;
}
static public void testJoshuaDerivationTree(String PTS)
{
JoshuaDerivationTree T = new JoshuaDerivationTree(PTS,0);
println("T.toSentence() is:");
println(" " + T.toSentence());
println("root.numTgtWords: " + T.numTgtWords);
println("T.toString() is:");
println(" " + T);
if (PTS.equals(T.toString())) println("toString is A-OK");
else println("PROBLEM in toString!");
println("Alignments:");
println(T.alignments());
println("");
}
static private String resolve(String link, String[] srcWords, String[] tgtWords)
{
println(" Resolving " + link);
String SrcSide = link.substring(0,link.indexOf("--"));
String CandSide = link.substring(link.indexOf("--")+2);
String[] srcPhrases_str = indicesToPhrases(SrcSide, srcWords);
String[] tgtPhrases_str = indicesToPhrases(CandSide, tgtWords);
int[] origSrcIndices = toInt(SrcSide.split(","));
int[] origCandIndices = toInt(CandSide.split(","));
String cacheKey = "";
// for (int w = 0; w < origSrcIndices.length; ++w) cacheKey += " " + srcWords[origSrcIndices[w]];
for (int w = 0; w < srcPhrases_str.length; ++w) cacheKey += " " + srcPhrases_str[w];
cacheKey += "__";
// for (int w = 0; w < origCandIndices.length; ++w) cacheKey += tgtWords[origCandIndices[w]] + " ";
for (int w = 0; w < tgtPhrases_str.length; ++w) cacheKey += tgtPhrases_str[w] + " ";
cacheKey = cacheKey.trim();
BasicPhrase[] srcPhrases = strToPhrase(srcPhrases_str,srcVocab);
BasicPhrase[] tgtPhrases = strToPhrase(tgtPhrases_str,tgtVocab);
int[] srcPhrases_len = phraseLenghts(srcPhrases);
int[] tgtPhrases_len = phraseLenghts(tgtPhrases);
int srcPhCount = srcPhrases.length;
int tgtPhCount = tgtPhrases.length;
println(" srcPhCount: " + srcPhCount + ", tgtPhCount: " + tgtPhCount);
TreeSet<Integer> senIndices = null;
if (alreadyResolved_srcSet.containsKey(cacheKey)) {
println(" Using cached result (for " + cacheKey + ")");
TreeSet<Integer> srcIndices_allowed = alreadyResolved_srcSet.get(cacheKey);
TreeSet<Integer> tgtIndices_allowed = alreadyResolved_tgtSet.get(cacheKey);
return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
}
print(" Extracting xxxPhPos...");
// the keySet of srcPhPos[p] are sentence indices, with key_i mapped to a Vector of the positions
// of matches of srcPhrases's p'th phrase in the key_i'th sentence
TreeMap<Integer,Vector<Integer>>[] srcPhPos = getPosMaps(srcPhrases,srcSA);
TreeMap<Integer,Vector<Integer>>[] tgtPhPos = getPosMaps(tgtPhrases,tgtSA);
println("done");
print(" Intersecting sentence indices...");
senIndices = new TreeSet<Integer>(srcPhPos[0].keySet());
for (int i = 1; i < srcPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(srcPhPos[i].keySet())); }
for (int i = 0; i < tgtPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(tgtPhPos[i].keySet())); }
// now, if sen_i is in senIndices, this means that the sen_i'th sentence pair
// contains all the relevant phrases, on both sides
println("done; intersection has " + senIndices.size() + " indices.");
boolean found = false;
for (Integer sen_i : senIndices) {
@SuppressWarnings("unchecked")
Vector<Integer>[] srcVecs = new Vector[srcPhCount];
for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs[ph] = srcPhPos[ph].get(sen_i); }
@SuppressWarnings("unchecked")
Vector<Integer>[] tgtVecs = new Vector[tgtPhCount];
for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs[ph] = tgtPhPos[ph].get(sen_i); }
int[] srcVecs_size = new int[srcPhCount];
for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_size[ph] = srcVecs[ph].size(); }
int[] tgtVecs_size = new int[tgtPhCount];
for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_size[ph] = tgtVecs[ph].size(); }
int[] srcVecs_i = new int[srcPhCount];
for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_i[ph] = 0; }
int[] tgtVecs_i = new int[tgtPhCount];
for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_i[ph] = 0; }
boolean done = false;
while (!done) {
// check to see if elements at srcVecs_i[] and tgtVecs_i[] are ordered properly
boolean ordered = true;
for (int ph = 0; ph < srcPhCount-1; ++ph) {
int end_curr = srcVecs[ph].elementAt(srcVecs_i[ph]) + srcPhrases_len[ph] - 1;
int start_next = srcVecs[ph+1].elementAt(srcVecs_i[ph+1]);
if (end_curr >= start_next) {
ordered = false;
break;
}
}
if (ordered) { // still ordered; now check tgt side for order
for (int ph = 0; ph < tgtPhCount-1; ++ph) {
int end_curr = tgtVecs[ph].elementAt(tgtVecs_i[ph]) + tgtPhrases_len[ph] - 1;
int start_next = tgtVecs[ph+1].elementAt(tgtVecs_i[ph+1]);
if (end_curr >= start_next) {
ordered = false;
break;
}
}
if (ordered) { // still ordered; now, finally, check if alignment is consistent
// what do we have here? we know that the sen_i'th training sentence has all the
// phrases we we want (on both source and target side) and we also know the
// phrases do not cross sentence boundaries and we also know that the phrases
// are in the right order.
// now we need to make sure the alignments are contained:
// (*) all the words in the phrases of the source side
// must not align to ANYthing outside the phrases
// of the target side.
// (*) all the words in the phrases of the target side
// must not align to ANYthing outside the phrases
// of the source side.
// what indices IN THE SOURCE SIDE do the words of the target phrases ALLOWED TO align with?
TreeSet<Integer> srcIndices_allowed = new TreeSet<Integer>();
// what indices IN THE TARGET SIDE do the words of the source phrases ALLOWED TO align with?
TreeSet<Integer> tgtIndices_allowed = new TreeSet<Integer>();
// set srcIndices_allowed
for (int ph = 0; ph < srcPhCount; ++ph) {
int start_i = srcVecs[ph].elementAt(srcVecs_i[ph]);
int final_i = start_i + srcPhrases_len[ph] - 1;
for (int i = start_i; i <= final_i; ++i) srcIndices_allowed.add(i);
} // for (ph:0..srcPhCount)
// set tgtIndices_allowed
for (int ph = 0; ph < tgtPhCount; ++ph) {
int start_i = tgtVecs[ph].elementAt(tgtVecs_i[ph]);
int final_i = start_i + tgtPhrases_len[ph] - 1;
for (int i = start_i; i <= final_i; ++i) tgtIndices_allowed.add(i);
} // for (ph:0..tgtPhCount)
boolean misalign = false;
// does any source word align to anything that is not allowed?
for (Integer i : srcIndices_allowed) {
int[] tgtIndices = alignments.getAlignedTargetIndices(i);
if (tgtIndices != null) {
for (int j = 0; j < tgtIndices.length; ++j) {
if (!tgtIndices_allowed.contains(tgtIndices[j])) {
misalign = true;
break; // from for (j)
}
}
}
if (misalign) break; // from for (i)
}
if (!misalign) { // still aligned; now check tgt->src
// does any target word align to anything that is not allowed?
for (Integer i : tgtIndices_allowed) {
int[] srcIndices = alignments.getAlignedSourceIndices(i);
if (srcIndices != null) {
for (int j = 0; j < srcIndices.length; ++j) {
if (!srcIndices_allowed.contains(srcIndices[j])) {
misalign = true;
break; // from for (j)
}
}
}
if (misalign) break; // from for (i)
}
if (!misalign) { // still aligned; now, FINALLY, extract alignments
// remember: src->cand
alreadyResolved_srcSet.put(cacheKey,srcIndices_allowed);
alreadyResolved_tgtSet.put(cacheKey,tgtIndices_allowed);
return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
}
}
} // if (ordered)_2
} // if (ordered)_1
advance(srcVecs_i,tgtVecs_i,srcVecs_size,tgtVecs_size);
// advances srcVecs_i and tgtVecs_i, if possible
// if not possible, sets srcVecs_i[0] to -1 and everything else to 0
if (srcVecs_i[0] == -1) done = true;
}
} // for (sen_i)
return link;
}
static private String finalResolve(TreeSet<Integer> srcIndices_allowed, TreeSet<Integer> tgtIndices_allowed, int[] origSrcIndices, int[] origCandIndices)
{
println("In finalResolve. Sizes: sI_a: " + srcIndices_allowed.size() + ", tI_a: " + tgtIndices_allowed.size() + ", oSI: " + origSrcIndices.length + ", oCI: " + origCandIndices.length);
String resolvedStr = "";
TreeMap<Integer,Integer> toOrigTgt = new TreeMap<Integer,Integer>();
int oci = 0;
for (Integer i : tgtIndices_allowed) {
toOrigTgt.put(i,origCandIndices[oci]);
++oci;
}
int osi = 0;
for (Integer i : srcIndices_allowed) {
int[] tgtIndices = alignments.getAlignedTargetIndices(i);
if (tgtIndices != null) {
for (int j = 0; j < tgtIndices.length; ++j) {
resolvedStr += " " + origSrcIndices[osi] + "-" + toOrigTgt.get(tgtIndices[j]);
}
}
++osi;
}
return resolvedStr.trim();
}
static private int[] phraseLenghts(BasicPhrase[] phrases)
{
int[] lenghts = new int[phrases.length];
for (int k = 0; k < phrases.length; ++k) lenghts[k] = phrases[k].size();
return lenghts;
}
static private void advance(int[] A_i, int[] B_i, int[] A_size, int[] B_size)
{
int A_cnt = A_i.length;
int B_cnt = B_i.length;
boolean B_adv = false;
int B_curr = B_cnt-1;
while (true) {
B_i[B_curr] += 1;
if (B_i[B_curr] == B_size[B_curr]) {
B_i[B_curr] = 0;
--B_curr;
if (B_curr < 0) break;
} else {
B_adv = true;
break;
}
}
if (!B_adv) {
boolean A_adv = false;
int A_curr = A_cnt-1;
while (true) {
A_i[A_curr] += 1;
if (A_i[A_curr] == A_size[A_curr]) {
A_i[A_curr] = 0;
--A_curr;
if (A_curr < 0) break;
} else {
A_adv = true;
break;
}
}
if (!A_adv) {
A_i[0] = -1;
}
}
}
static private TreeSet<Integer> setIntersect(TreeSet<Integer> A, TreeSet<Integer> B)
{
TreeSet<Integer> retSet = new TreeSet<Integer>();
for (Integer i : A) { if (B.contains(i)) retSet.add(i); }
return retSet;
}
static private TreeMap<Integer,Vector<Integer>>[] getPosMaps(BasicPhrase[] phrases, Suffixes SA)
{
int phCount = phrases.length;
@SuppressWarnings("unchecked")
TreeMap<Integer,Vector<Integer>>[] retA = new TreeMap[phCount];
for (int ph_i = 0; ph_i < phCount; ++ph_i) {
retA[ph_i] = new TreeMap<Integer,Vector<Integer>>();
int offset = phrases[ph_i].size() - 1;
int[] bounds = SA.findPhrase(phrases[ph_i]);
int[] pos = SA.getAllPositions(bounds);
for (int p_i = 0; p_i < pos.length; ++p_i) {
int start_i = pos[p_i];
int final_i = start_i + offset;
int senIndex = SA.getSentenceIndex(start_i);
if (SA.getSentenceIndex(final_i) == senIndex) { // necessary because findPhrase might match across sentences
Vector<Integer> V = retA[ph_i].get(senIndex);
if (V == null) V = new Vector<Integer>();
V.add(start_i);
retA[ph_i].put(senIndex,V);
}
}
}
return retA;
}
static private String[] indicesToPhrases(String indices, String[] words)
{
int[] indices_A = toInt(indices.split(","));
int phraseCount = gapCount(indices_A) + 1;
String[] phrases = new String[phraseCount];
int ph_i = 0;
String curr_ph = words[indices_A[0]];
int prev = indices_A[0];
for (int i = 1; i < indices_A.length; ++i) {
if (indices_A[i] == prev+1) { // continue phrase
curr_ph += " " + words[indices_A[i]];
} else { // gap; end previous phrase and start new one
phrases[ph_i] = curr_ph;
curr_ph = words[indices_A[i]];
++ph_i;
}
prev = indices_A[i];
}
phrases[ph_i] = curr_ph;
// now ph_i+1 == phraseCount
if (ph_i != phraseCount - 1) {
println("MISMATCH: ph_i = " + ph_i + "; phraseCount - 1 = " + (phraseCount-1));
}
return phrases;
}
static private int gapCount(int[] indices)
{
if (indices == null || indices.length < 2) {
return 0;
} else {
int count = 0;
int prev = indices[0];
for (int i = 1; i < indices.length; ++i) {
if (indices[i] != prev+1) {
++count;
}
prev = indices[i];
}
return count;
}
}
static private BasicPhrase[] strToPhrase(String[] phrases_str, Vocabulary vocab)
{
BasicPhrase[] retA = new BasicPhrase[phrases_str.length];
for (int i = 0; i < phrases_str.length; ++i) { retA[i] = new BasicPhrase(phrases_str[i],vocab); }
return retA;
}
static private void println(Object obj) { System.out.println(obj); }
static private void print(Object obj) { System.out.print(obj); }
static private int toInt(String str) { return Integer.parseInt(str); }
static private int[] toInt(String[] strA)
{
int[] intA = new int[strA.length];
for (int i = 0; i < intA.length; ++i) intA[i] = toInt(strA[i]);
return intA;
}
static private boolean fileExists(String fileName)
{
if (fileName == null) return false;
File checker = new File(fileName);
return checker.exists();
}
}