Source Code of joshua.aligner.AlignCandidates

/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or 
 * (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but 
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */


package joshua.aligner;


import java.io.*;
import java.util.*;


import joshua.corpus.suffix_array.*;
import joshua.corpus.vocab.Vocabulary;
import joshua.corpus.Corpus;
import joshua.corpus.alignment.*;
import joshua.corpus.alignment.mm.MemoryMappedAlignmentGrids;


public class AlignCandidates {


  private static Vocabulary srcVocab, tgtVocab;
  private static Corpus srcCorpusArray, tgtCorpusArray;
  private static Suffixes srcSA, tgtSA;
  private static Alignments alignments;


  private static HashMap<String,TreeSet<Integer>> alreadyResolved_srcSet;
  private static HashMap<String,TreeSet<Integer>> alreadyResolved_tgtSet;


  
  public static void main(String[] args) throws IOException {


/*
    testJoshuaDerivationTree("(S{0-12} (S{0-11} (S{0-8} (X{0-8} (X{0-3} official (X{1-2} forecasts) are) based on (X{4-7} (X{4-5} only) 3 per cent))) (X{8-11} reported (X{8-9} ,) (X{10-11} bloomberg))) (X{11-12} .))");
    testJoshuaDerivationTree("(S{0-5} (S{0-3} (S{0-1} (X{0-1} food)) (X{1-3} is to blame for)) (X{3-5} european (X{4-5} inflation)))");
*/


    String paramFileName = args[0];


    BufferedReader inFile_params = new BufferedReader(new FileReader(paramFileName));


    String cands_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String alignSrcRef_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String parseSrc_fileName = (inFile_params.readLine().split("\\s+"))[0];
    String alignSrcCand_phrasal_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
    String alignSrcCand_word_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
//    String alignCandRef_sen_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
//    String alignCandRef_parse_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
    String source_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String ref_fileName = (inFile_params.readLine().split("\\s+"))[0];
    String trainSrc_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src side of training corpus
    String trainTgt_fileName = (inFile_params.readLine().split("\\s+"))[0]; // tgt side of training corpus
    String trainAlign_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src-tgt of training corpus
    String alignCache_fileName = (inFile_params.readLine().split("\\s+"))[0];


    String alignmentsType = "AlignmentGrids"; // if (args.length >= 4) alignmentsType = args[3];
    int maxCacheSize = 1000; // if (args.length >= 5) maxCacheSize = Integer.parseInt(args[4]);


    inFile_params.close();


    int numSentences = countLines(source_fileName);


    InputStream inStream_src = new FileInputStream(new File(source_fileName));
    BufferedReader srcFile = new BufferedReader(new InputStreamReader(inStream_src, "utf8"));
//    InputStream inStream_ref = new FileInputStream(new File(ref_fileName));
//    BufferedReader refFile = new BufferedReader(new InputStreamReader(inStream_ref, "utf8"));


    String[] srcSentences = new String[numSentences];
//    String[] refSentences = new String[numSentences];


    for (int i = 0; i < numSentences; ++i) {
      srcSentences[i] = srcFile.readLine();
//      refSentences[i] = refFile.readLine();
    }


    srcFile.close();
//    refFile.close();




    // Source language vocabulary
    println("Creating src vocabulary @ " + (new Date()));
    srcVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(trainSrc_fileName, srcVocab, true);


    int numSourceWords = sourceWordsSentences[0];
    int numSourceSentences = sourceWordsSentences[1];


    // Source language corpus array
    println("Reading src corpus @ " + (new Date()));
    srcCorpusArray = SuffixArrayFactory.createCorpusArray(trainSrc_fileName, srcVocab, numSourceWords, numSourceSentences);


    // Source language suffix array
    println("Creating src SA @ " + (new Date()));
    srcSA = SuffixArrayFactory.createSuffixArray(srcCorpusArray, maxCacheSize);




    // Target language vocabulary
    println("Creating tgt vocabulary @ " + (new Date()));
    tgtVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(trainTgt_fileName, tgtVocab, true);


    int numTargetWords = targetWordsSentences[0];
    int numTargetSentences = targetWordsSentences[1];


    // Target language corpus array
    println("Reading tgt corpus @ " + (new Date()));
    tgtCorpusArray = SuffixArrayFactory.createCorpusArray(trainTgt_fileName, tgtVocab, numTargetWords, numTargetSentences);


    // Target language suffix array
    println("Creating tgt SA @ " + (new Date()));
    tgtSA = SuffixArrayFactory.createSuffixArray(tgtCorpusArray, maxCacheSize);




    int trainingSize = srcCorpusArray.getNumSentences();
    if (trainingSize != tgtCorpusArray.getNumSentences()) {
      throw new RuntimeException("Source and target corpora have different number of sentences. This is bad.");
    }




    // Alignment data
    println("Reading alignment data @ " + (new Date()));
    alignments = null;
    if ("AlignmentArray".equals(alignmentsType)) {
      alignments = SuffixArrayFactory.createAlignments(trainAlign_fileName, srcSA, tgtSA);
    } else if ("AlignmentGrids".equals(alignmentsType) || "AlignmentsGrid".equals(alignmentsType)) {
      alignments = new AlignmentGrids(new Scanner(new File(trainAlign_fileName)), srcCorpusArray, tgtCorpusArray, trainingSize, true);
    } else if ("MemoryMappedAlignmentGrids".equals(alignmentsType)) {
      alignments = new MemoryMappedAlignmentGrids(trainAlign_fileName, srcCorpusArray, tgtCorpusArray);
    }






    if (!fileExists(alignCache_fileName)) {
      alreadyResolved_srcSet = new HashMap<String,TreeSet<Integer>>();
      alreadyResolved_tgtSet = new HashMap<String,TreeSet<Integer>>();
    } else {
      try {
        ObjectInputStream in = new ObjectInputStream(new FileInputStream(alignCache_fileName));
        alreadyResolved_srcSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
        alreadyResolved_tgtSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
        in.close();
      } catch (FileNotFoundException e) {
        System.err.println("FileNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99901);
      } catch (IOException e) {
        System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99902);
      } catch (ClassNotFoundException e) {
        System.err.println("ClassNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99904);
      }
    }








    println("Processing candidates @ " + (new Date()));


    PrintWriter outFile_alignSrcCand_phrasal = new PrintWriter(alignSrcCand_phrasal_fileName);
    PrintWriter outFile_alignSrcCand_word = new PrintWriter(alignSrcCand_word_fileName);
//    PrintWriter outFile_alignCandRef_sen = new PrintWriter(alignCandRef_sen_fileName);


    InputStream inStream_cands = new FileInputStream(new File(cands_fileName));
    BufferedReader candsFile = new BufferedReader(new InputStreamReader(inStream_cands, "utf8"));


    String line = "";


    String cand = "";
    line = candsFile.readLine();


    int countSatisfied = 0;
    int countAll = 0;
    int countSatisfied_sizeOne = 0;
    int countAll_sizeOne = 0;
    int prev_i = -1;
    String srcSent = "";
    String[] srcWords = null;
    int candsRead = 0;
    int C50count = 0;


    while (line != null) {
      ++candsRead;
      println("Read candidate on line #" + candsRead);
      int i = toInt((line.substring(0,line.indexOf("|||"))).trim());


      if (i != prev_i) {
        srcSent = srcSentences[i];
        srcWords = srcSent.split("\\s+");
        prev_i = i;
        println("New value for i: " + i + " seen @ " + (new Date()));
        C50count = 0;
      } else { ++C50count; }


      line = (line.substring(line.indexOf("|||")+3)).trim(); // get rid of initial text


      cand = (line.substring(0,line.indexOf("|||"))).trim();


      cand = cand.substring(cand.indexOf(" ")+1,cand.length()-1); // trim "(ROOT{x-y} " and ")"


//      testParseTree(cand);


      JoshuaDerivationTree DT = new JoshuaDerivationTree(cand,0);


      String candSent = DT.toSentence();
      String[] candWords = candSent.split("\\s+");




      ///////////////////////////////
      // align source to candidate //
      ///////////////////////////////


      String alignSrcCand = DT.alignments();




      // allow many-to-many
      outFile_alignSrcCand_phrasal.println(alignSrcCand);


println("  i = " + i + ", alignSrcCand: " + alignSrcCand);


      // resolve many-to-many


      String alignSrcCand_res = "";


      String[] linksSrcCand = alignSrcCand.split("\\s+");


      for (int k = 0; k < linksSrcCand.length; ++k) {
        String link = linksSrcCand[k];
        if (link.indexOf(',') == -1) { // already one-to-one
          alignSrcCand_res += " " + link.replaceFirst("--","-");
        } else {
          alignSrcCand_res += " " + resolve(link, srcWords, candWords);
        }
      }


      alignSrcCand_res = alignSrcCand_res.trim();
println("  i = " + i + ", alignSrcCand_res: " + alignSrcCand_res);


      outFile_alignSrcCand_word.println(alignSrcCand_res);


      if (C50count == 50) { println("50C @ " + (new Date())); C50count = 0; }


      line = candsFile.readLine();
    }


    outFile_alignSrcCand_phrasal.close();
    outFile_alignSrcCand_word.close();
//    outFile_alignCandRef_sen.close();
    candsFile.close();


    println("Finished processing candidates @ " + (new Date()));
/*
    println("Satisfied: " + countSatisfied + "/" + countAll);
    println("Satisfied_sizeOne: " + countSatisfied_sizeOne + "/" + countAll_sizeOne);
*/






    try {
      ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(alignCache_fileName));
      out.writeObject(alreadyResolved_srcSet);
      out.writeObject(alreadyResolved_tgtSet);
      out.flush();
      out.close();
    } catch (IOException e) {
      System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
      System.exit(99902);
    }




// sanity checks
/*
    String ph_str = "schuld an";
    BasicPhrase ph = new BasicPhrase(ph_str,srcVocab);
    int ph_size = ph.size();


    int[] bounds = srcSA.findPhrase(ph);
    int[] pos = srcSA.getAllPositions(bounds);


    println("bounds: " + bounds[0] + " " + bounds[1]);
    println("ph_str \"" + ph_str + "\" found in " + pos.length + " positions:");
*/
/*
    for (int p = 1; p <= pos.length; ++p) {
      int start_i = pos[p-1];
      int final_i = start_i + ph_size - 1;
      int senIndex = srcSA.getSentenceIndex(start_i);
      int senPos_src = srcSA.getSentencePosition(senIndex);
      int senPos_tgt = tgtSA.getSentencePosition(senIndex);


      println("  POS #" + p + ": starting at " + start_i + ", ending at " + final_i);
      println("    i.e. sentence " + senIndex + ", src words " + (start_i-senPos_src) + "-" + (final_i-senPos_src));


      for (int i = start_i; i <= final_i; ++i) {
        int[] tgtIndices = alignments.getAlignedTargetIndices(i);
        if (tgtIndices != null) {
          print("    i=" + i + " aligned to: ");
          for (int k = 0; k < tgtIndices.length; ++k) print(tgtIndices[k] + " (" + (tgtIndices[k] - senPos_tgt) + ": " + tgtVocab.getWord(tgtCorpusArray.getWordID(tgtIndices[k])) + ") ");
          println("");
        } else {
          println("    i=" + i + " unaligned");
        }
      }
    }
*/




  } // main










  static private int countLines(String fileName)
  {
    int count = 0;


    try {
      BufferedReader inFile = new BufferedReader(new FileReader(fileName));


      String line;
      do {
        line = inFile.readLine();
        if (line != null) ++count;
      }  while (line != null);


      inFile.close();
    } catch (IOException e) {
      System.err.println("IOException in AlignCandidates.countLines(String): " + e.getMessage());
      System.exit(99902);
    }


    return count;
  }




  static public void testJoshuaDerivationTree(String PTS)
  {
    JoshuaDerivationTree T = new JoshuaDerivationTree(PTS,0);


    println("T.toSentence() is:");
    println("  " + T.toSentence());
    println("root.numTgtWords: " + T.numTgtWords);
    println("T.toString() is:");
    println("  " + T);


    if (PTS.equals(T.toString())) println("toString is A-OK");
    else println("PROBLEM in toString!");


    println("Alignments:");
    println(T.alignments());
    println("");
  }


  static private String resolve(String link, String[] srcWords, String[] tgtWords)
  {
    println("    Resolving " + link);
    String SrcSide = link.substring(0,link.indexOf("--"));
    String CandSide = link.substring(link.indexOf("--")+2);


    String[] srcPhrases_str = indicesToPhrases(SrcSide, srcWords);
    String[] tgtPhrases_str = indicesToPhrases(CandSide, tgtWords);


    int[] origSrcIndices = toInt(SrcSide.split(","));
    int[] origCandIndices = toInt(CandSide.split(","));


    String cacheKey = "";
//    for (int w = 0; w < origSrcIndices.length; ++w) cacheKey += " " + srcWords[origSrcIndices[w]];
    for (int w = 0; w < srcPhrases_str.length; ++w) cacheKey += " " + srcPhrases_str[w];
    cacheKey += "__";
//    for (int w = 0; w < origCandIndices.length; ++w) cacheKey += tgtWords[origCandIndices[w]] + " ";
    for (int w = 0; w < tgtPhrases_str.length; ++w) cacheKey += tgtPhrases_str[w] + " ";
    cacheKey = cacheKey.trim();




    BasicPhrase[] srcPhrases = strToPhrase(srcPhrases_str,srcVocab);
    BasicPhrase[] tgtPhrases = strToPhrase(tgtPhrases_str,tgtVocab);


    int[] srcPhrases_len = phraseLenghts(srcPhrases);
    int[] tgtPhrases_len = phraseLenghts(tgtPhrases);


    int srcPhCount = srcPhrases.length;
    int tgtPhCount = tgtPhrases.length;


    println("      srcPhCount: " + srcPhCount + ", tgtPhCount: " + tgtPhCount);


    TreeSet<Integer> senIndices = null;
    if (alreadyResolved_srcSet.containsKey(cacheKey)) {
      println("      Using cached result (for " + cacheKey + ")");


      TreeSet<Integer> srcIndices_allowed = alreadyResolved_srcSet.get(cacheKey);
      TreeSet<Integer> tgtIndices_allowed = alreadyResolved_tgtSet.get(cacheKey);


      return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
    }


    print("      Extracting xxxPhPos...");
    // the keySet of srcPhPos[p] are sentence indices, with key_i mapped to a Vector of the positions
    // of matches of srcPhrases's p'th phrase in the key_i'th sentence
    TreeMap<Integer,Vector<Integer>>[] srcPhPos = getPosMaps(srcPhrases,srcSA);
    TreeMap<Integer,Vector<Integer>>[] tgtPhPos = getPosMaps(tgtPhrases,tgtSA);
    println("done");


    print("      Intersecting sentence indices...");
    senIndices = new TreeSet<Integer>(srcPhPos[0].keySet());
    for (int i = 1; i < srcPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(srcPhPos[i].keySet())); }
    for (int i = 0; i < tgtPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(tgtPhPos[i].keySet())); }
    // now, if sen_i is in senIndices, this means that the sen_i'th sentence pair
    // contains all the relevant phrases, on both sides
    println("done; intersection has " + senIndices.size() + " indices.");




    boolean found = false;


    for (Integer sen_i : senIndices) {


      @SuppressWarnings("unchecked")
      Vector<Integer>[] srcVecs = new Vector[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs[ph] = srcPhPos[ph].get(sen_i); }
      @SuppressWarnings("unchecked")
      Vector<Integer>[] tgtVecs = new Vector[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs[ph] = tgtPhPos[ph].get(sen_i); }


      int[] srcVecs_size = new int[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_size[ph] = srcVecs[ph].size(); }
      int[] tgtVecs_size = new int[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_size[ph] = tgtVecs[ph].size(); }


      int[] srcVecs_i = new int[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_i[ph] = 0; }
      int[] tgtVecs_i = new int[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_i[ph] = 0; }


      boolean done = false;


      while (!done) {


        // check to see if elements at srcVecs_i[] and tgtVecs_i[] are ordered properly


        boolean ordered = true;


        for (int ph = 0; ph < srcPhCount-1; ++ph) {
          int end_curr = srcVecs[ph].elementAt(srcVecs_i[ph]) + srcPhrases_len[ph] - 1;
          int start_next = srcVecs[ph+1].elementAt(srcVecs_i[ph+1]);
          if (end_curr >= start_next) {
            ordered = false;
            break;
          }
        }


        if (ordered) { // still ordered; now check tgt side for order
          for (int ph = 0; ph < tgtPhCount-1; ++ph) {
            int end_curr = tgtVecs[ph].elementAt(tgtVecs_i[ph]) + tgtPhrases_len[ph] - 1;
            int start_next = tgtVecs[ph+1].elementAt(tgtVecs_i[ph+1]);
            if (end_curr >= start_next) {
              ordered = false;
              break;
            }
          }


          if (ordered) { // still ordered; now, finally, check if alignment is consistent


            // what do we have here? we know that the sen_i'th training sentence has all the
            // phrases we we want (on both source and target side) and we also know the
            // phrases do not cross sentence boundaries and we also know that the phrases
            // are in the right order.


            // now we need to make sure the alignments are contained:
            //   (*) all the words in the phrases of the source side
            //       must not align to ANYthing outside the phrases
            //       of the target side.
            //   (*) all the words in the phrases of the target side
            //       must not align to ANYthing outside the phrases
            //       of the source side.


            // what indices IN THE SOURCE SIDE do the words of the target phrases ALLOWED TO align with?
            TreeSet<Integer> srcIndices_allowed = new TreeSet<Integer>();


            // what indices IN THE TARGET SIDE do the words of the source phrases ALLOWED TO align with?
            TreeSet<Integer> tgtIndices_allowed = new TreeSet<Integer>();




            // set srcIndices_allowed
            for (int ph = 0; ph < srcPhCount; ++ph) {
              int start_i = srcVecs[ph].elementAt(srcVecs_i[ph]);
              int final_i = start_i + srcPhrases_len[ph] - 1;
              for (int i = start_i; i <= final_i; ++i) srcIndices_allowed.add(i);
            } // for (ph:0..srcPhCount)


            // set tgtIndices_allowed
            for (int ph = 0; ph < tgtPhCount; ++ph) {
              int start_i = tgtVecs[ph].elementAt(tgtVecs_i[ph]);
              int final_i = start_i + tgtPhrases_len[ph] - 1;
              for (int i = start_i; i <= final_i; ++i) tgtIndices_allowed.add(i);
            } // for (ph:0..tgtPhCount)




            boolean misalign = false;


            // does any source word align to anything that is not allowed?
            for (Integer i : srcIndices_allowed) {
              int[] tgtIndices = alignments.getAlignedTargetIndices(i);
              if (tgtIndices != null) {
                for (int j = 0; j < tgtIndices.length; ++j) {
                  if (!tgtIndices_allowed.contains(tgtIndices[j])) {
                    misalign = true;
                    break; // from for (j)
                  }
                }
              }


              if (misalign) break; // from for (i)
            }




            if (!misalign) { // still aligned; now check tgt->src


              // does any target word align to anything that is not allowed?
              for (Integer i : tgtIndices_allowed) {
                int[] srcIndices = alignments.getAlignedSourceIndices(i);
                if (srcIndices != null) {
                  for (int j = 0; j < srcIndices.length; ++j) {
                    if (!srcIndices_allowed.contains(srcIndices[j])) {
                      misalign = true;
                      break; // from for (j)
                    }
                  }
                }


                if (misalign) break; // from for (i)
              }


              if (!misalign) { // still aligned; now, FINALLY, extract alignments


                // remember: src->cand


                alreadyResolved_srcSet.put(cacheKey,srcIndices_allowed);
                alreadyResolved_tgtSet.put(cacheKey,tgtIndices_allowed);


                return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
              }


            }




          } // if (ordered)_2


        } // if (ordered)_1


        advance(srcVecs_i,tgtVecs_i,srcVecs_size,tgtVecs_size);
        // advances srcVecs_i and tgtVecs_i, if possible
        // if not possible, sets srcVecs_i[0] to -1 and everything else to 0


        if (srcVecs_i[0] == -1) done = true;


      }


    } // for (sen_i)




    return link;
  }


  static private String finalResolve(TreeSet<Integer> srcIndices_allowed, TreeSet<Integer> tgtIndices_allowed, int[] origSrcIndices, int[] origCandIndices)
  {
println("In finalResolve.  Sizes: sI_a: " + srcIndices_allowed.size() + ", tI_a: " + tgtIndices_allowed.size() + ", oSI: " + origSrcIndices.length + ", oCI: " + origCandIndices.length);
    String resolvedStr = "";


    TreeMap<Integer,Integer> toOrigTgt = new TreeMap<Integer,Integer>();
    int oci = 0;
    for (Integer i : tgtIndices_allowed) {
      toOrigTgt.put(i,origCandIndices[oci]);
      ++oci;
    }


    int osi = 0;
    for (Integer i : srcIndices_allowed) {
      int[] tgtIndices = alignments.getAlignedTargetIndices(i);
      if (tgtIndices != null) {
        for (int j = 0; j < tgtIndices.length; ++j) {
          resolvedStr += " " + origSrcIndices[osi] + "-" + toOrigTgt.get(tgtIndices[j]);
        }
      }
      ++osi;
    }
    return resolvedStr.trim();


  }


  static private int[] phraseLenghts(BasicPhrase[] phrases)
  {
    int[] lenghts = new int[phrases.length];
    for (int k = 0; k < phrases.length; ++k) lenghts[k] = phrases[k].size();
    return lenghts;
  }




  static private void advance(int[] A_i, int[] B_i, int[] A_size, int[] B_size)
  {
    int A_cnt = A_i.length;
    int B_cnt = B_i.length;


    boolean B_adv = false;
    int B_curr = B_cnt-1;


    while (true) {
      B_i[B_curr] += 1;
      if (B_i[B_curr] == B_size[B_curr]) {
        B_i[B_curr] = 0;
        --B_curr;
        if (B_curr < 0) break;
      } else {
        B_adv = true;
        break;
      }
    }


    if (!B_adv) {


      boolean A_adv = false;
      int A_curr = A_cnt-1;


      while (true) {
        A_i[A_curr] += 1;
        if (A_i[A_curr] == A_size[A_curr]) {
          A_i[A_curr] = 0;
          --A_curr;
          if (A_curr < 0) break;
        } else {
          A_adv = true;
          break;
        }
      }


      if (!A_adv) {
        A_i[0] = -1;
      }
    }


  }


  static private TreeSet<Integer> setIntersect(TreeSet<Integer> A, TreeSet<Integer> B)
  {
    TreeSet<Integer> retSet = new TreeSet<Integer>();


    for (Integer i : A) { if (B.contains(i)) retSet.add(i); }


    return retSet;
  }


  static private TreeMap<Integer,Vector<Integer>>[] getPosMaps(BasicPhrase[] phrases, Suffixes SA)
  {
    int phCount = phrases.length;


    @SuppressWarnings("unchecked")
    TreeMap<Integer,Vector<Integer>>[] retA = new TreeMap[phCount];


    for (int ph_i = 0; ph_i < phCount; ++ph_i) {
      retA[ph_i] = new TreeMap<Integer,Vector<Integer>>();
      int offset = phrases[ph_i].size() - 1;
      int[] bounds = SA.findPhrase(phrases[ph_i]);
      int[] pos = SA.getAllPositions(bounds);
      for (int p_i = 0; p_i < pos.length; ++p_i) {
        int start_i = pos[p_i];
        int final_i = start_i + offset;
        int senIndex = SA.getSentenceIndex(start_i);
        if (SA.getSentenceIndex(final_i) == senIndex) { // necessary because findPhrase might match across sentences
          Vector<Integer> V = retA[ph_i].get(senIndex);
          if (V == null) V = new Vector<Integer>();
          V.add(start_i);
          retA[ph_i].put(senIndex,V);
        }
      }
    }


    return retA;
  }


  static private String[] indicesToPhrases(String indices, String[] words)
  {
    int[] indices_A = toInt(indices.split(","));


    int phraseCount = gapCount(indices_A) + 1;


    String[] phrases = new String[phraseCount];
    int ph_i = 0;
    String curr_ph = words[indices_A[0]];
    int prev = indices_A[0];


    for (int i = 1; i < indices_A.length; ++i) {
      if (indices_A[i] == prev+1) { // continue phrase
        curr_ph += " " + words[indices_A[i]];
      } else { // gap; end previous phrase and start new one
        phrases[ph_i] = curr_ph;
        curr_ph = words[indices_A[i]];
        ++ph_i;
      }
      prev = indices_A[i];
    }


    phrases[ph_i] = curr_ph;


    // now ph_i+1 == phraseCount
    if (ph_i != phraseCount - 1) {
      println("MISMATCH: ph_i = " + ph_i + "; phraseCount - 1 = " + (phraseCount-1));
    }


    return phrases;
  }


  static private int gapCount(int[] indices)
  {
    if (indices == null || indices.length < 2) {
      return 0;
    } else {
      int count = 0;
      
      int prev = indices[0];
      for (int i = 1; i < indices.length; ++i) {
        if (indices[i] != prev+1) {
          ++count;
        }
        prev = indices[i];
      }


      return count;
    }
  }


  static private BasicPhrase[] strToPhrase(String[] phrases_str, Vocabulary vocab)
  {
    BasicPhrase[] retA = new BasicPhrase[phrases_str.length];
    for (int i = 0; i < phrases_str.length; ++i) { retA[i] = new BasicPhrase(phrases_str[i],vocab); }
    return retA;
  }


  static private void println(Object obj) { System.out.println(obj); }
  static private void print(Object obj) { System.out.print(obj); }
  static private int toInt(String str) { return Integer.parseInt(str); }


  static private int[] toInt(String[] strA)
  {
    int[] intA = new int[strA.length];
    for (int i = 0; i < intA.length; ++i) intA[i] = toInt(strA[i]);
    return intA;
  }


  static private boolean fileExists(String fileName)
  {
    if (fileName == null) return false;
    File checker = new File(fileName);
    return checker.exists();
  }


}
Source Code of joshua.aligner.AlignCandidates

Related Classes of joshua.aligner.AlignCandidates