Package joshua.aligner

Source Code of joshua.aligner.AlignCandidates

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/

package joshua.aligner;

import java.io.*;
import java.util.*;

import joshua.corpus.suffix_array.*;
import joshua.corpus.vocab.Vocabulary;
import joshua.corpus.Corpus;
import joshua.corpus.alignment.*;
import joshua.corpus.alignment.mm.MemoryMappedAlignmentGrids;

public class AlignCandidates {

  private static Vocabulary srcVocab, tgtVocab;
  private static Corpus srcCorpusArray, tgtCorpusArray;
  private static Suffixes srcSA, tgtSA;
  private static Alignments alignments;

  private static HashMap<String,TreeSet<Integer>> alreadyResolved_srcSet;
  private static HashMap<String,TreeSet<Integer>> alreadyResolved_tgtSet;

 
  public static void main(String[] args) throws IOException {

/*
    testJoshuaDerivationTree("(S{0-12} (S{0-11} (S{0-8} (X{0-8} (X{0-3} official (X{1-2} forecasts) are) based on (X{4-7} (X{4-5} only) 3 per cent))) (X{8-11} reported (X{8-9} ,) (X{10-11} bloomberg))) (X{11-12} .))");
    testJoshuaDerivationTree("(S{0-5} (S{0-3} (S{0-1} (X{0-1} food)) (X{1-3} is to blame for)) (X{3-5} european (X{4-5} inflation)))");
*/

    String paramFileName = args[0];

    BufferedReader inFile_params = new BufferedReader(new FileReader(paramFileName));

    String cands_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String alignSrcRef_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String parseSrc_fileName = (inFile_params.readLine().split("\\s+"))[0];
    String alignSrcCand_phrasal_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
    String alignSrcCand_word_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
//    String alignCandRef_sen_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
//    String alignCandRef_parse_fileName = (inFile_params.readLine().split("\\s+"))[0]; // output file
    String source_fileName = (inFile_params.readLine().split("\\s+"))[0];
//    String ref_fileName = (inFile_params.readLine().split("\\s+"))[0];
    String trainSrc_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src side of training corpus
    String trainTgt_fileName = (inFile_params.readLine().split("\\s+"))[0]; // tgt side of training corpus
    String trainAlign_fileName = (inFile_params.readLine().split("\\s+"))[0]; // src-tgt of training corpus
    String alignCache_fileName = (inFile_params.readLine().split("\\s+"))[0];

    String alignmentsType = "AlignmentGrids"; // if (args.length >= 4) alignmentsType = args[3];
    int maxCacheSize = 1000; // if (args.length >= 5) maxCacheSize = Integer.parseInt(args[4]);

    inFile_params.close();

    int numSentences = countLines(source_fileName);

    InputStream inStream_src = new FileInputStream(new File(source_fileName));
    BufferedReader srcFile = new BufferedReader(new InputStreamReader(inStream_src, "utf8"));
//    InputStream inStream_ref = new FileInputStream(new File(ref_fileName));
//    BufferedReader refFile = new BufferedReader(new InputStreamReader(inStream_ref, "utf8"));

    String[] srcSentences = new String[numSentences];
//    String[] refSentences = new String[numSentences];

    for (int i = 0; i < numSentences; ++i) {
      srcSentences[i] = srcFile.readLine();
//      refSentences[i] = refFile.readLine();
    }

    srcFile.close();
//    refFile.close();


    // Source language vocabulary
    println("Creating src vocabulary @ " + (new Date()));
    srcVocab = new Vocabulary();
    int[] sourceWordsSentences = Vocabulary.initializeVocabulary(trainSrc_fileName, srcVocab, true);

    int numSourceWords = sourceWordsSentences[0];
    int numSourceSentences = sourceWordsSentences[1];

    // Source language corpus array
    println("Reading src corpus @ " + (new Date()));
    srcCorpusArray = SuffixArrayFactory.createCorpusArray(trainSrc_fileName, srcVocab, numSourceWords, numSourceSentences);

    // Source language suffix array
    println("Creating src SA @ " + (new Date()));
    srcSA = SuffixArrayFactory.createSuffixArray(srcCorpusArray, maxCacheSize);


    // Target language vocabulary
    println("Creating tgt vocabulary @ " + (new Date()));
    tgtVocab = new Vocabulary();
    int[] targetWordsSentences = Vocabulary.initializeVocabulary(trainTgt_fileName, tgtVocab, true);

    int numTargetWords = targetWordsSentences[0];
    int numTargetSentences = targetWordsSentences[1];

    // Target language corpus array
    println("Reading tgt corpus @ " + (new Date()));
    tgtCorpusArray = SuffixArrayFactory.createCorpusArray(trainTgt_fileName, tgtVocab, numTargetWords, numTargetSentences);

    // Target language suffix array
    println("Creating tgt SA @ " + (new Date()));
    tgtSA = SuffixArrayFactory.createSuffixArray(tgtCorpusArray, maxCacheSize);


    int trainingSize = srcCorpusArray.getNumSentences();
    if (trainingSize != tgtCorpusArray.getNumSentences()) {
      throw new RuntimeException("Source and target corpora have different number of sentences. This is bad.");
    }


    // Alignment data
    println("Reading alignment data @ " + (new Date()));
    alignments = null;
    if ("AlignmentArray".equals(alignmentsType)) {
      alignments = SuffixArrayFactory.createAlignments(trainAlign_fileName, srcSA, tgtSA);
    } else if ("AlignmentGrids".equals(alignmentsType) || "AlignmentsGrid".equals(alignmentsType)) {
      alignments = new AlignmentGrids(new Scanner(new File(trainAlign_fileName)), srcCorpusArray, tgtCorpusArray, trainingSize, true);
    } else if ("MemoryMappedAlignmentGrids".equals(alignmentsType)) {
      alignments = new MemoryMappedAlignmentGrids(trainAlign_fileName, srcCorpusArray, tgtCorpusArray);
    }



    if (!fileExists(alignCache_fileName)) {
      alreadyResolved_srcSet = new HashMap<String,TreeSet<Integer>>();
      alreadyResolved_tgtSet = new HashMap<String,TreeSet<Integer>>();
    } else {
      try {
        ObjectInputStream in = new ObjectInputStream(new FileInputStream(alignCache_fileName));
        alreadyResolved_srcSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
        alreadyResolved_tgtSet = (HashMap<String,TreeSet<Integer>>)in.readObject();
        in.close();
      } catch (FileNotFoundException e) {
        System.err.println("FileNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99901);
      } catch (IOException e) {
        System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99902);
      } catch (ClassNotFoundException e) {
        System.err.println("ClassNotFoundException in AlignCandidates.main(String[]): " + e.getMessage());
        System.exit(99904);
      }
    }




    println("Processing candidates @ " + (new Date()));

    PrintWriter outFile_alignSrcCand_phrasal = new PrintWriter(alignSrcCand_phrasal_fileName);
    PrintWriter outFile_alignSrcCand_word = new PrintWriter(alignSrcCand_word_fileName);
//    PrintWriter outFile_alignCandRef_sen = new PrintWriter(alignCandRef_sen_fileName);

    InputStream inStream_cands = new FileInputStream(new File(cands_fileName));
    BufferedReader candsFile = new BufferedReader(new InputStreamReader(inStream_cands, "utf8"));

    String line = "";

    String cand = "";
    line = candsFile.readLine();

    int countSatisfied = 0;
    int countAll = 0;
    int countSatisfied_sizeOne = 0;
    int countAll_sizeOne = 0;
    int prev_i = -1;
    String srcSent = "";
    String[] srcWords = null;
    int candsRead = 0;
    int C50count = 0;

    while (line != null) {
      ++candsRead;
      println("Read candidate on line #" + candsRead);
      int i = toInt((line.substring(0,line.indexOf("|||"))).trim());

      if (i != prev_i) {
        srcSent = srcSentences[i];
        srcWords = srcSent.split("\\s+");
        prev_i = i;
        println("New value for i: " + i + " seen @ " + (new Date()));
        C50count = 0;
      } else { ++C50count; }

      line = (line.substring(line.indexOf("|||")+3)).trim(); // get rid of initial text

      cand = (line.substring(0,line.indexOf("|||"))).trim();

      cand = cand.substring(cand.indexOf(" ")+1,cand.length()-1); // trim "(ROOT{x-y} " and ")"

//      testParseTree(cand);

      JoshuaDerivationTree DT = new JoshuaDerivationTree(cand,0);

      String candSent = DT.toSentence();
      String[] candWords = candSent.split("\\s+");


      ///////////////////////////////
      // align source to candidate //
      ///////////////////////////////

      String alignSrcCand = DT.alignments();


      // allow many-to-many
      outFile_alignSrcCand_phrasal.println(alignSrcCand);

println("  i = " + i + ", alignSrcCand: " + alignSrcCand);

      // resolve many-to-many

      String alignSrcCand_res = "";

      String[] linksSrcCand = alignSrcCand.split("\\s+");

      for (int k = 0; k < linksSrcCand.length; ++k) {
        String link = linksSrcCand[k];
        if (link.indexOf(',') == -1) { // already one-to-one
          alignSrcCand_res += " " + link.replaceFirst("--","-");
        } else {
          alignSrcCand_res += " " + resolve(link, srcWords, candWords);
        }
      }

      alignSrcCand_res = alignSrcCand_res.trim();
println("  i = " + i + ", alignSrcCand_res: " + alignSrcCand_res);

      outFile_alignSrcCand_word.println(alignSrcCand_res);

      if (C50count == 50) { println("50C @ " + (new Date())); C50count = 0; }

      line = candsFile.readLine();
    }

    outFile_alignSrcCand_phrasal.close();
    outFile_alignSrcCand_word.close();
//    outFile_alignCandRef_sen.close();
    candsFile.close();

    println("Finished processing candidates @ " + (new Date()));
/*
    println("Satisfied: " + countSatisfied + "/" + countAll);
    println("Satisfied_sizeOne: " + countSatisfied_sizeOne + "/" + countAll_sizeOne);
*/



    try {
      ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(alignCache_fileName));
      out.writeObject(alreadyResolved_srcSet);
      out.writeObject(alreadyResolved_tgtSet);
      out.flush();
      out.close();
    } catch (IOException e) {
      System.err.println("IOException in AlignCandidates.main(String[]): " + e.getMessage());
      System.exit(99902);
    }


// sanity checks
/*
    String ph_str = "schuld an";
    BasicPhrase ph = new BasicPhrase(ph_str,srcVocab);
    int ph_size = ph.size();

    int[] bounds = srcSA.findPhrase(ph);
    int[] pos = srcSA.getAllPositions(bounds);

    println("bounds: " + bounds[0] + " " + bounds[1]);
    println("ph_str \"" + ph_str + "\" found in " + pos.length + " positions:");
*/
/*
    for (int p = 1; p <= pos.length; ++p) {
      int start_i = pos[p-1];
      int final_i = start_i + ph_size - 1;
      int senIndex = srcSA.getSentenceIndex(start_i);
      int senPos_src = srcSA.getSentencePosition(senIndex);
      int senPos_tgt = tgtSA.getSentencePosition(senIndex);

      println("  POS #" + p + ": starting at " + start_i + ", ending at " + final_i);
      println("    i.e. sentence " + senIndex + ", src words " + (start_i-senPos_src) + "-" + (final_i-senPos_src));

      for (int i = start_i; i <= final_i; ++i) {
        int[] tgtIndices = alignments.getAlignedTargetIndices(i);
        if (tgtIndices != null) {
          print("    i=" + i + " aligned to: ");
          for (int k = 0; k < tgtIndices.length; ++k) print(tgtIndices[k] + " (" + (tgtIndices[k] - senPos_tgt) + ": " + tgtVocab.getWord(tgtCorpusArray.getWordID(tgtIndices[k])) + ") ");
          println("");
        } else {
          println("    i=" + i + " unaligned");
        }
      }
    }
*/


  } // main





  static private int countLines(String fileName)
  {
    int count = 0;

    try {
      BufferedReader inFile = new BufferedReader(new FileReader(fileName));

      String line;
      do {
        line = inFile.readLine();
        if (line != null) ++count;
      while (line != null);

      inFile.close();
    } catch (IOException e) {
      System.err.println("IOException in AlignCandidates.countLines(String): " + e.getMessage());
      System.exit(99902);
    }

    return count;
  }


  static public void testJoshuaDerivationTree(String PTS)
  {
    JoshuaDerivationTree T = new JoshuaDerivationTree(PTS,0);

    println("T.toSentence() is:");
    println("  " + T.toSentence());
    println("root.numTgtWords: " + T.numTgtWords);
    println("T.toString() is:");
    println("  " + T);

    if (PTS.equals(T.toString())) println("toString is A-OK");
    else println("PROBLEM in toString!");

    println("Alignments:");
    println(T.alignments());
    println("");
  }

  static private String resolve(String link, String[] srcWords, String[] tgtWords)
  {
    println("    Resolving " + link);
    String SrcSide = link.substring(0,link.indexOf("--"));
    String CandSide = link.substring(link.indexOf("--")+2);

    String[] srcPhrases_str = indicesToPhrases(SrcSide, srcWords);
    String[] tgtPhrases_str = indicesToPhrases(CandSide, tgtWords);

    int[] origSrcIndices = toInt(SrcSide.split(","));
    int[] origCandIndices = toInt(CandSide.split(","));

    String cacheKey = "";
//    for (int w = 0; w < origSrcIndices.length; ++w) cacheKey += " " + srcWords[origSrcIndices[w]];
    for (int w = 0; w < srcPhrases_str.length; ++w) cacheKey += " " + srcPhrases_str[w];
    cacheKey += "__";
//    for (int w = 0; w < origCandIndices.length; ++w) cacheKey += tgtWords[origCandIndices[w]] + " ";
    for (int w = 0; w < tgtPhrases_str.length; ++w) cacheKey += tgtPhrases_str[w] + " ";
    cacheKey = cacheKey.trim();


    BasicPhrase[] srcPhrases = strToPhrase(srcPhrases_str,srcVocab);
    BasicPhrase[] tgtPhrases = strToPhrase(tgtPhrases_str,tgtVocab);

    int[] srcPhrases_len = phraseLenghts(srcPhrases);
    int[] tgtPhrases_len = phraseLenghts(tgtPhrases);

    int srcPhCount = srcPhrases.length;
    int tgtPhCount = tgtPhrases.length;

    println("      srcPhCount: " + srcPhCount + ", tgtPhCount: " + tgtPhCount);

    TreeSet<Integer> senIndices = null;
    if (alreadyResolved_srcSet.containsKey(cacheKey)) {
      println("      Using cached result (for " + cacheKey + ")");

      TreeSet<Integer> srcIndices_allowed = alreadyResolved_srcSet.get(cacheKey);
      TreeSet<Integer> tgtIndices_allowed = alreadyResolved_tgtSet.get(cacheKey);

      return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
    }

    print("      Extracting xxxPhPos...");
    // the keySet of srcPhPos[p] are sentence indices, with key_i mapped to a Vector of the positions
    // of matches of srcPhrases's p'th phrase in the key_i'th sentence
    TreeMap<Integer,Vector<Integer>>[] srcPhPos = getPosMaps(srcPhrases,srcSA);
    TreeMap<Integer,Vector<Integer>>[] tgtPhPos = getPosMaps(tgtPhrases,tgtSA);
    println("done");

    print("      Intersecting sentence indices...");
    senIndices = new TreeSet<Integer>(srcPhPos[0].keySet());
    for (int i = 1; i < srcPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(srcPhPos[i].keySet())); }
    for (int i = 0; i < tgtPhCount; ++i) { senIndices = setIntersect(senIndices,new TreeSet<Integer>(tgtPhPos[i].keySet())); }
    // now, if sen_i is in senIndices, this means that the sen_i'th sentence pair
    // contains all the relevant phrases, on both sides
    println("done; intersection has " + senIndices.size() + " indices.");


    boolean found = false;

    for (Integer sen_i : senIndices) {

      @SuppressWarnings("unchecked")
      Vector<Integer>[] srcVecs = new Vector[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs[ph] = srcPhPos[ph].get(sen_i); }
      @SuppressWarnings("unchecked")
      Vector<Integer>[] tgtVecs = new Vector[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs[ph] = tgtPhPos[ph].get(sen_i); }

      int[] srcVecs_size = new int[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_size[ph] = srcVecs[ph].size(); }
      int[] tgtVecs_size = new int[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_size[ph] = tgtVecs[ph].size(); }

      int[] srcVecs_i = new int[srcPhCount];
      for (int ph = 0; ph < srcPhCount; ++ph) { srcVecs_i[ph] = 0; }
      int[] tgtVecs_i = new int[tgtPhCount];
      for (int ph = 0; ph < tgtPhCount; ++ph) { tgtVecs_i[ph] = 0; }

      boolean done = false;

      while (!done) {

        // check to see if elements at srcVecs_i[] and tgtVecs_i[] are ordered properly

        boolean ordered = true;

        for (int ph = 0; ph < srcPhCount-1; ++ph) {
          int end_curr = srcVecs[ph].elementAt(srcVecs_i[ph]) + srcPhrases_len[ph] - 1;
          int start_next = srcVecs[ph+1].elementAt(srcVecs_i[ph+1]);
          if (end_curr >= start_next) {
            ordered = false;
            break;
          }
        }

        if (ordered) { // still ordered; now check tgt side for order
          for (int ph = 0; ph < tgtPhCount-1; ++ph) {
            int end_curr = tgtVecs[ph].elementAt(tgtVecs_i[ph]) + tgtPhrases_len[ph] - 1;
            int start_next = tgtVecs[ph+1].elementAt(tgtVecs_i[ph+1]);
            if (end_curr >= start_next) {
              ordered = false;
              break;
            }
          }

          if (ordered) { // still ordered; now, finally, check if alignment is consistent

            // what do we have here? we know that the sen_i'th training sentence has all the
            // phrases we we want (on both source and target side) and we also know the
            // phrases do not cross sentence boundaries and we also know that the phrases
            // are in the right order.

            // now we need to make sure the alignments are contained:
            //   (*) all the words in the phrases of the source side
            //       must not align to ANYthing outside the phrases
            //       of the target side.
            //   (*) all the words in the phrases of the target side
            //       must not align to ANYthing outside the phrases
            //       of the source side.

            // what indices IN THE SOURCE SIDE do the words of the target phrases ALLOWED TO align with?
            TreeSet<Integer> srcIndices_allowed = new TreeSet<Integer>();

            // what indices IN THE TARGET SIDE do the words of the source phrases ALLOWED TO align with?
            TreeSet<Integer> tgtIndices_allowed = new TreeSet<Integer>();


            // set srcIndices_allowed
            for (int ph = 0; ph < srcPhCount; ++ph) {
              int start_i = srcVecs[ph].elementAt(srcVecs_i[ph]);
              int final_i = start_i + srcPhrases_len[ph] - 1;
              for (int i = start_i; i <= final_i; ++i) srcIndices_allowed.add(i);
            } // for (ph:0..srcPhCount)

            // set tgtIndices_allowed
            for (int ph = 0; ph < tgtPhCount; ++ph) {
              int start_i = tgtVecs[ph].elementAt(tgtVecs_i[ph]);
              int final_i = start_i + tgtPhrases_len[ph] - 1;
              for (int i = start_i; i <= final_i; ++i) tgtIndices_allowed.add(i);
            } // for (ph:0..tgtPhCount)


            boolean misalign = false;

            // does any source word align to anything that is not allowed?
            for (Integer i : srcIndices_allowed) {
              int[] tgtIndices = alignments.getAlignedTargetIndices(i);
              if (tgtIndices != null) {
                for (int j = 0; j < tgtIndices.length; ++j) {
                  if (!tgtIndices_allowed.contains(tgtIndices[j])) {
                    misalign = true;
                    break; // from for (j)
                  }
                }
              }

              if (misalign) break; // from for (i)
            }


            if (!misalign) { // still aligned; now check tgt->src

              // does any target word align to anything that is not allowed?
              for (Integer i : tgtIndices_allowed) {
                int[] srcIndices = alignments.getAlignedSourceIndices(i);
                if (srcIndices != null) {
                  for (int j = 0; j < srcIndices.length; ++j) {
                    if (!srcIndices_allowed.contains(srcIndices[j])) {
                      misalign = true;
                      break; // from for (j)
                    }
                  }
                }

                if (misalign) break; // from for (i)
              }

              if (!misalign) { // still aligned; now, FINALLY, extract alignments

                // remember: src->cand

                alreadyResolved_srcSet.put(cacheKey,srcIndices_allowed);
                alreadyResolved_tgtSet.put(cacheKey,tgtIndices_allowed);

                return finalResolve(srcIndices_allowed,tgtIndices_allowed,origSrcIndices,origCandIndices);
              }

            }


          } // if (ordered)_2

        } // if (ordered)_1

        advance(srcVecs_i,tgtVecs_i,srcVecs_size,tgtVecs_size);
        // advances srcVecs_i and tgtVecs_i, if possible
        // if not possible, sets srcVecs_i[0] to -1 and everything else to 0

        if (srcVecs_i[0] == -1) done = true;

      }

    } // for (sen_i)


    return link;
  }

  static private String finalResolve(TreeSet<Integer> srcIndices_allowed, TreeSet<Integer> tgtIndices_allowed, int[] origSrcIndices, int[] origCandIndices)
  {
println("In finalResolve.  Sizes: sI_a: " + srcIndices_allowed.size() + ", tI_a: " + tgtIndices_allowed.size() + ", oSI: " + origSrcIndices.length + ", oCI: " + origCandIndices.length);
    String resolvedStr = "";

    TreeMap<Integer,Integer> toOrigTgt = new TreeMap<Integer,Integer>();
    int oci = 0;
    for (Integer i : tgtIndices_allowed) {
      toOrigTgt.put(i,origCandIndices[oci]);
      ++oci;
    }

    int osi = 0;
    for (Integer i : srcIndices_allowed) {
      int[] tgtIndices = alignments.getAlignedTargetIndices(i);
      if (tgtIndices != null) {
        for (int j = 0; j < tgtIndices.length; ++j) {
          resolvedStr += " " + origSrcIndices[osi] + "-" + toOrigTgt.get(tgtIndices[j]);
        }
      }
      ++osi;
    }
    return resolvedStr.trim();

  }

  static private int[] phraseLenghts(BasicPhrase[] phrases)
  {
    int[] lenghts = new int[phrases.length];
    for (int k = 0; k < phrases.length; ++k) lenghts[k] = phrases[k].size();
    return lenghts;
  }


  static private void advance(int[] A_i, int[] B_i, int[] A_size, int[] B_size)
  {
    int A_cnt = A_i.length;
    int B_cnt = B_i.length;

    boolean B_adv = false;
    int B_curr = B_cnt-1;

    while (true) {
      B_i[B_curr] += 1;
      if (B_i[B_curr] == B_size[B_curr]) {
        B_i[B_curr] = 0;
        --B_curr;
        if (B_curr < 0) break;
      } else {
        B_adv = true;
        break;
      }
    }

    if (!B_adv) {

      boolean A_adv = false;
      int A_curr = A_cnt-1;

      while (true) {
        A_i[A_curr] += 1;
        if (A_i[A_curr] == A_size[A_curr]) {
          A_i[A_curr] = 0;
          --A_curr;
          if (A_curr < 0) break;
        } else {
          A_adv = true;
          break;
        }
      }

      if (!A_adv) {
        A_i[0] = -1;
      }
    }

  }

  static private TreeSet<Integer> setIntersect(TreeSet<Integer> A, TreeSet<Integer> B)
  {
    TreeSet<Integer> retSet = new TreeSet<Integer>();

    for (Integer i : A) { if (B.contains(i)) retSet.add(i); }

    return retSet;
  }

  static private TreeMap<Integer,Vector<Integer>>[] getPosMaps(BasicPhrase[] phrases, Suffixes SA)
  {
    int phCount = phrases.length;

    @SuppressWarnings("unchecked")
    TreeMap<Integer,Vector<Integer>>[] retA = new TreeMap[phCount];

    for (int ph_i = 0; ph_i < phCount; ++ph_i) {
      retA[ph_i] = new TreeMap<Integer,Vector<Integer>>();
      int offset = phrases[ph_i].size() - 1;
      int[] bounds = SA.findPhrase(phrases[ph_i]);
      int[] pos = SA.getAllPositions(bounds);
      for (int p_i = 0; p_i < pos.length; ++p_i) {
        int start_i = pos[p_i];
        int final_i = start_i + offset;
        int senIndex = SA.getSentenceIndex(start_i);
        if (SA.getSentenceIndex(final_i) == senIndex) { // necessary because findPhrase might match across sentences
          Vector<Integer> V = retA[ph_i].get(senIndex);
          if (V == null) V = new Vector<Integer>();
          V.add(start_i);
          retA[ph_i].put(senIndex,V);
        }
      }
    }

    return retA;
  }

  static private String[] indicesToPhrases(String indices, String[] words)
  {
    int[] indices_A = toInt(indices.split(","));

    int phraseCount = gapCount(indices_A) + 1;

    String[] phrases = new String[phraseCount];
    int ph_i = 0;
    String curr_ph = words[indices_A[0]];
    int prev = indices_A[0];

    for (int i = 1; i < indices_A.length; ++i) {
      if (indices_A[i] == prev+1) { // continue phrase
        curr_ph += " " + words[indices_A[i]];
      } else { // gap; end previous phrase and start new one
        phrases[ph_i] = curr_ph;
        curr_ph = words[indices_A[i]];
        ++ph_i;
      }
      prev = indices_A[i];
    }

    phrases[ph_i] = curr_ph;

    // now ph_i+1 == phraseCount
    if (ph_i != phraseCount - 1) {
      println("MISMATCH: ph_i = " + ph_i + "; phraseCount - 1 = " + (phraseCount-1));
    }

    return phrases;
  }

  static private int gapCount(int[] indices)
  {
    if (indices == null || indices.length < 2) {
      return 0;
    } else {
      int count = 0;
     
      int prev = indices[0];
      for (int i = 1; i < indices.length; ++i) {
        if (indices[i] != prev+1) {
          ++count;
        }
        prev = indices[i];
      }

      return count;
    }
  }

  static private BasicPhrase[] strToPhrase(String[] phrases_str, Vocabulary vocab)
  {
    BasicPhrase[] retA = new BasicPhrase[phrases_str.length];
    for (int i = 0; i < phrases_str.length; ++i) { retA[i] = new BasicPhrase(phrases_str[i],vocab); }
    return retA;
  }

  static private void println(Object obj) { System.out.println(obj); }
  static private void print(Object obj) { System.out.print(obj); }
  static private int toInt(String str) { return Integer.parseInt(str); }

  static private int[] toInt(String[] strA)
  {
    int[] intA = new int[strA.length];
    for (int i = 0; i < intA.length; ++i) intA[i] = toInt(strA[i]);
    return intA;
  }

  static private boolean fileExists(String fileName)
  {
    if (fileName == null) return false;
    File checker = new File(fileName);
    return checker.exists();
  }

}
TOP

Related Classes of joshua.aligner.AlignCandidates

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.