Examples of VocabularyWritable

edu.umd.hooka.VocabularyWritable

Examples of edu.umd.hooka.VocabularyWritable

   * @throws IOException
   */
  public static void createTTableFromBerkeleyAligner(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, 
      float probThreshold, int numTrans, FileSystem fs) throws IOException{
    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
    int cnt = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //In BerkeleyAligner output, dictionary entries of each source term are already sorted by prob. value. 
    try {
      DataInputStream d = new DataInputStream(fs.open(new Path(inputFile)));
      BufferedReader inputReader = new BufferedReader(new InputStreamReader(d));
      String cur = null;
      boolean earlyTerminate = false;
      String line = "";
      while (true) {
        if(!earlyTerminate){
          line = inputReader.readLine();
          if (line == null)
            break;
          cnt++;
        }
        earlyTerminate = false;
        logger.debug("Line:"+line);


        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans"); 
        Matcher m = p.matcher(line);
        if ( m.find() ) {
          cur = m.group(1);


          int gerIndex = srcVocab.addOrGet(cur);  
          logger.debug("Found: "+cur+" with index: "+gerIndex);




          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumOfProbs = 0.0f;
          int i = 0;
          while ( i++ < numTrans ) {
            line = inputReader.readLine(); 
            if ( line == null ) {
              break;
            }else {
              cnt++;
              // check if we've already consumed all translations of this term -- if so, terminate loop
              Pattern p2 = Pattern.compile("\\s*(\\S+): (.+)");
              Matcher m2 = p2.matcher(line);
              if ( !m2.find() ) {
                m = p.matcher(line);
                if ( m.find() ) {
                  logger.debug("Early terminate");
                  earlyTerminate = true;
                  i = numTrans;
                  break;
                }
                //                logger.debug("FFFF"+line);
              } else {
                String term = m2.group(1);
                if ( !term.equals("NULL") ) {
                  float prob = Float.parseFloat(m2.group(2));
                  int engIndex = trgVocab.addOrGet(term);
                  logger.debug("Added: "+term+" with index: "+engIndex+" and prob:"+prob);
                  indexProbPairs.add(new PairOfIntFloat(engIndex, prob));
                  sumOfProbs += prob;
                }
              }
            }
            // if number of translations not set, we never cut-off, so all cases are long tails 
            if ( numTrans != Integer.MAX_VALUE && sumOfProbs > probThreshold ){
              stats.incCntShortTail(1);
              stats.incSumShortTail(i);
              break;
            }
          }
          if ( sumOfProbs <= probThreshold ){
            // early cut-off
            stats.incCntLongTail(1);
            stats.incSumLongTail(i);
            stats.incSumCumProbs(sumOfProbs);
          }


          // to enable faster access with binary search, we sort entries by vocabulary index.
          Collections.sort(indexProbPairs);
          int numEntries = indexProbPairs.size();
          int[] indices = new int[numEntries];
          float[] probs = new float[numEntries];
          i=0;
          for ( PairOfIntFloat pair : indexProbPairs ) {
            indices[i] = pair.getLeftElement();
            probs[i++] = pair.getRightElement() / sumOfProbs;
          }         
          table.set(gerIndex, new IndexedFloatArray(indices, probs, true));
        }
      }


      // dispose all the resources after using them.
      inputReader.close();
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    System.err.println("File "+inputFile+": read "+cnt+" lines");
    System.err.println("Vocabulary Target: "+trgVocab.size()+" elements");
    System.err.println("Vocabulary Source: "+srcVocab.size()+" elements");
    System.err.println(stats);


    FSDataOutputStream outputStream1 = fs.create(new Path(trgVocabFile));
    ((VocabularyWritable) trgVocab).write(outputStream1);
    outputStream1.close();

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

   * @throws IOException
   */
  public static void createTTableFromGIZA(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, 
      float probThreshold, int numTrans, FileSystem fs) throws IOException{
    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();


    int cnt = 0;


    //In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top numTrans or less entries w/o exceeding <probThreshold> probability
    try {
      DataInputStream d = new DataInputStream(fs.open(new Path(inputFile)));
      BufferedReader inputReader = new BufferedReader(new InputStreamReader(d));


      String srcTerm = null, trgTerm = null, prev = null;
      int curIndex = -1;
      TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
      String line = "";
      boolean earlyTerminate = false, skipTerm = false;
      float sumOfProbs = 0.0f, prob;
      HookaStats stats = new HookaStats(numTrans, probThreshold);


      while (true) {  
        //        line = bis.readLine();
        line = inputReader.readLine();
        if(line == null)  break;
        String[] parts = line.split(" ");
        if(parts.length != 3){
          throw new RuntimeException("Unknown format: "+cnt+" = \n"+line);
        }
        cnt++;
        trgTerm = parts[0];
        srcTerm = parts[1];
        prob = Float.parseFloat(parts[2]);


        if (trgTerm.equals("NULL")) {
          continue;   // skip alignments to imaginary NULL word
        }


        // new source term (ignore punctuation)
        if ((prev==null || !srcTerm.equals(prev)) && !delims.contains(srcTerm)){
          if(topTrans.size() > 0){
            // store previous term's top translations to ttable
            addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
          }


          logger.debug("Line:"+line);


          // initialize the translation distribution of the source term
          sumOfProbs = 0.0f;
          topTrans.clear();
          earlyTerminate = false;    // reset status
          skipTerm = false;
          prev = srcTerm;
          int prevIndex = curIndex;
          curIndex = srcVocab.addOrGet(srcTerm);
          if(curIndex <= prevIndex){
            // we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
            logger.debug("FLAG: "+line);
            curIndex = prevIndex;    // revert curIndex value since we're skipping this one
            skipTerm = true;
            continue;
          }
          logger.debug("Processing: "+srcTerm+" with index: "+curIndex);      
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      
        }else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){  //continue adding translation term,prob pairs (except if early termination is ON)
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      


          // keep top numTrans translations
          if(topTrans.size() > numTrans){
            PairOfFloatString pair = topTrans.pollFirst();
            float removedProb = pair.getLeftElement();
            sumOfProbs -= removedProb;
            logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");      
          }
        }else{
          logger.debug("Skipped line: "+line);
        }
      }


      //last one
      if(topTrans.size()>0){
        //store previous term's top translations to ttable
        addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
      }


      // dispose all the resources after using them.
      inputReader.close();


      System.err.println("File " + inputFile + ": read " + cnt + " lines");
      System.err.println("Vocabulary Target: " + trgVocab.size() + " elements");
      System.err.println("Vocabulary Source: " + srcVocab.size() + " elements");
      System.err.println(stats);
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
    TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);


    logger.debug(ttable.getMaxE() + "," + ttable.getMaxF());


    Vocab finalSrcVocab = new VocabularyWritable();
    Vocab finalTrgVocab = new VocabularyWritable();
    TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();


    String srcTerm = null, trgTerm = null;
    int curIndex = -1;
    TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
    float sumOfProbs = 0.0f, prob;
    //    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //modify current ttable wrt foll. criteria: top numTrans translations per source term, unless cumulative prob. distr. exceeds probThreshold before that.
    for (int srcIndex = 1; srcIndex < srcVocab.size(); srcIndex++) {
      int[] translations;
      try {
        translations = ttable.get(srcIndex).getTranslations(0f);
      } catch (Exception e) {
        logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
        continue;
      }


      srcTerm = srcVocab.get(srcIndex);
      curIndex = finalSrcVocab.addOrGet(srcTerm);


      //initialize this term
      topTrans.clear();
      sumOfProbs = 0.0f;
      logger.debug("Processing: " + srcTerm + " with index: " + curIndex + " ("+srcIndex+"); " + translations.length + " translations");
      for (int trgIndex : translations) {
        try {
          trgTerm = trgVocab.get(trgIndex);
        } catch (Exception e) {
          logger.debug("Skipping " + trgIndex);
          continue;
        }
        prob = ttable.get(srcIndex, trgIndex);
        logger.debug("Found: " + trgTerm + " with " + prob);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if (topTrans.size() > numTrans) {
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }
        sumOfProbs += prob;


        if (sumOfProbs > probThreshold) {
          logger.debug("Sum of probs > "+probThreshold+", early termination.");
          break;
        }  
      }


      //store previous term's top translations to ttable
      if(topTrans.size() > 0){
        addToTable(curIndex, topTrans, sumOfProbs, finalTTable, finalTrgVocab, probThreshold, stats);
      }
    }
    System.err.println("Vocabulary Target: "+finalTrgVocab.size()+" elements");
    System.err.println("Vocabulary Source: "+finalSrcVocab.size()+" elements");
    System.err.println(stats);


    FSDataOutputStream outputStream1 = fs.create(new Path(finalTrgVocabFile));
    ((VocabularyWritable) finalTrgVocab).write(outputStream1);

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    String stopwordsFile = conf.get(Constants.StopwordList);
    stopwords = readInput(fs, stopwordsFile);      
    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);


    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      sLogger.warn("No vocabulary provided to tokenizer.");

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

   * @throws IOException
   */
  public static void createTTableFromBerkeleyAligner(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, 
      float probThreshold, int numTrans, FileSystem fs) throws IOException{
    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();
    int cnt = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //In BerkeleyAligner output, dictionary entries of each source term are already sorted by prob. value. 
    try {
      DataInputStream d = new DataInputStream(fs.open(new Path(inputFile)));
      BufferedReader inputReader = new BufferedReader(new InputStreamReader(d));
      String cur = null;
      boolean earlyTerminate = false;
      String line = "";
      while (true) {
        if(!earlyTerminate){
          line = inputReader.readLine();
          if (line == null)
            break;
          cnt++;
        }
        earlyTerminate = false;
        logger.debug("Line:"+line);


        Pattern p = Pattern.compile("(.+)\\tentropy .+nTrans"); 
        Matcher m = p.matcher(line);
        if ( m.find() ) {
          cur = m.group(1);


          int gerIndex = srcVocab.addOrGet(cur);  
          logger.debug("Found: "+cur+" with index: "+gerIndex);




          List<PairOfIntFloat> indexProbPairs = new ArrayList<PairOfIntFloat>();
          float sumOfProbs = 0.0f;
          int i = 0;
          while ( i++ < numTrans ) {
            line = inputReader.readLine(); 
            if ( line == null ) {
              break;
            }else {
              cnt++;
              // check if we've already consumed all translations of this term -- if so, terminate loop
              Pattern p2 = Pattern.compile("\\s*(\\S+): (.+)");
              Matcher m2 = p2.matcher(line);
              if ( !m2.find() ) {
                m = p.matcher(line);
                if ( m.find() ) {
                  logger.debug("Early terminate");
                  earlyTerminate = true;
                  i = numTrans;
                  break;
                }
                //                logger.debug("FFFF"+line);
              } else {
                String term = m2.group(1);
                if ( !term.equals("NULL") ) {
                  float prob = Float.parseFloat(m2.group(2));
                  int engIndex = trgVocab.addOrGet(term);
                  logger.debug("Added: "+term+" with index: "+engIndex+" and prob:"+prob);
                  indexProbPairs.add(new PairOfIntFloat(engIndex, prob));
                  sumOfProbs += prob;
                }
              }
            }
            // if number of translations not set, we never cut-off, so all cases are long tails 
            if ( numTrans != Integer.MAX_VALUE && sumOfProbs > probThreshold ){
              stats.incCntShortTail(1);
              stats.incSumShortTail(i);
              break;
            }
          }
          if ( sumOfProbs <= probThreshold ){
            // early cut-off
            stats.incCntLongTail(1);
            stats.incSumLongTail(i);
            stats.incSumCumProbs(sumOfProbs);
          }


          // to enable faster access with binary search, we sort entries by vocabulary index.
          Collections.sort(indexProbPairs);
          int numEntries = indexProbPairs.size();
          int[] indices = new int[numEntries];
          float[] probs = new float[numEntries];
          i=0;
          for ( PairOfIntFloat pair : indexProbPairs ) {
            indices[i] = pair.getLeftElement();
            probs[i++] = pair.getRightElement() / sumOfProbs;
          }         
          table.set(gerIndex, new IndexedFloatArray(indices, probs, true));
        }
      }


      // dispose all the resources after using them.
      inputReader.close();
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    System.err.println("File "+inputFile+": read "+cnt+" lines");
    System.err.println("Vocabulary Target: "+trgVocab.size()+" elements");
    System.err.println("Vocabulary Source: "+srcVocab.size()+" elements");
    System.err.println(stats);


    FSDataOutputStream outputStream1 = fs.create(new Path(trgVocabFile));
    ((VocabularyWritable) trgVocab).write(outputStream1);
    outputStream1.close();

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

   * @throws IOException
   */
  public static void createTTableFromGIZA(String inputFile, String srcVocabFile, String trgVocabFile, String probsFile, 
      float probThreshold, int numTrans, FileSystem fs) throws IOException{
    TTable_monolithic_IFAs table = new TTable_monolithic_IFAs();
    VocabularyWritable trgVocab = new VocabularyWritable(), srcVocab = new VocabularyWritable();


    int cnt = 0;


    //In GIZA output, dictionary entries are in random order (w.r.t. prob value), so you need to keep a sorted list of top numTrans or less entries w/o exceeding <probThreshold> probability
    try {
      DataInputStream d = new DataInputStream(fs.open(new Path(inputFile)));
      BufferedReader inputReader = new BufferedReader(new InputStreamReader(d));


      String srcTerm = null, trgTerm = null, prev = null;
      int curIndex = -1;
      TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
      String line = "";
      boolean earlyTerminate = false, skipTerm = false;
      float sumOfProbs = 0.0f, prob;
      HookaStats stats = new HookaStats(numTrans, probThreshold);


      while (true) {  
        //        line = bis.readLine();
        line = inputReader.readLine();
        if(line == null)  break;
        String[] parts = line.split(" ");
        if(parts.length != 3){
          throw new RuntimeException("Unknown format: "+cnt+" = \n"+line);
        }
        cnt++;
        trgTerm = parts[0];
        srcTerm = parts[1];
        prob = Float.parseFloat(parts[2]);


        if (trgTerm.equals("NULL")) {
          continue;   // skip alignments to imaginary NULL word
        }


        // new source term (ignore punctuation)
        if ((prev==null || !srcTerm.equals(prev)) && !delims.contains(srcTerm)){
          if(topTrans.size() > 0){
            // store previous term's top translations to ttable
            addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
          }


          logger.debug("Line:"+line);


          // initialize the translation distribution of the source term
          sumOfProbs = 0.0f;
          topTrans.clear();
          earlyTerminate = false;   // reset status
          skipTerm = false;
          prev = srcTerm;
          int prevIndex = curIndex;
          curIndex = srcVocab.addOrGet(srcTerm);
          if(curIndex <= prevIndex){
            // we've seen this foreign term before. probably due to tokenization or sorting error in aligner. just ignore.
            logger.debug("FLAG: "+line);
            curIndex = prevIndex;   // revert curIndex value since we're skipping this one
            skipTerm = true;
            continue;
          }
          logger.debug("Processing: "+srcTerm+" with index: "+curIndex);      
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      
        }else if(!earlyTerminate && !skipTerm && !delims.contains(srcTerm)){  //continue adding translation term,prob pairs (except if early termination is ON)
          topTrans.add(new PairOfFloatString(prob, trgTerm));
          sumOfProbs += prob;
          logger.debug("Added to queue: "+trgTerm+" with prob: "+prob+" (sum: "+sumOfProbs+")");      


          // keep top numTrans translations
          if(topTrans.size() > numTrans){
            PairOfFloatString pair = topTrans.pollFirst();
            float removedProb = pair.getLeftElement();
            sumOfProbs -= removedProb;
            logger.debug("Removed from queue: "+pair.getRightElement()+" (sum: "+sumOfProbs+")");      
          }
        }else{
          logger.debug("Skipped line: "+line);
        }
      }


      //last one
      if(topTrans.size()>0){
        //store previous term's top translations to ttable
        addToTable(curIndex, topTrans, sumOfProbs, table, trgVocab, probThreshold, stats);
      }


      // dispose all the resources after using them.
      inputReader.close();


      System.err.println("File " + inputFile + ": read " + cnt + " lines");
      System.err.println("Vocabulary Target: " + trgVocab.size() + " elements");
      System.err.println("Vocabulary Source: " + srcVocab.size() + " elements");
      System.err.println(stats);
    }catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    Vocab trgVocab = HadoopAlign.loadVocab(new Path(trgVocabFile), fs);
    TTable_monolithic_IFAs ttable = new TTable_monolithic_IFAs(fs, new Path(tableFile), true);


    logger.debug(ttable.getMaxE() + "," + ttable.getMaxF());


    Vocab finalSrcVocab = new VocabularyWritable();
    Vocab finalTrgVocab = new VocabularyWritable();
    TTable_monolithic_IFAs finalTTable = new TTable_monolithic_IFAs();


    String srcTerm = null, trgTerm = null;
    int curIndex = -1;
    TreeSet<PairOfFloatString> topTrans = new TreeSet<PairOfFloatString>();
    float sumOfProbs = 0.0f, prob;
    //    int cntLongTail = 0, cntShortTail = 0, sumShortTail = 0;    // for statistical purposes only
    HookaStats stats = new HookaStats(numTrans, probThreshold);


    //modify current ttable wrt foll. criteria: top numTrans translations per source term, unless cumulative prob. distr. exceeds probThreshold before that.
    for (int srcIndex = 1; srcIndex < srcVocab.size(); srcIndex++) {
      int[] translations;
      try {
        translations = ttable.get(srcIndex).getTranslations(0f);
      } catch (Exception e) {
        logger.warn("No translations found for "+srcVocab.get(srcIndex)+". Ignoring...");
        continue;
      }


      srcTerm = srcVocab.get(srcIndex);
      curIndex = finalSrcVocab.addOrGet(srcTerm);


      //initialize this term
      topTrans.clear();
      sumOfProbs = 0.0f;
      logger.debug("Processing: " + srcTerm + " with index: " + curIndex + " ("+srcIndex+"); " + translations.length + " translations");
      for (int trgIndex : translations) {
        try {
          trgTerm = trgVocab.get(trgIndex);
        } catch (Exception e) {
          logger.debug("Skipping " + trgIndex);
          continue;
        }
        prob = ttable.get(srcIndex, trgIndex);
        logger.debug("Found: " + trgTerm + " with " + prob);


        topTrans.add(new PairOfFloatString(prob, trgTerm));
        // keep top numTrans translations
        if (topTrans.size() > numTrans) {
          float removedProb = topTrans.pollFirst().getLeftElement();
          sumOfProbs -= removedProb;
        }
        sumOfProbs += prob;


        if (sumOfProbs > probThreshold) {
          logger.debug("Sum of probs > "+probThreshold+", early termination.");
          break;
        } 
      }


      //store previous term's top translations to ttable
      if(topTrans.size() > 0){
        addToTable(curIndex, topTrans, sumOfProbs, finalTTable, finalTrgVocab, probThreshold, stats);
      }
    }
    System.err.println("Vocabulary Target: "+finalTrgVocab.size()+" elements");
    System.err.println("Vocabulary Source: "+finalSrcVocab.size()+" elements");
    System.err.println(stats);


    FSDataOutputStream outputStream1 = fs.create(new Path(finalTrgVocabFile));
    ((VocabularyWritable) finalTrgVocab).write(outputStream1);

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);
    isStopwordRemoval = !stopwords.isEmpty();
    isStemming = conf.getBoolean(Constants.Stemming, true);


    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      LOG.warn("No vocabulary provided to tokenizer.");

View Full Code Here

Examples of edu.umd.hooka.VocabularyWritable

    stopwords = readInput(fs, stopwordsFile);      
    String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
    stemmedStopwords = readInput(fs, stemmedStopwordsFile);
    isStopwordRemoval = !stopwords.isEmpty();
    
    VocabularyWritable vocab;
    try {
      vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
      setVocab(vocab);
    } catch (Exception e) {
      LOG.warn("No vocabulary provided to tokenizer.");

View Full Code Here

0 1 2 3 4

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.