Package joshua.subsample

Source Code of joshua.subsample.Subsampler

/* This file is part of the Joshua Machine Translation System.
*
* Joshua is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free
* Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
* MA 02111-1307 USA
*/
/*
* This file is based on the edu.umd.clip.mt.subsample.Subsampler
* class from the University of Maryland's jmtTools project (in
* conjunction with the umd-hadoop-mt-0.01 project). That project
* is released under the terms of the Apache License 2.0, but with
* special permission for the Joshua Machine Translation System to
* release modifications under the LGPL version 2.1. LGPL version
* 3 requires no special permission since it is compatible with
* Apache License 2.0
*/
package joshua.subsample;

import joshua.corpus.Phrase;
import joshua.corpus.vocab.Vocabulary;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.HashSet;
import java.util.Set;


/**
* A class for subsampling a large (F,E)-parallel sentence-aligned
* corpus to generate a smaller corpus whose N-grams are relevant
* to some seed corpus. The idea of subsampling owes to Kishore
* Papineni.
*
* @author UMD (Jimmy Lin, Chris Dyer, et al.)
* @author wren ng thornton <wren@users.sourceforge.net>
* @version $LastChangedDate: 2009-05-27 20:14:28 -0500 (Wed, 27 May 2009) $
*/
public class Subsampler {
  protected Vocabulary ve = new Vocabulary();
  protected Vocabulary vf = new Vocabulary();
  protected Map<Phrase,Integer> ngramCounts;
  protected int maxN;
  protected int targetCount;
  protected int maxSubsample = 1500000;
 
  protected static final int MAX_SENTENCE_LENGTH = 100;
  protected static final int MIN_RATIO_LENGTH    = 10;
 
 
  public Subsampler(String[] testFiles, int maxN, int targetCount)
  throws IOException {
    this.maxN        = maxN;
    this.targetCount = targetCount;
    this.ngramCounts = loadNgrams(testFiles);
  }
  private HashMap<Phrase,Integer> loadNgrams(String[] files)
  throws IOException {
    HashMap<Phrase,Integer> map = new HashMap<Phrase,Integer>();
    for (String fn : files) {
      System.err.println("Loading test set from " +fn+ "...");
     
      PhraseReader reader = new PhraseReader(
          new FileReader(fn), this.vf, (byte)1);
      Phrase phrase;
      int lineCount = 0;
      try {
        while ((phrase = reader.readPhrase()) != null) {
          lineCount++;
          List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
          for (Phrase ngram : ngrams) map.put(ngram, 0);
        }
      } finally {
        reader.close();
      }
      System.err.println("Processed " +lineCount+ " lines in " +fn);
    }
    System.err.println("Test set: " +map.size()+ " ngrams");
    return map;
  }
 
 
  /**
   * The general subsampler function for external use.
   *
   * @param filelist list of source files to subsample from
   * @param targetFtoERatio goal for ratio of output F length
   *                        to output E length
   * @param extf   extension of F files
   * @param exte   extension of E files
   * @param fpath  path to source F files
   * @param epath  path to source E files
   * @param output basename for output files (will append
   *               extensions)
   */
  public void subsample(
    String filelist, float targetFtoERatio,
    String extf,  String exte,
    String fpath, String epath,
    String output
  ) throws IOException {
    this.subsample(
      filelist,
      targetFtoERatio,
      new PhraseWriter(
        new BufferedWriter(
          new OutputStreamWriter(
            new FileOutputStream(output + "." + extf),
            "UTF8")),
        new BufferedWriter(
          new OutputStreamWriter(
            new FileOutputStream(output + "." + exte),
            "UTF8"))
        ),
      new BiCorpusFactory(
        fpath, epath, null,
        extf,  exte,  null,
        this.vf, this.ve)
      );
  }
 
  /**
   * The main wrapper for the subsample worker. Closes the
   * PhraseWriter before exiting.
   */
  protected void subsample(
    String filelist, float targetFtoERatio,
    PhraseWriter out, BiCorpusFactory bcFactory
  ) throws IOException {
    try {
      // Read filenames into a list
      List<String> files = new ArrayList<String>();
      {
        FileReader     fr = null;
        BufferedReader br = null;
        try {
          fr = new FileReader(filelist);
          br = new BufferedReader(fr);
          String file;
          while((file = br.readLine()) != null) {
            files.add(file);
          }
        } finally {
          // Maybe redundant, but UMD's FixBugs says to
          // close br (and close is idempotent anyways)
          if (null != fr) fr.close();
          if (null != br) br.close();
        }
      }
     
      int totalSubsampled = 0;
      // Iterating on files in order biases towards files
      // earlier in the list
      for (String f : files) {
        System.err.println("Loading training data: " + f);
       
        BiCorpus bc = bcFactory.fromFiles(f);
       
        Set<PhrasePair> set = new HashSet<PhrasePair>();
       
        int binsize = 10; // BUG: Magic-Number
        int max_k   = MAX_SENTENCE_LENGTH / binsize;
        System.err.print("Looking in length range");
        // Iterating bins from small to large biases
        // towards short sentences
        for (int k = 0; k < max_k; k++) {
          System.err.print(
            " [" +(k * binsize + 1)+ "," +((k + 1) * binsize)+ "]");
          System.err.flush();
         
          this.subsample(
            set,
            bc,
            k * binsize + 1,
            (k + 1) * binsize,
            targetFtoERatio);
         
          if (set.size() + totalSubsampled > maxSubsample) break;
        }
       
        float ff = 0.0f;
        float ef = 0.0f;
        for (PhrasePair pp : set) {
          // Get pp.ratioFtoE() for all pp
          ff += pp.getF().size();
          ef += pp.getE().size();
         
          out.write(pp);
          out.newLine();
        }
        out.flush();
       
        totalSubsampled += set.size();
        System.err.println(
          "\n  current="       +set.size()+
          " [total="           +totalSubsampled+
          "]    currentRatio=" +(ff/ef) );
        System.err.flush();
       
        // TODO: is this gc actually dubious? Or
        // does profiling show it helps? We only
        // do it once per file, so it's not a
        // performance blackhole.
        set = null; bc = null; System.gc();
      }
    } finally {
      out.close();
    }
  }
 
  /**
   * The worker function for subsampling.
   *
   * @param set       The set to put selected sentences into
   * @param bc        The sentence-aligned corpus to read from
   * @param minLength The minimum F sentence length
   * @param maxLength The maximum F sentence length
   * @param targetFtoERatio The desired ratio of F length to
   *                  E length
   */
  private void subsample(
    Set<PhrasePair> set,
    BiCorpus bc,
    int minLength,
    int maxLength,
    float targetFtoERatio
  ) {
    for (PhrasePair pp : bc) {
      { int eLength = pp.getE().size();
      if (eLength == 0
      ||  eLength >  MAX_SENTENCE_LENGTH)             continue; }
     
      int fLength = pp.getF().size();
      if (fLength == 0
      ||  fLength <  minLength
      ||  fLength >  maxLength
      ||  fLength >  MAX_SENTENCE_LENGTH)             continue;
      if (fLength > 10
      && targetFtoERatio != 0.0f) {
        float ratio = pp.ratioFtoE();
        if (fLength >= MIN_RATIO_LENGTH
          && (   ratio > 1.3f * targetFtoERatio
            || ratio * 1.3f < targetFtoERatio)) continue;
      }
      if (set.contains(pp))                           continue;
     
      // at this point, length checks out and the sentence hasn't
      // been selected yet
     
      List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
      boolean useSentence = false;
      for (Phrase ng : ngrams) {
        Integer count = this.ngramCounts.get(ng);
        if (count == null) continue;
        if (count < targetCount) {
          useSentence = true;
          count++;
          this.ngramCounts.put(ng, count);
        }
      }
      if (useSentence) set.add(pp);
    }
  }
 
 
  public static void main(String[] args) {
    new SubsamplerCLI().runMain(args);
  }
}
TOP

Related Classes of joshua.subsample.Subsampler

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.