Source Code of edu.stanford.nlp.international.arabic.pipeline.IBMMTArabicDataset

package edu.stanford.nlp.international.arabic.pipeline;


import java.io.*;
import java.util.*;
import java.util.regex.*;


import edu.stanford.nlp.international.arabic.IBMArabicEscaper;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.trees.treebank.ConfigParser;
import edu.stanford.nlp.trees.treebank.Dataset;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;


/**
 * Applies the same orthographic transformations developed for ATB parse trees to flat
 * MT input. This data set escapes IBM Arabic (for example, it removes explicit clitic markings).
 * <p>
 * NOTE: This class expects UTF-8 input (not Buckwalter)
 *
 * @author Spence Green
 *
 */
public class IBMMTArabicDataset implements Dataset {


  protected Mapper lexMapper = null;
  protected final List<File> pathsToData;


  protected String outFileName;
  protected final Pattern fileNameNormalizer = Pattern.compile("\\s+");


  protected final IBMArabicEscaper escaper;
  private static final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]");


  protected final Set<String> configuredOptions;
  protected final Set<String> requiredOptions;
  protected final StringBuilder toStringBuffer;


  public IBMMTArabicDataset() {
    configuredOptions = Generics.newHashSet();
    toStringBuffer = new StringBuilder();
    pathsToData = new ArrayList<File>();


    escaper = new IBMArabicEscaper(true);
    escaper.disableWarnings();


    requiredOptions = Generics.newHashSet();
    requiredOptions.add(ConfigParser.paramName);
    requiredOptions.add(ConfigParser.paramPath);
  }


  public void build() {
    LineNumberReader infile = null;
    PrintWriter outfile = null;
    String currentInfile = "";
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));


      for(File path : pathsToData) {
        infile = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(path),"UTF-8")));
        currentInfile = path.getPath();


        while(infile.ready()) {
          ArrayList<Word> sent = Sentence.toUntaggedList(infile.readLine().split("\\s+"));


          for(Word token : sent) {
            Matcher hasArabic = utf8ArabicChart.matcher(token.word());
            if(hasArabic.find()) {
              token.setWord(escaper.apply(token.word()));
              token.setWord(lexMapper.map(null, token.word()));
            }
          }


          outfile.println(Sentence.listToString(sent));
        }


        toStringBuffer.append(String.format(" Read %d input lines from %s",infile.getLineNumber(),path.getPath()));
      }


      infile.close();


    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
    } catch(IOException e) {
      System.err.printf("%s: Error reading from %s (line %d)\n", this.getClass().getName(), currentInfile,infile.getLineNumber());
    } catch(RuntimeException e) {
      System.err.printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", this.getClass().getName(),currentInfile,infile.getLineNumber());
      e.printStackTrace();
    } finally {
      if(outfile != null)
        outfile.close();
    }
  }


  public List<String> getFilenames() {
    List<String> l = new ArrayList<String>();
    l.add(outFileName);
    return l;
  }


  @Override
  public String toString() {
    return toStringBuffer.toString();
  }


  public boolean setOptions(Properties opts) {
    for(String opt : opts.stringPropertyNames()) {
      String value = opts.getProperty(opt);


      if(value == null) {
        System.err.printf("%s: Read parameter with null value (%s)\n", this.getClass().getName(),opt);
        continue;
      }


      configuredOptions.add(opt);


      Matcher pathMatcher = ConfigParser.matchPath.matcher(opt);


      if(pathMatcher.lookingAt()) {
        pathsToData.add(new File(value));
        configuredOptions.add(ConfigParser.paramPath);
      } else if(opt.equals(ConfigParser.paramName)) {
        Matcher inThisFilename = fileNameNormalizer.matcher(value.trim());
        outFileName = inThisFilename.replaceAll("-");
        toStringBuffer.append(String.format("Dataset Name: %s\n",value.trim()));
      }
    }


    if(!configuredOptions.containsAll(requiredOptions))
      return false;


    //Finalize the output file names
    outFileName += ".txt";


    //Used for codifying lexical hacks
    lexMapper = new DefaultLexicalMapper();


    return true;
  }


}
Source Code of edu.stanford.nlp.international.arabic.pipeline.IBMMTArabicDataset

Related Classes of edu.stanford.nlp.international.arabic.pipeline.IBMMTArabicDataset