Package edu.stanford.nlp.international.arabic.pipeline

Source Code of edu.stanford.nlp.international.arabic.pipeline.IBMMTArabicDataset

package edu.stanford.nlp.international.arabic.pipeline;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import edu.stanford.nlp.international.arabic.IBMArabicEscaper;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.trees.treebank.ConfigParser;
import edu.stanford.nlp.trees.treebank.Dataset;
import edu.stanford.nlp.trees.treebank.Mapper;
import edu.stanford.nlp.util.Generics;

/**
* Applies the same orthographic transformations developed for ATB parse trees to flat
* MT input. This data set escapes IBM Arabic (for example, it removes explicit clitic markings).
* <p>
* NOTE: This class expects UTF-8 input (not Buckwalter)
*
* @author Spence Green
*
*/
public class IBMMTArabicDataset implements Dataset {

  protected Mapper lexMapper = null;
  protected final List<File> pathsToData;

  protected String outFileName;
  protected final Pattern fileNameNormalizer = Pattern.compile("\\s+");

  protected final IBMArabicEscaper escaper;
  private static final Pattern utf8ArabicChart = Pattern.compile("[\u0600-\u06FF]");

  protected final Set<String> configuredOptions;
  protected final Set<String> requiredOptions;
  protected final StringBuilder toStringBuffer;

  public IBMMTArabicDataset() {
    configuredOptions = Generics.newHashSet();
    toStringBuffer = new StringBuilder();
    pathsToData = new ArrayList<File>();

    escaper = new IBMArabicEscaper(true);
    escaper.disableWarnings();

    requiredOptions = Generics.newHashSet();
    requiredOptions.add(ConfigParser.paramName);
    requiredOptions.add(ConfigParser.paramPath);
  }

  public void build() {
    LineNumberReader infile = null;
    PrintWriter outfile = null;
    String currentInfile = "";
    try {
      outfile = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFileName),"UTF-8")));

      for(File path : pathsToData) {
        infile = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(path),"UTF-8")));
        currentInfile = path.getPath();

        while(infile.ready()) {
          ArrayList<Word> sent = Sentence.toUntaggedList(infile.readLine().split("\\s+"));

          for(Word token : sent) {
            Matcher hasArabic = utf8ArabicChart.matcher(token.word());
            if(hasArabic.find()) {
              token.setWord(escaper.apply(token.word()));
              token.setWord(lexMapper.map(null, token.word()));
            }
          }

          outfile.println(Sentence.listToString(sent));
        }

        toStringBuffer.append(String.format(" Read %d input lines from %s",infile.getLineNumber(),path.getPath()));
      }

      infile.close();

    } catch (UnsupportedEncodingException e) {
      System.err.printf("%s: Filesystem does not support UTF-8 output\n", this.getClass().getName());
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      System.err.printf("%s: Could not open %s for writing\n", this.getClass().getName(), outFileName);
    } catch(IOException e) {
      System.err.printf("%s: Error reading from %s (line %d)\n", this.getClass().getName(), currentInfile,infile.getLineNumber());
    } catch(RuntimeException e) {
      System.err.printf("%s: Input sentence from %s contains token mapped to null (line %d)\n", this.getClass().getName(),currentInfile,infile.getLineNumber());
      e.printStackTrace();
    } finally {
      if(outfile != null)
        outfile.close();
    }
  }

  public List<String> getFilenames() {
    List<String> l = new ArrayList<String>();
    l.add(outFileName);
    return l;
  }

  @Override
  public String toString() {
    return toStringBuffer.toString();
  }

  public boolean setOptions(Properties opts) {
    for(String opt : opts.stringPropertyNames()) {
      String value = opts.getProperty(opt);

      if(value == null) {
        System.err.printf("%s: Read parameter with null value (%s)\n", this.getClass().getName(),opt);
        continue;
      }

      configuredOptions.add(opt);

      Matcher pathMatcher = ConfigParser.matchPath.matcher(opt);

      if(pathMatcher.lookingAt()) {
        pathsToData.add(new File(value));
        configuredOptions.add(ConfigParser.paramPath);
      } else if(opt.equals(ConfigParser.paramName)) {
        Matcher inThisFilename = fileNameNormalizer.matcher(value.trim());
        outFileName = inThisFilename.replaceAll("-");
        toStringBuffer.append(String.format("Dataset Name: %s\n",value.trim()));
      }
    }

    if(!configuredOptions.containsAll(requiredOptions))
      return false;

    //Finalize the output file names
    outFileName += ".txt";

    //Used for codifying lexical hacks
    lexMapper = new DefaultLexicalMapper();

    return true;
  }

}
TOP

Related Classes of edu.stanford.nlp.international.arabic.pipeline.IBMMTArabicDataset

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.