Package edu.stanford.nlp.international.morph

Source Code of edu.stanford.nlp.international.morph.AddMorphoAnnotations$YieldIterator

package edu.stanford.nlp.international.morph;

import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.international.arabic.ArabicTreeReaderFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/**
* Reads in the tree files without any kind of pre-processing. Assumes that the trees
* have been processed separately.
* <p>
* TODO: wsg2011 Extend to other languages. Only supports Arabic right now.
*
* @author Spence Green
*
*/
public final class AddMorphoAnnotations {
 
  private static final int minArgs = 2;
  private static String usage() {
    StringBuilder sb = new StringBuilder();
    sb.append(String.format("Usage: java %s [OPTS] morph_file lemma_file < tree_file \n\n",AddMorphoAnnotations.class.getName()));
    sb.append("Options:\n");
    sb.append("  -e enc     : Encoding.\n");
    sb.append("  -g         : Morph file is gold tree file with morph analyses in the pre-terminals.");
    return sb.toString();
  }
  private static Map<String,Integer> argSpec() {
    Map<String,Integer> argSpec = Generics.newHashMap();
    argSpec.put("g", 0);
    argSpec.put("e", 1);
    return argSpec;
  }
 
  /**
   * Iterate over either strings or leaves.
   *
   * @author Spence Green
   *
   */
  private static class YieldIterator implements Iterator<List<String>> {

    private List<String> nextYield = null;
    BufferedReader fileReader = null;
    TreeReader treeReader = null;
   
    public YieldIterator(String fileName, boolean isTree) {
      try {
        if (isTree) {
          TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true);
          treeReader = trf.newTreeReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
        } else {
          fileReader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
        }
      } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
      } catch (FileNotFoundException e) {
        e.printStackTrace();
      }
      primeNext();
    }
   
    private void primeNext() {
      try {
        if (treeReader != null) {
            Tree tree = treeReader.readTree();
            if (tree == null) {
              nextYield = null;
            } else {
              List<CoreLabel> mLabeledLeaves = tree.taggedLabeledYield();
              nextYield = new ArrayList<String>(mLabeledLeaves.size());
              for (CoreLabel label : mLabeledLeaves) {
                nextYield.add(label.tag());
              }
            }
        } else {
          String line = fileReader.readLine();
          if (line == null) {
            nextYield = null;
          } else {
            nextYield = Arrays.asList(line.split("\\s+"));
          }
        }
      } catch (IOException e) {
        nextYield = null;
        e.printStackTrace();
      }
    }
   
    @Override
    public boolean hasNext() {
      return nextYield != null;
    }

    @Override
    public List<String> next() {
      if (nextYield == null) {
        try {
          if (fileReader != null) {
            fileReader.close();
            fileReader = null;
          } else if (treeReader != null) {
            treeReader.close();
            treeReader = null;
          }
        } catch (IOException e) {
          e.printStackTrace();
        }
        return null;
      } else {
        List<String> next = nextYield;
        primeNext();
        return next;
      }
    }

    @Override
    public void remove() {
      throw new UnsupportedOperationException();
    }
  }

  /**
   *
   * @param args
   */
  public static void main(String[] args) {
    if(args.length < minArgs) {
      System.err.println(usage());
      System.exit(-1);
    }
    Properties options = StringUtils.argsToProperties(args, argSpec());
    String encoding = options.getProperty("e", "UTF-8");
    boolean isMorphTreeFile = PropertiesUtils.getBool(options, "g", false);
    String[] parsedArgs = options.getProperty("").split("\\s+");
    if (parsedArgs.length != 2) {
      System.err.println(usage());
      System.exit(-1);
    }
   
    YieldIterator morphIter = new YieldIterator(parsedArgs[0], isMorphTreeFile);
    YieldIterator lemmaIter = new YieldIterator(parsedArgs[1], false);
   
    final Pattern pParenStripper = Pattern.compile("[\\(\\)]");
       
    try {
      BufferedReader brIn = new BufferedReader(new InputStreamReader(System.in, encoding));
      TreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(true);

      int nTrees = 0;
      for(String line; (line = brIn.readLine()) != null; ++nTrees) {
        Tree tree = trf.newTreeReader(new StringReader(line)).readTree();
        List<Tree> leaves = tree.getLeaves();
        if(!morphIter.hasNext()) {
          throw new RuntimeException("Mismatch between number of morpho analyses and number of input lines.");
        }
        List<String> morphTags = morphIter.next();
        if (!lemmaIter.hasNext()) {
          throw new RuntimeException("Mismatch between number of lemmas and number of input lines.");
        }
        List<String> lemmas = lemmaIter.next();
        
        // Sanity checks
        assert morphTags.size() == lemmas.size();
        assert lemmas.size() == leaves.size();
       
        for(int i = 0; i < leaves.size(); ++i) {
          String morphTag = morphTags.get(i);
          if (pParenStripper.matcher(morphTag).find()) {
            morphTag = pParenStripper.matcher(morphTag).replaceAll("");
          }
          String newLeaf = String.format("%s%s%s%s%s", leaves.get(i).value(),
              MorphoFeatureSpecification.MORPHO_MARK,
              lemmas.get(i),
              MorphoFeatureSpecification.LEMMA_MARK,
              morphTag);
          leaves.get(i).setValue(newLeaf);
        }
        System.out.println(tree.toString());
      }
     
      // Sanity checks
      assert !morphIter.hasNext();
      assert !lemmaIter.hasNext();
     
      System.err.printf("Processed %d trees%n",nTrees);
     
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of edu.stanford.nlp.international.morph.AddMorphoAnnotations$YieldIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.