Package edu.stanford.nlp.international.spanish.scripts

Source Code of edu.stanford.nlp.international.spanish.scripts.TreeToTSV

package edu.stanford.nlp.international.spanish.scripts;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.TreeReaderFactory;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeReaderFactory;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.regex.Pattern;

/**
* This script converts a PTB tree into TSV suitable for NER classification. The
* input is an AnCora treebank file with NER tags, and the output is a TSV file
* with tab-seperated word-class pairs, one word per file. These can be used with
* the CRFClassifier for training or testing.
*/
public class TreeToTSV {

  public static void main(String[] args) {
    if(args.length < 1) {
      System.err.printf("Usage: java %s tree_file%n", TreeToTSV.class.getName());
      System.exit(-1);
    }

    String treeFile = args[0];

    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
      TreeReaderFactory trf = new SpanishTreeReaderFactory();
      TreeReader tr = trf.newTreeReader(br);

      StringBuilder sb = new StringBuilder();
      String nl = System.getProperty("line.separator");

      Pattern nePattern = Pattern.compile("^grup\\.nom\\.");
      Pattern npPattern = Pattern.compile("^np0000.$");

      for (Tree tree; (tree = tr.readTree()) != null;) {
        for(Tree t : tree) {
          if(!t.isPreTerminal())
            continue;

          char type = 'O';
          Tree grandma = t.ancestor(1, tree);
          String grandmaValue = ((CoreLabel) grandma.label()).value();

          // grup.nom.x
          if(nePattern.matcher(grandmaValue).find())
            type = grandmaValue.charAt(9);

          // else check the pos for np0000x or not
          else {
            String pos = ((CoreLabel) t.label()).value();
            if(npPattern.matcher(pos).find())
              type = pos.charAt(6);
          }

          Tree wordNode = t.firstChild();
          String word = ((CoreLabel) wordNode.label()).value();
          sb.append(word).append("\t");
          switch(type) {
          case 'p':
            sb.append("PERS");
            break;
          case 'l':
            sb.append("LUG");
            break;
          case 'o':
            sb.append("ORG");
            break;
          case '0':
            sb.append("OTROS");
            break;
          default:
            sb.append("O");
          }
          sb.append(nl);
        }
        sb.append(nl);
      }
      System.out.print(sb.toString());

      tr.close();
    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }

  }
}
TOP

Related Classes of edu.stanford.nlp.international.spanish.scripts.TreeToTSV

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.