Package com.NLP.Gui

Source Code of com.NLP.Gui.Gui

package com.NLP.Gui;
/*
* Burkan Y�lmaz
* T�bitak Bilgem NLP Course Project
* 27/06/2013
* */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;



import com.NLP.Tester.Test;

public class Gui {
 
  public static String[] arguments;                //{"bigramModelDeneme3.lm","trigramModelDeneme2.lm"};
  public Hashtable<String,Hashtable<String,Double> > bigramHash;
  public Hashtable<String,String> disambiguted;
  public String punct = "[\\p{Punct}]";//Regular Expression for Punctuation
  public String whiteSpace = "[\\n]";//Regular Expression for Whitespace Characters
  public String[] punctuations = {"." , "," ,"@", "!","?","\"","<",">","+","-","*","/",")","(","{","}","=","/",";","&","^","'","#","$","£","|","_","~",":"};
  public String[] punctCommas = {"," ,"@","\"","<",">","+","-","*","/",")","(","{","}","=","/",";","&","^","'","#","$","£","|","_","~",":"};
  public String bounders[] = {"...",".","!","?"};
  public static void main(String[] args) {
    if(args.length<3){
      System.err.println("Language Model, Disambiguated Vocabulary and Test file are required in this order.");
    }else{
      arguments = new String[3];
      arguments[0] = args[0];//bigram language model
      arguments[1] = args[1];//disambiguated vocabulary list
      arguments[2] = args[2];//test file
      Gui g = new Gui();
      Test t = new Test();
      g.readBigramModel();
      g.testBoundation();
      t.compare();
    }
  }
  public Gui(){
    bigramHash = new Hashtable<String, Hashtable<String,Double>>();
    disambiguted = new Hashtable<String,String>();
  }
  public void printHash(String s){
    System.out.println(s);
   
  }
  public String splitMorphology(String str){
    String[] array = str.split("\\+");
    String feature = array[1];//Morphological Feature
    for (int i=0;i<array.length;i++) {
      if(array[i].contains("DB")){
        feature = array[i+1];
      }else if(array[i].contains("Cond")){
        feature = "Cond";
      }
    }
    return feature;
  }
  public void readDisambiguator(){
    try{
      BufferedReader reader = new BufferedReader(new FileReader(arguments[1]));//System.getProperty("user.home")+"/Desktop/PROJECT/NLP/esas/yaz-okulu/araclar/disambiguated.txt"
      String line = "";
      String word = "";
      String feature = "";
      while((line=reader.readLine())!=null){
        String[] tokens = line.split(" ");
        word = tokens[0];
        feature = splitMorphology(tokens[1]);
        disambiguted.put(word,feature);
      }
    }catch(IOException e){
      e.printStackTrace();
    }
  }
  public void disambiguator(String file){
    try{
      String[] commands = {"java","-jar","smooth-parse-full-0.6.jar","-i",file};
      ProcessBuilder builder = new ProcessBuilder(commands);
      builder.directory(new File(System.getProperty("user.home")+"/Desktop/PROJECT/NLP/esas/yaz-okulu/araclar"));
      Process process = builder.start();
      int exitValue = process.waitFor();
      if(exitValue!=0){
        System.err.println("An error occured, exit value is "+exitValue);
      }
    }catch(IOException e){
      e.printStackTrace();
    }catch(InterruptedException e){
      e.printStackTrace();
    }
  }
  public String SplitPunct(String sentence){
    Pattern p = Pattern.compile(punct);
    Matcher m = p.matcher(sentence);
    if(m.find()){
      for(int i = 0;i<punctuations.length-1;i++)
      {
        if(sentence.indexOf(punctuations[i])!=-1)
          sentence = sentence.replace(punctuations[i]," "+punctuations[i]+" ");
      }
    }
    return sentence;
  }
  public void writeTextFile(String text){
    try{
      text = text.replaceAll("\n", " ");
      File f = new File(System.getProperty("user.home")+"/Desktop/PROJECT/NLP/esas/yaz-okulu/araclar/kelime.txt");
      if(!f.isFile())
        f.createNewFile();
      FileWriter fw = new FileWriter(f);
      BufferedWriter bw = new BufferedWriter(fw);
      bw.write(text);
      bw.close();
    }catch(IOException e){
      e.printStackTrace();
    }
  }
  public void testBoundation(){
    try{
      BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(arguments[2]),"UTF-8"));//System.getProperty("user.home")+"/Desktop/PROJECT/NLP/yaz-okulu/metin-derlemler/zaman.txt"
      String line = "";
      String paraph = "";
      String golden = "";
      String text = "";
      int counter = 0;
      while((line = reader.readLine())!=null && counter<1200){
          golden += removeAllFloats(line);
          text += (line+"\n");
          line = line.replaceAll("\""," ");
          line = SplitPunct(line);
          line = line.replaceAll(punct, "");
          StringTokenizer tokens = new StringTokenizer(line);
          line = "";
          while(tokens.hasMoreTokens()){
            String str = tokens.nextToken();
            line += str+" ";
          }
          line+="\n";
          paraph += line;
          counter++;
      }
      golden = removeAllCommas(golden);
      writeResults(golden,"golden.txt");
      readDisambiguator();
      System.out.println("Creation is done.");
      System.out.println(text);
      System.out.println("=============================================================================");
      paraph = sentenceBounder(paraph);
      writeResults(paraph,"output.txt");
      System.out.println(paraph);
    }catch(IOException e){
      e.printStackTrace();
    }
  }
  public String replaceLast(String str,String regexp){
    StringBuffer tmp = new StringBuffer();
    String s;
    tmp.append(str);
    s = tmp.reverse().toString().replaceFirst(regexp, "");
    tmp.delete(0, tmp.length()-1);
    tmp.append(s);
    return tmp.reverse().toString();
  }
 
  public String removeAllFloats(String line){
    String sentence = "";
    StringTokenizer tokens = new StringTokenizer(line);
    while(tokens.hasMoreTokens()){
      String str = tokens.nextToken();
      if(str.contains(".") && (Character.isLetterOrDigit(str.charAt(str.indexOf(".") > 0 ? str.indexOf(".") - 1 : str.indexOf("."))) && Character.isLetterOrDigit(str.charAt(str.indexOf(".") < str.length()-1 ? str.indexOf(".") + 1 : str.indexOf("."))))){
        str = str.replace(".", " ").replace(",", " ").trim();
      }else if(str.contains(",") && (Character.isLetterOrDigit(str.charAt(str.indexOf(",") > 0 ? str.indexOf(",") - 1 : str.indexOf(","))) && Character.isLetterOrDigit(str.charAt(str.indexOf(",") < str.length()-1 ? str.indexOf(",") + 1 : str.indexOf(","))))){
        str = str.replace(","," ").replace(".", " ").trim();
      }
      sentence += (str + " ");
    }
    sentence += "\n";
    return sentence;
  }
  public String removeAllCommas(String golden) {
    for(int i = 0;i<punctCommas.length;i++){
      if(punctCommas[i] == "'")
        golden = golden.replace(punctCommas[i], " ");
      else
        golden = golden.replace(punctCommas[i], "");
    }
    return golden;
  }
  public void writeResults(String sentence,String filename){
    try{
      File output = new File(System.getProperty("user.home")+"/Desktop/"+filename);
      if(!output.isFile()){
        output.createNewFile();
      }
      FileWriter fw = new FileWriter(output.getAbsoluteFile());
      BufferedWriter w = new BufferedWriter(fw);
      w.write(sentence);
      w.close();
    }catch(IOException e){
      e.printStackTrace();
    }
  }
  public String sentenceBounder(String sentence){
    StringTokenizer tokens = new StringTokenizer(sentence);
    ArrayList<String> array = new ArrayList<String>();
    String line = "";
   
    while(tokens.hasMoreTokens()){
      array.add(tokens.nextToken());
    }
    ArrayList<Double> all = new ArrayList<Double>();
    for (int i = 0;i<array.size();i++) {
      String str = array.get(i);
      line+=str;
      boolean added = false;
      if(bigramHash.containsKey(str)){
        if(bigramHash.get(str).containsKey(i < array.size()-1 ? array.get(i+1) : "<NULL>")){
//          Enumeration<Double> em = bigramHash.get(str).elements();
//          boolean isPossible = true;
//          while(em.hasMoreElements()){
//            if(bigramHash.get(str).get(i < array.size()-1 ? array.get(i+1) : "<NULL>") < em.nextElement()){
//              isPossible = false;
//              break;
//            }
//          }
//          if(isPossible){
            line += " ";
            added = true;
//          }
        }
        if(!added){
          Enumeration<Double> e = bigramHash.get(str).elements();
          while(e.hasMoreElements()){
            all.add(e.nextElement());
          }
          ArrayList<Double> pList = new ArrayList<Double>();
          for(String s:bounders){
            if(bigramHash.get(str).containsKey(s)){
              pList.add(bigramHash.get(str).get(s));
            }
          }
         
          if(pList.size()>=1){
            Object[] values = pList.toArray();
            Object[] allValues = all.toArray();
            Arrays.sort(values);
            Arrays.sort(allValues);
            Hashtable<String, Double> tmp = bigramHash.get(str);
            Enumeration<String> keys = tmp.keys();
            boolean isAdded = false;//Punctuation is added or not
            if(values[values.length-1]==allValues[allValues.length-1]){
              while(keys.hasMoreElements()){
                String k = keys.nextElement();
                if(tmp.get(k)==values[values.length-1] && (isUpper(array.get(i),i < array.size()-1 ? array.get(i+1) : null) || leftBounder(i < array.size()-1 ? array.get(i+1) : null))){
                  if(isDegree(line)){
                    line+=k;
                  }else{
                    line+=k+" \n";
                    isAdded = true;
                  }
                  break;
                }
              }
              if(!isAdded){
                if(i+1<array.size()){
                  if(disambiguted.containsKey(array.get(i))){
                    if(disambiguted.get(array.get(i)).equals("Verb") && (Character.isUpperCase(array.get(i+1).charAt(0)) || Character.isDigit(array.get(i+1).charAt(0)))){
                      if(isDegree(line)){
                        line+=". ";
                      }else{
                        line+="."+" \n";
                      }
                    }else{
                      line+=" ";
                    }
                  }else{
                    line+=" ";
                  }
                }else
                  line+=" ";
              }
            }else{
              line+=" ";
            }
          }else{
            if(i+1<array.size()){
              if(disambiguted.containsKey(array.get(i))){
                if(disambiguted.get(array.get(i)).equals("Verb") && (Character.isUpperCase(array.get(i+1).charAt(0)) || Character.isDigit(array.get(i+1).charAt(0)) )){
                  if(isDegree(line)){
                    line+=". ";
                  }else{
                    line+="."+" \n";
                  }
                }else{
                  line+=" ";
                }
              }else{
                line+=" ";
              }
            }else
              line+=" ";
          }
          pList.clear();
        }
        all.clear();
      }else{
        if(i+1<array.size()){
          if(disambiguted.containsKey(array.get(i))){
            if(disambiguted.get(array.get(i)).equals("Verb") && (Character.isUpperCase(array.get(i+1).charAt(0)) || Character.isDigit(array.get(i+1).charAt(0)))){
              if(isDegree(line)){
                line+=". ";
              }else{
                line+="."+" \n";
              }
            }else{
              line+=" ";
            }
          }else{
            line+=" ";
          }
        }else
          line+=" ";
      }
    }
    return line;
  }
  public boolean isDegree(String line) {
    if(Character.isDigit(line.charAt(line.length()-1))){
      return true;
    }
    for(int i = line.length()-1;i>=0;i--){
      if(line.charAt(i)==' '){
        if(Character.isUpperCase(line.charAt(i+1))){
          return true;
        }
        else
          break;
      }
    }
    return false;
  }
  public boolean isUpper(String str0,String str) {
    if(str!=null && Character.isDigit(str0.charAt(0)) && Character.isUpperCase(str.charAt(0))){
      return false;
    }
    if(str!=null && Character.isUpperCase(str.charAt(0))){
      return true;
    }else{
      return false;
    }
  }
  public boolean leftBounder(String word){
    if(word==null)
      return true;
    else{
      for(int i = 0 ;i<bounders.length;i++){
        if(bigramHash.containsKey(bounders[i])){
          if(bigramHash.get(bounders[i]).containsKey(word)){
            Enumeration<Double> em = bigramHash.get(bounders[i]).elements();
            while(em.hasMoreElements()){
              if(bigramHash.get(bounders[i]).get(word) < em.nextElement()){
                return false;
              }
            }
            return true;
          }else{
            return false;
          }
        }
      }
      return false;
    }
  }
  public void readBigramModel(){
    try{
      BufferedReader reader = new BufferedReader(new FileReader(arguments[0]));
      String line = "";
      ArrayList<String> array = new ArrayList<String>();
      while((line = reader.readLine())!=null){
        if(line.contains("2-grams")){
          while((line = reader.readLine())!=null){
            StringTokenizer tokens = new StringTokenizer(line);
            if(tokens.countTokens()<=1){
              break;
            }
            while(tokens.hasMoreTokens()){
              String str = tokens.nextToken();
              array.add(str);
            }
            if(bigramHash.containsKey(array.get(1))){
              bigramHash.get(array.get(1)).put(array.get(2), Math.pow(10.0,Double.parseDouble(array.get(0))));
            }else{
              Hashtable<String, Double> hash = new Hashtable<String, Double>();
              hash.put(array.get(2), Math.pow(10.0,Double.parseDouble(array.get(0))));
              bigramHash.put(array.get(1),hash);
            }
            array.clear();
          }
        }
      }
    }catch(IOException e){
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of com.NLP.Gui.Gui

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.