Package com.NLP.MorphAnalized

Source Code of com.NLP.MorphAnalized.Disambiguator

package com.NLP.MorphAnalized;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.StringTokenizer;

import com.NLP.ReqExp.RegExp;

public class Disambiguator {
 
  HashSet<String> set;
  String[] data;
  RegExp regexp;
  public Disambiguator(){
    set = new HashSet<String>();
    data = new String[8];
    data[0] = "cnn-turk.txt";
    data[1] = "derlem-lisans.txt";
    data[2] = "dunya.txt";
    data[3] = "milliyet-sondakika.txt";
    data[4] = "ntvmsnbc.txt";
    data[5] = "radikal.txt";
    data[6] = "star-gazete.txt";
    data[7] = "tbmm.txt";
    regexp = new RegExp();
  }
  public void disambiguatedVocab(){
    if(!set.isEmpty()){
      try{
        set.remove(" ");
        File f = new File(System.getProperty("user.home")+"/Desktop/vocab.txt");
        if(!f.isFile())
          f.createNewFile();
        FileWriter w = new FileWriter(f);
        BufferedWriter bw = new BufferedWriter(w);
        for (Object elem : set.toArray()) {
          String element = elem.toString();
          bw.write(element);
          bw.write('\n');
        }
        bw.close();
      }catch(IOException e){
        e.printStackTrace();
      }
    }
  }
  public void readData(){
    try {
      for(int i = 0;i<data.length;i++){
        BufferedReader reader = new BufferedReader(new FileReader(System.getProperty("user.home")+"/Desktop/PROJECT/NLP/yaz-okulu/metin-derlemler/"+data[i]));
        String line = "";
        String token;
        while((line = reader.readLine())!=null){
          if(line.contains("<DOC_END>")){
            continue;
          }else{
            line = line.replaceAll("\"", " ");
            line = regexp.SplitPunct(line);
            StringTokenizer tokenizer = new StringTokenizer(line);
            while(tokenizer.hasMoreTokens()){
              token = tokenizer.nextToken();
              set.add(token);
            }
          }
        }
      }
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch(IOException e){
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of com.NLP.MorphAnalized.Disambiguator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.