Package com.NLP.Vocabulary

Source Code of com.NLP.Vocabulary.Vocab

package com.NLP.Vocabulary;
/*
* Burkan Y�lmaz
* T�bitak Bilgem NLP Course Project
* 25/07/2013
* */
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashSet;
import java.util.StringTokenizer;

import com.NLP.ReqExp.RegExp;

public class Vocab {
  private String[] data;
  private RegExp regexp;
  private HashSet<String> set;
  public Vocab(){
    set = new HashSet<String>();
    data = new String[8];
    regexp = new RegExp();
    data[0] = "cnn-turk.txt";
    data[1] = "derlem-lisans.txt";
    data[2] = "dunya.txt";
    //data[3] = "hukuki-net.txt";
    data[3] = "milliyet-sondakika.txt";
    data[4] = "ntvmsnbc.txt";
    data[5] = "radikal.txt";
    data[6] = "star-gazete.txt";
    data[7] = "tbmm.txt";
    //data[8] = "sentences_utf_8.txt";
   
  }
 
  public static void main(String[] args) {
    Vocab v = new Vocab();
    v.readTrainData();
    v.writeVocabulary();
    //v.print();
   
  }
  public void print(){
    if(!set.isEmpty())
      System.out.println(set.size());
    String a = "asd  swa";
    String[] ar = a.split(" ");
    System.err.println(ar.length);
    for(int i = 0 ;i<ar.length;i++)
    System.err.println(ar[i]);
    /*String a = "\"asdw.wad,awdas;awdasdawdswd\"wdadwada\"";
    System.out.println(a);
    a = a.replaceAll("\"","");
    System.out.print(a);
    String a = "asddw  adwaw  wawa ssd";
    StringTokenizer tokens = new StringTokenizer(a);
    while(tokens.hasMoreTokens()){
      System.out.println(tokens.nextToken());
    }
   
    Pattern p = Pattern.compile("\\p{Punct}");
    Matcher m = p.matcher(regexp.test);
    if(m.find()){
   
    System.out.println(regexp.test);
    }*/
  }
  public void writeVocabulary(){
    if(!set.isEmpty()){
      try {
        set.remove(new String(" "));
        File vocabulary = new File(System.getProperty("user.home")+"/Desktop/vocab.txt");
        if(!vocabulary.isFile()){
          vocabulary.createNewFile();
        }
        FileWriter writer = new FileWriter(vocabulary.getAbsoluteFile());
        BufferedWriter bw = new BufferedWriter(writer);
        for (Object elem : set.toArray()) {
          String element = elem.toString();
          bw.write(element);
          bw.write('\n');
        }
        bw.close();
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
  public void readTrainData(){   
    try {
      for(int i = 0;i<data.length;i++){
        BufferedReader reader = new BufferedReader(new FileReader("Dosya Yolu"));
        String line = "";
        String token;
        while((line = reader.readLine())!=null){
          if(line.contains("<DOC_END>")){
            continue;
          }else{
            line = line.replaceAll("\"", " ");
            line = regexp.SplitPunct(line);
            StringTokenizer tokenizer = new StringTokenizer(line);
            while(tokenizer.hasMoreTokens()){
              token = tokenizer.nextToken();
              set.add(token);
            }
          }
        }
      }
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch(IOException e){
      e.printStackTrace();
    }
  }
}
TOP

Related Classes of com.NLP.Vocabulary.Vocab

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.