package syntaxLearner;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import syntaxLearner.corpus.Corpus;
import syntaxLearner.corpus.Vocabulary;
import syntaxLearner.corpus.Word;
/**
* A library that records the algorithm in JSON for a standard interpreter
* @author Omer Shapira
*
*/
public class Recorder {
private File mainPath;
private File currentPath;
private File wordPath;
private String name;
private boolean isRecording;
private int iterationCounter = 0;
private File corpusFile;
private Learner l;
private File iterationFile;
private Corpus c;
public Recorder(Learner l, File mainPath, String name, boolean isRecording){
this.mainPath = mainPath;
this.name = name;
this.isRecording = isRecording;
this.l=l;
}
/*
*
* corpus_%corpusName% = {
name: "...",
tokenCount: 500,
typeCount: 1000,
commonTypes: 900,
clusterCount: 100,
identityEps: 0.00004
iterationCount: 7,
typeToId:
{
typeName:0,
typeName2: 1,
...
},
idToType: {
1: "word",
4: "bleh",
}
};
*/
/**
*
*/
public void recordCorpusData (Corpus c, Learner l){
if (!isRecording) return;
this.c=c;
Vocabulary v = c.getVocabulary();
Set<Map.Entry<String, Integer>> entrySet = v.getWordIndicesEntrySet();
Set<Map.Entry<Integer, Word>> wordSet = v.getWordEntrySet();
StringBuilder s = new StringBuilder(v.getNumOfWords()*30);
corpusFile = new File(mainPath, "corpus_"+name+".js");
s.append(String.format("corpus_%1$s = \n{\n\tname:\"%1$s\",\n", name));
s.append(String.format("\ttokenCount: %1$s,\n", c.tokenCount));
s.append(String.format("\ttypeCount: %1$s,\n", v.getNumOfWords()));
s.append(String.format("\tcommonTypes: %1$s,\n", (v.getNumOfWords()-v.countWordsBelowThreshold(l.RARE_WORD_THRESHOLD))));
s.append(String.format("\tclusterCount: %1$s,\n", l.NUMBER_OF_CLUSTERS));
//TODO add proper number formatters
s.append(String.format("\tidentityEps: %1$s,\n", l.IDENTITY_EPSILON));
s.append(String.format("\titerationCount: %1$s,\n", iterationCounter));
s.append("\ttypeToId: \n\t{\n\t");
for (Map.Entry<String, Integer> e: entrySet){
if (v.getWord(e.getValue()).frequency >= l.RARE_WORD_THRESHOLD){ //TODO see if necessary
s.append(String.format("\t\t\"%1$s\": %2$s,\n", e.getKey(), e.getValue()));
}
}
s.append("},\n");
s.append("\tidToType: {");
for (Map.Entry<Integer, Word> e: wordSet){ //No rare word check here, to keep vector status
if (e.getValue().frequency >= l.RARE_WORD_THRESHOLD){
s.append(String.format("\t\t%1$s: \"%2$s\",\n",e.getKey(), e.getValue().name));
}
}
s.append("}");
s.append("\n};");
try {
corpusFile.createNewFile();
BufferedWriter out = new BufferedWriter(new FileWriter(corpusFile));
out.append(s);
out.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
}
private void updateIterationData(){
if (iterationCounter<l.getIterationCount()){
iterationCounter = l.getIterationCount();
//TODO Write iteration number to file
}
}
/*
* iteration_%corpusName%_%iterationNumber% = {
iteration_number : 2,
unsorted_words: ["a", "b", ...]
};
*/
/**
*
*/
public void recordNewIteration(String words){
if (!isRecording) return;
updateIterationData();
currentPath = new File(mainPath, Integer.toString(iterationCounter));
currentPath.mkdir();
wordPath = new File(currentPath, "words");
wordPath.mkdir();
iterationFile = new File(currentPath,
String.format("iteration_%1$s_%2$s.js",name,iterationCounter));
StringBuilder s = new StringBuilder();
s.append(String.format("iteration_%1$s_%2$s = {\n",name,iterationCounter));
s.append(String.format("\titeration_number : %1$s,\n",iterationCounter));
s.append("\tunsorted_words: ");
s.append(words);
s.append("\n}");
try {
iterationFile.createNewFile();
BufferedWriter out = new BufferedWriter(new FileWriter(iterationFile));
out.append(s);
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/*
* cluster_%corpusName%_%iterationNumber%_%clusterIndex% =
{
index: 30,
words: [8, 9, ...],
distribution: [[0.1, 0.1,...],
[0.1, 0.1,...],
[0.1, 0.1,....],
..]
}
*/
/**
*
*/
public void recordClusterInfo(Cluster c){
if (!isRecording) return;
StringBuilder s = new StringBuilder();
String filename = String.format("cluster_%1$s_%2$s_%3$s", name, iterationCounter,c.ID);
s.append(filename+" =\n{\n");
s.append(String.format("\tindex: %1$s,\n", c.ID));
s.append("\twords: [");
for (int i: c.words){
s.append(" "+i+",");
}
//remove last comma
s.deleteCharAt(s.length()-1);
s.append(" ],\n");
s.append("\tdistribution: [");
for (double d1[] : c.clusterDistribution()){
for (double d2: d1){
s.append(String.format(" %1$-4f,", d2));
}
}
s.deleteCharAt(s.length()-1);
s.append(" ]\n}");
File f = new File (currentPath, filename+".js");
try {
f.createNewFile();
BufferedWriter out = new BufferedWriter(new FileWriter(f));
out.append(s);
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/*
* word_%corpusName%_%iterationNumber%_%wordIndex% =
{
word: "..",
wordIndex: 5,
freq: 80,
rank: 150,
distribution: [[0.1, 0.1,...],
[0.1, 0.1,...],
[0.1, 0.1,....],
..],
dist_to_cluster: []
}
*/
public void recordWordInfo(Word w, TreeMap<Double, Short> distances, StringBuilder distribution){
if (!isRecording) return;
double[] distanceArray = new double[l.NUMBER_OF_CLUSTERS];
for (Map.Entry<Double, Short> e : distances.entrySet()){
assert (e.getValue()<distanceArray.length);
distanceArray[e.getValue()]=e.getKey();
}
String filename = String.format("word_%1$s_%2$s_%3$s", name, iterationCounter, w.ID);
StringBuilder s = new StringBuilder();
s.append(filename+" =\n{\n");
s.append(String.format("\tword: \"%1$s\",\n\twordIndex: %2$s,\n", w.name, w.ID));
s.append(String.format("\tfreq: %1$s,\n", w.frequency));
s.append(String.format("\trank: %1$s,\n", c.getVocabulary().getRank(w.ID)));
s.append("\tdistribution: ");
s.append(distribution);
s.append(",\n\tdist_to_cluster : [");
for (double d: distanceArray){
s.append(String.format(" %1$-4f,", d));
}
s.deleteCharAt(s.length()-1);
s.append(" ]\n}");
File f = new File(wordPath, filename+".js");
try {
f.createNewFile();
BufferedWriter out = new BufferedWriter(new FileWriter(f));
out.append(s);
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}