package cc.mallet.cluster.tui;
import gnu.trove.TIntArrayList;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.logging.Logger;
import cc.mallet.cluster.Clustering;
import cc.mallet.cluster.Clusterings;
import cc.mallet.cluster.Record;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
//In progress
public class Text2Clusterings {
private static Logger logger =
MalletLogger.getLogger(Text2Clusterings.class.getName());
public static void main (String[] args) throws IOException {
CommandOption
.setSummary(Text2Clusterings.class,
"A tool to convert a list of text files to a Clusterings.");
CommandOption.process(Text2Clusterings.class, args);
if (classDirs.value.length == 0) {
logger
.warning("You must include --input DIR1 DIR2 ...' in order to specify a"
+ "list of directories containing the documents for each class.");
System.exit(-1);
}
Clustering[] clusterings = new Clustering[classDirs.value.length];
int fi = 0;
for (int i = 0; i < classDirs.value.length; i++) {
Alphabet fieldAlph = new Alphabet();
Alphabet valueAlph = new Alphabet();
File directory = new File(classDirs.value[i]);
File[] subdirs = getSubDirs(directory);
Alphabet clusterAlph = new Alphabet();
InstanceList instances = new InstanceList(new Noop());
TIntArrayList labels = new TIntArrayList();
for (int j = 0; j < subdirs.length; j++) {
ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
int label = clusterAlph.lookupIndex(subdirs[j].toString());
for (int k = 0; k < records.size(); k++) {
if (fi % 100 == 0) System.out.print(fi);
else if (fi % 10 == 0) System.out.print(".");
if (fi % 1000 == 0 && fi > 0) System.out.println();
System.out.flush();
fi++;
File record = records.get(k);
labels.add(label);
instances.add(new Instance(new Record(fieldAlph, valueAlph, parseFile(record)),
new Integer(label), record.toString(),
record.toString()));
}
}
clusterings[i] =
new Clustering(instances, subdirs.length, labels.toNativeArray());
}
logger.info("\nread " + fi + " objects in " + clusterings.length + " clusterings.");
try {
ObjectOutputStream oos =
new ObjectOutputStream(new FileOutputStream(outputFile.value));
oos.writeObject(new Clusterings(clusterings));
oos.close();
} catch (Exception e) {
logger.warning("Exception writing clustering to file " + outputFile.value
+ " " + e);
e.printStackTrace();
}
}
public static File[] getSubDirs (File dir) throws IOException {
ArrayList<File> ret = new ArrayList<File>();
File[] fs = dir.listFiles();
for (File f : fs)
if (f.isDirectory() && !f.getName().matches("^\\.+$"))
ret.add(f);
return ret.toArray(new File[] {});
}
public static String[][] parseFile (File f) throws IOException {
BufferedReader r = new BufferedReader(new FileReader(f));
String line = "";
ArrayList<String[]> lines = new ArrayList<String[]>();
while ((line = r.readLine()) != null) {
line = line.trim();
String[] words = line.split("\\s+");
if (words.length > 1)
lines.add(words);
}
String[][] ret = new String[lines.size()][];
for (int i = 0; i < lines.size(); i++)
ret[i] = lines.get(i);
return ret;
}
static CommandOption.SpacedStrings classDirs =
new CommandOption.SpacedStrings(
Text2Clusterings.class,
"input",
"DIR...",
true,
null,
"The directories containing text files to be clustered, one directory per clustering",
null);
static CommandOption.String outputFile =
new CommandOption.String(Text2Clusterings.class, "output", "FILENAME",
true, "text.clusterings",
"The filename to write the Clustering.", null);
}