Package cc.mallet.cluster.tui

Source Code of cc.mallet.cluster.tui.Text2Clusterings

package cc.mallet.cluster.tui;

import gnu.trove.TIntArrayList;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.logging.Logger;

import cc.mallet.cluster.Clustering;
import cc.mallet.cluster.Clusterings;
import cc.mallet.cluster.Record;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;

//In progress
public class Text2Clusterings {

  private static Logger logger =
      MalletLogger.getLogger(Text2Clusterings.class.getName());

  public static void main (String[] args) throws IOException {
    CommandOption
                  .setSummary(Text2Clusterings.class,
                              "A tool to convert a list of text files to a Clusterings.");
    CommandOption.process(Text2Clusterings.class, args);

    if (classDirs.value.length == 0) {
      logger
            .warning("You must include --input DIR1 DIR2 ...' in order to specify a"
                      + "list of directories containing the documents for each class.");
      System.exit(-1);
    }

    Clustering[] clusterings = new Clustering[classDirs.value.length];
    int fi = 0;
    for (int i = 0; i < classDirs.value.length; i++) {
      Alphabet fieldAlph = new Alphabet();
      Alphabet valueAlph = new Alphabet();
      File directory = new File(classDirs.value[i]);
      File[] subdirs = getSubDirs(directory);
      Alphabet clusterAlph = new Alphabet();
      InstanceList instances = new InstanceList(new Noop());
      TIntArrayList labels = new TIntArrayList();
      for (int j = 0; j < subdirs.length; j++) {
        ArrayList<File> records = new FileIterator(subdirs[j]).getFileArray();
        int label = clusterAlph.lookupIndex(subdirs[j].toString());
        for (int k = 0; k < records.size(); k++) {
          if (fi % 100 == 0) System.out.print(fi);
          else if (fi % 10 == 0) System.out.print(".");
          if (fi % 1000 == 0 && fi > 0) System.out.println();
          System.out.flush();
          fi++;


          File record = records.get(k);
          labels.add(label);
          instances.add(new Instance(new Record(fieldAlph, valueAlph, parseFile(record)),
                        new Integer(label), record.toString(),
                        record.toString()));
        }
      }
      clusterings[i] =
          new Clustering(instances, subdirs.length, labels.toNativeArray());
    }

    logger.info("\nread " + fi + " objects in " + clusterings.length + " clusterings.");
    try {
      ObjectOutputStream oos =
          new ObjectOutputStream(new FileOutputStream(outputFile.value));
      oos.writeObject(new Clusterings(clusterings));
      oos.close();
    } catch (Exception e) {
      logger.warning("Exception writing clustering to file " + outputFile.value
                      + " " + e);
      e.printStackTrace();
    }

  }

  public static File[] getSubDirs (File dir) throws IOException {
    ArrayList<File> ret = new ArrayList<File>();
    File[] fs = dir.listFiles();
    for (File f : fs)
      if (f.isDirectory() && !f.getName().matches("^\\.+$"))
        ret.add(f);
    return ret.toArray(new File[] {});
  }

  public static String[][] parseFile (File f) throws IOException {
    BufferedReader r = new BufferedReader(new FileReader(f));
    String line = "";
    ArrayList<String[]> lines = new ArrayList<String[]>();
    while ((line = r.readLine()) != null) {
      line = line.trim();
      String[] words = line.split("\\s+");
      if (words.length > 1)
        lines.add(words);
    }
    String[][] ret = new String[lines.size()][];
    for (int i = 0; i < lines.size(); i++)
      ret[i] = lines.get(i);
    return ret;
  }

  static CommandOption.SpacedStrings classDirs =
      new CommandOption.SpacedStrings(
                                      Text2Clusterings.class,
                                      "input",
                                      "DIR...",
                                      true,
                                      null,
                                      "The directories containing text files to be clustered, one directory per clustering",
                                      null);

  static CommandOption.String outputFile =
      new CommandOption.String(Text2Clusterings.class, "output", "FILENAME",
                                true, "text.clusterings",
                                "The filename to write the Clustering.", null);

}
TOP

Related Classes of cc.mallet.cluster.tui.Text2Clusterings

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.