Source Code of de.jungblut.reader.TwentyNewsgroupReader

package de.jungblut.reader;


import gnu.trove.list.array.TDoubleArrayList;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;


import de.jungblut.math.dense.DenseDoubleVector;
import de.jungblut.math.tuple.Tuple3;


/**
 * Reads the "20news-bydate" dataset into a vector space model as well as
 * predictions based on the category.
 * 
 * @author thomas.jungblut
 * 
 */
public final class TwentyNewsgroupReader {


  private TwentyNewsgroupReader() {
    throw new IllegalAccessError();
  }


  /**
   * Needs the "20news-bydate" directory that has test and train subdirectories
   * given.
   * 
   * @return in tuple3 order: document as string, prediction, name mapping for
   *         prediction
   */
  public static Tuple3<List<String>, DenseDoubleVector, String[]> readTwentyNewsgroups(
      File directory) {
    String[] classList = directory.list();
    Arrays.sort(classList);
    List<String> docList = new ArrayList<>();
    TDoubleArrayList prediction = new TDoubleArrayList();
    String[] nameMapping = new String[classList.length];
    int classIndex = 0;
    for (String classDirString : classList) {
      File classDir = new File(directory, classDirString);
      String[] fileList = classDir.list();
      for (String fileDoc : fileList) {
        try (BufferedReader br = new BufferedReader(new FileReader(new File(
            classDir, fileDoc)))) {
          StringBuilder document = new StringBuilder();
          String l = null;
          while ((l = br.readLine()) != null) {
            document.append(l);
          }
          docList.add(document.toString());
          prediction.add(classIndex);
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
      nameMapping[classIndex++] = classDirString;
    }


    return new Tuple3<>(docList, new DenseDoubleVector(prediction.toArray()),
        nameMapping);
  }
}
Source Code of de.jungblut.reader.TwentyNewsgroupReader

Related Classes of de.jungblut.reader.TwentyNewsgroupReader