Package io

Source Code of io.NewsgroupsReader

package io;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

import types.Alphabet;
import types.ClassificationInstance;
import types.SparseVector;

public class NewsgroupsReader {
  Alphabet xAlphabet;
  Alphabet yAlphabet;

  public NewsgroupsReader(Alphabet xAlphabet, Alphabet yAlphabet) {
    this.xAlphabet = xAlphabet;
    this.yAlphabet = yAlphabet;
  }

  public ArrayList<ClassificationInstance> readFile(String dataLoc)
      throws IOException {
    ArrayList<ClassificationInstance> result = new ArrayList<ClassificationInstance>();
    File baseDir = new File(dataLoc);
    for (String label : baseDir.list()) {
      String ydir = baseDir + File.separator + label;
      System.out.println(ydir);
      for (String fname : new File(ydir).list()) {
        HashSet<String> words = new HashSet<String>();
        BufferedReader reader = new BufferedReader(new FileReader(ydir
            + File.separator + fname));
        for (String ln = reader.readLine(); ln != null; ln = reader
            .readLine()) {
          ln = ln.toLowerCase();
          ln = ln.replaceAll("[^a-z']", " ");
          for (String w : ln.split("\\s\\s*")) {
            words.add(w);
          }
        }
        reader.close();
        SparseVector x = new SparseVector();
        for (String w : words)
          x.add(xAlphabet.lookupObject(w), 1);
        result.add(new ClassificationInstance(xAlphabet, yAlphabet, x,
            label));
      }
    }
    return result;
  }

}
TOP

Related Classes of io.NewsgroupsReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.