Package io

Source Code of io.PartOfSpeechReader

package io;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;

import types.Alphabet;
import sequence.SequenceInstance;
import types.SparseVector;

public class PartOfSpeechReader {

  Alphabet xAlphabet;
  Alphabet yAlphabet;

  public PartOfSpeechReader(Alphabet xAlphabet, Alphabet yAlphabet) {
    this.xAlphabet = xAlphabet;
    this.yAlphabet = yAlphabet;
  }

  public ArrayList<SequenceInstance> readFile(String dataLoc)
      throws IOException {
    ArrayList<SequenceInstance> result = new ArrayList<SequenceInstance>();
    BufferedReader reader = new BufferedReader(new FileReader(dataLoc));
    ArrayList<String> words = new ArrayList<String>();
    ArrayList<String> tags = new ArrayList<String>();
    for (String ln = reader.readLine(); ln != null; ln = reader.readLine()) {
      if (ln.length() < 2) {
        SparseVector[] x = new SparseVector[words.size()];
        Object[] y = new Object[tags.size()];
        for (int t = 0; t < x.length; t++) {
          y[t] = tags.get(t);
          String word = "^" + words.get(t) + "$";
          x[t] = new SparseVector();
          x[t].add(xAlphabet.lookupObject(word), 1);
          for (int i = 0; i < word.length() - 3; i++) {
            x[t].add(xAlphabet.lookupObject(word
                .substring(i, i + 3)), 1);
          }
        }
        result.add(new SequenceInstance(xAlphabet, yAlphabet, x, y));
        words = new ArrayList<String>();
        tags = new ArrayList<String>();
        continue;
      }
      String[] wordpos = ln.split("\t");
      words.add(wordpos[0]);
      tags.add(wordpos[1]);
    }
    return result;
  }

}
TOP

Related Classes of io.PartOfSpeechReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.