Package winterwell.utils.io

Source Code of winterwell.utils.io.CSVReader

package winterwell.utils.io;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.NoSuchElementException;

import winterwell.utils.Printer;
import winterwell.utils.StrUtils;
import winterwell.utils.WrappedException;
import winterwell.utils.containers.IOneShot;
import winterwell.utils.reporting.Log;

/**
* Support for reading comma (etc.) separated values. This aims to be
* "standards" compliant as per
* http://en.wikipedia.org/wiki/Comma-separated_values
*
* Policy for handling incorrectly sized records is defined by
* {@link #reportBadRecord(int, String[])}
*
* TODO: This is all geared up to work with UNIX line-breaks. Fix it to work
* with Windows and Mac too.
*
* @testedby {@link CSVReaderTest}
* @author Joe Halliwell <joe@winterwell.com>
*
*/
public class CSVReader implements Iterable<String[]>, Iterator<String[]>,
    Closeable, IOneShot {

  final static char DEFAULT_COMMENT_CHARACTER = '#';
  final static char DEFAULT_QUOTE_CHARACTER = '"';

  final char comment;
  int currentLineNumber = 0; // The starting line number of the current record
  final char delimiter;

  final PushbackReader input;
  int nextLineNumber = 0; // The starting line number of the next record (i.e.
              // handling comments)

  String[] nextRecord;

  // Fields for keeping track of row and line counts
  // NB scanLineNumber >= nextLineNumber >= currentLineNumber
  int nextRowNumber = 0;
  /**
   * -1 for variable width mode. The starting value will get over-written by
   * the constructor!
   */
  int numFields = -1;
  final char quote;
  int scanLineNumber = 0; // The starting line number of the next but one
              // record (i.e. handling multi-line records)

  /**
   * Create a CSVReader from a file with the default quote and comment
   * characters.
   *
   * @param f
   * @param delimiter
   */
  public CSVReader(File f, char delimiter) {
    this(f, delimiter, DEFAULT_QUOTE_CHARACTER);
  }

  /**
   * Create a CSVReader from a file, using the default comment character.
   *
   * @param file
   * @param delimiter
   * @param quote
   */
  public CSVReader(File file, char delimiter, char quote) {
    this(FileUtils.getReader(file), delimiter, quote,
        DEFAULT_COMMENT_CHARACTER);
  }

  /**
   * Full constructor for CSVReaders.
   *
   * @param input
   * @param delimiter
   * @param quote
   * @param comment
   *            lines beginning with this character will be ignored!
   */
  public CSVReader(Reader input, char delimiter, char quote, char comment) {
    this.delimiter = delimiter;
    this.quote = quote;
    this.comment = comment;
    this.input = new PushbackReader(input);
    try {
      nextRecord = getNextRecord();
      if (nextRecord != null) {
        numFields = nextRecord.length;
      }
    } catch (IOException e) {
      throw new WrappedException(e);
    }
  }

  /**
   * @see java.io.Closeable#close()
   */
  @Override
  public void close() {
    FileUtils.close(input);
  }

  /**
   * The number of the line on which the last row returned began.
   * Zero-indexed.
   *
   * @return
   */
  public int getLineNumber() {
    return currentLineNumber;
  }

  /**
   * Pull in the next record of the correct length.
   *
   * @return
   * @throws IOException
   */
  private String[] getNextGoodRecord() throws IOException {
    String[] record;
    while (true) {
      record = getNextRecord();
      if (record == null)
        return null;
      if (isGoodRecord(record))
        return record;
      reportBadRecord(nextLineNumber, record);
    }
  }

  /**
   * Pull in the next record line (possibly ignoring several comments)
   *
   * @return
   * @throws IOException
   */
  private String[] getNextRecord() throws IOException {
    boolean inQuote = false;
    ArrayList<String> row = new ArrayList<String>();
    StringBuffer currentField = new StringBuffer();

    int c = input.read();

    // Nothing left
    if (c == -1)
      return null;

    // Ignore comment lines (but increment line counter)
    while (c == comment) {
      // read to line end
      while (!(c == -1 || c == '\n')) {
        c = (char) input.read();
      }
      c = (char) input.read();
      nextLineNumber++;
      scanLineNumber++;
    }

    // The logic here is a bit deep, but hopefully clear
    while (c != -1) {
      // Not in quotes
      if (!inQuote) {
        if (c == delimiter) {
          row.add(currentField.toString());
          currentField = new StringBuffer();
        } else if (c == '\n') {
          row.add(currentField.toString());
          break;
        } else if (c == quote) {
          inQuote = true;
        } else {
          currentField.append((char) c);
        }
      }
      // In quotes
      else {
        if (c == quote) {
          // Effectively peek the next char (via PushbackReader)
          int d = input.read();
          if (d == quote) {
            currentField.append(quote);
          } else {
            inQuote = false;
            if (d != -1) {
              input.unread(d);
            }
          }
        } else {
          if (c == '\n') {
            scanLineNumber++;
          }
          currentField.append((char) c);
        }
      }
      c = input.read();
    }
    if (c == -1) {
      row.add(currentField.toString());
    }
    scanLineNumber++;
    return row.toArray(new String[0]);
  }

  /**
   * The expected number of fields in a record. -1 if in variable width mode.
   *
   * @return
   */
  public int getNumFields() {
    return numFields;
  }

  /**
   * The number of the last row returned. zero indexed (zero at the beginning
   * too). This may be less than the line number due to comments and
   * multi-line items
   *
   * @see #getLineNumber()
   */
  public int getRowNumber() {
    return nextRowNumber - 1;
  }

  /**
   * @see java.util.Iterator#hasNext()
   */
  @Override
  public boolean hasNext() {
    return nextRecord != null;
  }

  private boolean isGoodRecord(String[] record) {
    assert record != null;
    if (numFields == -1)
      return true;
    return record.length == numFields;
  }

  // Interface implementations

  /**
   * @see java.lang.Iterable#iterator()
   */
  @Override
  public Iterator<String[]> iterator() {
    return this;
  }

  /**
   * @see java.util.Iterator#next()
   */
  @Override
  public String[] next() {
    if (nextRecord == null)
      throw new NoSuchElementException();
    try {
      currentLineNumber = nextLineNumber;
      nextLineNumber = scanLineNumber;
      String[] record = nextRecord;
      nextRecord = getNextGoodRecord();
      nextRowNumber++;
      // If record is good return it...
      if (isGoodRecord(record))
        return record;
      // ...otherwise someone has poked something, so let's try looking
      // again
      return next();
    } catch (IOException e) {
      throw new WrappedException(e);
    }
  }

  /**
   * @see java.util.Iterator#remove()
   */
  @Override
  public void remove() {
    throw new UnsupportedOperationException();
  }

  /**
   * Override to change the behaviour. Default is to print a log message.
   *
   * @param lineNumber
   * @param record
   */
  public void reportBadRecord(int lineNumber, String[] record) {
    Log.report("Bad record beginning at line " + lineNumber + ": "
        + StrUtils.ellipsize(Printer.toString(record), 36));
  }

  /**
   * Set the expected number of fields in a record. -1 to set variable width
   * mode. Changing this value mid-way through an iteration may result in odd
   * behaviour
   *
   * @param numFields
   * @return this
   */
  public CSVReader setNumFields(int numFields) {
    this.numFields = numFields;
    return this;
  }

}
TOP

Related Classes of winterwell.utils.io.CSVReader

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.