package winterwell.utils.io;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.NoSuchElementException;
import winterwell.utils.Printer;
import winterwell.utils.StrUtils;
import winterwell.utils.WrappedException;
import winterwell.utils.containers.IOneShot;
import winterwell.utils.reporting.Log;
/**
* Support for reading comma (etc.) separated values. This aims to be
* "standards" compliant as per
* http://en.wikipedia.org/wiki/Comma-separated_values
*
* Policy for handling incorrectly sized records is defined by
* {@link #reportBadRecord(int, String[])}
*
* TODO: This is all geared up to work with UNIX line-breaks. Fix it to work
* with Windows and Mac too.
*
* @testedby {@link CSVReaderTest}
* @author Joe Halliwell <joe@winterwell.com>
*
*/
public class CSVReader implements Iterable<String[]>, Iterator<String[]>,
Closeable, IOneShot {
final static char DEFAULT_COMMENT_CHARACTER = '#';
final static char DEFAULT_QUOTE_CHARACTER = '"';
final char comment;
int currentLineNumber = 0; // The starting line number of the current record
final char delimiter;
final PushbackReader input;
int nextLineNumber = 0; // The starting line number of the next record (i.e.
// handling comments)
String[] nextRecord;
// Fields for keeping track of row and line counts
// NB scanLineNumber >= nextLineNumber >= currentLineNumber
int nextRowNumber = 0;
/**
* -1 for variable width mode. The starting value will get over-written by
* the constructor!
*/
int numFields = -1;
final char quote;
int scanLineNumber = 0; // The starting line number of the next but one
// record (i.e. handling multi-line records)
/**
* Create a CSVReader from a file with the default quote and comment
* characters.
*
* @param f
* @param delimiter
*/
public CSVReader(File f, char delimiter) {
this(f, delimiter, DEFAULT_QUOTE_CHARACTER);
}
/**
* Create a CSVReader from a file, using the default comment character.
*
* @param file
* @param delimiter
* @param quote
*/
public CSVReader(File file, char delimiter, char quote) {
this(FileUtils.getReader(file), delimiter, quote,
DEFAULT_COMMENT_CHARACTER);
}
/**
* Full constructor for CSVReaders.
*
* @param input
* @param delimiter
* @param quote
* @param comment
* lines beginning with this character will be ignored!
*/
public CSVReader(Reader input, char delimiter, char quote, char comment) {
this.delimiter = delimiter;
this.quote = quote;
this.comment = comment;
this.input = new PushbackReader(input);
try {
nextRecord = getNextRecord();
if (nextRecord != null) {
numFields = nextRecord.length;
}
} catch (IOException e) {
throw new WrappedException(e);
}
}
/**
* @see java.io.Closeable#close()
*/
@Override
public void close() {
FileUtils.close(input);
}
/**
* The number of the line on which the last row returned began.
* Zero-indexed.
*
* @return
*/
public int getLineNumber() {
return currentLineNumber;
}
/**
* Pull in the next record of the correct length.
*
* @return
* @throws IOException
*/
private String[] getNextGoodRecord() throws IOException {
String[] record;
while (true) {
record = getNextRecord();
if (record == null)
return null;
if (isGoodRecord(record))
return record;
reportBadRecord(nextLineNumber, record);
}
}
/**
* Pull in the next record line (possibly ignoring several comments)
*
* @return
* @throws IOException
*/
private String[] getNextRecord() throws IOException {
boolean inQuote = false;
ArrayList<String> row = new ArrayList<String>();
StringBuffer currentField = new StringBuffer();
int c = input.read();
// Nothing left
if (c == -1)
return null;
// Ignore comment lines (but increment line counter)
while (c == comment) {
// read to line end
while (!(c == -1 || c == '\n')) {
c = (char) input.read();
}
c = (char) input.read();
nextLineNumber++;
scanLineNumber++;
}
// The logic here is a bit deep, but hopefully clear
while (c != -1) {
// Not in quotes
if (!inQuote) {
if (c == delimiter) {
row.add(currentField.toString());
currentField = new StringBuffer();
} else if (c == '\n') {
row.add(currentField.toString());
break;
} else if (c == quote) {
inQuote = true;
} else {
currentField.append((char) c);
}
}
// In quotes
else {
if (c == quote) {
// Effectively peek the next char (via PushbackReader)
int d = input.read();
if (d == quote) {
currentField.append(quote);
} else {
inQuote = false;
if (d != -1) {
input.unread(d);
}
}
} else {
if (c == '\n') {
scanLineNumber++;
}
currentField.append((char) c);
}
}
c = input.read();
}
if (c == -1) {
row.add(currentField.toString());
}
scanLineNumber++;
return row.toArray(new String[0]);
}
/**
* The expected number of fields in a record. -1 if in variable width mode.
*
* @return
*/
public int getNumFields() {
return numFields;
}
/**
* The number of the last row returned. zero indexed (zero at the beginning
* too). This may be less than the line number due to comments and
* multi-line items
*
* @see #getLineNumber()
*/
public int getRowNumber() {
return nextRowNumber - 1;
}
/**
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
return nextRecord != null;
}
private boolean isGoodRecord(String[] record) {
assert record != null;
if (numFields == -1)
return true;
return record.length == numFields;
}
// Interface implementations
/**
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String[]> iterator() {
return this;
}
/**
* @see java.util.Iterator#next()
*/
@Override
public String[] next() {
if (nextRecord == null)
throw new NoSuchElementException();
try {
currentLineNumber = nextLineNumber;
nextLineNumber = scanLineNumber;
String[] record = nextRecord;
nextRecord = getNextGoodRecord();
nextRowNumber++;
// If record is good return it...
if (isGoodRecord(record))
return record;
// ...otherwise someone has poked something, so let's try looking
// again
return next();
} catch (IOException e) {
throw new WrappedException(e);
}
}
/**
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Override to change the behaviour. Default is to print a log message.
*
* @param lineNumber
* @param record
*/
public void reportBadRecord(int lineNumber, String[] record) {
Log.report("Bad record beginning at line " + lineNumber + ": "
+ StrUtils.ellipsize(Printer.toString(record), 36));
}
/**
* Set the expected number of fields in a record. -1 to set variable width
* mode. Changing this value mid-way through an iteration may result in odd
* behaviour
*
* @param numFields
* @return this
*/
public CSVReader setNumFields(int numFields) {
this.numFields = numFields;
return this;
}
}