package edu.hawaii.ics.yucheng;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
/**
* A static class that provides a method to parse records from a CSV file.
*
* @author Cheng Jade
* @assignment ICS 421 Assignment 2-2
* @date Feb 29, 2010
* @bugs None
*/
public final class CSVParser {
/**
* A main method to test the implementation.
*
* @param args
* The command line arguments.
*/
public static void main(final String[] args) {
assert null != args;
// The the usage.
if (args.length != 1) {
System.out.println("Usage: CSVParser <path>");
System.out.println(" <path> the path to the csv file");
System.exit(0);
return;
}
try {
// Open the file for reading.
assert null != args[0];
final Reader reader = new FileReader(args[0]);
// Parse lines until the end of the file is reached.
String[] row;
while (null != (row = CSVParser.parse(reader))) {
// Print each line parsed.
System.out.print("{ ");
for (String field : row)
System.out.print("[" + field + "] ");
System.out.println("}");
}
} catch (Exception e) {
System.err.println(e);
}
}
/** Possible states of the CSV parser. */
private enum State {
/**
* Indicates the parser is looking for the first character in a field.
* If the parser is in this state, it will add a field to the record
* unless an error occurs.
*/
START,
/**
* Indicates the parser is parsing an unquoted string.
*/
CONTENT,
/**
* Indicates the parser is parsing a quoted string.
*/
IN_QUOTE,
/**
* Indicates the parser just read a double-quote in a quoted string.
* This may or may not indicate the end of the string. If another
* double-quote is read, then it is treated a double-quote is added to
* the field, and the parser returns to the IN_QUOTE state.
*/
END_QUOTE,
/**
* Indicates the parser has finished parsing a quoted string and is
* looking for a comma, end of line, or end of file.
*/
NEED_COMMA,
/**
* Indicates the parser has found the end of one record.
*/
TERMINAL
}
/**
* Parses some input as CSV data. If successful, the method returns a
* sequence of strings corresponding to the fields in the line. Otherwise,
* the method returns null to indicate there is no more data available from
* the reader.
*
* @param reader
* The reader that contains the CSV data.
*
* @return A sequence of strings corresponding to the fields in the line.
*
* @throws ProgramException
* Thrown if there are any errors processing the CSV file.
* @throws NullPointerException
* Thrown if the argument is null.
*/
public static String[] parse(final Reader reader) throws ProgramException {
if (null == reader)
throw new NullPointerException("reader");
State state = State.START;
final ArrayList<String> fields = new ArrayList<String>();
final StringBuilder fieldBuilder = new StringBuilder();
// Loop until the parser enters the terminal state.
while (state != State.TERMINAL) {
// Read a character, and check for errors.
final int ch;
try {
ch = reader.read();
} catch (final IOException e) {
throw new ProgramException(e);
}
// Ignore carriage-returns.
if (ch == '\r')
continue;
switch (state) {
// -------------------------------------------------------------
case START:
if (ch == -1 && fields.size() == 0)
return null;
if (ch == '\n' || ch == -1) {
addUnquoted(fields, fieldBuilder);
state = State.TERMINAL;
} else if (ch == ',')
addUnquoted(fields, fieldBuilder);
else if (ch == '"')
state = State.IN_QUOTE;
else if (!Character.isWhitespace(ch)) {
fieldBuilder.append((char) ch);
state = State.CONTENT;
}
break;
// -------------------------------------------------------------
case CONTENT:
if (ch == '"')
throw new ProgramException("Unexpected token: '\"'.");
if (ch == '\n' || ch == -1) {
addUnquoted(fields, fieldBuilder);
state = State.TERMINAL;
} else if (ch == ',') {
addUnquoted(fields, fieldBuilder);
state = State.START;
} else
fieldBuilder.append((char) ch);
break;
// -------------------------------------------------------------
case IN_QUOTE:
if (ch == -1)
throw new ProgramException("Unexpected end of line.");
if (ch == '"')
state = State.END_QUOTE;
else
fieldBuilder.append((char) ch);
break;
// -------------------------------------------------------------
case END_QUOTE:
if (ch == -1 || ch == '\n') {
addQuoted(fields, fieldBuilder);
state = State.TERMINAL;
} else if (ch == '"') {
fieldBuilder.append((char) ch);
state = State.IN_QUOTE;
} else if (ch == ',') {
addQuoted(fields, fieldBuilder);
state = State.START;
} else if (Character.isWhitespace(ch)) {
addQuoted(fields, fieldBuilder);
state = State.NEED_COMMA;
} else {
final String message = "Unexpected token: '" + (char) ch + "'.";
throw new ProgramException(message);
}
break;
// -------------------------------------------------------------
case NEED_COMMA:
if (ch == -1 || ch == '\n')
state = State.TERMINAL;
else if (ch == ',')
state = State.START;
else if (!Character.isWhitespace(ch)) {
final String message = "Unexpected token: '" + (char) ch + "'.";
throw new ProgramException(message);
}
break;
}
}
// Return the list as an array.
final String[] array = new String[fields.size()];
fields.toArray(array);
return array;
}
/**
* Adds a field to the array of fields. The field is not trimmed of
* whitespace, and the string builder is reset to zero length.
*
* @param fields
* The array of fields.
*
* @param fieldBuilder
* The string builder used to build the field.
*/
private static void addQuoted(
final ArrayList<String> fields,
final StringBuilder fieldBuilder) {
assert null != fields;
assert null != fieldBuilder;
fields.add(fieldBuilder.toString());
fieldBuilder.setLength(0);
}
/**
* Adds a field to the array of fields. The field is trimmed of whitespace,
* and the string builder is reset to zero length.
*
* @param fields
* The array of fields.
*
* @param fieldBuilder
* The string builder used to build the field.
*/
private static void addUnquoted(
final ArrayList<String> fields,
final StringBuilder fieldBuilder) {
assert null != fields;
assert null != fieldBuilder;
fields.add(fieldBuilder.toString().trim());
fieldBuilder.setLength(0);
}
}