CSVParser.java

package edu.hawaii.ics.yucheng;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;

/**
 * A static class that provides a method to parse records from a CSV file.
 * 
 * @author Cheng Jade
 * @assignment ICS 421 Project
 * @date Feb 29, 2010
 * @bugs None
 */
public final class CSVParser {

    /**
     * A main method to test the implementation.
     * 
     * @param args
     *            The command line arguments.
     */
    public static void main(final String[] args) {
        assert null != args;

        // The the usage.
        if (args.length != 1) {
            System.out.println("Usage: CSVParser <path>");
            System.out.println("       <path> the path to the csv file");
            System.exit(0);
            return;
        }

        try {
            // Open the file for reading.
            assert null != args[0];
            final Reader reader = new FileReader(args[0]);

            // Parse lines until the end of the file is reached.
            String[] row;
            while (null != (row = CSVParser.parse(reader))) {

                // Print each line parsed.
                System.out.print("{ ");
                for (final String field : row)
                    System.out.print("[" + field + "] ");
                System.out.println("}");
            }

        } catch (final Exception e) {
            System.err.println(e);
        }
    }

    /** Possible states of the CSV parser. */
    private enum State {

        /**
         * Indicates the parser is looking for the first character in a field.
         * If the parser is in this state, it will add a field to the record
         * unless an error occurs.
         */
        START,

        /**
         * Indicates the parser is parsing an unquoted string.
         */
        CONTENT,

        /**
         * Indicates the parser is parsing a quoted string.
         */
        IN_QUOTE,

        /**
         * Indicates the parser just read a double-quote in a quoted string.
         * This may or may not indicate the end of the string. If another
         * double-quote is read, then it is treated a double-quote is added to
         * the field, and the parser returns to the IN_QUOTE state.
         */
        END_QUOTE,

        /**
         * Indicates the parser has finished parsing a quoted string and is
         * looking for a comma, end of line, or end of file.
         */
        NEED_COMMA,

        /**
         * Indicates the parser has found the end of one record.
         */
        TERMINAL
    }

    /**
     * Parses some input as CSV data. If successful, the method returns a
     * sequence of strings corresponding to the fields in the line. Otherwise,
     * the method returns null to indicate there is no more data available from
     * the reader.
     * 
     * @param reader
     *            The reader that contains the CSV data.
     * 
     * @return A sequence of strings corresponding to the fields in the line.
     * 
     * @throws ProgramException
     *             Thrown if there are any errors processing the CSV file.
     * @throws NullPointerException
     *             Thrown if the argument is null.
     */
    public static String[] parse(final Reader reader) throws ProgramException {
        if (null == reader)
            throw new NullPointerException("reader");

        State state = State.START;
        final ArrayList<String> fields = new ArrayList<String>();
        final StringBuilder fieldBuilder = new StringBuilder();

        // Loop until the parser enters the terminal state.
        while (state != State.TERMINAL) {

            // Read a character, and check for errors.
            final int ch;
            try {
                ch = reader.read();
            } catch (final IOException e) {
                throw new ProgramException(e);
            }

            // Ignore carriage-returns.
            if (ch == '\r')
                continue;

            switch (state) {
            // -------------------------------------------------------------
            case START:
                if (ch == -1 && fields.size() == 0)
                    return null;
                if (ch == '\n' || ch == -1) {
                    addUnquoted(fields, fieldBuilder);
                    state = State.TERMINAL;
                } else if (ch == ',')
                    addUnquoted(fields, fieldBuilder);
                else if (ch == '"')
                    state = State.IN_QUOTE;
                else if (!Character.isWhitespace(ch)) {
                    fieldBuilder.append((char) ch);
                    state = State.CONTENT;
                }
                break;

            // -------------------------------------------------------------
            case CONTENT:
                if (ch == '"')
                    throw new ProgramException("Unexpected token: '\"'.");
                if (ch == '\n' || ch == -1) {
                    addUnquoted(fields, fieldBuilder);
                    state = State.TERMINAL;
                } else if (ch == ',') {
                    addUnquoted(fields, fieldBuilder);
                    state = State.START;
                } else
                    fieldBuilder.append((char) ch);
                break;

            // -------------------------------------------------------------
            case IN_QUOTE:
                if (ch == -1)
                    throw new ProgramException("Unexpected end of line.");
                if (ch == '"')
                    state = State.END_QUOTE;
                else
                    fieldBuilder.append((char) ch);
                break;

            // -------------------------------------------------------------
            case END_QUOTE:
                if (ch == -1 || ch == '\n') {
                    addQuoted(fields, fieldBuilder);
                    state = State.TERMINAL;
                } else if (ch == '"') {
                    fieldBuilder.append((char) ch);
                    state = State.IN_QUOTE;
                } else if (ch == ',') {
                    addQuoted(fields, fieldBuilder);
                    state = State.START;
                } else if (Character.isWhitespace(ch)) {
                    addQuoted(fields, fieldBuilder);
                    state = State.NEED_COMMA;
                } else {
                    final String message = "Unexpected token: '" + (char) ch + "'.";
                    throw new ProgramException(message);
                }
                break;

            // -------------------------------------------------------------
            case NEED_COMMA:
                if (ch == -1 || ch == '\n')
                    state = State.TERMINAL;
                else if (ch == ',')
                    state = State.START;
                else if (!Character.isWhitespace(ch)) {
                    final String message = "Unexpected token: '" + (char) ch + "'.";
                    throw new ProgramException(message);
                }
                break;
            }
        }

        // Return the list as an array.
        final String[] array = new String[fields.size()];
        fields.toArray(array);
        return array;
    }

    /**
     * Adds a field to the array of fields. The field is not trimmed of
     * whitespace, and the string builder is reset to zero length.
     * 
     * @param fields
     *            The array of fields.
     * 
     * @param fieldBuilder
     *            The string builder used to build the field.
     */
    private static void addQuoted(final ArrayList<String> fields, final StringBuilder fieldBuilder) {
        assert null != fields;
        assert null != fieldBuilder;

        fields.add(fieldBuilder.toString());
        fieldBuilder.setLength(0);
    }

    /**
     * Adds a field to the array of fields. The field is trimmed of whitespace,
     * and the string builder is reset to zero length.
     * 
     * @param fields
     *            The array of fields.
     * 
     * @param fieldBuilder
     *            The string builder used to build the field.
     */
    private static void addUnquoted(final ArrayList<String> fields, final StringBuilder fieldBuilder) {
        assert null != fields;
        assert null != fieldBuilder;

        fields.add(fieldBuilder.toString().trim());
        fieldBuilder.setLength(0);
    }
}
Valid HTML 4.01 Valid CSS