CSVParser.java

package edu.hawaii.ics.yucheng;

import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;

/**
 * A static class that provides a method to parse records from a CSV file.
 * 
 * @author     Cheng Jade
 * @assignment ICS 421 Assignment 2-2
 * @date       Feb 29, 2010
 * @bugs       None
 */
public final class CSVParser {

    /**
     * A main method to test the implementation.
     * 
     * @param args
     *            The command line arguments.
     */
    public static void main(final String[] args) {
        assert null != args;

        // The the usage.
        if (args.length != 1) {
            System.out.println("Usage: CSVParser <path>");
            System.out.println("       <path> the path to the csv file");
            System.exit(0);
            return;
        }

        try {
            // Open the file for reading.
            assert null != args[0];
            final Reader reader = new FileReader(args[0]);

            // Parse lines until the end of the file is reached.
            String[] row;
            while (null != (row = CSVParser.parse(reader))) {

                // Print each line parsed.
                System.out.print("{ ");
                for (String field : row)
                    System.out.print("[" + field + "] ");
                System.out.println("}");
            }

        } catch (Exception e) {
            System.err.println(e);
        }
    }

    /** Possible states of the CSV parser. */
    private enum State {

        /**
         * Indicates the parser is looking for the first character in a field.
         * If the parser is in this state, it will add a field to the record
         * unless an error occurs.
         */
        START,

        /**
         * Indicates the parser is parsing an unquoted string.
         */
        CONTENT,

        /**
         * Indicates the parser is parsing a quoted string.
         */
        IN_QUOTE,

        /**
         * Indicates the parser just read a double-quote in a quoted string.
         * This may or may not indicate the end of the string. If another
         * double-quote is read, then it is treated a double-quote is added to
         * the field, and the parser returns to the IN_QUOTE state.
         */
        END_QUOTE,

        /**
         * Indicates the parser has finished parsing a quoted string and is
         * looking for a comma, end of line, or end of file.
         */
        NEED_COMMA,

        /**
         * Indicates the parser has found the end of one record.
         */
        TERMINAL
    }

    /**
     * Parses some input as CSV data. If successful, the method returns a
     * sequence of strings corresponding to the fields in the line. Otherwise,
     * the method returns null to indicate there is no more data available from
     * the reader.
     * 
     * @param reader
     *            The reader that contains the CSV data.
     * 
     * @return A sequence of strings corresponding to the fields in the line.
     * 
     * @throws ProgramException
     *             Thrown if there are any errors processing the CSV file.
     * @throws NullPointerException
     *             Thrown if the argument is null.
     */
    public static String[] parse(final Reader reader) throws ProgramException {
        if (null == reader)
            throw new NullPointerException("reader");

        State state = State.START;
        final ArrayList<String> fields = new ArrayList<String>();
        final StringBuilder fieldBuilder = new StringBuilder();

        // Loop until the parser enters the terminal state.
        while (state != State.TERMINAL) {

            // Read a character, and check for errors.
            final int ch;
            try {
                ch = reader.read();
            } catch (final IOException e) {
                throw new ProgramException(e);
            }

            // Ignore carriage-returns.
            if (ch == '\r')
                continue;

            switch (state) {
                // -------------------------------------------------------------
                case START:
                    if (ch == -1 && fields.size() == 0)
                        return null;
                    if (ch == '\n' || ch == -1) {
                        addUnquoted(fields, fieldBuilder);
                        state = State.TERMINAL;
                    } else if (ch == ',')
                        addUnquoted(fields, fieldBuilder);
                    else if (ch == '"')
                        state = State.IN_QUOTE;
                    else if (!Character.isWhitespace(ch)) {
                        fieldBuilder.append((char) ch);
                        state = State.CONTENT;
                    }
                    break;

                // -------------------------------------------------------------
                case CONTENT:
                    if (ch == '"')
                        throw new ProgramException("Unexpected token: '\"'.");
                    if (ch == '\n' || ch == -1) {
                        addUnquoted(fields, fieldBuilder);
                        state = State.TERMINAL;
                    } else if (ch == ',') {
                        addUnquoted(fields, fieldBuilder);
                        state = State.START;
                    } else
                        fieldBuilder.append((char) ch);
                    break;

                // -------------------------------------------------------------
                case IN_QUOTE:
                    if (ch == -1)
                        throw new ProgramException("Unexpected end of line.");
                    if (ch == '"')
                        state = State.END_QUOTE;
                    else
                        fieldBuilder.append((char) ch);
                    break;

                // -------------------------------------------------------------
                case END_QUOTE:
                    if (ch == -1 || ch == '\n') {
                        addQuoted(fields, fieldBuilder);
                        state = State.TERMINAL;
                    } else if (ch == '"') {
                        fieldBuilder.append((char) ch);
                        state = State.IN_QUOTE;
                    } else if (ch == ',') {
                        addQuoted(fields, fieldBuilder);
                        state = State.START;
                    } else if (Character.isWhitespace(ch)) {
                        addQuoted(fields, fieldBuilder);
                        state = State.NEED_COMMA;
                    } else {
                        final String message = "Unexpected token: '" + (char) ch + "'.";
                        throw new ProgramException(message);
                    }
                    break;

                // -------------------------------------------------------------
                case NEED_COMMA:
                    if (ch == -1 || ch == '\n')
                        state = State.TERMINAL;
                    else if (ch == ',')
                        state = State.START;
                    else if (!Character.isWhitespace(ch)) {
                        final String message = "Unexpected token: '" + (char) ch + "'.";
                        throw new ProgramException(message);
                    }
                    break;
            }
        }

        // Return the list as an array.
        final String[] array = new String[fields.size()];
        fields.toArray(array);
        return array;
    }

    /**
     * Adds a field to the array of fields. The field is not trimmed of
     * whitespace, and the string builder is reset to zero length.
     * 
     * @param fields
     *            The array of fields.
     * 
     * @param fieldBuilder
     *            The string builder used to build the field.
     */
    private static void addQuoted(
            final ArrayList<String> fields,
            final StringBuilder fieldBuilder) {
        assert null != fields;
        assert null != fieldBuilder;

        fields.add(fieldBuilder.toString());
        fieldBuilder.setLength(0);
    }

    /**
     * Adds a field to the array of fields. The field is trimmed of whitespace,
     * and the string builder is reset to zero length.
     * 
     * @param fields
     *            The array of fields.
     * 
     * @param fieldBuilder
     *            The string builder used to build the field.
     */
    private static void addUnquoted(
            final ArrayList<String> fields,
            final StringBuilder fieldBuilder) {
        assert null != fields;
        assert null != fieldBuilder;

        fields.add(fieldBuilder.toString().trim());
        fieldBuilder.setLength(0);
    }
}
Valid HTML 4.01 Valid CSS