View Javadoc
1   /*
2    * Copyright (C) 2007-2012 Argeo GmbH
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *         http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.argeo.util;
17  
18  import java.io.BufferedReader;
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.io.InputStreamReader;
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.List;
25  
26  /**
27   * Parses a CSV file interpreting the first line as a header. The
28   * {@link #parse(InputStream)} method and the setters are synchronized so that
29   * the object cannot be modified when parsing.
30   */
31  public abstract class CsvParser {
32  	private char separator = ',';
33  	private char quote = '\"';
34  
35  	private Boolean noHeader = false;
36  	private Boolean strictLineAsLongAsHeader = true;
37  
38  	/**
39  	 * Actually process a parsed line. If
40  	 * {@link #setStrictLineAsLongAsHeader(Boolean)} is true (default) the
41  	 * header and the tokens are guaranteed to have the same size.
42  	 * 
43  	 * @param lineNumber
44  	 *            the current line number, starts at 1 (the header, if header
45  	 *            processing is enabled, the first line otherwise)
46  	 * @param header
47  	 *            the read-only header or null if {@link #setNoHeader(Boolean)}
48  	 *            is true (default is false)
49  	 * @param tokens
50  	 *            the parsed tokens
51  	 */
52  	protected abstract void processLine(Integer lineNumber,
53  			List<String> header, List<String> tokens);
54  
55  	/**
56  	 * Parses the CSV file (stream is closed at the end)
57  	 */
58  	public synchronized void parse(InputStream in) {
59  		parse(in, null);
60  	}
61  
62  	/**
63  	 * Parses the CSV file (stream is closed at the end)
64  	 */
65  	public synchronized void parse(InputStream in, String encoding) {
66  		BufferedReader reader = null;
67  		Integer lineCount = 0;
68  		try {
69  			if (encoding == null)
70  				reader = new BufferedReader(new InputStreamReader(in));
71  			else
72  				reader = new BufferedReader(new InputStreamReader(in, encoding));
73  			List<String> header = null;
74  			if (!noHeader) {
75  				String headerStr = reader.readLine();
76  				if (headerStr == null)// empty file
77  					return;
78  				lineCount++;
79  				header = new ArrayList<String>();
80  				StringBuffer currStr = new StringBuffer("");
81  				Boolean wasInquote = false;
82  				while (parseLine(headerStr, header, currStr, wasInquote)) {
83  					headerStr = reader.readLine();
84  					if (headerStr == null)
85  						break;
86  					wasInquote = true;
87  				}
88  				header = Collections.unmodifiableList(header);
89  			}
90  
91  			String line = null;
92  			lines: while ((line = reader.readLine()) != null) {
93  				line = preProcessLine(line);
94  				if (line == null) {
95  					// skip line
96  					continue lines;
97  				}
98  				lineCount++;
99  				List<String> tokens = new ArrayList<String>();
100 				StringBuffer currStr = new StringBuffer("");
101 				Boolean wasInquote = false;
102 				sublines: while (parseLine(line, tokens, currStr, wasInquote)) {
103 					line = reader.readLine();
104 					if (line == null)
105 						break sublines;
106 					wasInquote = true;
107 				}
108 				if (!noHeader && strictLineAsLongAsHeader) {
109 					int headerSize = header.size();
110 					int tokenSize = tokens.size();
111 					if (tokenSize == 1 && line.trim().equals(""))
112 						continue lines;// empty line
113 					if (headerSize != tokenSize) {
114 						throw new UtilsException("Token size " + tokenSize
115 								+ " is different from header size "
116 								+ headerSize + " at line " + lineCount
117 								+ ", line: " + line + ", header: " + header
118 								+ ", tokens: " + tokens);
119 					}
120 				}
121 				processLine(lineCount, header, tokens);
122 			}
123 		} catch (UtilsException e) {
124 			throw e;
125 		} catch (IOException e) {
126 			throw new UtilsException("Cannot parse CSV file (line: "
127 					+ lineCount + ")", e);
128 		} finally {
129 			StreamUtils.closeQuietly(reader);
130 		}
131 	}
132 
133 	/**
134 	 * Called before each (logical) line is processed, giving a change to modify
135 	 * it (typically for cleaning dirty files). To be overridden, return the
136 	 * line unchanged by default. Skip the line if 'null' is returned.
137 	 */
138 	protected String preProcessLine(String line) {
139 		return line;
140 	}
141 
142 	/**
143 	 * Parses a line character by character for performance purpose
144 	 * 
145 	 * @return whether to continue parsing this line
146 	 */
147 	protected Boolean parseLine(String str, List<String> tokens,
148 			StringBuffer currStr, Boolean wasInquote) {
149 		// List<String> tokens = new ArrayList<String>();
150 
151 		// System.out.println("#LINE: " + str);
152 
153 		if (wasInquote)
154 			currStr.append('\n');
155 
156 		char[] arr = str.toCharArray();
157 		boolean inQuote = wasInquote;
158 		// StringBuffer currStr = new StringBuffer("");
159 		for (int i = 0; i < arr.length; i++) {
160 			char c = arr[i];
161 			if (c == separator) {
162 				if (!inQuote) {
163 					tokens.add(currStr.toString());
164 					// System.out.println("# TOKEN: " + currStr);
165 					currStr.delete(0, currStr.length());
166 				} else {
167 					// we don't remove separator that are in a quoted substring
168 					// System.out
169 					// .println("IN QUOTE, got a separator: [" + c + "]");
170 					currStr.append(c);
171 				}
172 			} else if (c == quote) {
173 				if (inQuote && (i + 1) < arr.length && arr[i + 1] == quote) {
174 					// case of double quote
175 					currStr.append(quote);
176 					i++;
177 				} else {// standard
178 					inQuote = inQuote ? false : true;
179 				}
180 			} else {
181 				currStr.append(c);
182 			}
183 		}
184 
185 		if (!inQuote) {
186 			tokens.add(currStr.toString());
187 			// System.out.println("# TOKEN: " + currStr);
188 		}
189 		// if (inQuote)
190 		// throw new ArgeoException("Missing quote at the end of the line "
191 		// + str + " (parsed: " + tokens + ")");
192 		if (inQuote)
193 			return true;
194 		else
195 			return false;
196 		// return tokens;
197 	}
198 
199 	public char getSeparator() {
200 		return separator;
201 	}
202 
203 	public synchronized void setSeparator(char separator) {
204 		this.separator = separator;
205 	}
206 
207 	public char getQuote() {
208 		return quote;
209 	}
210 
211 	public synchronized void setQuote(char quote) {
212 		this.quote = quote;
213 	}
214 
215 	public Boolean getNoHeader() {
216 		return noHeader;
217 	}
218 
219 	public synchronized void setNoHeader(Boolean noHeader) {
220 		this.noHeader = noHeader;
221 	}
222 
223 	public Boolean getStrictLineAsLongAsHeader() {
224 		return strictLineAsLongAsHeader;
225 	}
226 
227 	public synchronized void setStrictLineAsLongAsHeader(
228 			Boolean strictLineAsLongAsHeader) {
229 		this.strictLineAsLongAsHeader = strictLineAsLongAsHeader;
230 	}
231 
232 }