001package com.hfg.util.io; 002 003 004import com.hfg.exception.DataParsingException; 005import com.hfg.util.StringUtil; 006import com.hfg.util.collection.DataColumn; 007import com.hfg.util.collection.DataTable; 008 009import java.io.BufferedReader; 010import java.io.IOException; 011import java.io.Reader; 012import java.util.ArrayList; 013import java.util.HashMap; 014import java.util.List; 015import java.util.Map; 016 017//------------------------------------------------------------------------------ 018/** 019 Base class for CSV (comma-separated value) and TSV (tab-separated value). 020 <div> 021 @author J. Alex Taylor, hairyfatguy.com 022 </div> 023 */ 024//------------------------------------------------------------------------------ 025// com.hfg Library 026// 027// This library is free software; you can redistribute it and/or 028// modify it under the terms of the GNU Lesser General Public 029// License as published by the Free Software Foundation; either 030// version 2.1 of the License, or (at your option) any later version. 031// 032// This library is distributed in the hope that it will be useful, 033// but WITHOUT ANY WARRANTY; without even the implied warranty of 034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 035// Lesser General Public License for more details. 036// 037// You should have received a copy of the GNU Lesser General Public 038// License along with this library; if not, write to the Free Software 039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 040// 041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 042// jataylor@hairyfatguy.com 043//------------------------------------------------------------------------------ 044public class DelimitedTextParser 045{ 046 private char mDelimiter; 047 048 //########################################################################### 049 // CONSTRUCTORS 050 //########################################################################### 051 052 //--------------------------------------------------------------------------- 053 public DelimitedTextParser(char inDelimiter) 054 { 055 mDelimiter = inDelimiter; 056 } 057 058 //########################################################################### 059 // PUBLIC METHODS 060 //########################################################################### 061 062 //--------------------------------------------------------------------------- 063 public String escapeField(String inField) 064 { 065 String result = inField; 066 067 if (StringUtil.isSet(result) 068 && (result.contains("\"") || result.contains(mDelimiter + ""))) 069 { 070 result = "\"" + result.replaceAll("\"", "\"\"") + "\""; 071 } 072 073 return result; 074 } 075 076 //--------------------------------------------------------------------------- 077 public DataTable parseToDataTable(Reader inReader) 078 throws IOException 079 { 080 List<String[]> lines = parse(inReader); 081 082 DataTable dataTable = new DataTable(); 083 084 Map<Integer, DataColumn> colMap = new HashMap<>(10); 085 boolean headerParsed = false; 086 int rowIndex = 0; 087 for (String[] fields : lines) 088 { 089 if (! headerParsed) 090 { 091 // Skip blank lines 092 if (1 == fields.length 093 && ! StringUtil.isSet(fields[0])) 094 { 095 continue; 096 } 097 098 for (int i = 0; i < fields.length; i++) 099 { 100 String field = fields[i].trim(); 101 DataColumn col = new DataColumn(field); 102 colMap.put(i, col); 103 } 104 105 headerParsed = true; 106 } 107 else 108 { 109 rowIndex++; 110 111 if (fields.length > colMap.size()) 112 { 113 throw new DataParsingException("Row " + rowIndex + " has more fields (" + fields.length + ") than the number of columns (" + colMap.size() + ")!"); 114 } 115 116 for (int i = 0; i < fields.length; i++) 117 { 118 String fieldString = fields[i]; 119 Comparable field = null; 120 if (fieldString != null) 121 { 122 fieldString = fieldString.trim(); 123 124 if (StringUtil.isNumber(fieldString)) 125 { 126 try 127 { 128 if (fieldString.contains(".")) 129 { 130 field = Double.parseDouble(fieldString); 131 } 132 else if (fieldString.length() > 9) 133 { 134 field = Long.parseLong(fieldString); 135 } 136 else 137 { 138 field = Integer.parseInt(fieldString); 139 } 140 } 141 catch (NumberFormatException e) 142 { 143 field = fieldString; 144 } 145 } 146 else 147 { 148 field = fieldString; 149 } 150 } 151 152 dataTable.put(rowIndex + "", colMap.get(i), field); 153 } 154 } 155 } 156 157 return dataTable; 158 } 159 160 //--------------------------------------------------------------------------- 161 public List<String[]> parse(Reader inReader) 162 throws IOException 163 { 164 List<String[]> parsedLines = new ArrayList<>(); 165 166 BufferedReader bufferedReader = null; 167 try 168 { 169 if (inReader instanceof BufferedReader) 170 { 171 bufferedReader = (BufferedReader) inReader; 172 } 173 else 174 { 175 bufferedReader = new BufferedReader(inReader); 176 } 177 178 String line; 179 while ((line = bufferedReader.readLine()) != null) 180 { 181 parsedLines.add(parseLine(line)); 182 } 183 } 184 finally 185 { 186 StreamUtil.close(bufferedReader); 187 } 188 189 return parsedLines; 190 } 191 192 //--------------------------------------------------------------------------- 193 public String[] parseLine(String inLine) 194 throws IOException 195 { 196 List<String> fields = new ArrayList<>(); 197 198 boolean inQuotedValue = false; 199 int quoteCount = 0; 200 char currentQuoteChar = ' '; 201 StringBuilder currentValue = new StringBuilder(); 202 203 int index = 0; 204 while (index < inLine.length()) 205 { 206 int theChar = inLine.charAt(index); 207 208 if (inQuotedValue) 209 { 210 if (theChar == currentQuoteChar) 211 { 212 quoteCount++; 213 214 if (2 == quoteCount) 215 { 216 // Skip 217 quoteCount = 0; 218 } 219 else if ((index == inLine.length() - 1 || inLine.charAt(index + 1) != currentQuoteChar) 220 && (0 == currentValue.length() || currentValue.charAt(currentValue.length() - 1) != '\\')) 221 { 222 inQuotedValue = false; 223 String unescapedValue = StringUtil.replaceAll(currentValue, "\\" + currentQuoteChar, currentQuoteChar + ""); 224 currentValue.setLength(0); 225 currentValue.append(unescapedValue); 226 } 227 else 228 { 229 currentValue.append((char) theChar); 230 } 231 } 232 else 233 { 234 currentValue.append((char) theChar); 235 quoteCount = 0; 236 } 237 } 238 else if (theChar == mDelimiter) 239 { 240 fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null); 241 currentValue.setLength(0); 242 } 243 else if (Character.isWhitespace(theChar) 244 && 0 == currentValue.length()) 245 { 246 // Skip whitespace between the comma and the value 247 } 248 else if ((theChar == '\'' 249 || theChar == '\"') 250 && 0 == currentValue.length()) 251 { // Start of a quoted value 252 inQuotedValue = true; 253 quoteCount = 0; 254 currentQuoteChar = (char) theChar; 255 } 256 else 257 { 258 currentValue.append((char) theChar); 259 } 260 261 index++; 262 } 263 264 fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null); 265 266 return fields.toArray(new String[] {}); 267 } 268}