001package com.hfg.util.io;
002
003
004import com.hfg.exception.DataParsingException;
005import com.hfg.util.StringUtil;
006import com.hfg.util.collection.DataColumn;
007import com.hfg.util.collection.DataTable;
008
009import java.io.BufferedReader;
010import java.io.IOException;
011import java.io.Reader;
012import java.util.ArrayList;
013import java.util.HashMap;
014import java.util.List;
015import java.util.Map;
016
017//------------------------------------------------------------------------------
018/**
019 Base class for CSV (comma-separated value) and TSV (tab-separated value).
020 <div>
021 @author J. Alex Taylor, hairyfatguy.com
022 </div>
023 */
024//------------------------------------------------------------------------------
025// com.hfg Library
026//
027// This library is free software; you can redistribute it and/or
028// modify it under the terms of the GNU Lesser General Public
029// License as published by the Free Software Foundation; either
030// version 2.1 of the License, or (at your option) any later version.
031//
032// This library is distributed in the hope that it will be useful,
033// but WITHOUT ANY WARRANTY; without even the implied warranty of
034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035// Lesser General Public License for more details.
036//
037// You should have received a copy of the GNU Lesser General Public
038// License along with this library; if not, write to the Free Software
039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
040//
041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
042// jataylor@hairyfatguy.com
043//------------------------------------------------------------------------------
044public class DelimitedTextParser
045{
046   private char mDelimiter;
047
048   //###########################################################################
049   // CONSTRUCTORS
050   //###########################################################################
051
052   //---------------------------------------------------------------------------
053   public DelimitedTextParser(char inDelimiter)
054   {
055      mDelimiter = inDelimiter;
056   }
057
058   //###########################################################################
059   // PUBLIC METHODS
060   //###########################################################################
061
062   //---------------------------------------------------------------------------
063   public String escapeField(String inField)
064   {
065      String result = inField;
066
067      if (StringUtil.isSet(result)
068            && (result.contains("\"") || result.contains(mDelimiter + "")))
069      {
070         result = "\"" + result.replaceAll("\"", "\"\"") + "\"";
071      }
072
073      return result;
074   }
075
076   //---------------------------------------------------------------------------
077   public DataTable parseToDataTable(Reader inReader)
078         throws IOException
079   {
080      List<String[]> lines = parse(inReader);
081
082      DataTable dataTable = new DataTable();
083
084      Map<Integer, DataColumn> colMap = new HashMap<>(10);
085      boolean headerParsed = false;
086      int rowIndex = 0;
087      for (String[] fields : lines)
088      {
089         if (! headerParsed)
090         {
091            // Skip blank lines
092            if (1 == fields.length
093                && ! StringUtil.isSet(fields[0]))
094            {
095               continue;
096            }
097
098            for (int i = 0; i < fields.length; i++)
099            {
100               String field = fields[i].trim();
101               DataColumn col = new DataColumn(field);
102               colMap.put(i, col);
103            }
104
105            headerParsed = true;
106         }
107         else
108         {
109            rowIndex++;
110
111            if (fields.length > colMap.size())
112            {
113               throw new DataParsingException("Row " + rowIndex + " has more fields (" + fields.length + ") than the number of columns (" + colMap.size() + ")!");
114            }
115
116            for (int i = 0; i < fields.length; i++)
117            {
118               String fieldString = fields[i];
119               Comparable field = null;
120               if (fieldString != null)
121               {
122                  fieldString = fieldString.trim();
123
124                  if (StringUtil.isNumber(fieldString))
125                  {
126                     try
127                     {
128                        if (fieldString.contains("."))
129                        {
130                           field = Double.parseDouble(fieldString);
131                        }
132                        else if (fieldString.length() > 9)
133                        {
134                           field = Long.parseLong(fieldString);
135                        }
136                        else
137                        {
138                           field = Integer.parseInt(fieldString);
139                        }
140                     }
141                     catch (NumberFormatException e)
142                     {
143                        field = fieldString;
144                     }
145                  }
146                  else
147                  {
148                     field = fieldString;
149                  }
150               }
151
152               dataTable.put(rowIndex + "", colMap.get(i), field);
153            }
154         }
155      }
156
157      return dataTable;
158   }
159
160   //---------------------------------------------------------------------------
161   public List<String[]> parse(Reader inReader)
162      throws IOException
163   {
164      List<String[]> parsedLines = new ArrayList<>();
165
166      BufferedReader bufferedReader = null;
167      try
168      {
169         if (inReader instanceof BufferedReader)
170         {
171            bufferedReader = (BufferedReader) inReader;
172         }
173         else
174         {
175            bufferedReader = new BufferedReader(inReader);
176         }
177
178         String line;
179         while ((line = bufferedReader.readLine()) != null)
180         {
181            parsedLines.add(parseLine(line));
182         }
183      }
184      finally
185      {
186         StreamUtil.close(bufferedReader);
187      }
188
189      return parsedLines;
190   }
191
192   //---------------------------------------------------------------------------
193   public String[] parseLine(String inLine)
194         throws IOException
195   {
196      List<String> fields = new ArrayList<>();
197
198      boolean inQuotedValue = false;
199      int quoteCount = 0;
200      char currentQuoteChar = ' ';
201      StringBuilder currentValue = new StringBuilder();
202
203      int index = 0;
204      while (index < inLine.length())
205      {
206         int theChar = inLine.charAt(index);
207
208         if (inQuotedValue)
209         {
210            if (theChar == currentQuoteChar)
211            {
212               quoteCount++;
213
214               if (2 == quoteCount)
215               {
216                  // Skip
217                  quoteCount = 0;
218               }
219               else if ((index == inLine.length() - 1 || inLine.charAt(index + 1) != currentQuoteChar)
220                     && (0 == currentValue.length() || currentValue.charAt(currentValue.length() - 1) != '\\'))
221               {
222                  inQuotedValue = false;
223                  String unescapedValue = StringUtil.replaceAll(currentValue, "\\" + currentQuoteChar, currentQuoteChar + "");
224                  currentValue.setLength(0);
225                  currentValue.append(unescapedValue);
226               }
227               else
228               {
229                  currentValue.append((char) theChar);
230               }
231            }
232            else
233            {
234               currentValue.append((char) theChar);
235               quoteCount = 0;
236            }
237         }
238         else if (theChar == mDelimiter)
239         {
240            fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null);
241            currentValue.setLength(0);
242         }
243         else if (Character.isWhitespace(theChar)
244               && 0 == currentValue.length())
245         {
246            // Skip whitespace between the comma and the value
247         }
248         else if ((theChar == '\''
249               || theChar == '\"')
250               && 0 == currentValue.length())
251         {  // Start of a quoted value
252            inQuotedValue = true;
253            quoteCount = 0;
254            currentQuoteChar = (char) theChar;
255         }
256         else
257         {
258            currentValue.append((char) theChar);
259         }
260
261         index++;
262      }
263
264      fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null);
265
266      return fields.toArray(new String[] {});
267   }
268}