root/trunk/csvRange/csvRange.d

Revision 824, 16.2 kB (checked in by dsimcha, 1 year ago)

Fix range violation in csvRange.

Line 
1 /**
2 This module allows the parsing of CSV files and otherwise similar delimited
3 files using alternative delimiters on-the-fly, with O(1) memory usage if the
4 files are processed as they are read from disk.  It attempts to handle issues
5 like newlines and delimiters in quoted fields and escaped quotes correctly.
6
7 There are two recognized ways of escaping a quote in this library:  The
8 backslash method (\") and the double-quote ("") method.  According to Wikipedia
9 the double-quote method is teh standard.  On the other hand, in my observations
10 the backslash method occurs more often in practice.
11
12 References:
13
14 http://en.wikipedia.org/wiki/Comma-separated_values
15
16 Copyright (C) 2011 David Simcha
17
18 License:
19
20 Boost Software License - Version 1.0 - August 17th, 2003
21
22 Permission is hereby granted, free of charge, to any person or organization
23 obtaining a copy of the software and accompanying documentation covered by
24 this license (the "Software") to use, reproduce, display, distribute,
25 execute, and transmit the Software, and to prepare derivative works of the
26 Software, and to permit third-parties to whom the Software is furnished to
27 do so, all subject to the following:
28
29 The copyright notices in the Software and this entire statement, including
30 the above license grant, this restriction and the following disclaimer,
31 must be included in all copies of the Software, in whole or in part, and
32 all derivative works of the Software, unless such copies or derivative
33 works are solely in the form of machine-executable object code generated by
34 a source language processor.
35
36 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
39 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
40 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
41 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
42 DEALINGS IN THE SOFTWARE.
43 */
44 module csvrange;
45
46 import std.stdio, std.array, std.range, std.exception, std.algorithm, std.conv,
47     std.traits, std.string;
48
49 // Gives a range that may have value semantics reference semantics.
50 private final class RefRange(R)
51 {
52     R _range;
53
54     this(R range)
55     {
56         this._range = range;
57     }
58
59     bool empty() @property
60     {
61         return _range.empty;
62     }
63
64     void popFront()
65     {
66         _range.popFront();
67     }
68
69     ElementType!R front() @property
70     {
71         return _range.front;
72     }
73 }
74
75 /**
76 Tests whether a type is a ranges of characters.
77 */
78 template isCharRange(R)
79 {
80     enum bool isCharRange = isInputRange!R && is(ElementType!R : dchar);
81 }
82
83 /**
84 Tests whether a range is a range of ranges of character ranges.
85 */
86 template isStringMatrixLike(R)
87 {
88     enum bool isStringMatrixLike = isInputRange!R
89         && isInputRange!(ElementType!R)
90         && isCharRange!(ElementType!(ElementType!R));
91 }
92
93 /**
94 This class is an input range that iterates over the columns of a single line of
95 a CSV file.   It is meant to be instantiated only by the CsvFile object,
96 and may be recycled when popFront() is called on the CsvFile object.  Also,
97 when popFront() is called on this object, the buffer is recycled.  For
98 usage details see csvFile().
99 */
100 final class CsvLine(R) if(isCharRange!R)
101 {
102     private typeof(appender!(char[])())_front;
103     private R _rest;
104     private dchar _delim;
105     private bool _nextEmpty;
106     private bool _empty;
107
108     private this(R range, dchar delim)
109     {
110         this._rest = range;
111         this._delim = delim;
112         _front = appender!(char[])();
113         prime();
114     }
115
116     private void reset()
117     {
118         _nextEmpty = false;
119         _empty = false;
120         prime();
121     }
122
123     @property char[] front()
124     {
125         return _front.data;
126     }
127
128     @property bool empty()
129     {
130         return _empty;
131     }
132
133     void popFront()
134     {
135         if(_nextEmpty)
136         {
137             _empty = true;
138         }
139         else
140         {
141             prime();
142         }
143     }
144
145     void prime()
146     {
147         bool inQuotes = false;
148         bool backslashEscape = false;
149         bool firstChar = true;
150         _front.shrinkTo(0);
151
152         while(true)
153         {
154             scope(exit) firstChar = false;
155
156             if(_rest.empty)
157             {
158                 _nextEmpty = true;
159                 return;
160             }
161
162             dchar cur = _rest.front;
163             if(cur == '"' && !backslashEscape)
164             {
165                 // To allow quotes in quoted fields, the CSV spec allows them
166                 // to be escaped with a second quote.
167                 if(inQuotes)
168                 {
169                     // Handle double quoting.
170                     _rest.popFront();
171                     if(_rest.empty)
172                     {
173                         _nextEmpty = true;
174                         return;
175                     }
176
177                     if(_rest.front == '"')
178                     {
179                         _front.put('"');
180                         _rest.popFront();
181                         continue;
182                     }
183
184                     inQuotes = false;
185
186                     if(_rest.front == _delim)
187                     {
188                         _rest.popFront();
189                         return;
190                     }
191                     else if(_rest.front == '\r' || _rest.front == '\n')
192                     {
193                         continue;
194                     }
195                     else
196                     {
197                         enforce(0, "Invalid single quote within quoted field.");
198                     }
199                 }
200                 else
201                 {
202                     _rest.popFront();
203                     if(_rest.empty || _rest.front == '\r' || _rest.front == '\n')
204                     {
205                         // Assume the quote was meant literally.
206                         _front.put('"');
207                         if(_rest.empty)
208                         {
209                             _nextEmpty = true;
210                             return;
211                         }
212                         else
213                         {
214                             continue;
215                         }
216                     }
217                     else if(_rest.front == '"')
218                     {
219                         // Assume the first quote was an escape even though
220                         // we're not inside quotes.
221                         _front.put('"');
222                         _rest.popFront();
223                         continue;
224                     }
225                     else if(firstChar)
226                     {
227                         inQuotes = true;
228                         continue;
229                     }
230                     else
231                     {
232                         // Again, assume it's a literal b/c it's not at
233                         // the beginning of a field.
234                         _front.put('"');
235                         _rest.popFront();
236                         continue;
237                     }
238                 }
239             }
240
241             if(backslashEscape)
242             {
243                 if(cur == '"' || cur == '\\')
244                 {
245                     // Then nust turn backslashEscape off and let the normal
246                     // processing handle it.
247                     backslashEscape = false;
248                 }
249                 else if(cur == 'n')
250                 {
251                     // Then the file was output with newlines written as '\n'.
252                     // Substitute a newline.
253                     backslashEscape = false;
254                     cur = '\n';
255                     // Let normal flow take over.
256                 }
257                 else
258                 {
259                     // Assume it's intended to be taken literally.
260                     _front.put('\\');
261                     // Let normal flow take over.
262                 }
263             }
264
265             if(cur == '\\')
266             {
267                 backslashEscape = true;
268                 _rest.popFront();
269                 continue;
270             }
271
272             if(cur == '\n' && !inQuotes)
273             {
274                 _rest.popFront();
275                 _nextEmpty = true;
276                 return;
277             }
278
279             if(cur == '\r' && !inQuotes)
280             {
281                 // Get rid of the \n, too, if there is one, then return.
282                 _rest.popFront();
283                 if(!_rest.empty && _rest.front == '\n')
284                 {
285                     _rest.popFront();
286                 }
287
288                 _nextEmpty = true;
289                 return;
290             }
291
292             if(cur == _delim && !inQuotes)
293             {
294                 _rest.popFront();
295                 return;
296             }
297
298             // Finally the typical case:  Just append the thing.
299             _front.put(cur);
300             _rest.popFront();
301         }
302     }
303 }
304
305 ///
306 struct CsvFile(R)
307 {
308     private RefRange!R _rest;
309     private bool _empty;
310
311     alias typeof(_rest) R2;
312     alias CsvLine!(R2) Line;
313
314     private dchar _delim;
315     Line _front;
316
317     this(R range, dchar delim)
318     {
319         this._rest = new RefRange!R(range);
320         this._delim = delim;
321         _front = new Line(_rest, _delim);
322     }
323
324     bool empty() @property
325     {
326         return _empty;
327     }
328
329     void popFront()
330     {
331         if(_rest.empty)
332         {
333             _empty = true;
334             return;
335         }
336
337         // Have to iterate over _front to figure out where the thing ends.
338         while(!_front.empty)
339         {
340             _front.popFront();
341         }
342         _front.reset;
343     }
344
345     Line front() @property
346     {
347         return _front;
348     }
349 }
350
351 /**
352 Returns a struct that iterates over the rows of a CSV file (or more generally a
353 similar file with any delimiter character).  It is an input range of input
354 ranges.  Each call to front() produces a CsvLine object that can be used to
355 iterate over the columns of the current row.  Note that the CsvLine object is
356 recycled with every call to popFront().
357
358 Examples:
359 ---
360 // charRange must be a range of characters representing a CSV file.
361 auto charRange = getCharRange();
362 auto csvIter = csvFile(charRange, ',');
363
364 // Iterate over the lines of the CSV file, excluding line breaks embedded in
365 // quotes.
366 string[] rowArrays;
367 foreach(row; csvIter)
368 {
369     // Convert the row to an array of strings.
370     rowArrays ~= array(
371         map!"a.idup"(  // Necessary because CsvLine recycles buffers.
372             row
373         )
374     );
375
376     // row is now empty.  The next call to csvIter.popFront() will recycle
377     // the object.
378 }
379 ---
380 */
381 CsvFile!(R) csvFile(R)(R range, dchar delim)
382 {
383     return typeof(return)(range, delim);
384 }
385
386 ///
387 final class CsvStructRange(S, R)
388 {
389 private:
390     S _front;
391     R _csvRange;
392     size_t[] indices;
393     Malformed _malformed;
394
395 public:
396     this(R csvRange, string[] colHeaders, Malformed malformed)
397     {
398         enforce(colHeaders.length <= S.tupleof.length, text(
399             "Can't have ", colHeaders.length, " fields in a ",
400             S.tupleof.length, " field struct."));
401
402         this._malformed = malformed;
403         indices.length = colHeaders.length;
404         size_t[string] colToIndex;
405         foreach(i, h; colHeaders)
406         {
407             colToIndex[h] = size_t.max;
408         }
409
410         size_t colIndex;
411         foreach(col; csvRange.front)
412         {
413             auto ptr = col in colToIndex;
414             if(ptr)
415             {
416                 *ptr = colIndex;
417             }
418
419             colIndex++;
420         }
421
422         indices.length = colHeaders.length;
423         foreach(i, h; colHeaders)
424         {
425             immutable index = colToIndex[h];
426             enforce(index < size_t.max, "Header not found:  " ~ h);
427             indices[i] = index;
428         }
429
430         csvRange.popFront();
431         this._csvRange = csvRange;
432         prime();
433     }
434
435     void prime()
436     {
437         while(true)
438         {
439             try
440             {
441                 primeImpl();
442                 return;
443             }
444             catch(Exception e)
445             {
446                 if(_malformed == Malformed.throwException)
447                 {
448                     throw e;
449                 }
450                 else
451                 {
452                     _csvRange.popFront();
453                     if(empty)
454                     {
455                         return;
456                     }
457                     else
458                     {
459                         continue;
460                     }
461                 }
462             }
463         }
464     }
465
466
467     void primeImpl()
468     {
469         size_t colIndex;
470         _front = S.init;
471
472         foreach(colData; _csvRange.front)
473         {
474             scope(exit) colIndex++;
475
476             foreach(ti, ToType; typeof(_front.tupleof))
477             {
478
479                 if(ti >= indices.length)
480                 {
481                     continue;
482                 }
483
484                 if(indices[ti] == colIndex)
485                 {
486                     static if(isNumeric!(ToType))
487                     {
488                         colData = colData.strip();
489                     }
490                     else static if
491                     (is(ToType == char[]) || is(ToType == const(char)[])
492                      || is(ToType == const(char[])))
493                     {
494                         // colData gets overwritten and to!(char[])(char[])
495                         // will not copy it.
496                         colData = colData.dup;
497                     }
498
499                     _front.tupleof[ti] = to!ToType(colData);
500                 }
501             }
502         }
503     }
504
505     void popFront()
506     {
507         _csvRange.popFront();
508         if(!empty)
509         {
510             prime();
511         }
512     }
513
514     @property S front()
515     {
516         return _front;
517     }
518
519     @property bool empty()
520     {
521         return _csvRange.empty;
522     }
523 }
524
525 /**
526 Determines whether CsvStructRange will ignore malformed lines or throw an
527 exception.
528 */
529 enum Malformed
530 {
531     ///
532     ignore,
533
534     ///
535     throwException
536 }
537
538
539 /**
540 Given a struct type S, a range of characters representing a CSV file, and
541 an array of strings representing relevant column headers, read the CSV
542 file into an array of structs, one for each row.  The order of fields in
543 colHeaders must correspond to the order of the fields in the struct.
544 However, it is acceptable for colHeaders to include only a subset of the
545 columns in the CSV file and to only contain information for the first
546 colHeaders.length elements of the struct.
547
548 If a specified column header is not found, an exception is thrown.
549
550 All type conversions are done automatically, and if a type is to be converted
551 to a numeric type (an integer or floating point number), leading and trailing
552 whitespace are removed from the cell before this is attempted.  The Malformed
553 enum controls whether an exception is thrown when a type conversion fails,
554 or whether the line is simply ignored.  The default is to throw an exception.
555
556 Examples:
557 ---
558 struct Person
559 {
560     uint programmingSkill;
561     float sleepHours;
562     string favoriteLanguage;
563     float reserved;
564 }
565
566 auto csvRange = csvFile(getCharRange(), '\t');
567 auto personRange = csvStructRange!Person(csvRange,
568     ["Programming Skill", "Sleep Hours", "Favorite Language"]
569 );
570
571 foreach(person; personRange)
572 {
573     if(person.sleepHours > 7 && person.programmingSkill > 8)
574     {
575         assert(person.favoriteLanguage == "D");
576     }
577
578     assert(reserved == float.init);  // No header for it, so it's not populated.
579 }
580 ---
581 */
582 CsvStructRange!(S, R) csvStructRange(S, R)
583 (R csvRange, string[] colHeaders, Malformed malformed = Malformed.throwException)
584 if(is(S == struct) && isStringMatrixLike!R)
585 {
586     return new CsvStructRange!(S, R)(csvRange, colHeaders, malformed);
587 }
588
589 struct S
590 {
591     uint a;
592     string c;
593     float b;
594     uint dummy;
595
596     string toString()
597     {
598         return text(a, '\t', b, '\t', c);
599     }
600
601     bool opEquals(const ref typeof(this) rhs) const
602     {
603         return a == rhs.a && b == rhs.b && c == rhs.c;
604     }
605 }
606
607 unittest
608 {
609     auto str1 =
610 "a\tb\tc
611 1\t2.5\t\"foo \t stuff \n \"\" bar \"\"\"
612 2\t5\tbar
613 3\t7.5\tbaz
614 ";
615
616 // Same as str1 but put crazy quoting stuff in middle and use backslash escape
617 // for soem quotes.
618     auto str2 =
619 "a\tc\tb
620 1\t\"foo \t stuff \n \\\" bar \"\"\"\t2.5
621 2\tbar\t5
622 3\tbaz\t7.5
623 ";
624
625     alias typeof(csvFile(str1, '\t')) Foo;
626     auto res1 = array(
627         csvStructRange!S(csvFile(str1, '\t'), ["a", "c", "b"])
628     );
629
630     assert(res1[0] == S(1, "foo \t stuff \n \" bar \"", 2.5));
631     assert(res1[1] == S(2, "bar", 5));
632     assert(res1[2] == S(3, "baz", 7.5));
633
634     auto res2 = array(
635         csvStructRange!S(csvFile(str2, '\t'), ["a", "c", "b"])
636     );
637
638     assert(res1 == res2);
639 }
640
641 void main(){}
Note: See TracBrowser for help on using the browser.