root/trunk/tango/scrapple/text/convert/parseTo.d

Revision 55, 19.0 kB (checked in by flithm, 6 months ago)

Update parseTo and parseFrom

Line 
1 /**************************************************************************************************
2  * copyright: Copyright (c) 2007-2008 Diggory Hardy.
3  *
4  * author: Diggory Hardy, diggory.hardy@gmail.com
5  *
6  * license: BSD style: $(LICENSE)
7  *
8  * This contains templates for converting a char[] to various data-types.
9  *
10  * parseTo is roughly the inverse of $(B parseFrom) and should read any data output by $(B parseFrom).
11  *
12  * This module basically implements the following templated function for most basic D types:
13  * bool, byte, short, int, long, ubyte, ushort, uint, ulong, float, double, real, char.
14  * It also supports arrays and associative arrays of any supported type (including of other arrays)
15  * and has special handling for strings (char[]) and binary (ubyte[]) data-types.
16  * -----------------------------
17  * T parseTo(T) (char[] source);
18  * -----------------------------
19  *
20  * $(I source) is the string to parse, and data of the templated type that is read from the string
21  * is returned. See the examples to get a better idea of its use.
22  *
23  * Syntax:
24  * The syntax for parsing $(I source) is mostly the same used by D without any prefixes/suffixes
25  * (except 0x, 0b & 0o base specifiers). Also a special ubyte[] syntax is supported; see examples.
26  * The following escape sequences are supported for strings and characters: \' \" \\
27  * \a \b \f \n \r \t \v . Associative array literals use the same syntax as D, described here:
28  * $(LINK http://www.digitalmars.com/d/2.0/expression.html#AssocArrayLiteral). All whitespace is
29  * ignored (except of course within strings).
30  *
31  * There are also some public utility functions with their own documentation.
32  *
33  * Throws:
34  * On errors, a ParseException or a UnicodeException (both extend TextException) is thrown with a
35  * suitable message. No other exceptions should be thrown.
36  *
37  * Remarks:
38  * There is currently no support for reading wchar/dchar strings. There are, however, unicode
39  * conversions for converting UTF-8 to UTF-16/32. Be careful if converting on a char-by-char basis;
40  * such conversions cannot be used for non-ascii characters.
41  *
42  * Examples:
43  * ------------------------------------------------------------------------------------------------
44  * // Basic examples:
45  * ulong        a = parseTo!(ulong) ("20350");
46  * float        d = parseTo!(float) ("  1.2e-9 ");
47  * int[]        b = parseTo!(int[]) ("[0,1,2,3]");
48  *
49  * // String and char[] syntax:
50  * char[]       c = parseTo!(char[]) ("\"A string\"");
51  * char[]       e = parseTo!(char[]) ("['a','n','o','t','h','e','r', ' ' ,'s','t','r','i','n','g']");
52  *
53  * // These be used interchangably; here's a more complex example of an associative array:
54  * bool[char[]] f = parseTo!(bool[char[]]) ("[ \"one\":true, ['t','w','o']:false, \"three\":1, \"four\":000 ]");
55  *
56  * // There is also a special notation for ubyte[] types:
57  * // The digits following 0x must be in pairs and each specify one ubyte.
58  * assert ( parseTo!(ubyte[]) (`0x01F2AC`) == parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) );
59  *
60  * // There's no limit to the complexity!
61  * char[char[][][][char]][bool] z = ...; // don't expect me to write this!
62  * ------------------------------------------------------------------------------------------------
63  *************************************************************************************************/
64
65 module tango.scrapple.text.convert.parseTo;
66
67 // tango imports
68 import tango.core.Exception : TextException, UnicodeException;
69 import cInt = tango.text.convert.Integer;
70 import cFloat = tango.text.convert.Float;
71 import Utf = tango.text.convert.Utf;
72 import Util = tango.text.Util;
73
74 /**
75  * Base class for parseTo exceptions.
76  */
77 class ParseException : TextException
78 {
79     this( char[] msg )
80     {
81         super( msg );
82     }
83 }
84
85
86 //BEGIN parseTo templates
87
88 // Associative arrays
89
90 const char[] AA_ERR = "Invalid associative array: ";
91 T[S] parseTo(T : T[S], S) (char[] src) {
92     src = Util.trim(src);
93     if (src.length < 2 || src[0] != '[' || src[$-1] != ']')
94         throw new ParseException (AA_ERR ~ "not [ ... ]");  // bad braces.
95    
96     T[S] ret;
97     foreach (char[] pair; split (src[1..$-1])) {
98         uint i = 0;
99         while (i < pair.length) {   // advance to the ':'
100             char c = pair[i];
101             if (c == ':') break;
102             if (c == '\'' || c == '"') {    // string or character
103                 ++i;
104                 while (i < pair.length && pair[i] != c) {
105                     if (pair[i] == '\\') {
106                         if (i+2 >= pair.length) throw new ParseException (AA_ERR ~ "unfinished escape sequence within string/char");
107                         ++i;    // escape seq.
108                     }
109                     ++i;
110                 }
111                 if (i == pair.length) {
112                     throw new ParseException (AA_ERR ~ "encountered [ ... KEY] (missing :DATA)");
113                 }
114             }
115             ++i;
116         }
117         if (i == pair.length) {
118             throw new ParseException (AA_ERR ~ "encountered [ ... KEY:] (missing DATA)");
119         }
120         ret[parseTo!(S) (pair[0..i])] = parseTo!(T) (pair[i+1..$]);
121     }
122     return ret;
123 }
124 debug (UnitTest) unittest {
125     char[][char] X = parseTo!(char[][char]) (`['a':"animal", 'b':['b','u','s']]`);
126     char[][char] Y = ['a':cast(char[])"animal", 'b':['b','u','s']];
127    
128     //FIXME: when the compiler's fixed...
129     // just assert (X == Y)
130     assert (X.length == Y.length);
131     assert (X.keys == Y.keys);
132     assert (X.values == Y.values);
133     //X.rehash; Y.rehash;   // doesn't make a difference
134     //assert (X == Y);      // fails (compiler bug)
135 }
136
137
138 // Arrays
139
140 T[] parseTo(T : T[]) (char[] src) {
141     src = Util.trim(src);
142     if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T[]) (src);
143     throw new ParseException ("Invalid array: not [x, ..., z]");
144 }
145
146 // String (array special case)
147 T parseTo(T : char[]) (char[] src) {
148     src = Util.trim(src);
149     if (src.length >= 2 && src[0] == '"' && src[$-1] == '"') {
150         src = src[1..$-1];
151         T ret;
152         ret.length = src.length;    // maximum length; retract to actual length later
153         uint i = 0;
154         for (uint t = 0; t < src.length;) {
155             // process a block of non-escaped characters
156             uint s = t;
157             while (t < src.length && src[t] != '\\') ++t;   // non-escaped characters
158             uint j = i + t - s;
159             ret[i..j] = src[s..t];  // copy a block
160             i = j;
161            
162             // process a block of escaped characters
163             while (t < src.length && src[t] == '\\') {
164                 t++;
165                 if (t == src.length) throw new ParseException ("Invalid string: ends \\\" !");  // next char is "
166                 ret[i++] = replaceEscapedChar (src[t++]);   // throws if it's invalid
167             }
168         }
169         return ret[0..i];
170     }
171     else if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
172     throw new ParseException ("Invalid string: not quoted (\"*\") or char array (['a',...,'c'])");
173 }
174 // Unicode conversions for strings:
175 T parseTo(T : wchar[]) (char[] src) {
176     // May throw a UnicodeException; don't bother catching and rethrowing:
177     return Utf.toString16 (parseTo!(char[]) (src));
178 }
179 T parseTo(T : dchar[]) (char[] src) {
180     // May throw a UnicodeException; don't bother catching and rethrowing:
181     return Utf.toString32 (parseTo!(char[]) (src));
182 }
183
184 // Binary (array special case)
185 T parseTo(T : ubyte[]) (char[] src) {
186     src = Util.trim(src);
187     // Standard case:
188     if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return toArray!(T) (src);
189     // Special case: sequence of hex digits, each pair of which is a ubyte
190     if (src.length >= 2 && src[0..2] == "0x") {
191         src = src[2..$];    // strip down to actual digits
192        
193         // Must be in pairs:
194         if (src.length % 2 == 1) throw new ParseException ("Invalid binary: odd number of chars");
195        
196         T ret;
197         ret.length = src.length / 2;    // exact
198        
199         for (uint i, pos; pos + 1 < src.length; ++i) {
200             ubyte x = readHexChar(src, pos) << 4;
201             x |= readHexChar(src, pos);
202             ret[i] = x;
203         }
204         return ret;
205     }
206     else throw new ParseException ("Invalid ubyte[]: not an array and doesn't start 0x");
207 }
208
209 debug (UnitTest) unittest {
210     assert (parseTo!(double[]) (`[1.0,1.0e-10]`) == [1.0, 1.0e-10]);    // generic array stuff
211     assert (parseTo!(double[]) (`[  ]`) == cast(double[]) []);  // empty array
212    
213     // char[] and char conversions, with commas, escape sequences and multichar UTF8 characters:
214     assert (parseTo!(char[][]) (`[ ".\"", [',','\''] ,"!\b€" ]`) == [ ".\"".dup, [',','\''] ,"!\b€" ]);
215    
216     // wchar[] and dchar[] conversions:
217     // The characters were pretty-much pulled at random from unicode tables.
218     // The last few cause some wierd (display only) effects in my editor.
219     assert (parseTo!(wchar[]) ("\"Test string: ¶α؟à€
220 àžáˆ€æ€\"") == "Test string: ¶α؟à€
221 àžáˆ€æ€"w);
222     assert (parseTo!(dchar[]) ("\"Test string: ¶α؟à€
223 àžáˆ€æ€\"") == "Test string: ¶α؟à€
224 àžáˆ€æ€"d);
225    
226     assert (parseTo!(ubyte[]) (`0x01F2AC`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);    // ubyte[] special notation
227     assert (parseTo!(ubyte[]) (`[01 ,0xF2, 0xAC]`) == cast(ubyte[]) [0x01, 0xF2, 0xAC]);    // ubyte[] std notation
228 }
229
230
231 // Basic types
232
233 // Char
234 T parseTo(T : char) (char[] src) {
235     src = Util.trim(src);
236     if (src.length < 3 || src[0] != '\'' || src[$-1] != '\'')
237         throw new ParseException ("Invalid char: not quoted (e.g. 'c')");
238     if (src[1] != '\\' && src.length == 3) return src[1];   // Either non escaped
239     if (src.length == 4) return replaceEscapedChar (src[2]);    // Or escaped
240    
241     // Report various errors; warnings for likely and difficult to tell cases:
242     // Warn in case it's a multibyte UTF-8 character:
243     if (src[1] & 0xC0u) throw new UnicodeException ("Invalid char: too long (non-ASCII UTF-8 characters cannot be read as a single character)", 1);
244     throw new ParseException ("Invalid char: too long");
245 }
246 /* Basic unicode convertions for wide-chars.
247 * NOTE: c > 127 signals the start of a multibyte UTF-8 sequence which must be converted for
248 * UTF-16/32. But since we don't know what the next char is we can't do the conversion. */
249 const char[] WIDE_CHAR_ERROR = "Error: unicode non-ascii character cannot be converted from a single UTF-8 char";
250 T parseTo(T : wchar) (char[] src) {
251     char c = parseTo!(char) (src);
252     if (c <= 127u) return cast(wchar) c;    // this char can be converted
253     else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
254 }
255 T parseTo(T : dchar) (char[] src) {
256     char c = parseTo!(char) (src);
257     if (c <= 127u) return cast(dchar) c;    // this char can be converted
258     else throw new UnicodeException (WIDE_CHAR_ERROR, 1);
259 }
260 debug (UnitTest) unittest {
261     assert (parseTo!(char) ("\'\\\'\'") == '\'');
262     assert (parseTo!(wchar) ("'X'") == 'X');
263     assert (parseTo!(dchar) ("'X'") == 'X');
264 }
265
266 // Bool
267 T parseTo(T : bool) (char[] src) {
268     src = Util.trim(src);
269     if (src == "true") return true;
270     if (src == "false") return false;
271     uint pos;
272     while (src.length > pos && src[pos] == '0') ++pos;  // strip leading zeros
273     if (src.length == pos && pos > 0) return false;
274     if (src.length == pos + 1 && src[pos] == '1') return true;
275     throw new ParseException ("Invalid bool: not true or false and doesn't evaluate to 0 or 1");
276 }
277 debug (UnitTest) unittest {
278     assert (parseTo!(bool[]) (`[true,false,01,00]`) == cast(bool[]) [1,0,1,0]);
279 }
280
281 // Ints
282 T parseTo(T : byte) (char[] src) {
283     return toTInt!(T) (src);
284 }
285 T parseTo(T : short) (char[] src) {
286     return toTInt!(T) (src);
287 }
288 T parseTo(T : int) (char[] src) {
289     return toTInt!(T) (src);
290 }
291 T parseTo(T : long) (char[] src) {
292     return toTInt!(T) (src);
293 }
294 T parseTo(T : ubyte) (char[] src) {
295     return toTInt!(T) (src);
296 }
297 T parseTo(T : ushort) (char[] src) {
298     return toTInt!(T) (src);
299 }
300 T parseTo(T : uint) (char[] src) {
301     return toTInt!(T) (src);
302 }
303 T parseTo(T : ulong) (char[] src) {
304     return toTInt!(T) (src);
305 }
306 debug (UnitTest) unittest {
307     assert (parseTo!(byte) ("-5") == cast(byte) -5);
308     // annoyingly, octal syntax differs from D (blame tango):
309     assert (parseTo!(uint[]) ("[0b0100,0o724,0xFa59c,0xFFFFFFFF,0]") == [0b0100u,0724,0xFa59c,0xFFFFFFFF,0]);
310 }
311
312 // Floats
313 T parseTo(T : float) (char[] src) {
314     return toTFloat!(T) (src);
315 }
316 T parseTo(T : double) (char[] src) {
317     return toTFloat!(T) (src);
318 }
319 T parseTo(T : real) (char[] src) {
320     return toTFloat!(T) (src);
321 }
322 debug (UnitTest) unittest {
323     assert (parseTo!(float) ("0.0") == 0.0f);
324     assert (parseTo!(double) ("-1e25") == -1e25);
325     assert (parseTo!(real) ("5.24e-269") == cast(real) 5.24e-269);
326 }
327 //END parseTo templates
328
329 //BEGIN Utility funcs
330 /** Trims whitespace at ends of string and checks for and removes array brackets: []
331 *
332 * Throws:
333 *   ParseException if brackets aren't end non-whitespace characters.
334 *
335 * Returns:
336 *   String without brackets (and whitespace outside those brackets). Useful for passing to split.
337 */
338 char[] stripBrackets (char[] src) {
339     src = Util.trim(src);
340     if (src.length >= 2 && src[0] == '[' && src[$-1] == ']') return src[1..$-1];
341     throw new ParseException ("Invalid bracketed string: not [...]");
342 }
343
344 /** Splits a string into substrings separated by '$(B ,)' with support for characters and strings
345  * containing escape sequences and for embedded arrays ($(B [...])).
346  *
347  * Params:
348  *     src A string to separate on commas. Where used for parsing arrays, the brackets enclosing
349  *     the array should be removed before calling this function (stripBrackets can do this).
350  *
351  * Returns:
352  *     An array of substrings within src, excluding commas. Whitespace is not stripped and
353  *     empty strings may get returned.
354  *
355  * Remarks:
356  *     This function is primarily intended for as a utility function for use by the templates
357  *     parsing arrays and associative arrays, but it may be useful in other cases too. Hence the
358  *     fact no brackets are stripped from src.
359  */
360 char[][] split (char[] src) {
361     src = Util.trim (src);
362     if (src == "") return [];       // empty array: no elements when no data
363    
364     uint depth = 0;         // surface depth (embedded arrays)
365     char[][] ret;
366     ret.length = src.length / 3;    // unlikely to need a longer array
367     uint k = 0;             // current split piece
368     uint i = 0, j = 0;          // current read location, start of current piece
369    
370     while (i < src.length) {
371         char c = src[i];
372         if (c == '\'' || c == '"') {    // string or character
373             ++i;
374             while (i < src.length && src[i] != c) {
375                 if (src[i] == '\\') ++i;    // escape seq.
376                 ++i;
377             }   // Doesn't throw if no terminal quote at end of src, but this should be caught later.
378         }
379         else if (c == '[') ++depth;
380         else if (c == ']') {
381             if (depth) --depth;
382             else throw new ParseException ("Invalid array literal: closes before end of data item.");
383         }
384         else if (c == ',' && depth == 0) {      // only if not an embedded array
385             if (ret.length <= k) ret.length = ret.length * 2;
386             ret[k++] = src[j..i];   // add this piece and increment k
387             j = i + 1;
388         }
389         ++i;
390     }
391     if (ret.length <= k) ret.length = k + 1;
392     ret[k] = src[j..i];     // add final piece (i >= j)
393     return ret[0..k+1];
394 }
395
396 /* Templated read-int function to read (un)signed 1-4 byte integers.
397  *
398  * Actually a reimplementation of tango.text.convert.Integer toLong and parse functions.
399  */
400 private TInt toTInt(TInt) (char[] src) {
401     const char[] INT_OUT_OF_RANGE = "Integer out of range";
402     bool sign;
403     uint radix, ate, ate2;
404    
405     // Trim off whitespace.
406     // NOTE: Cannot use tango.text.convert.Integer.trim to trim leading whitespace since it doesn't
407     // treat new-lines, etc. as whitespace which for our purposes is whitespace.
408     src = Util.trim (src);
409    
410     ate = cInt.trim (src, sign, radix);
411     if (ate == src.length) throw new ParseException ("Invalid integer: no digits");
412     ulong val = cInt.convert (src[ate..$], radix, &ate2);
413     ate += ate2;
414    
415     if (ate < src.length)
416         throw new ParseException ("Invalid integer at marked character: \"" ~ src[0..ate] ~ "'" ~ src[ate] ~ "'" ~ src[ate+1..$] ~ "\"");
417    
418     if (val > TInt.max) throw new ParseException (INT_OUT_OF_RANGE);
419     if (sign) {
420         long sval = cast(long) -val;
421         if (sval > TInt.min) return cast(TInt) sval;
422         else throw new ParseException (INT_OUT_OF_RANGE);
423     }
424     return cast(TInt) val;
425 }
426
427 /* Basically a reimplementation of tango.text.convert.Float.toFloat which checks for
428  * whitespace before throwing an exception for overlong input. */
429 private TFloat toTFloat(TFloat) (char[] src) {
430     // NOTE: As for toTInt(), this needs to strip leading as well as trailing whitespace.
431     src = Util.trim (src);
432     if (src == "") throw new ParseException ("Invalid float: no digits");
433     uint ate;
434    
435     TFloat x = cFloat.parse (src, &ate);
436     return x;
437 }
438
439 /* Throws an exception on invalid escape sequences. Supported escape sequences are the following
440  * subset of those supported by D: \" \' \\ \a \b \f \n \r \t \v
441  */
442 private char replaceEscapedChar (char c)
443 {
444     // This code was generated:
445     if (c <= 'b') {
446         if (c <= '\'') {
447             if (c == '\"') {
448                 return '\"';
449             } else if (c == '\'') {
450                 return '\'';
451             }
452         } else {
453             if (c == '\\') {
454                 return '\\';
455             } else if (c == 'a') {
456                 return '\a';
457             } else if (c == 'b') {
458                 return '\b';
459             }
460         }
461     } else {
462         if (c <= 'n') {
463             if (c == 'f') {
464                 return '\f';
465             } else if (c == 'n') {
466                 return '\n';
467             }
468         } else {
469             if (c == 'r') {
470                 return '\r';
471             } else if (c == 't') {
472                 return '\t';
473             } else if (c == 'v') {
474                 return '\v';
475             }
476         }
477     }
478    
479     // if we haven't returned:
480     throw new ParseException ("Invalid escape sequence: \\"~c);
481 }
482
483 // Reads one hex char: [0-9A-Fa-f]. Otherwise throws an exception. Doesn't check src.length.
484 private ubyte readHexChar (char[] src, inout uint pos) {
485     ubyte x;
486     if (src[pos] >= '0' && src[pos] <= '9') x = src[pos] - '0';
487     else if (src[pos] >= 'A' && src[pos] <= 'F') x = src[pos] - 'A' + 10;
488     else if (src[pos] >= 'a' && src[pos] <= 'f') x = src[pos] - 'a' + 10;
489     else throw new ParseException ("Invalid hex digit.");
490     ++pos;
491     return x;
492 }
493
494 // Generic array reader
495 // Assumes input is of form "[xxxxx]" (i.e. first and last chars are '[', ']' and length >= 2).
496 private T[] toArray(T : T[]) (char[] src) {
497     T[] ret = new T[16];    // avoid unnecessary allocations
498     uint i = 0;
499     foreach (char[] element; split(src[1..$-1])) {
500         if (i == ret.length) ret.length = ret.length * 2;
501         ret[i] = parseTo!(T) (element);
502         ++i;
503     }
504     return ret[0..i];
505 }
506
507 debug (UnitTest) {
508     import tango.io.Console;
509    
510     unittest {
511         Cout ("Running unittest: parseTo ...").flush;
512        
513         assert (parseTo!(char[]) ("\"\\a