Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

root/trunk/tango/text/Util.d

Revision 5600, 53.9 kB (checked in by mwarning, 2 years ago)

fixes #1968 :: reduce/fix 64bit compile warnings for 64bit

  • Property svn:mime-type set to text/x-dsrc
  • Property svn:eol-style set to native
Line 
1 /*******************************************************************************
2
3         copyright:      Copyright (c) 2004 Kris Bell. All rights reserved
4
5         license:        BSD style: $(LICENSE)
6
7         version:        Apr 2004: Initial release
8                         Dec 2006: South Seas version
9
10         author:         Kris
11
12
13         Placeholder for a variety of wee functions. These functions are all
14         templated with the intent of being used for arrays of char, wchar,
15         and dchar. However, they operate correctly with other array types
16         also.
17
18         Several of these functions return an index value, representing where
19         some criteria was identified. When said criteria is not matched, the
20         functions return a value representing the array length provided to
21         them. That is, for those scenarios where C functions might typically
22         return -1 these functions return length instead. This operate nicely
23         with D slices:
24         ---
25         auto text = "happy:faces";
26         
27         assert (text[0 .. locate (text, ':')] == "happy");
28         
29         assert (text[0 .. locate (text, '!')] == "happy:faces");
30         ---
31
32         The contains() function is more convenient for trivial lookup
33         cases:
34         ---
35         if (contains ("fubar", '!'))
36             ...
37         ---
38
39         Note that where some functions expect a size_t as an argument, the
40         D template-matching algorithm will fail where an int is provided
41         instead. This is the typically the cause of "template not found"
42         errors. Also note that name overloading is not supported cleanly
43         by IFTI at this time, so is not applied here.
44
45
46         Applying the D "import alias" mechanism to this module is highly
47         recommended, in order to limit namespace pollution:
48         ---
49         import Util = tango.text.Util;
50
51         auto s = Util.trim ("  foo ");
52         ---
53                 
54
55         Function templates:
56         ---
57         trim (source)                               // trim whitespace
58         triml (source)                              // trim whitespace
59         trimr (source)                              // trim whitespace
60         strip (source, match)                       // trim elements
61         stripl (source, match)                      // trim elements
62         stripr (source, match)                      // trim elements
63         chopl (source, match)                       // trim pattern match
64         chopr (source, match)                       // trim pattern match
65         delimit (src, set)                          // split on delims
66         split (source, pattern)                     // split on pattern
67         splitLines (source);                        // split on lines
68         head (source, pattern, tail)                // split to head & tail
69         join (source, postfix, output)              // join text segments
70         prefix (dst, prefix, content...)            // prefix text segments
71         postfix (dst, postfix, content...)          // postfix text segments
72         combine (dst, prefix, postfix, content...)  // combine lotsa stuff
73         repeat (source, count, output)              // repeat source
74         replace (source, match, replacement)        // replace chars
75         substitute (source, match, replacement)     // replace/remove matches
76         count (source, match)                       // count instances
77         contains (source, match)                    // has char?
78         containsPattern (source, match)             // has pattern?
79         index (source, match, start)                // find match index
80         locate (source, match, start)               // find char
81         locatePrior (source, match, start)          // find prior char
82         locatePattern (source, match, start);       // find pattern
83         locatePatternPrior (source, match, start);  // find prior pattern
84         indexOf (s*, match, length)                 // low-level lookup
85         mismatch (s1*, s2*, length)                 // low-level compare
86         matching (s1*, s2*, length)                 // low-level compare
87         isSpace (match)                             // is whitespace?
88         unescape(source, output)                    // convert '\' prefixes
89         layout (destination, format ...)            // featherweight printf
90         lines (str)                                 // foreach lines
91         quotes (str, set)                           // foreach quotes
92         delimiters (str, set)                       // foreach delimiters
93         patterns (str, pattern)                     // foreach patterns
94         ---
95
96         Please note that any 'pattern' referred to within this module
97         refers to a pattern of characters, and not some kind of regex
98         descriptor. Use the Regex module for regex operation.
99
100 *******************************************************************************/
101
102 module tango.text.Util;
103
104 /******************************************************************************
105
106         Trim the provided array by stripping whitespace from both
107         ends. Returns a slice of the original content
108
109 ******************************************************************************/
110
111 T[] trim(T) (T[] source)
112 {
113         T*   head = source.ptr,
114              tail = head + source.length;
115
116         while (head < tail && isSpace(*head))
117                ++head;
118
119         while (tail > head && isSpace(*(tail-1)))
120                --tail;
121
122         return head [0 .. tail - head];
123 }
124
125 /******************************************************************************
126
127         Trim the provided array by stripping whitespace from the left.
128         Returns a slice of the original content
129
130 ******************************************************************************/
131
132 T[] triml(T) (T[] source)
133 {
134         T*   head = source.ptr,
135              tail = head + source.length;
136
137         while (head < tail && isSpace(*head))
138                ++head;
139
140         return head [0 .. tail - head];
141 }
142
143 /******************************************************************************
144
145         Trim the provided array by stripping whitespace from the right.
146         Returns a slice of the original content
147
148 ******************************************************************************/
149
150 T[] trimr(T) (T[] source)
151 {
152         T*   head = source.ptr,
153              tail = head + source.length;
154
155         while (tail > head && isSpace(*(tail-1)))
156                --tail;
157
158         return head [0 .. tail - head];
159 }
160
161 /******************************************************************************
162
163         Trim the given array by stripping the provided match from
164         both ends. Returns a slice of the original content
165
166 ******************************************************************************/
167
168 T[] strip(T) (T[] source, T match)
169 {
170         T*   head = source.ptr,
171              tail = head + source.length;
172
173         while (head < tail && *head is match)
174                ++head;
175
176         while (tail > head && *(tail-1) is match)
177                --tail;
178
179         return head [0 .. tail - head];
180 }
181
182 /******************************************************************************
183
184         Trim the given array by stripping the provided match from
185         the left hand side. Returns a slice of the original content
186
187 ******************************************************************************/
188
189 T[] stripl(T) (T[] source, T match)
190 {
191         T*   head = source.ptr,
192              tail = head + source.length;
193
194         while (head < tail && *head is match)
195                ++head;
196
197         return head [0 .. tail - head];
198 }
199
200 /******************************************************************************
201
202         Trim the given array by stripping the provided match from
203         the right hand side. Returns a slice of the original content
204
205 ******************************************************************************/
206
207 T[] stripr(T) (T[] source, T match)
208 {
209         T*   head = source.ptr,
210              tail = head + source.length;
211
212         while (tail > head && *(tail-1) is match)
213                --tail;
214
215         return head [0 .. tail - head];
216 }
217
218 /******************************************************************************
219
220         Chop the given source by stripping the provided match from
221         the left hand side. Returns a slice of the original content
222
223 ******************************************************************************/
224
225 T[] chopl(T) (T[] source, T[] match)
226 {
227         if (match.length <= source.length)
228             if (source[0 .. match.length] == match)
229                 source = source [match.length .. $];
230
231         return source;
232 }
233
234 /******************************************************************************
235
236         Chop the given source by stripping the provided match from
237         the right hand side. Returns a slice of the original content
238
239 ******************************************************************************/
240
241 T[] chopr(T) (T[] source, T[] match)
242 {
243         if (match.length <= source.length)
244             if (source[$-match.length .. $] == match)
245                 source = source [0 .. $-match.length];
246
247         return source;
248 }
249
250 /******************************************************************************
251
252         Replace all instances of one element with another (in place)
253
254 ******************************************************************************/
255
256 T[] replace(T) (T[] source, T match, T replacement)
257 {
258         foreach (ref c; source)
259                  if (c is match)
260                      c = replacement;
261         return source;
262 }
263
264 /******************************************************************************
265
266         Substitute all instances of match from source. Set replacement
267         to null in order to remove instead of replace
268
269 ******************************************************************************/
270
271 T[] substitute(T) (T[] source, T[] match, T[] replacement)
272 {
273         T[] output;
274
275         foreach (s; patterns (source, match, replacement))
276                     output ~= s;
277         return output;
278 }
279
280 /******************************************************************************
281
282         Count all instances of match within source
283
284 ******************************************************************************/
285
286 size_t count(T) (T[] source, T[] match)
287 {
288         size_t c;
289
290         foreach (s; patterns (source, match))
291                     ++c;
292         assert(c > 0);
293         return c - 1;
294 }
295
296 /******************************************************************************
297
298         Returns whether or not the provided array contains an instance
299         of the given match
300         
301 ******************************************************************************/
302
303 bool contains(T) (T[] source, T match)
304 {
305         return indexOf (source.ptr, match, source.length) != source.length;
306 }
307
308 /******************************************************************************
309
310         Returns whether or not the provided array contains an instance
311         of the given match
312         
313 ******************************************************************************/
314
315 bool containsPattern(T) (T[] source, T[] match)
316 {
317         return locatePattern (source, match) != source.length;
318 }
319
320 /******************************************************************************
321
322         Return the index of the next instance of 'match' starting at
323         position 'start', or source.length where there is no match.
324
325         Parameter 'start' defaults to 0
326
327 ******************************************************************************/
328
329 size_t index(T, U=size_t) (T[] source, T[] match, U start=0)
330 {return index!(T) (source, match, start);}
331
332 size_t index(T) (T[] source, T[] match, size_t start=0)
333 {
334         return (match.length is 1) ? locate (source, match[0], start)
335                                    : locatePattern (source, match, start);
336 }
337
338 /******************************************************************************
339
340         Return the index of the prior instance of 'match' starting
341         just before 'start', or source.length where there is no match.
342
343         Parameter 'start' defaults to source.length
344
345 ******************************************************************************/
346
347 size_t rindex(T, U=size_t) (T[] source, T[] match, U start=U.max)
348 {return rindex!(T)(source, match, start);}
349
350 size_t rindex(T) (T[] source, T[] match, size_t start=size_t.max)
351 {
352         return (match.length is 1) ? locatePrior (source, match[0], start)
353                                    : locatePatternPrior (source, match, start);
354 }
355
356 /******************************************************************************
357
358         Return the index of the next instance of 'match' starting at
359         position 'start', or source.length where there is no match.
360
361         Parameter 'start' defaults to 0
362
363 ******************************************************************************/
364
365 size_t locate(T, U=size_t) (T[] source, T match, U start=0)
366 {return locate!(T) (source, match, start);}
367
368 size_t locate(T) (T[] source, T match, size_t start=0)
369 {
370         if (start > source.length)
371             start = source.length;
372        
373         return indexOf (source.ptr+start, match, source.length - start) + start;
374 }
375
376 /******************************************************************************
377
378         Return the index of the prior instance of 'match' starting
379         just before 'start', or source.length where there is no match.
380
381         Parameter 'start' defaults to source.length
382
383 ******************************************************************************/
384
385 size_t locatePrior(T, U=size_t) (T[] source, T match, U start=U.max)
386 {return locatePrior!(T)(source, match, start);}
387
388 size_t locatePrior(T) (T[] source, T match, size_t start=size_t.max)
389 {
390         if (start > source.length)
391             start = source.length;
392
393         while (start > 0)
394                if (source[--start] is match)
395                    return start;
396         return source.length;
397 }
398
399 /******************************************************************************
400
401         Return the index of the next instance of 'match' starting at
402         position 'start', or source.length where there is no match.
403
404         Parameter 'start' defaults to 0
405
406 ******************************************************************************/
407
408 size_t locatePattern(T, U=size_t) (T[] source, T[] match, U start=0)
409 {return locatePattern!(T) (source, match, start);}
410
411 size_t locatePattern(T) (T[] source, T[] match, size_t start=0)
412 {
413         size_t    idx;
414         T*      p = source.ptr + start;
415         size_t    extent = source.length - start - match.length + 1;
416
417         if (match.length && extent <= source.length)
418             while (extent)
419                    if ((idx = indexOf (p, match[0], extent)) is extent)
420                         break;
421                    else
422                       if (matching (p+=idx, match.ptr, match.length))
423                           return p - source.ptr;
424                       else
425                          {
426                          extent -= (idx+1);
427                          ++p;
428                          }
429
430         return source.length;
431 }
432    
433 /******************************************************************************
434
435         Return the index of the prior instance of 'match' starting
436         just before 'start', or source.length where there is no match.
437
438         Parameter 'start' defaults to source.length
439
440 ******************************************************************************/
441
442 size_t locatePatternPrior(T, U=size_t) (T[] source, T[] match, U start=U.max)
443 {return locatePatternPrior!(T)(source, match, start);}
444
445 size_t locatePatternPrior(T) (T[] source, T[] match, size_t start=size_t.max)
446 {
447         auto len = source.length;
448        
449         if (start > len)
450             start = len;
451
452         if (match.length && match.length <= len)
453             while (start)
454                   {
455                   start = locatePrior (source, match[0], start);
456                   if (start is len)
457                       break;
458                   else
459                      if ((start + match.length) <= len)
460                           if (matching (source.ptr+start, match.ptr, match.length))
461                               return start;
462                   }
463
464         return len;
465 }
466
467 /******************************************************************************
468
469         Split the provided array on the first pattern instance, and
470         return the resultant head and tail. The pattern is excluded
471         from the two segments.
472
473         Where a segment is not found, tail will be null and the return
474         value will be the original array.
475         
476 ******************************************************************************/
477
478 T[] head(T) (T[] src, T[] pattern, out T[] tail)
479 {
480         auto i = locatePattern (src, pattern);
481         if (i != src.length)
482            {
483            tail = src [i + pattern.length .. $];
484            src = src [0 .. i];
485            }
486         return src;
487 }
488
489 /******************************************************************************
490
491         Split the provided array on the last pattern instance, and
492         return the resultant head and tail. The pattern is excluded
493         from the two segments.
494
495         Where a segment is not found, head will be null and the return
496         value will be the original array.
497         
498 ******************************************************************************/
499
500 T[] tail(T) (T[] src, T[] pattern, out T[] head)
501 {
502         auto i = locatePatternPrior (src, pattern);
503         if (i != src.length)
504            {
505            head = src [0 .. i];
506            src = src [i + pattern.length .. $];
507            }
508         return src;
509 }
510
511 /******************************************************************************
512
513         Split the provided array wherever a delimiter-set instance is
514         found, and return the resultant segments. The delimiters are
515         excluded from each of the segments. Note that delimiters are
516         matched as a set of alternates rather than as a pattern.
517
518         Splitting on a single delimiter is considerably faster than
519         splitting upon a set of alternatives.
520
521         Note that the src content is not duplicated by this function,
522         but is sliced instead.
523
524 ******************************************************************************/
525
526 T[][] delimit(T) (T[] src, T[] set)
527 {
528         T[][] result;
529
530         foreach (segment; delimiters (src, set))
531                  result ~= segment;
532         return result;
533 }
534
535 /******************************************************************************
536
537         Split the provided array wherever a pattern instance is
538         found, and return the resultant segments. The pattern is
539         excluded from each of the segments.
540         
541         Note that the src content is not duplicated by this function,
542         but is sliced instead.
543
544 ******************************************************************************/
545
546 T[][] split(T) (T[] src, T[] pattern)
547 {
548         T[][] result;
549
550         foreach (segment; patterns (src, pattern))
551                  result ~= segment;
552         return result;
553 }
554
555 /******************************************************************************
556
557         Convert text into a set of lines, where each line is identified
558         by a \n or \r\n combination. The line terminator is stripped from
559         each resultant array
560
561         Note that the src content is not duplicated by this function, but
562         is sliced instead.
563
564 ******************************************************************************/
565
566 alias toLines splitLines;
567 T[][] toLines(T) (T[] src)
568 {
569
570         T[][] result;
571
572         foreach (line; lines (src))
573                  result ~= line;
574         return result;
575 }
576
577 /******************************************************************************
578
579         Return the indexed line, where each line is identified by a \n
580         or \r\n combination. The line terminator is stripped from the
581         resultant line
582
583         Note that src content is not duplicated by this function, but
584         is sliced instead.
585
586 ******************************************************************************/
587
588 T[] lineOf(T) (T[] src, size_t index)
589 {
590         int i = 0;
591         foreach (line; lines (src))
592                  if (i++ is index)
593                      return line;
594         return null;
595 }
596
597 /******************************************************************************
598
599         Combine a series of text segments together, each appended with
600         a postfix pattern. An optional output buffer can be provided to
601         avoid heap activity - it should be large enough to contain the
602         entire output, otherwise the heap will be used instead.
603
604         Returns a valid slice of the output, containing the concatenated
605         text.
606
607 ******************************************************************************/
608
609 T[] join(T) (T[][] src, T[] postfix=null, T[] dst=null)
610 {
611         return combine!(T) (dst, null, postfix, src);
612 }
613
614 /******************************************************************************
615
616         Combine a series of text segments together, each prepended with
617         a prefix pattern. An optional output buffer can be provided to
618         avoid heap activity - it should be large enough to contain the
619         entire output, otherwise the heap will be used instead.
620
621         Note that, unlike join(), the output buffer is specified first
622         such that a set of trailing strings can be provided.
623
624         Returns a valid slice of the output, containing the concatenated
625         text.
626
627 ******************************************************************************/
628
629 T[] prefix(T) (T[] dst, T[] prefix, T[][] src...)
630 {
631         return combine!(T) (dst, prefix, null, src);
632 }
633
634 /******************************************************************************
635
636         Combine a series of text segments together, each appended with an
637         optional postfix pattern. An optional output buffer can be provided
638         to avoid heap activity - it should be large enough to contain the
639         entire output, otherwise the heap will be used instead.
640
641         Note that, unlike join(), the output buffer is specified first
642         such that a set of trailing strings can be provided.
643
644         Returns a valid slice of the output, containing the concatenated
645         text.
646
647 ******************************************************************************/
648
649 T[] postfix(T) (T[] dst, T[] postfix, T[][] src...)
650 {
651         return combine!(T) (dst, null, postfix, src);
652 }
653
654 /******************************************************************************
655
656         Combine a series of text segments together, each prefixed and/or
657         postfixed with optional strings. An optional output buffer can be
658         provided to avoid heap activity - which should be large enough to
659         contain the entire output, otherwise the heap will be used instead.
660
661         Note that, unlike join(), the output buffer is specified first
662         such that a set of trailing strings can be provided.
663
664         Returns a valid slice of the output, containing the concatenated
665         text.
666
667 ******************************************************************************/
668
669 T[] combine(T) (T[] dst, T[] prefix, T[] postfix, T[][] src ...)
670 {
671         size_t len = src.length * prefix.length +
672                    src.length * postfix.length;
673
674         foreach (segment; src)
675                  len += segment.length;
676                
677         if (dst.length < len)
678             dst.length = len;
679            
680         T* p = dst.ptr;
681         foreach (segment; src)
682                 {
683                 p[0 .. prefix.length] = prefix;
684                 p += prefix.length;
685                 p[0 .. segment.length] = segment;
686                 p += segment.length;
687                 p[0 .. postfix.length] = postfix;
688                 p += postfix.length;
689                 }
690
691         // remove trailing seperator
692         if (len)
693             len -= postfix.length;
694         return dst [0 .. len];       
695 }
696
697 /******************************************************************************
698
699         Repeat an array for a specific number of times. An optional output
700         buffer can be provided to avoid heap activity - it should be large
701         enough to contain the entire output, otherwise the heap will be used
702         instead.
703
704         Returns a valid slice of the output, containing the concatenated
705         text.
706
707 ******************************************************************************/
708
709 T[] repeat(T, U=size_t) (T[] src, U count, T[] dst=null)
710 {return repeat!(T)(src, count, dst);}
711
712 T[] repeat(T) (T[] src, size_t count, T[] dst=null)
713 {
714         size_t len = src.length * count;
715         if (len is 0)
716             return null;
717
718         if (dst.length < len)
719             dst.length = len;
720            
721         for (auto p = dst.ptr; count--; p += src.length)
722              p[0 .. src.length] = src;
723
724         return dst [0 .. len];
725 }
726
727 /******************************************************************************
728
729         Is the argument a whitespace character?
730
731 ******************************************************************************/
732
733 bool isSpace(T) (T c)
734 {
735         static if (T.sizeof is 1)
736                    return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v'));
737         else
738            return (c <= 32 && (c is ' ' || c is '\t' || c is '\r' || c is '\n' || c is '\f' || c is '\v')) || (c is '\u2028' || c is '\u2029');
739 }
740
741 /******************************************************************************
742
743         Return whether or not the two arrays have matching content
744         
745 ******************************************************************************/
746
747 bool matching(T, U=size_t) (T* s1, T* s2, U length)
748 {return matching!(T) (s1, s2, length);}
749
750 bool matching(T) (T* s1, T* s2, size_t length)
751 {
752         return mismatch(s1, s2, length) is length;
753 }
754
755 /******************************************************************************
756
757         Returns the index of the first match in str, failing once
758         length is reached. Note that we return 'length' for failure
759         and a 0-based index on success
760
761 ******************************************************************************/
762
763 size_t indexOf(T, U=size_t) (T* str, T match, U length)
764 {return indexOf!(T) (str, match, length);}
765
766 size_t indexOf(T) (T* str, T match, size_t length)
767 {
768         //assert (str);
769
770         static if (T.sizeof == 1)
771                    enum : size_t {m1 = cast(size_t) 0x0101010101010101,
772                                   m2 = cast(size_t) 0x8080808080808080}
773         static if (T.sizeof == 2)
774                    enum : size_t {m1 = cast(size_t) 0x0001000100010001,
775                                   m2 = cast(size_t) 0x8000800080008000}
776         static if (T.sizeof == 4)
777                    enum : size_t {m1 = cast(size_t) 0x0000000100000001,
778                                   m2 = cast(size_t) 0x8000000080000000}
779
780         static if (T.sizeof < size_t.sizeof)
781         {
782                 if (length)
783                    {
784                    size_t m = match;
785                    m += m << (8 * T.sizeof);
786
787                    static if (T.sizeof < size_t.sizeof / 2)
788                               m += (m << (8 * T.sizeof * 2));
789
790                    static if (T.sizeof < size_t.sizeof / 4)
791                               m += (m << (8 * T.sizeof * 4));
792
793                    auto p = str;
794                    auto e = p + length - size_t.sizeof/T.sizeof;
795                    while (p < e)
796                          {
797                          // clear matching T segments
798                          auto v = (*cast(size_t*) p) ^ m;
799                          // test for zero, courtesy of Alan Mycroft
800                          if ((v - m1) & ~v & m2)
801                               break;
802                          p += size_t.sizeof/T.sizeof;
803                          }
804
805                    e += size_t.sizeof/T.sizeof;
806                    while (p < e)
807                           if (*p++ is match)
808                               return cast(size_t) (p - str - 1);
809                    }
810                 return length;
811         }
812         else
813         {
814                 auto len = length;
815                 for (auto p=str-1; len--;)
816                      if (*++p is match)
817                          return cast(size_t) (p - str);
818                 return length;
819         }
820 }
821
822 /******************************************************************************
823
824         Returns the index of a mismatch between s1 & s2, failing when
825         length is reached. Note that we return 'length' upon failure
826         (array content matches) and a 0-based index upon success.
827
828         Use this as a faster opEquals. Also provides the basis for a
829         faster opCmp, since the index of the first mismatched character
830         can be used to determine the return value
831
832 ******************************************************************************/
833
834 size_t mismatch(T, U=size_t) (T* s1, T* s2, U length)
835 {return mismatch!(T)(s1, s2, length);}
836
837 size_t mismatch(T) (T* s1, T* s2, size_t length)
838 {
839         assert (s1 && s2);
840
841         static if (T.sizeof < size_t.sizeof)
842         {
843                 if (length)
844                    {
845                    auto start = s1;
846                    auto e = start + length - size_t.sizeof/T.sizeof;
847
848                    while (s1 < e)
849                          {
850                          if (*cast(size_t*) s1 != *cast(size_t*) s2)
851                              break;
852                          s1 += size_t.sizeof/T.sizeof;
853                          s2 += size_t.sizeof/T.sizeof;
854                          }
855
856                    e += size_t.sizeof/T.sizeof;
857                    while (s1 < e)
858                           if (*s1++ != *s2++)
859                               return s1 - start - 1;
860                    }
861                 return length;
862         }
863         else
864         {
865                 auto len = length;
866                 for (auto p=s1-1; len--;)
867                      if (*++p != *s2++)
868                          return p - s1;
869                 return length;
870         }
871 }
872
873 /******************************************************************************
874
875         Iterator to isolate lines.
876
877         Converts text into a set of lines, where each line is identified
878         by a \n or \r\n combination. The line terminator is stripped from
879         each resultant array.
880
881         ---
882         foreach (line; lines ("one\ntwo\nthree"))
883                  ...
884         ---
885         
886 ******************************************************************************/
887
888 LineFruct!(T) lines(T) (T[] src)
889 {
890         LineFruct!(T) lines;
891         lines.src = src;
892         return lines;
893 }
894
895 /******************************************************************************
896
897         Iterator to isolate text elements.
898
899         Splits the provided array wherever a delimiter-set instance is
900         found, and return the resultant segments. The delimiters are
901         excluded from each of the segments. Note that delimiters are
902         matched as a set of alternates rather than as a pattern.
903
904         Splitting on a single delimiter is considerably faster than
905         splitting upon a set of alternatives.
906
907         ---
908         foreach (segment; delimiters ("one,two;three", ",;"))
909                  ...
910         ---
911         
912 ******************************************************************************/
913
914 DelimFruct!(T) delimiters(T) (T[] src, T[] set)
915 {
916         DelimFruct!(T) elements;
917         elements.set = set;
918         elements.src = src;
919         return elements;
920 }
921
922 /******************************************************************************
923
924         Iterator to isolate text elements.
925
926         Split the provided array wherever a pattern instance is found,
927         and return the resultant segments. Pattern are excluded from
928         each of the segments, and an optional sub argument enables
929         replacement.
930         
931         ---
932         foreach (segment; patterns ("one, two, three", ", "))
933                  ...
934         ---
935         
936 ******************************************************************************/
937
938 PatternFruct!(T) patterns(T) (T[] src, T[] pattern, T[] sub=null)
939 {
940         PatternFruct!(T) elements;
941         elements.pattern = pattern;
942         elements.sub = sub;
943         elements.src = src;
944         return elements;
945 }
946
947 /******************************************************************************
948
949         Iterator to isolate optionally quoted text elements.
950
951         As per elements(), but with the extension of being quote-aware;
952         the set of delimiters is ignored inside a pair of quotes. Note
953         that an unterminated quote will consume remaining content.
954         
955         ---
956         foreach (quote; quotes ("one two 'three four' five", " "))
957                  ...
958         ---
959         
960 ******************************************************************************/
961
962 QuoteFruct!(T) quotes(T) (T[] src, T[] set)
963 {
964         QuoteFruct!(T) quotes;
965         quotes.set = set;
966         quotes.src = src;
967         return quotes;
968 }
969
970 /*******************************************************************************
971
972         Arranges text strings in order, using indices to specify where
973         each particular argument should be positioned within the text.
974         This is handy for collating I18N components, or as a simplistic
975         and lightweight formatter. Indices range from zero through nine.
976         
977         ---
978         // write ordered text to the console
979         char[64] tmp;
980
981         Cout (layout (tmp, "%1 is after %0", "zero", "one")).newline;
982         ---
983
984 *******************************************************************************/
985
986 T[] layout(T) (T[] output, T[][] layout ...)
987 {
988         static T[] badarg   = "{index out of range}";
989         static T[] toosmall = "{output buffer too small}";
990        
991         size_t  pos,
992                 args;
993         bool    state;
994
995         args = layout.length - 1;
996         foreach (c; layout[0])
997                 {
998                 if (state)
999                    {
1000                    state = false;
1001                    if (c >= '0' && c <= '9')
1002                       {
1003                       size_t index = c - '0';
1004                       if (index < args)
1005                          {
1006                          T[] x = layout[index+1];
1007
1008                          size_t limit = pos + x.length;
1009                          if (limit < output.length)
1010                             {
1011                             output [pos .. limit] = x;
1012                             pos = limit;
1013                             continue;
1014                             }
1015                          else
1016                             return toosmall;
1017                          }
1018                       else
1019                          return badarg;
1020                       }
1021                    }
1022                 else
1023                    if (c is '%')
1024                       {
1025                       state = true;
1026                       continue;
1027                       }
1028
1029                 if (pos < output.length)
1030                    {
1031                    output[pos] = c;
1032                    ++pos;
1033                    }
1034                 else     
1035                    return toosmall;
1036                 }
1037
1038         return output [0..pos];
1039 }
1040
1041 /******************************************************************************
1042
1043         Convert 'escaped' chars to normal ones: \t => ^t for example.
1044         Supports \" \' \\ \a \b \f \n \r \t \v
1045         
1046 ******************************************************************************/
1047
1048 T[] unescape(T) (T[] src, T[] dst = null)
1049 {
1050         int delta;
1051         auto s = src.ptr;
1052         auto len = src.length;
1053
1054         // take a peek first to see if there's anything
1055         if ((delta = indexOf (s, '\\', len)) < len)
1056            {
1057            // make some room if not enough provided
1058            if (dst.length < src.length)
1059                dst.length = src.length;
1060            auto d = dst.ptr;
1061
1062            // copy segments over, a chunk at a time
1063            do {
1064               d [0 .. delta] = s [0 .. delta];
1065               len -= delta;
1066               s += delta;
1067               d += delta;
1068
1069               // bogus trailing '\'
1070               if (len < 2)
1071                  {
1072                  *d++ = '\\';
1073                  len = 0;
1074                  break;
1075                  }
1076
1077               // translate \char
1078               auto c = s[1];
1079               switch (c)
1080                      {
1081                       case '\\':
1082                            break;
1083                       case '\'':
1084                            c = '\'';
1085                            break;
1086                       case '"':
1087                            c = '"';
1088                            break;
1089                       case 'a':
1090                            c = '\a';
1091                            break;
1092                       case 'b':
1093                            c = '\b';
1094                            break;
1095                       case 'f':
1096                            c = '\f';
1097                            break;
1098                       case 'n':
1099                            c = '\n';
1100                            break;
1101                       case 'r':
1102                            c = '\r';
1103                            break;
1104                       case 't':
1105                            c = '\t';
1106                            break;
1107                       case 'v':
1108                            c = '\v';
1109                            break;
1110                       default:
1111                            *d++ = '\\';
1112                      }
1113               *d++ = c; 
1114               len -= 2;           
1115               s += 2;
1116               } while ((delta = indexOf (s, '\\', len)) < len);
1117
1118            // copy tail too
1119            d [0 .. len] = s [0 .. len];
1120            return dst [0 .. (d + len) - dst.ptr];
1121            }
1122         return src;
1123 }
1124
1125
1126 /******************************************************************************
1127
1128         jhash() -- hash a variable-length key into a 32-bit value
1129
1130           k     : the key (the unaligned variable-length array of bytes)
1131           len   : the length of the key, counting by bytes
1132           level : can be any 4-byte value
1133
1134         Returns a 32-bit value.  Every bit of the key affects every bit of
1135         the return value.  Every 1-bit and 2-bit delta achieves avalanche.
1136
1137         About 4.3*len + 80 X86 instructions, with excellent pipelining
1138
1139         The best hash table sizes are powers of 2.  There is no need to do
1140         mod a prime (mod is sooo slow!).  If you need less than 32 bits,
1141         use a bitmask.  For example, if you need only 10 bits, do
1142
1143                     h = (h & hashmask(10));
1144
1145         In which case, the hash table should have hashsize(10) elements.
1146         If you are hashing n strings (ub1 **)k, do it like this:
1147
1148                     for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
1149
1150         By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use
1151         this code any way you wish, private, educational, or commercial. 
1152         It's free.
1153
1154         See http://burtleburtle.net/bob/hash/evahash.html
1155         Use for hash table lookup, or anything where one collision in 2^32
1156         is acceptable. Do NOT use for cryptographic purposes.
1157
1158 ******************************************************************************/
1159
1160 size_t jhash (ubyte* k, size_t len, size_t c = 0)
1161 {
1162         size_t a = 0x9e3779b9,
1163              b = 0x9e3779b9,
1164              i = len;
1165
1166         // handle most of the key
1167         while (i >= 12)
1168               {
1169               a += *cast(uint *)(k+0);
1170               b += *cast(uint *)(k+4);
1171               c += *cast(uint *)(k+8);
1172
1173               a -= b; a -= c; a ^= (c>>13);
1174               b -= c; b -= a; b ^= (a<<8);
1175               c -= a; c -= b; c ^= (b>>13);
1176               a -= b; a -= c; a ^= (c>>12); 
1177               b -= c; b -= a; b ^= (a<<16);
1178               c -= a; c -= b; c ^= (b>>5);
1179               a -= b; a -= c; a ^= (c>>3); 
1180               b -= c; b -= a; b ^= (a<<10);
1181               c -= a; c -= b; c ^= (b>>15);
1182               k += 12; i -= 12;
1183               }
1184
1185         // handle the last 11 bytes
1186         c += len;
1187         switch (i)
1188                {
1189                case 11: c+=(cast(uint)k[10]<<24);
1190                case 10: c+=(cast(uint)k[9]<<16);
1191                case 9 : c+=(cast(uint)k[8]<<8);
1192                case 8 : b+=(cast(uint)k[7]<<24);
1193                case 7 : b+=(cast(uint)k[6]<<16);
1194                case 6 : b+=(cast(uint)k[5]<<8);
1195                case 5 : b+=(cast(uint)k[4]);
1196                case 4 : a+=(cast(uint)k[3]<<24);
1197                case 3 : a+=(cast(uint)k[2]<<16);
1198                case 2 : a+=(cast(uint)k[1]<<8);
1199                case 1 : a+=(cast(uint)k[0]);
1200                default:
1201                }
1202
1203         a -= b; a -= c; a ^= (c>>13);
1204         b -= c; b -= a; b ^= (a<<8);
1205         c -= a; c -= b; c ^= (b>>13);
1206         a -= b; a -= c; a ^= (c>>12); 
1207         b -= c; b -= a; b ^= (a<<16);
1208         c -= a; c -= b; c ^= (b>>5);
1209         a -= b; a -= c; a ^= (c>>3); 
1210         b -= c; b -= a; b ^= (a<<10);
1211         c -= a; c -= b; c ^= (b>>15);
1212
1213         return c;
1214 }
1215
1216 /// ditto
1217 size_t jhash (void[] x, size_t c = 0)
1218 {
1219         return jhash (cast(ubyte*) x.ptr, x.length, c);
1220 }
1221
1222
1223 /******************************************************************************
1224       
1225         Helper fruct for iterator lines(). A fruct is a low
1226         impact mechanism for capturing context relating to an
1227         opApply (conjunction of the names struct and foreach)
1228         
1229 ******************************************************************************/
1230
1231 private struct LineFruct(T)
1232 {
1233         private T[] src;
1234
1235         int opApply (int delegate (ref T[] line) dg)
1236         {
1237                 int     ret;
1238                 size_t  pos,
1239                         mark;
1240                 T[]     line;
1241
1242                 const T nl = '\n';
1243                 const T cr = '\r';
1244
1245                 while ((pos = locate (src, nl, mark)) < src.length)
1246                       {
1247                       auto end = pos;
1248                       if (end && src[end-1] is cr)
1249                           --end;
1250
1251                       line = src [mark .. end];
1252                       if ((ret = dg (line)) != 0)
1253                            return ret;
1254                       mark = pos + 1;
1255                       }
1256
1257                 line = src [mark .. $];
1258                 if (mark <= src.length)
1259                     ret = dg (line);
1260
1261                 return ret;
1262         }
1263 }
1264
1265 /******************************************************************************
1266
1267         Helper fruct for iterator delims(). A fruct is a low
1268         impact mechanism for capturing context relating to an
1269         opApply (conjunction of the names struct and foreach)
1270         
1271 ******************************************************************************/
1272
1273 private struct DelimFruct(T)
1274 {
1275         private T[] src;
1276         private T[] set;
1277
1278         int opApply (int delegate (ref T[] token) dg)
1279         {
1280                 int     ret;
1281                 size_t  pos,
1282                         mark;
1283                 T[]     token;
1284
1285                 // optimize for single delimiter case
1286                 if (set.length is 1)
1287                     while ((pos = locate (src, set[0], mark)) < src.length)
1288                           {
1289                           token = src [mark .. pos];
1290                           if ((ret = dg (token)) != 0)
1291                                return ret;
1292                           mark = pos + 1;
1293                           }
1294                 else
1295                    if (set.length > 1)
1296                        foreach (i, elem; src)
1297                                 if (contains (set, elem))
1298                                    {
1299                                    token = src [mark .. i];
1300                                    if ((ret = dg (token)) != 0)
1301                                         return ret;
1302                                    mark = i + 1;
1303                                    }
1304
1305                 token = src [mark .. $];
1306                 if (mark <= src.length)
1307                     ret = dg (token);
1308
1309                 return ret;
1310         }
1311 }
1312
1313 /******************************************************************************
1314
1315         Helper fruct for iterator patterns(). A fruct is a low
1316         impact mechanism for capturing context relating to an
1317         opApply (conjunction of the names struct and foreach)
1318         
1319 ******************************************************************************/
1320
1321 private struct PatternFruct(T)
1322 {
1323         private T[] src,
1324                     sub,
1325                     pattern;
1326
1327         int opApply (int delegate (ref T[] token) dg)
1328         {
1329                 int     ret;
1330                 size_t  pos,
1331                         mark;
1332                 T[]     token;
1333
1334                 while ((pos = index (src, pattern, mark)) < src.length)
1335                       {
1336                       token = src [mark .. pos];
1337                       if ((ret = dg(token)) != 0)
1338                            return ret;
1339                       if (sub.ptr && (ret = dg(sub)) != 0)
1340                           return ret;
1341                       mark = pos + pattern.length;
1342                       }
1343
1344                 token = src [mark .. $];
1345                 if (mark <= src.length)
1346                     ret = dg (token);
1347
1348                 return ret;
1349         }
1350 }
1351
1352 /******************************************************************************
1353
1354         Helper fruct for iterator quotes(). A fruct is a low
1355         impact mechanism for capturing context relating to an
1356         opApply (conjunction of the names struct and foreach)
1357         
1358 ******************************************************************************/
1359
1360 private struct QuoteFruct(T)
1361 {
1362         private T[] src;
1363         private T[] set;
1364        
1365         int opApply (int delegate (ref T[] token) dg)
1366         {
1367                 int     ret;
1368                 size_t  mark;
1369                 T[]     token;
1370
1371                 if (set.length)
1372                     for (size_t i=0; i < src.length; ++i)
1373                         {
1374                         T c = src[i];
1375                         if (c is '"' || c is '\'')
1376                             i = locate (src, c, i+1);
1377                         else
1378                            if (contains (set, c))
1379                               {
1380                               token = src [mark .. i];
1381                               if ((ret = dg (token)) != 0)
1382                                    return ret;
1383                               mark = i + 1;
1384                               }
1385                         }
1386                
1387                 token = src [mark .. $];
1388                 if (mark <= src.length)
1389                     ret = dg (token);
1390
1391                 return ret;
1392         }
1393 }
1394
1395
1396 /******************************************************************************
1397
1398 ******************************************************************************/
1399
1400 debug (UnitTest)
1401 {
1402         unittest
1403         {
1404         char[64] tmp;
1405
1406         assert (isSpace (' ') && !isSpace ('d'));
1407
1408         assert (indexOf ("abc".ptr, 'a', 3) is 0);
1409         assert (indexOf ("abc".ptr, 'b', 3) is 1);
1410         assert (indexOf ("abc".ptr, 'c', 3) is 2);
1411         assert (indexOf ("abc".ptr, 'd', 3) is 3);
1412         assert (indexOf ("abcabcabc".ptr, 'd', 9) is 9);
1413
1414         assert (indexOf ("abc"d.ptr, cast(dchar)'c', 3) is 2);
1415         assert (indexOf ("abc"d.ptr, cast(dchar)'d', 3) is 3);
1416
1417         assert (indexOf ("abc"w.ptr, cast(wchar)'c', 3) is 2);
1418         assert (indexOf ("abc"w.ptr, cast(wchar)'d', 3) is 3);
1419         assert (indexOf ("abcdefghijklmnopqrstuvwxyz"w.ptr, cast(wchar)'x', 25) is 23);
1420
1421         assert (mismatch ("abc".ptr, "abc".ptr, 3) is 3);
1422         assert (mismatch ("abc".ptr, "abd".ptr, 3) is 2);
1423         assert (mismatch ("abc".ptr, "acc".ptr, 3) is 1);
1424         assert (mismatch ("abc".ptr, "ccc".ptr, 3) is 0);
1425
1426         assert (mismatch ("abc"w.ptr, "abc"w.ptr, 3) is 3);
1427         assert (mismatch ("abc"w.ptr, "acc"w.ptr, 3) is 1);
1428
1429         assert (mismatch ("abc"d.ptr, "abc"d.ptr, 3) is 3);
1430         assert (mismatch ("abc"d.ptr, "acc"d.ptr, 3) is 1);
1431
1432         assert (matching ("abc".ptr, "abc".ptr, 3));
1433         assert (matching ("abc".ptr, "abb".ptr, 3) is false);
1434        
1435         assert (contains ("abc", 'a'));
1436         assert (contains ("abc", 'b'));
1437         assert (contains ("abc", 'c'));
1438         assert (contains ("abc", 'd') is false);
1439
1440         assert (containsPattern ("abc", "ab"));
1441         assert (containsPattern ("abc", "bc"));
1442         assert (containsPattern ("abc", "abc"));
1443         assert (containsPattern ("abc", "zabc") is false);
1444         assert (containsPattern ("abc", "abcd") is false);
1445         assert (containsPattern ("abc", "za") is false);
1446         assert (containsPattern ("abc", "cd") is false);
1447
1448         assert (trim ("") == "");
1449         assert (trim (" abc  ") == "abc");
1450         assert (trim ("   ") == "");
1451
1452         assert (strip ("", '%') == "");
1453         assert (strip ("%abc%%%", '%') == "abc");
1454         assert (strip ("#####", '#') == "");
1455         assert (stripl ("#####", '#') == "");
1456         assert (stripl (" ###", ' ') == "###");
1457         assert (stripl ("#####", 's') == "#####");
1458         assert (stripr ("#####", '#') == "");
1459         assert (stripr ("### ", ' ') == "###");
1460         assert (stripr ("#####", 's') == "#####");
1461
1462         assert (replace ("abc".dup, 'b', ':') == "a:c");
1463         assert (substitute ("abc".dup, "bc", "x") == "ax");
1464
1465         assert (locate ("abc", 'c', 1) is 2);
1466
1467         assert (locate ("abc", 'c') is 2);
1468         assert (locate ("abc", 'a') is 0);
1469         assert (locate ("abc", 'd') is 3);
1470         assert (locate ("", 'c') is 0);
1471
1472         assert (locatePrior ("abce", 'c') is 2);
1473         assert (locatePrior ("abce", 'a') is 0);
1474         assert (locatePrior ("abce", 'd') is 4);
1475         assert (locatePrior ("abce", 'c', 3) is 2);
1476         assert (locatePrior ("abce", 'c', 2) is 4);
1477         assert (locatePrior ("", 'c') is 0);
1478
1479         auto x = delimit ("::b", ":");
1480         assert (x.length is 3 && x[0] == "" && x[1] == "" && x[2] == "b");
1481         x = delimit ("a:bc:d", ":");
1482         assert (x.length is 3 && x[0] == "a" && x[1] == "bc" && x[2] == "d");
1483         x = delimit ("abcd", ":");
1484         assert (x.length is 1 && x[0] == "abcd");
1485         x = delimit ("abcd:", ":");
1486         assert (x.length is 2 && x[0] == "abcd" && x[1] == "");
1487         x = delimit ("a;b$c#d:e@f", ";:$#@");
1488         assert (x.length is 6 && x[0]=="a" && x[1]=="b" && x[2]=="c" &&
1489                                  x[3]=="d" && x[4]=="e" && x[5]=="f");
1490
1491         assert (locatePattern ("abcdefg", "") is 7);
1492         assert (locatePattern ("abcdefg", "g") is 6);
1493         assert (locatePattern ("abcdefg", "abcdefg") is 0);
1494         assert (locatePattern ("abcdefg", "abcdefgx") is 7);
1495         assert (locatePattern ("abcdefg", "cce") is 7);
1496         assert (locatePattern ("abcdefg", "cde") is 2);
1497         assert (locatePattern ("abcdefgcde", "cde", 3) is 7);
1498
1499         assert (locatePatternPrior ("abcdefg", "") is 7);
1500         assert (locatePatternPrior ("abcdefg", "cce") is 7);
1501         assert (locatePatternPrior ("abcdefg", "cde") is 2);
1502         assert (locatePatternPrior ("abcdefgcde", "cde", 6) is 2);
1503         assert (locatePatternPrior ("abcdefgcde", "cde", 4) is 2);
1504         assert (locatePatternPrior ("abcdefg", "abcdefgx") is 7);
1505
1506         x = splitLines ("a\nb\n");
1507         assert (x.length is 3 && x[0] == "a" && x[1] == "b" && x[2] == "");
1508         x = splitLines ("a\r\n");
1509         assert (x.length is 2 && x[0] == "a" && x[1] == "");
1510
1511         x = splitLines ("a");
1512         assert (x.length is 1 && x[0] == "a");
1513         x = splitLines ("");
1514         assert (x.length is 1);
1515
1516         char[][] q;
1517         foreach (element; quotes ("1 'avcc   cc ' 3", " "))
1518                  q ~= element;
1519         assert (q.length is 3 && q[0] == "1" && q[1] == "'avcc   cc '" && q[2] == "3");
1520
1521         assert (layout (tmp, "%1,%%%c %0", "abc", "efg") == "efg,%c abc");
1522
1523         x = split ("one, two, three", ",");
1524         assert (x.length is 3 && x[0] == "one" && x[1] == " two" && x[2] == " three");
1525         x = split ("one, two, three", ", ");
1526         assert (x.length is 3 && x[0] == "one" && x[1] == "two" && x[2] == "three");
1527         x = split ("one, two, three", ",,");
1528         assert (x.length is 1 && x[0] == "one, two, three");
1529         x = split ("one,,", ",");
1530         assert (x.length is 3 && x[0] == "one" && x[1] == "" && x[2] == "");
1531
1532         char[] h, t;
1533         h =  head ("one:two:three", ":", t);
1534         assert (h == "one" && t == "two:three");
1535         h = head ("one:::two:three", ":::", t);
1536         assert (h == "one" && t == "two:three");
1537         h = head ("one:two:three", "*", t);
1538         assert (h == "one:two:three" && t is null);
1539
1540         t =  tail ("one:two:three", ":", h);
1541         assert (h == "one:two" && t == "three");
1542         t = tail ("one:::two:three", ":::", h);
1543         assert (h == "one" && t == "two:three");
1544         t = tail ("one:two:three", "*", h);
1545         assert (t == "one:two:three" && h is null);
1546
1547         assert (chopl("hello world", "hello ") == "world");
1548         assert (chopl("hello", "hello") == "");
1549         assert (chopl("hello world", " ") == "hello world");
1550         assert (chopl("hello world", "") == "hello world");
1551
1552         assert (chopr("hello world", " world") == "hello");
1553         assert (chopr("hello", "hello") == "");
1554         assert (chopr("hello world", " ") == "hello world");
1555         assert (chopr("hello world", "") == "hello world");
1556
1557         char[][] foo = ["one", "two", "three"];
1558         auto j = join (foo);
1559         assert (j == "onetwothree");
1560         j = join (foo, ", ");
1561         assert (j == "one, two, three");
1562         j = join (foo, " ", tmp);
1563         assert (j == "one two three");
1564         assert (j.ptr is tmp.ptr);
1565
1566         assert (repeat ("abc", 0) == "");
1567         assert (repeat ("abc", 1) == "abc");
1568         assert (repeat ("abc", 2) == "abcabc");
1569         assert (repeat ("abc", 4) == "abcabcabcabc");
1570         assert (repeat ("", 4) == "");
1571         char[10] rep;
1572         assert (repeat ("abc", 0, rep) == "");
1573         assert (repeat ("abc", 1, rep) == "abc");
1574         assert (repeat ("abc", 2, rep) == "abcabc");
1575         assert (repeat ("", 4, rep) == "");
1576
1577         assert (unescape ("abc") == "abc");
1578         assert (unescape ("abc\\") == "abc\\");
1579         assert (unescape ("abc\\t") == "abc\t");
1580         assert (unescape ("abc\\tc") == "abc\tc");
1581         assert (unescape ("\\t") == "\t");
1582         assert (unescape ("\\tx") == "\tx");
1583         assert (unescape ("\\v\\vx") == "\v\vx");
1584         assert (unescape ("abc\\t\\a\\bc") == "abc\t\a\bc");
1585         }
1586 }
1587
1588
1589
1590 debug (Util)
1591 {
1592         auto x = import("Util.d");
1593        
1594         void main()
1595         {
1596                 mismatch ("".ptr, x.ptr, 0);
1597                 indexOf ("".ptr, '@', 0);
1598                 char[] s;
1599                 split (s, " ");
1600                 //indexOf (s.ptr, '@', 0);
1601
1602         }
1603 }
Note: See TracBrowser for help on using the browser.