root/trunk/phobos/std/encoding.d

Revision 2195, 71.5 kB (checked in by Don Clugston, 2 years ago)

Move Boost copyright declaration from ddoc to normal comment. Fixes ugly ddoc output.

  • Property svn:eol-style set to native
  • Property svn:executable set to *
Line 
1 // Written in the D programming language.
2
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5
6 For cases where the _encoding is known at compile-time, functions are provided
7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), and WINDOWS-1252.
12
13 $(UL
14 $(LI The type $(D AsciiChar) represents an ASCII character.)
15 $(LI The type $(D AsciiString) represents an ASCII string.)
16 $(LI The type $(D Latin1Char) represents an ISO-8859-1 character.)
17 $(LI The type $(D Latin1String) represents an ISO-8859-1 string.)
18 $(LI The type $(D Windows1252Char) represents a Windows-1252 character.)
19 $(LI The type $(D Windows1252String) represents a Windows-1252 string.))
20
21 For cases where the _encoding is not known at compile-time, but is
22 known at run-time, we provide the abstract class $(D EncodingScheme)
23 and its subclasses.  To construct a run-time encoder/decoder, one does
24 e.g.
25
26 ----------------------------------------------------
27     auto e = EncodingScheme.create("utf-8");
28 ----------------------------------------------------
29
30 This library supplies $(D EncodingScheme) subclasses for ASCII,
31 ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on
32 little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian
33 architectures) UTF-16BE and UTF-32BE.
34
35 This library provides a mechanism whereby other modules may add $(D
36 EncodingScheme) subclasses for any other _encoding.
37
38 Macros:
39     WIKI=Phobos/StdEncoding
40
41 Copyright: Copyright Janice Caron 2008 - 2009.
42 License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
43 Authors:   Janice Caron
44 */
45 /*
46          Copyright Janice Caron 2008 - 2009.
47 Distributed under the Boost Software License, Version 1.0.
48    (See accompanying file LICENSE_1_0.txt or copy at
49          http://www.boost.org/LICENSE_1_0.txt)
50 */
51 module std.encoding;
52
53 import std.string;
54 import std.traits;
55 import std.range;
56
57 unittest
58 {
59     static ubyte[][] validStrings =
60     [
61         // Plain ASCII
62         cast(ubyte[])"hello",
63
64         // First possible sequence of a certain length
65         [ 0x00 ],                       // U+00000000   one byte
66         [ 0xC2, 0x80 ],                 // U+00000080   two bytes
67         [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
68         [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
69
70         // Last possible sequence of a certain length
71         [ 0x7F ],                       // U+0000007F   one byte
72         [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
73         [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
74
75         // Other boundary conditions
76         [ 0xED, 0x9F, 0xBF ],
77         // U+0000D7FF   Last character before surrogates
78         [ 0xEE, 0x80, 0x80 ],
79         // U+0000E000   First character after surrogates
80         [ 0xEF, 0xBF, 0xBD ],
81         // U+0000FFFD   Unicode replacement character
82         [ 0xF4, 0x8F, 0xBF, 0xBF ],
83         // U+0010FFFF   Very last character
84
85         // Non-character code points
86         /*  NOTE: These are legal in UTF, and may be converted from
87             one UTF to another, however they do not represent Unicode
88             characters. These code points have been reserved by
89             Unicode as non-character code points. They are permissible
90             for data exchange within an application, but they are are
91             not permitted to be used as characters. Since this module
92             deals with UTF, and not with Unicode per se, we choose to
93             accept them here. */
94         [ 0xDF, 0xBE ],                 // U+0000FFFE
95         [ 0xDF, 0xBF ],                 // U+0000FFFF
96     ];
97
98     static ubyte[][] invalidStrings =
99     [
100         // First possible sequence of a certain length, but greater
101         // than U+10FFFF
102         [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
103         [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
104
105         // Last possible sequence of a certain length, but greater than U+10FFFF
106         [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
107         [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
108         [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
109
110         // Other boundary conditions
111         [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
112                                                     // First code
113                                                     // point after
114                                                     // last character
115
116         // Unexpected continuation bytes
117         [ 0x80 ],
118         [ 0xBF ],
119         [ 0x20, 0x80, 0x20 ],
120         [ 0x20, 0xBF, 0x20 ],
121         [ 0x80, 0x9F, 0xA0 ],
122
123         // Lonely start bytes
124         [ 0xC0 ],
125         [ 0xCF ],
126         [ 0x20, 0xC0, 0x20 ],
127         [ 0x20, 0xCF, 0x20 ],
128         [ 0xD0 ],
129         [ 0xDF ],
130         [ 0x20, 0xD0, 0x20 ],
131         [ 0x20, 0xDF, 0x20 ],
132         [ 0xE0 ],
133         [ 0xEF ],
134         [ 0x20, 0xE0, 0x20 ],
135         [ 0x20, 0xEF, 0x20 ],
136         [ 0xF0 ],
137         [ 0xF1 ],
138         [ 0xF2 ],
139         [ 0xF3 ],
140         [ 0xF4 ],
141         [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
142         [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
143         [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
144
145         [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
146         [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
147         [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
148
149         // Impossible bytes
150         [ 0xF8 ],
151         [ 0xF9 ],
152         [ 0xFA ],
153         [ 0xFB ],
154         [ 0xFC ],
155         [ 0xFD ],
156         [ 0xFE ],
157         [ 0xFF ],
158         [ 0x20, 0xF8, 0x20 ],
159         [ 0x20, 0xF9, 0x20 ],
160         [ 0x20, 0xFA, 0x20 ],
161         [ 0x20, 0xFB, 0x20 ],
162         [ 0x20, 0xFC, 0x20 ],
163         [ 0x20, 0xFD, 0x20 ],
164         [ 0x20, 0xFE, 0x20 ],
165         [ 0x20, 0xFF, 0x20 ],
166
167         // Overlong sequences, all representing U+002F
168         /*  With a safe UTF-8 decoder, all of the following five overlong
169             representations of the ASCII character slash ("/") should be
170             rejected like a malformed UTF-8 sequence */
171         [ 0xC0, 0xAF ],
172         [ 0xE0, 0x80, 0xAF ],
173         [ 0xF0, 0x80, 0x80, 0xAF ],
174         [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
175         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
176
177         // Maximum overlong sequences
178         /*  Below you see the highest Unicode value that is still resulting in
179             an overlong sequence if represented with the given number of bytes.
180             This is a boundary test for safe UTF-8 decoders. All five
181             characters should be rejected like malformed UTF-8 sequences. */
182         [ 0xC1, 0xBF ],                             // U+0000007F
183         [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
184         [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
185         [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
186         [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
187
188         // Overlong representation of the NUL character
189         /*  The following five sequences should also be rejected like malformed
190             UTF-8 sequences and should not be treated like the ASCII NUL
191             character. */
192         [ 0xC0, 0x80 ],
193         [ 0xE0, 0x80, 0x80 ],
194         [ 0xF0, 0x80, 0x80, 0x80 ],
195         [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
196         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
197
198         // Illegal code positions
199         /*  The following UTF-8 sequences should be rejected like malformed
200             sequences, because they never represent valid ISO 10646 characters
201             and a UTF-8 decoder that accepts them might introduce security
202             problems comparable to overlong UTF-8 sequences. */
203         [ 0xED, 0xA0, 0x80 ],       // U+D800
204         [ 0xED, 0xAD, 0xBF ],       // U+DB7F
205         [ 0xED, 0xAE, 0x80 ],       // U+DB80
206         [ 0xED, 0xAF, 0xBF ],       // U+DBFF
207         [ 0xED, 0xB0, 0x80 ],       // U+DC00
208         [ 0xED, 0xBE, 0x80 ],       // U+DF80
209         [ 0xED, 0xBF, 0xBF ],       // U+DFFF
210     ];
211
212     static string[] sanitizedStrings =
213     [
214         "\uFFFD","\uFFFD",
215         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
216         " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
217         "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
218         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
219         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
220         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
221         " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
222         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
223         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
224         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
225     ];
226
227     // Make sure everything that should be valid, is
228     foreach(a;validStrings)
229     {
230         string s = cast(string)a;
231         assert(isValid(s),"Failed to validate: "~makeReadable(s));
232     }
233
234     // Make sure everything that shouldn't be valid, isn't
235     foreach(a;invalidStrings)
236     {
237         string s = cast(string)a;
238         assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
239     }
240
241     // Make sure we can sanitize everything bad
242     assert(invalidStrings.length == sanitizedStrings.length);
243     for(int i=0; i<invalidStrings.length; ++i)
244     {
245         string s = cast(string)invalidStrings[i];
246         string t = sanitize(s);
247         assert(isValid(t));
248         assert(t == sanitizedStrings[i]);
249         ubyte[] u = cast(ubyte[])t;
250         validStrings ~= u;
251     }
252
253     // Make sure all transcodings work in both directions, using both forward
254     // and reverse iteration
255     foreach(a; validStrings)
256     {
257         string s = cast(string)a;
258         string s2;
259         wstring ws, ws2;
260         dstring ds, ds2;
261
262         transcode(s,ws);
263         assert(isValid(ws));
264         transcode(ws,s2);
265         assert(s == s2);
266
267         transcode(s,ds);
268         assert(isValid(ds));
269         transcode(ds,s2);
270         assert(s == s2);
271
272         transcode(ws,s);
273         assert(isValid(s));
274         transcode(s,ws2);
275         assert(ws == ws2);
276
277         transcode(ws,ds);
278         assert(isValid(ds));
279         transcode(ds,ws2);
280         assert(ws == ws2);
281
282         transcode(ds,s);
283         assert(isValid(s));
284         transcode(s,ds2);
285         assert(ds == ds2);
286
287         transcode(ds,ws);
288         assert(isValid(ws));
289         transcode(ws,ds2);
290         assert(ds == ds2);
291
292         transcodeReverse(s,ws);
293         assert(isValid(ws));
294         transcodeReverse(ws,s2);
295         assert(s == s2);
296
297         transcodeReverse(s,ds);
298         assert(isValid(ds));
299         transcodeReverse(ds,s2);
300         assert(s == s2);
301
302         transcodeReverse(ws,s);
303         assert(isValid(s));
304         transcodeReverse(s,ws2);
305         assert(ws == ws2);
306
307         transcodeReverse(ws,ds);
308         assert(isValid(ds));
309         transcodeReverse(ds,ws2);
310         assert(ws == ws2);
311
312         transcodeReverse(ds,s);
313         assert(isValid(s));
314         transcodeReverse(s,ds2);
315         assert(ds == ds2);
316
317         transcodeReverse(ds,ws);
318         assert(isValid(ws));
319         transcodeReverse(ws,ds2);
320         assert(ds == ds2);
321     }
322
323     // Make sure the non-UTF encodings work too
324     {
325         auto s = "\u20AC100";
326         Windows1252String t;
327         transcode(s,t);
328         assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
329         string u;
330         transcode(s,u);
331         assert(s == u);
332         Latin1String v;
333         transcode(s,v);
334         assert(cast(string)v == "?100");
335         AsciiString w;
336         transcode(v,w);
337         assert(cast(string)w == "?100");
338     }
339
340     // Make sure we can count properly
341     {
342         assert(encodedLength!(char)('A') == 1);
343         assert(encodedLength!(char)('\u00E3') == 2);
344         assert(encodedLength!(char)('\u2028') == 3);
345         assert(encodedLength!(char)('\U0010FFF0') == 4);
346         assert(encodedLength!(wchar)('A') == 1);
347         assert(encodedLength!(wchar)('\U0010FFF0') == 2);
348     }
349
350     // Make sure we can write into mutable arrays
351     {
352         char[4] buffer;
353         auto n = encode(cast(dchar)'\u00E3',buffer);
354         assert(n == 2);
355         assert(buffer[0] == 0xC3);
356         assert(buffer[1] == 0xA3);
357     }
358 }
359
360 //=============================================================================
361
362 /** Special value returned by $(D safeDecode) */
363 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
364
365 template EncoderFunctions()
366 {
367     // Various forms of read
368
369     template ReadFromString()
370     {
371         bool canRead() { return s.length != 0; }
372         E peek() { return s[0]; }
373         E read() { E t = s[0]; s = s[1..$]; return t; }
374     }
375
376     template ReverseReadFromString()
377     {
378         bool canRead() { return s.length != 0; }
379         E peek() { return s[$-1]; }
380         E read() { E t = s[$-1]; s = s[0..$-1]; return t; }
381     }
382
383     // Various forms of Write
384
385     template WriteToString()
386     {
387         E[] s;
388         void write(E c) { s ~= c; }
389     }
390
391     template WriteToArray()
392     {
393         void write(E c) { array[0] = c; array = array[1..$]; }
394     }
395
396     deprecated template WriteToBuffer()
397     {
398         void write(E c) { buffer ~= c; }
399     }
400
401     template WriteToDelegate()
402     {
403         void write(E c) { dg(c); }
404     }
405
406     // Functions we will export
407
408     template EncodeViaWrite()
409     {
410         mixin encodeViaWrite;
411         void encode(dchar c) { encodeViaWrite(c); }
412     }
413
414     template SkipViaRead()
415     {
416         mixin skipViaRead;
417         void skip() { skipViaRead(); }
418     }
419
420     template DecodeViaRead()
421     {
422         mixin decodeViaRead;
423         dchar decode() { return decodeViaRead(); }
424     }
425
426     template SafeDecodeViaRead()
427     {
428         mixin safeDecodeViaRead;
429         dchar safeDecode() { return safeDecodeViaRead(); }
430     }
431
432     template DecodeReverseViaRead()
433     {
434         mixin decodeReverseViaRead;
435         dchar decodeReverse() { return decodeReverseViaRead(); }
436     }
437
438     // Encoding to different destinations
439
440     template EncodeToString()
441     {
442         mixin WriteToString;
443         mixin EncodeViaWrite;
444     }
445
446     template EncodeToArray()
447     {
448         mixin WriteToArray;
449         mixin EncodeViaWrite;
450     }
451
452     deprecated template EncodeToBuffer()
453     {
454         mixin WriteToBuffer;
455         mixin EncodeViaWrite;
456     }
457
458     template EncodeToDelegate()
459     {
460         mixin WriteToDelegate;
461         mixin EncodeViaWrite;
462     }
463
464     // Decoding functions
465
466     template SkipFromString()
467     {
468         mixin ReadFromString;
469         mixin SkipViaRead;
470     }
471
472     template DecodeFromString()
473     {
474         mixin ReadFromString;
475         mixin DecodeViaRead;
476     }
477
478     template SafeDecodeFromString()
479     {
480         mixin ReadFromString;
481         mixin SafeDecodeViaRead;
482     }
483
484     template DecodeReverseFromString()
485     {
486         mixin ReverseReadFromString;
487         mixin DecodeReverseViaRead;
488     }
489
490     //=========================================================================
491
492     // Below are the functions we will ultimately expose to the user
493
494     E[] encode(dchar c)
495     {
496         mixin EncodeToString e;
497         e.encode(c);
498         return e.s;
499     }
500
501     void encode(dchar c, ref E[] array)
502     {
503         mixin EncodeToArray e;
504         e.encode(c);
505     }
506
507     void encode(dchar c, void delegate(E) dg)
508     {
509         mixin EncodeToDelegate e;
510         e.encode(c);
511     }
512
513     void skip(ref const(E)[] s)
514     {
515         mixin SkipFromString e;
516         e.skip();
517     }
518
519     dchar decode(S)(ref S s)
520     {
521         mixin DecodeFromString e;
522         return e.decode();
523     }
524
525     dchar safeDecode(S)(ref S s)
526     {
527         mixin SafeDecodeFromString e;
528         return e.safeDecode();
529     }
530
531     dchar decodeReverse(ref const(E)[] s)
532     {
533         mixin DecodeReverseFromString e;
534         return e.decodeReverse();
535     }
536 }
537
538 //=========================================================================
539
540 struct CodePoints(E)
541 {
542     const(E)[] s;
543
544     this(const(E)[] s)
545     in
546     {
547         assert(isValid(s));
548     }
549     body
550     {
551         this.s = s;
552     }
553
554     int opApply(scope int delegate(ref dchar) dg)
555     {
556         int result = 0;
557         while (s.length != 0)
558         {
559             dchar c = decode(s);
560             result = dg(c);
561             if (result != 0) break;
562         }
563         return result;
564     }
565
566     int opApply(scope int delegate(ref size_t, ref dchar) dg)
567     {
568         size_t i = 0;
569         int result = 0;
570         while (s.length != 0)
571         {
572             size_t len = s.length;
573             dchar c = decode(s);
574             size_t j = i; // We don't want the delegate corrupting i
575             result = dg(j,c);
576             if (result != 0) break;
577             i += len - s.length;
578         }
579         return result;
580     }
581
582     int opApplyReverse(scope int delegate(ref dchar) dg)
583     {
584         int result = 0;
585         while (s.length != 0)
586         {
587             dchar c = decodeReverse(s);
588             result = dg(c);
589             if (result != 0) break;
590         }
591         return result;
592     }
593
594     int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
595     {
596         int result = 0;
597         while (s.length != 0)
598         {
599             dchar c = decodeReverse(s);
600             size_t i = s.length;
601             result = dg(i,c);
602             if (result != 0) break;
603         }
604         return result;
605     }
606 }
607
608 struct CodeUnits(E)
609 {
610     E[] s;
611
612     this(dchar d)
613     in
614     {
615         assert(isValidCodePoint(d));
616     }
617     body
618     {
619         s = encode!(E)(d);
620     }
621
622     int opApply(scope int delegate(ref E) dg)
623     {
624         int result = 0;
625         foreach(E c;s)
626         {
627             result = dg(c);
628             if (result != 0) break;
629         }
630         return result;
631     }
632
633     int opApplyReverse(scope int delegate(ref E) dg)
634     {
635         int result = 0;
636         foreach_reverse(E c;s)
637         {
638             result = dg(c);
639             if (result != 0) break;
640         }
641         return result;
642     }
643 }
644
645 //=============================================================================
646
647 template EncoderInstance(E)
648 {
649     static assert(false,"Cannot instantiate EncoderInstance for type "
650         ~ E.stringof);
651 }
652
653 //=============================================================================
654 //          ASCII
655 //=============================================================================
656
657 /** Defines various character sets. */
658 typedef ubyte AsciiChar;
659 /// Ditto
660 alias immutable(AsciiChar)[] AsciiString;
661
662 template EncoderInstance(CharType : AsciiChar)
663 {
664     alias AsciiChar E;
665     alias AsciiString EString;
666
667     string encodingName()
668     {
669         return "ASCII";
670     }
671
672     bool canEncode(dchar c)
673     {
674         return c < 0x80;
675     }
676
677     bool isValidCodeUnit(AsciiChar c)
678     {
679         return c < 0x80;
680     }
681
682     size_t encodedLength(dchar c)
683     in
684     {
685         assert(canEncode(c));
686     }
687     body
688     {
689         return 1;
690     }
691
692     void encodeX(Range)(dchar c, Range r)
693     {
694         if (!canEncode(c)) c = '?';
695         r.write(cast(AsciiChar) c);
696     }
697
698     void encodeViaWrite()(dchar c)
699     {
700         if (!canEncode(c)) c = '?';
701         write(cast(AsciiChar)c);
702     }
703
704     void skipViaRead()()
705     {
706         read();
707     }
708
709     dchar decodeViaRead()()
710     {
711         return read;
712     }
713
714     dchar safeDecodeViaRead()()
715     {
716         dchar c = read;
717         return canEncode(c) ? c : INVALID_SEQUENCE;
718     }
719
720     dchar decodeReverseViaRead()()
721     {
722         return read;
723     }
724
725     EString replacementSequence()
726     {
727         return cast(EString)("?");
728     }
729
730     mixin EncoderFunctions;
731 }
732
733 //=============================================================================
734 //          ISO-8859-1
735 //=============================================================================
736
737 /** Defines an Latin1-encoded character. */
738 typedef ubyte Latin1Char;
739 /**
740 Defines an Latin1-encoded string (as an array of $(D
741 immutable(Latin1Char))).
742  */
743 alias immutable(Latin1Char)[] Latin1String; ///
744
745 template EncoderInstance(CharType : Latin1Char)
746 {
747     alias Latin1Char E;
748     alias Latin1String EString;
749
750     string encodingName()
751     {
752         return "ISO-8859-1";
753     }
754
755     bool canEncode(dchar c)
756     {
757         return c < 0x100;
758     }
759
760     bool isValidCodeUnit(Latin1Char c)
761     {
762         return true;
763     }
764
765     size_t encodedLength(dchar c)
766     in
767     {
768         assert(canEncode(c));
769     }
770     body
771     {
772                 return 1;
773     }
774
775     void encodeViaWrite()(dchar c)
776     {
777         if (!canEncode(c)) c = '?';
778         write(cast(Latin1Char)c);
779     }
780
781     void skipViaRead()()
782     {
783         read();
784     }
785
786     dchar decodeViaRead()()
787     {
788         return read;
789     }
790
791     dchar safeDecodeViaRead()()
792     {
793         return read;
794     }
795
796     dchar decodeReverseViaRead()()
797     {
798         return read;
799     }
800
801     EString replacementSequence()
802     {
803         return cast(EString)("?");
804     }
805
806     mixin EncoderFunctions;
807 }
808
809 //=============================================================================
810 //          WINDOWS-1252
811 //=============================================================================
812
813 /** Defines a Windows1252-encoded character. */
814 typedef ubyte Windows1252Char;
815 /**
816 Defines an Windows1252-encoded string (as an array of $(D
817 immutable(Windows1252Char))).
818  */
819 alias immutable(Windows1252Char)[] Windows1252String; ///
820
821 template EncoderInstance(CharType : Windows1252Char)
822 {
823     alias Windows1252Char E;
824     alias Windows1252String EString;
825
826     string encodingName()
827     {
828         return "windows-1252";
829     }
830
831     immutable wstring charMap =
832         "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
833         "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
834         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
835         "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"
836     ;
837
838     bool canEncode(dchar c)
839     {
840         if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return true;
841         if (c >= 0xFFFD) return false;
842         foreach(wchar d;charMap) { if (c == d) return true; }
843         return false;
844     }
845
846     bool isValidCodeUnit(Windows1252Char c)
847     {
848         if (c < 0x80 || c >= 0xA0) return true;
849         return (charMap[c-0x80] != 0xFFFD);
850     }
851
852     size_t encodedLength(dchar c)
853     in
854     {
855         assert(canEncode(c));
856     }
857     body
858     {
859         return 1;
860     }
861
862     void encodeViaWrite()(dchar c)
863     {
864         if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {}
865         else if (c >= 0xFFFD) { c = '?'; }
866         else
867         {
868             sizediff_t n = -1;
869             foreach (i, wchar d; charMap)
870             {
871                 if (c == d)
872                 {
873                     n = i;
874                     break;
875                 }
876             }
877             c = n == -1 ? '?' : 0x80 + cast(dchar) n;
878         }
879         write(cast(Windows1252Char)c);
880     }
881
882     void skipViaRead()()
883     {
884         read();
885     }
886
887     dchar decodeViaRead()()
888     {
889         Windows1252Char c = read;
890         return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
891     }
892
893     dchar safeDecodeViaRead()()
894     {
895         Windows1252Char c = read;
896         dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
897         return d == 0xFFFD ? INVALID_SEQUENCE : d;
898     }
899
900     dchar decodeReverseViaRead()()
901     {
902         Windows1252Char c = read;
903         return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
904     }
905
906     EString replacementSequence()
907     {
908         return cast(EString)("?");
909     }
910
911     mixin EncoderFunctions;
912 }
913
914 //=============================================================================
915 //          UTF-8
916 //=============================================================================
917
918 template EncoderInstance(CharType : char)
919 {
920     alias char E;
921     alias immutable(char)[] EString;
922
923     string encodingName()
924     {
925         return "UTF-8";
926     }
927
928     bool canEncode(dchar c)
929     {
930         return isValidCodePoint(c);
931     }
932
933     bool isValidCodeUnit(char c)
934     {
935         return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
936     }
937
938     immutable ubyte[128] tailTable =
939     [
940         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
941         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
942         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
943         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
944         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
945         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
946         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
947         3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
948     ];
949
950     private int tails(char c)
951     in
952     {
953         assert(c >= 0x80);
954     }
955     body
956     {
957         return tailTable[c-0x80];
958     }
959
960     size_t encodedLength(dchar c)
961     in
962     {
963         assert(canEncode(c));
964     }
965     body
966     {
967         if (c < 0x80) return 1;
968         if (c < 0x800) return 2;
969         if (c < 0x10000) return 3;
970         return 4;
971     }
972
973     void encodeViaWrite()(dchar c)
974     {
975         if (c < 0x80)
976         {
977             write(cast(char)c);
978         }
979         else if (c < 0x800)
980         {
981             write(cast(char)((c >> 6) + 0xC0));
982             write(cast(char)((c & 0x3F) + 0x80));
983         }
984         else if (c < 0x10000)
985         {
986             write(cast(char)((c >> 12) + 0xE0));
987             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
988             write(cast(char)((c & 0x3F) + 0x80));
989         }
990         else
991         {
992             write(cast(char)((c >> 18) + 0xF0));
993             write(cast(char)(((c >> 12) & 0x3F) + 0x80));
994             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
995             write(cast(char)((c & 0x3F) + 0x80));
996         }
997     }
998
999     void skipViaRead()()
1000     {
1001         auto c = read;
1002         if (c < 0xC0) return;
1003         int n = tails(cast(char) c);
1004         for (size_t i=0; i<n; ++i)
1005         {
1006             read();
1007         }
1008     }
1009
1010     dchar decodeViaRead()()
1011     {
1012         dchar c = read;
1013         if (c < 0xC0) return c;
1014         int n = tails(cast(char) c);
1015         c &= (1 << (6 - n)) - 1;
1016         for (size_t i=0; i<n; ++i)
1017         {
1018             c = (c << 6) + (read & 0x3F);
1019         }
1020         return c;
1021     }
1022
1023     dchar safeDecodeViaRead()()
1024     {
1025         dchar c = read;
1026         if (c < 0x80) return c;
1027         int n = tails(cast(char) c);
1028         if (n == 0) return INVALID_SEQUENCE;
1029
1030         if (!canRead) return INVALID_SEQUENCE;
1031         size_t d = peek;
1032         bool err =
1033         (
1034             (c < 0xC2)                              // fail overlong 2-byte sequences
1035         ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
1036         ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
1037         ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
1038         ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
1039         ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
1040         );
1041
1042         c &= (1 << (6 - n)) - 1;
1043         for (size_t i=0; i<n; ++i)
1044         {
1045             if (!canRead) return INVALID_SEQUENCE;
1046             d = peek;
1047             if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1048             c = (c << 6) + (read & 0x3F);
1049         }
1050
1051         return err ? INVALID_SEQUENCE : c;
1052     }
1053
1054     dchar decodeReverseViaRead()()
1055     {
1056         dchar c = read;
1057         if (c < 0x80) return c;
1058         size_t shift = 0;
1059         c &= 0x3F;
1060         for (size_t i=0; i<4; ++i)
1061         {
1062             shift += 6;
1063             auto d = read;
1064             size_t n = tails(cast(char) d);
1065             size_t mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1066             c += ((d & mask) << shift);
1067             if (n != 0) break;
1068         }
1069         return c;
1070     }
1071
1072     EString replacementSequence()
1073     {
1074         return "\uFFFD";
1075     }
1076
1077     mixin EncoderFunctions;
1078 }
1079
1080 //=============================================================================
1081 //          UTF-16
1082 //=============================================================================
1083
1084 template EncoderInstance(CharType : wchar)
1085 {
1086     alias wchar E;
1087     alias immutable(wchar)[] EString;
1088
1089     string encodingName()
1090     {
1091         return "UTF-16";
1092     }
1093
1094     bool canEncode(dchar c)
1095     {
1096         return isValidCodePoint(c);
1097     }
1098
1099     bool isValidCodeUnit(wchar c)
1100     {
1101         return true;
1102     }
1103
1104     size_t encodedLength(dchar c)
1105     in
1106     {
1107         assert(canEncode(c));
1108     }
1109     body
1110     {
1111                 return (c < 0x10000) ? 1 : 2;
1112     }
1113
1114     void encodeViaWrite()(dchar c)
1115     {
1116         if (c < 0x10000)
1117         {
1118             write(cast(wchar)c);
1119         }
1120         else
1121         {
1122             size_t n = c - 0x10000;
1123             write(cast(wchar)(0xD800 + (n >> 10)));
1124             write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1125         }
1126     }
1127
1128     void skipViaRead()()
1129     {
1130         wchar c = read;
1131         if (c < 0xD800 || c >= 0xE000) return;
1132         read();
1133     }
1134
1135     dchar decodeViaRead()()
1136     {
1137         wchar c = read;
1138         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1139         wchar d = read;
1140         c &= 0x3FF;
1141         d &= 0x3FF;
1142         return 0x10000 + (c << 10) + d;
1143     }
1144
1145     dchar safeDecodeViaRead()()
1146     {
1147         wchar c = read;
1148         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1149         if (c >= 0xDC00) return INVALID_SEQUENCE;
1150         if (!canRead) return INVALID_SEQUENCE;
1151         wchar d = peek;
1152         if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1153         d = read;
1154         c &= 0x3FF;
1155         d &= 0x3FF;
1156         return 0x10000 + (c << 10) + d;
1157     }
1158
1159     dchar decodeReverseViaRead()()
1160     {
1161         wchar c = read;
1162         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
1163         wchar d = read;
1164         c &= 0x3FF;
1165         d &= 0x3FF;
1166         return 0x10000 + (d << 10) + c;
1167     }
1168
1169     EString replacementSequence()
1170     {
1171         return "\uFFFD"w;
1172     }
1173
1174     mixin EncoderFunctions;
1175 }
1176
1177 //=============================================================================
1178 //          UTF-32
1179 //=============================================================================
1180
1181 template EncoderInstance(CharType : dchar)
1182 {
1183     alias dchar E;
1184     alias immutable(dchar)[] EString;
1185
1186     string encodingName()
1187     {
1188         return "UTF-32";
1189     }
1190
1191     bool canEncode(dchar c)
1192     {
1193         return isValidCodePoint(c);
1194     }
1195
1196     bool isValidCodeUnit(dchar c)
1197     {
1198         return isValidCodePoint(c);
1199     }
1200
1201     size_t encodedLength(dchar c)
1202     in
1203     {
1204         assert(canEncode(c));
1205     }
1206     body
1207     {
1208                 return 1;
1209     }
1210
1211     void encodeViaWrite()(dchar c)
1212     {
1213         write(c);
1214     }
1215
1216     void skipViaRead()()
1217     {
1218         read();
1219     }
1220
1221     dchar decodeViaRead()()
1222     {
1223         return cast(dchar)read;
1224     }
1225
1226     dchar safeDecodeViaRead()()
1227     {
1228         dchar c = read;
1229         return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1230     }
1231
1232     dchar decodeReverseViaRead()()
1233     {
1234         return cast(dchar)read;
1235     }
1236
1237     EString replacementSequence()
1238     {
1239         return "\uFFFD"d;
1240     }
1241
1242     mixin EncoderFunctions;
1243 }
1244
1245 //=============================================================================
1246 // Below are forwarding functions which expose the function to the user
1247
1248 /**
1249 Returns true if c is a valid code point
1250
1251  Note that this includes the non-character code points U+FFFE and U+FFFF,
1252  since these are valid code points (even though they are not valid
1253  characters).
1254
1255  Supercedes:
1256  This function supercedes $(D std.utf.startsValidDchar()).
1257
1258  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1259
1260  Params:
1261     c = the code point to be tested
1262  */
1263 bool isValidCodePoint(dchar c)
1264 {
1265     return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1266 }
1267
1268 /**
1269  Returns the name of an encoding.
1270
1271  The type of encoding cannot be deduced. Therefore, it is necessary to
1272  explicitly specify the encoding type.
1273
1274  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1275
1276  Examples:
1277  -----------------------------------
1278  assert(encodingName!(Latin1Char) == "ISO-8859-1");
1279  -----------------------------------
1280  */
1281 string encodingName(T)()
1282 {
1283     return EncoderInstance!(T).encodingName;
1284 }
1285
1286 unittest
1287 {
1288     assert(encodingName!(char) == "UTF-8");
1289     assert(encodingName!(wchar) == "UTF-16");
1290     assert(encodingName!(dchar) == "UTF-32");
1291     assert(encodingName!(AsciiChar) == "ASCII");
1292     assert(encodingName!(Latin1Char) == "ISO-8859-1");
1293     assert(encodingName!(Windows1252Char) == "windows-1252");
1294 }
1295
1296 /**
1297  Returns true iff it is possible to represent the specifed codepoint
1298  in the encoding.
1299
1300  The type of encoding cannot be deduced. Therefore, it is necessary to
1301  explicitly specify the encoding type.
1302
1303  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1304
1305  Examples:
1306  -----------------------------------
1307  assert(canEncode!(Latin1Char)('A'));
1308  -----------------------------------
1309  */
1310 bool canEncode(E)(dchar c)
1311 {
1312     return EncoderInstance!(E).canEncode(c);
1313 }
1314
1315 unittest
1316 {
1317     assert(!canEncode!(AsciiChar)('\u00A0'));
1318     assert(canEncode!(Latin1Char)('\u00A0'));
1319     assert(canEncode!(Windows1252Char)('\u20AC'));
1320     assert(!canEncode!(Windows1252Char)('\u20AD'));
1321     assert(!canEncode!(Windows1252Char)('\uFFFD'));
1322     assert(!canEncode!(char)(cast(dchar)0x110000));
1323 }
1324
1325 /**
1326  Returns true if the code unit is legal. For example, the byte 0x80 would
1327  not be legal in ASCII, because ASCII code units must always be in the range
1328  0x00 to 0x7F.
1329
1330  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1331
1332  Params:
1333     c = the code unit to be tested
1334  */
1335 bool isValidCodeUnit(E)(E c)
1336 {
1337     return EncoderInstance!(E).isValidCodeUnit(c);
1338 }
1339
1340 unittest
1341 {
1342     assert(!isValidCodeUnit(cast(AsciiChar)0xA0));
1343     assert( isValidCodeUnit(cast(Windows1252Char)0x80));
1344     assert(!isValidCodeUnit(cast(Windows1252Char)0x81));
1345     assert(!isValidCodeUnit(cast(char)0xC0));
1346     assert(!isValidCodeUnit(cast(char)0xFF));
1347     assert( isValidCodeUnit(cast(wchar)0xD800));
1348     assert(!isValidCodeUnit(cast(dchar)0xD800));
1349 }
1350
1351 /**
1352  Returns true if the string is encoded correctly
1353
1354  Supercedes:
1355  This function supercedes std.utf.validate(), however note that this
1356  function returns a bool indicating whether the input was valid or not,
1357  wheras the older funtion would throw an exception.
1358
1359  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1360
1361  Params:
1362     s = the string to be tested
1363  */
1364 bool isValid(E)(const(E)[] s)
1365 {
1366     return s.length == validLength(s);
1367 }
1368
1369 unittest
1370 {
1371     assert(isValid("\u20AC100"));
1372 }
1373
1374 /**
1375  Returns the length of the longest possible substring, starting from
1376  the first code unit, which is validly encoded.
1377
1378  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1379
1380  Params:
1381     s = the string to be tested
1382  */
1383 size_t validLength(E)(const(E)[] s)
1384 {
1385     size_t result, before = void;
1386     while ((before = s.length) > 0)
1387     {
1388         if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1389             break;
1390         result += before - s.length;
1391     }
1392     return result;
1393 }
1394
1395 /**
1396  Sanitizes a string by replacing malformed code unit sequences with valid
1397  code unit sequences. The result is guaranteed to be valid for this encoding.
1398
1399  If the input string is already valid, this function returns the original,
1400  otherwise it constructs a new string by replacing all illegal code unit
1401  sequences with the encoding's replacement character, Invalid sequences will
1402  be replaced with the Unicode replacement character (U+FFFD) if the
1403  character repertoire contains it, otherwise invalid sequences will be
1404  replaced with '?'.
1405
1406  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1407
1408  Params:
1409     s = the string to be sanitized
1410  */
1411 immutable(E)[] sanitize(E)(immutable(E)[] s)
1412 {
1413     size_t n = validLength(s);
1414     if (n == s.length) return s;
1415
1416     auto repSeq = EncoderInstance!(E).replacementSequence;
1417
1418     // Count how long the string needs to be.
1419     // Overestimating is not a problem
1420     size_t len = s.length;
1421     const(E)[] t = s[n..$];
1422     while (t.length != 0)
1423     {
1424         dchar c = EncoderInstance!(E).safeDecode(t);
1425         assert(c == INVALID_SEQUENCE);
1426         len += repSeq.length;
1427         t = t[validLength(t)..$];
1428     }
1429
1430     // Now do the write
1431     E[] array = new E[len];
1432     array[0..n] = s[0..n];
1433     size_t offset = n;
1434
1435     t = s[n..$];
1436     while (t.length != 0)
1437     {
1438         dchar c = EncoderInstance!(E).safeDecode(t);
1439         assert(c == INVALID_SEQUENCE);
1440         array[offset..offset+repSeq.length] = repSeq[];
1441         offset += repSeq.length;
1442         n = validLength(t);
1443         array[offset..offset+n] = t[0..n];
1444         offset += n;
1445         t = t[n..$];
1446     }
1447     return cast(immutable(E)[])array[0..offset];
1448 }
1449
1450 unittest
1451 {
1452     assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1453 }
1454
1455 /**
1456 Returns the length of the first encoded sequence.
1457
1458 The input to this function MUST be validly encoded.
1459 This is enforced by the function's in-contract.
1460
1461 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1462
1463 Params:
1464 s = the string to be sliced
1465  */
1466 size_t firstSequence(E)(const(E)[] s)
1467 in
1468 {
1469     assert(s.length != 0);
1470     const(E)[] u = s;
1471     assert(safeDecode(u) != INVALID_SEQUENCE);
1472 }
1473 body
1474 {
1475     auto before = s.length;
1476     EncoderInstance!(E).skip(s);
1477     return before - s.length;
1478 }
1479
1480 unittest
1481 {
1482     assert(firstSequence("\u20AC1000") == "\u20AC".length);
1483 }
1484
1485 /**
1486  Returns the length the last encoded sequence.
1487
1488  The input to this function MUST be validly encoded.
1489  This is enforced by the function's in-contract.
1490
1491  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1492
1493  Params:
1494     s = the string to be sliced
1495  */
1496 size_t lastSequence(E)(const(E)[] s)
1497 in
1498 {
1499     assert(s.length != 0);
1500     assert(isValid(s));
1501 }
1502 body
1503 {
1504     const(E)[] t = s;
1505     EncoderInstance!(E).decodeReverse(s);
1506     return t.length - s.length;
1507 }
1508
1509 unittest
1510 {
1511     assert(lastSequence("1000\u20AC") == "\u20AC".length);
1512 }
1513
1514 /**
1515  Returns the array index at which the (n+1)th code point begins.
1516
1517  The input to this function MUST be validly encoded.
1518  This is enforced by the function's in-contract.
1519
1520  Supercedes:
1521  This function supercedes std.utf.toUTFindex().
1522
1523  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1524
1525  Params:
1526     s = the string to be counted
1527  */
1528 sizediff_t index(E)(const(E)[] s,int n)
1529 in
1530 {
1531     assert(isValid(s));
1532     assert(n >= 0);
1533 }
1534 body
1535 {
1536     const(E)[] t = s;
1537     for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1538     return t.length - s.length;
1539 }
1540
1541 unittest
1542 {
1543     assert(index("\u20AC100",1) == 3);
1544 }
1545
1546 /**
1547  Decodes a single code point.
1548
1549  This function removes one or more code units from the start of a string,
1550  and returns the decoded code point which those code units represent.
1551
1552  The input to this function MUST be validly encoded.
1553  This is enforced by the function's in-contract.
1554
1555  Supercedes:
1556  This function supercedes std.utf.decode(), however, note that the
1557  function codePoints() supercedes it more conveniently.
1558
1559  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1560
1561  Params:
1562     s = the string whose first code point is to be decoded
1563  */
1564 dchar decode(S)(ref S s)
1565 in
1566 {
1567     assert(s.length != 0);
1568     auto u = s;
1569     assert(safeDecode(u) != INVALID_SEQUENCE);
1570 }
1571 body
1572 {
1573     return EncoderInstance!(typeof(s[0])).decode(s);
1574 }
1575
1576 /**
1577  Decodes a single code point from the end of a string.
1578
1579  This function removes one or more code units from the end of a string,
1580  and returns the decoded code point which those code units represent.
1581
1582  The input to this function MUST be validly encoded.
1583  This is enforced by the function's in-contract.
1584
1585  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1586
1587  Params:
1588     s = the string whose first code point is to be decoded
1589  */
1590 dchar decodeReverse(E)(ref const(E)[] s)
1591 in
1592 {
1593     assert(s.length != 0);
1594     assert(isValid(s));
1595 }
1596 body
1597 {
1598     return EncoderInstance!(E).decodeReverse(s);
1599 }
1600
1601 /**
1602  Decodes a single code point. The input does not have to be valid.
1603
1604  This function removes one or more code units from the start of a string,
1605  and returns the decoded code point which those code units represent.
1606
1607  This function will accept an invalidly encoded string as input.
1608  If an invalid sequence is found at the start of the string, this
1609  function will remove it, and return the value INVALID_SEQUENCE.
1610
1611  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1612
1613  Params:
1614     s = the string whose first code point is to be decoded
1615  */
1616 dchar safeDecode(S)(ref S s)
1617 in
1618 {
1619     assert(s.length != 0);
1620 }
1621 body
1622 {
1623     return EncoderInstance!(typeof(s[0])).safeDecode(s);
1624 }
1625
1626 /**
1627  Returns the number of code units required to encode a single code point.
1628
1629  The input to this function MUST be a valid code point.
1630  This is enforced by the function's in-contract.
1631
1632  The type of the output cannot be deduced. Therefore, it is necessary to
1633  explicitly specify the encoding as a template parameter.
1634
1635  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1636
1637  Params:
1638     c = the code point to be encoded
1639  */
1640 size_t encodedLength(E)(dchar c)
1641 in
1642 {
1643     assert(isValidCodePoint(c));
1644 }
1645 body
1646 {
1647     return EncoderInstance!(E).encodedLength(c);
1648 }
1649
1650 /**
1651  Encodes a single code point.
1652
1653  This function encodes a single code point into one or more code units.
1654  It returns a string containing those code units.
1655
1656  The input to this function MUST be a valid code point.
1657  This is enforced by the function's in-contract.
1658
1659  The type of the output cannot be deduced. Therefore, it is necessary to
1660  explicitly specify the encoding as a template parameter.
1661
1662  Supercedes:
1663  This function supercedes std.utf.encode(), however, note that the
1664  function codeUnits() supercedes it more conveniently.
1665
1666  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1667
1668  Params:
1669     c = the code point to be encoded
1670  */
1671 E[] encode(E)(dchar c)
1672 in
1673 {
1674     assert(isValidCodePoint(c));
1675 }
1676 body
1677 {
1678     return EncoderInstance!(E).encode(c);
1679 }
1680
1681 /**
1682  Encodes a single code point into an array.
1683
1684  This function encodes a single code point into one or more code units
1685  The code units are stored in a user-supplied fixed-size array,
1686  which must be passed by reference.
1687
1688  The input to this function MUST be a valid code point.
1689  This is enforced by the function's in-contract.
1690
1691  The type of the output cannot be deduced. Therefore, it is necessary to
1692  explicitly specify the encoding as a template parameter.
1693
1694  Supercedes:
1695  This function supercedes std.utf.encode(), however, note that the
1696  function codeUnits() supercedes it more conveniently.
1697
1698  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1699
1700  Params:
1701     c = the code point to be encoded
1702
1703  Returns:
1704           the number of code units written to the array
1705  */
1706 size_t encode(E)(dchar c, E[] array)
1707 in
1708 {
1709     assert(isValidCodePoint(c));
1710 }
1711 body
1712 {
1713     E[] t = array;
1714     EncoderInstance!(E).encode(c,t);
1715     return array.length - t.length;
1716 }
1717
1718 // /**
1719 //  * Encodes a single code point into a Buffer.
1720 //  *
1721 //  * This function encodes a single code point into one or more code units
1722 //  * The code units are stored in a growable buffer.
1723 //  *
1724 //  * The input to this function MUST be a valid code point.
1725 //  * This is enforced by the function's in-contract.
1726 //  *
1727 //  * The type of the output cannot be deduced. Therefore, it is necessary to
1728 //  * explicitly specify the encoding as a template parameter.
1729 //  *
1730 //  * Supercedes:
1731 //  * This function supercedes std.utf.encode(), however, note that the
1732 //  * function codeUnits() supercedes it more conveniently.
1733 //  *
1734 //  * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1735 //  *
1736 //  * Params:
1737 //  *    c = the code point to be encoded
1738 //  */
1739 // deprecated void encode(E)(dchar c, ref Buffer!(E) buffer)
1740 // in
1741 // {
1742 //     assert(isValidCodePoint(c));
1743 // }
1744 // body
1745 // {
1746 //     EncoderInstance!(E).encode(c,buffer);
1747 // }
1748
1749 /*
1750 Encodes $(D c) in units of type $(D E) and writes the result to the
1751 output range $(D R). Returns the number of $(D E)s written.
1752  */
1753
1754 size_t encode(E, R)(dchar c, R range)
1755 {
1756     static if (is(Unqual!E == char))
1757     {
1758         if (c <= 0x7F)
1759         {
1760             range.put(cast(char) c);
1761             return 1;
1762         }
1763         if (c <= 0x7FF)
1764         {
1765             range.put(cast(char)(0xC0 | (c >> 6)));
1766             range.put(cast(char)(0x80 | (c & 0x3F)));
1767             return 2;
1768         }
1769         if (c <= 0xFFFF)
1770         {
1771             range.put(cast(char)(0xE0 | (c >> 12)));
1772             range.put(cast(char)(0x80 | ((c >> 6) & 0x3F)));
1773             range.put(cast(char)(0x80 | (c & 0x3F)));
1774             return 3;
1775         }
1776         if (c <= 0x10FFFF)
1777         {
1778             range.put(cast(char)(0xF0 | (c >> 18)));
1779             range.put(cast(char)(0x80 | ((c >> 12) & 0x3F)));
1780             range.put(cast(char)(0x80 | ((c >> 6) & 0x3F)));
1781             range.put(cast(char)(0x80 | (c & 0x3F)));
1782             return 4;
1783         }
1784         else
1785         {
1786             assert(0);
1787         }
1788     }
1789     else static if (is(Unqual!E == wchar))
1790     {
1791         if (c <= 0xFFFF)
1792         {
1793             r.put(cast(wchar) c);
1794             return 1;
1795         }
1796         r.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
1797         r.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
1798         return 2;
1799     }
1800     else static if (is(Unqual!E == dchar))
1801     {
1802         r.put(c);
1803         return 1;
1804     }
1805     else
1806     {
1807         assert(0);
1808     }
1809 }
1810
1811 /**
1812  Encodes a single code point to a delegate.
1813
1814  This function encodes a single code point into one or more code units.
1815  The code units are passed one at a time to the supplied delegate.
1816
1817  The input to this function MUST be a valid code point.
1818  This is enforced by the function's in-contract.
1819
1820  The type of the output cannot be deduced. Therefore, it is necessary to
1821  explicitly specify the encoding as a template parameter.
1822
1823  Supercedes:
1824  This function supercedes std.utf.encode(), however, note that the
1825  function codeUnits() supercedes it more conveniently.
1826
1827  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1828
1829  Params:
1830     c = the code point to be encoded
1831  */
1832 void encode(E)(dchar c, void delegate(E) dg)
1833 in
1834 {
1835     assert(isValidCodePoint(c));
1836 }
1837 body
1838 {
1839     EncoderInstance!(E).encode(c,dg);
1840 }
1841
1842 /**
1843  Returns a foreachable struct which can bidirectionally iterate over all
1844  code points in a string.
1845
1846  The input to this function MUST be validly encoded.
1847  This is enforced by the function's in-contract.
1848
1849  You can foreach either
1850  with or without an index. If an index is specified, it will be initialized
1851  at each iteration with the offset into the string at which the code point
1852  begins.
1853
1854  Supercedes:
1855  This function supercedes std.utf.decode().
1856
1857  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1858
1859  Params:
1860     s = the string to be decoded
1861
1862  Examples:
1863  --------------------------------------------------------
1864  string s = "hello world";
1865  foreach(c;codePoints(s))
1866  {
1867      // do something with c (which will always be a dchar)
1868  }
1869  --------------------------------------------------------
1870
1871  Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
1872  in that the latter will fall over on encountering U+FFFF.
1873  */
1874 CodePoints!(E) codePoints(E)(immutable(E)[] s)
1875 in
1876 {
1877     assert(isValid(s));
1878 }
1879 body
1880 {
1881     return CodePoints!(E)(s);
1882 }
1883
1884 unittest
1885 {
1886     string s = "hello";
1887     string t;
1888     foreach(c;codePoints(s))
1889     {
1890         t ~= cast(char)c;
1891     }
1892     assert(s == t);
1893 }
1894
1895 /**
1896  Returns a foreachable struct which can bidirectionally iterate over all
1897  code units in a code point.
1898
1899  The input to this function MUST be a valid code point.
1900  This is enforced by the function's in-contract.
1901
1902  The type of the output cannot be deduced. Therefore, it is necessary to
1903  explicitly specify the encoding type in the template parameter.
1904
1905  Supercedes:
1906  This function supercedes std.utf.encode().
1907
1908  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1909
1910  Params:
1911     d = the code point to be encoded
1912
1913  Examples:
1914  --------------------------------------------------------
1915  dchar d = '\u20AC';
1916  foreach(c;codeUnits!(char)(d))
1917  {
1918      writefln("%X",c)
1919  }
1920  // will print
1921  // E2
1922  // 82
1923  // AC
1924  --------------------------------------------------------
1925  */
1926 CodeUnits!(E) codeUnits(E)(dchar c)
1927 in
1928 {
1929     assert(isValidCodePoint(c));
1930 }
1931 body
1932 {
1933     return CodeUnits!(E)(c);
1934 }
1935
1936 unittest
1937 {
1938     char[] a;
1939     foreach(c;codeUnits!(char)(cast(dchar)'\u20AC'))
1940     {
1941         a ~= c;
1942     }
1943     assert(a.length == 3);
1944     assert(a[0] == 0xE2);
1945     assert(a[1] == 0x82);
1946     assert(a[2] == 0xAC);
1947 }
1948
1949 /**
1950 Encodes $(D c) in units of type $(D E) and writes the result to the
1951 output range $(D R). Returns the number of $(D E)s written.
1952  */
1953
1954 size_t encode(Tgt, Src, R)(in Src[] s, R range)
1955 {
1956     size_t result;
1957     foreach (c; s)
1958     {
1959         result += encode!(Tgt)(c, range);
1960     }
1961     return result;
1962 }
1963
1964 /**
1965  Convert a string from one encoding to another. (See also to!() below).
1966
1967  The input to this function MUST be validly encoded.
1968  This is enforced by the function's in-contract.
1969
1970  Supercedes:
1971  This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1972  std.utf.toUTF32()
1973  (but note that to!() supercedes it more conveniently).
1974
1975  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1976
1977  Params:
1978     s = the source string
1979     r = the destination string
1980
1981  Examples:
1982  --------------------------------------------------------
1983  wstring ws;
1984  transcode("hello world",ws);
1985      // transcode from UTF-8 to UTF-16
1986
1987  Latin1String ls;
1988  transcode(ws, ls);
1989      // transcode from UTF-16 to ISO-8859-1
1990   --------------------------------------------------------
1991  */
1992 void transcode(Src,Dst)(immutable(Src)[] s,out immutable(Dst)[] r)
1993 in
1994 {
1995     assert(isValid(s));
1996 }
1997 body
1998 {
1999     static if(is(Src==Dst))
2000     {
2001         r = s;
2002     }
2003     else static if(is(Src==AsciiChar))
2004     {
2005         transcode!(char,Dst)(cast(string)s,r);
2006     }
2007     else
2008     {
2009         const(Src)[] t = s;
2010         while (t.length != 0)
2011         {
2012             r ~= encode!(Dst)(decode(t));
2013         }
2014     }
2015 }
2016
2017 /*
2018  Convert a string from one encoding to another. (See also transcode() above).
2019
2020  The input to this function MUST be validly encoded.
2021  This is enforced by the function's in-contract.
2022
2023  Supercedes:
2024  This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
2025  std.utf.toUTF32().
2026
2027  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
2028
2029  Params:
2030     Dst = the destination encoding type
2031     s = the source string
2032
2033  Examples:
2034  -----------------------------------------------------------------------------
2035  auto ws = to!(wchar)("hello world");  // transcode from UTF-8 to UTF-16
2036  auto ls = to!(Latin1Char)(ws);            // transcode from UTF-16 to ISO-8859-1
2037  -----------------------------------------------------------------------------
2038  */
2039 // TODO: Commented out for no - to be moved to std.conv
2040 // Dst to(Dst,Src)(immutable(Src)[] s)
2041 // in
2042 // {
2043 //  assert(isValid(s));
2044 // }
2045 // body
2046 // {
2047 //  Dst r;
2048 //  transcode(s,r);
2049 //  return r;
2050 // }
2051
2052 //=============================================================================
2053
2054 /** The base class for exceptions thrown by this module */
2055 class EncodingException : Exception { this(string msg) { super(msg); } }
2056
2057 class UnrecognizedEncodingException : EncodingException
2058 {
2059     private this(string msg) { super(msg); }
2060 }
2061
2062 /** Abstract base class of all encoding schemes */
2063 abstract class EncodingScheme
2064 {
2065     /**
2066      * Registers a subclass of EncodingScheme.
2067      *
2068      * This function allows user-defined subclasses of EncodingScheme to
2069      * be declared in other modules.
2070      *
2071      * Examples:
2072      * ----------------------------------------------
2073      * class Amiga1251 : EncodingScheme
2074      * {
2075      *     shared static this()
2076      *     {
2077      *         EncodingScheme.register("path.to.Amiga1251");
2078      *     }
2079      * }
2080      * ----------------------------------------------
2081      */
2082     static void register(string className)
2083     {
2084         auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
2085         if (scheme is null)
2086             throw new EncodingException("Unable to create class "~className);
2087         foreach(encodingName;scheme.names())
2088         {
2089             supported[tolower(encodingName)] = className;
2090         }
2091     }
2092
2093     /**
2094      * Obtains a subclass of EncodingScheme which is capable of encoding
2095      * and decoding the named encoding scheme.
2096      *
2097      * This function is only aware of EncodingSchemes which have been
2098      * registered with the register() function.
2099      *
2100      * Examples:
2101      * ---------------------------------------------------
2102      * auto scheme = EncodingScheme.create("Amiga-1251");
2103      * ---------------------------------------------------
2104      */
2105     static EncodingScheme create(string encodingName)
2106     {
2107         auto p = std.string.tolower(encodingName) in supported;
2108         if (p is null)
2109             throw new EncodingException("Unrecognized Encoding: "~encodingName);
2110         string className = *p;
2111         auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
2112         if (scheme is null) throw new EncodingException("Unable to create class "~className);
2113         return scheme;
2114     }
2115
2116     const
2117     {
2118         /**
2119          * Returns the standard name of the encoding scheme
2120          */
2121         abstract override string toString();
2122
2123         /**
2124          * Returns an array of all known names for this encoding scheme
2125          */
2126         abstract string[] names();
2127
2128         /**
2129          * Returns true if the character c can be represented
2130          * in this encoding scheme.
2131          */
2132         abstract bool canEncode(dchar c);
2133
2134         /**
2135          * Returns the number of ubytes required to encode this code point.
2136          *
2137          * The input to this function MUST be a valid code point.
2138          *
2139          * Params:
2140          *    c = the code point to be encoded
2141          *
2142          * Returns:
2143          *    the number of ubytes required.
2144          */
2145         abstract size_t encodedLength(dchar c);
2146
2147         /**
2148          * Encodes a single code point into a user-supplied, fixed-size buffer.
2149          *
2150          * This function encodes a single code point into one or more ubytes.
2151          * The supplied buffer must be code unit aligned.
2152          * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2153          * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2154          *
2155          * The input to this function MUST be a valid code point.
2156          *
2157          * Params:
2158          *    c = the code point to be encoded
2159          *
2160          * Returns:
2161          *    the number of ubytes written.
2162          */
2163         abstract size_t encode(dchar c, ubyte[] buffer);
2164
2165         /**
2166          * Decodes a single code point.
2167          *
2168          * This function removes one or more ubytes from the start of an array,
2169          * and returns the decoded code point which those ubytes represent.
2170          *
2171          * The input to this function MUST be validly encoded.
2172          *
2173          * Params:
2174          *    s = the array whose first code point is to be decoded
2175          */
2176         abstract dchar decode(ref const(ubyte)[] s);
2177
2178         /**
2179          * Decodes a single code point. The input does not have to be valid.
2180          *
2181          * This function removes one or more ubytes from the start of an array,
2182          * and returns the decoded code point which those ubytes represent.
2183          *
2184          * This function will accept an invalidly encoded array as input.
2185          * If an invalid sequence is found at the start of the string, this
2186          * function will remove it, and return the value INVALID_SEQUENCE.
2187          *
2188          * Params:
2189          *    s = the array whose first code point is to be decoded
2190          */
2191         abstract dchar safeDecode(ref const(ubyte)[] s);
2192
2193         /**
2194          * Returns the sequence of ubytes to be used to represent
2195          * any character which cannot be represented in the encoding scheme.
2196          *
2197          * Normally this will be a representation of some substitution
2198          * character, such as U+FFFD or '?'.
2199          */
2200         abstract immutable(ubyte)[] replacementSequence();
2201     }
2202
2203     /**
2204      * Returns true if the array is encoded correctly
2205      *
2206      * Params:
2207      *    s = the array to be tested
2208      */
2209     bool isValid(const(ubyte)[] s)
2210     {
2211         while (s.length != 0)
2212         {
2213             dchar d = safeDecode(s);
2214             if (d == INVALID_SEQUENCE)
2215                 return false;
2216         }
2217         return true;
2218     }
2219
2220     /**
2221      * Returns the length of the longest possible substring, starting from
2222      * the first element, which is validly encoded.
2223      *
2224      * Params:
2225      *    s = the array to be tested
2226      */
2227     size_t validLength(const(ubyte)[] s)
2228     {
2229         const(ubyte)[] r = s;
2230         const(ubyte)[] t = s;
2231         while (s.length != 0)
2232         {
2233             if (safeDecode(s) == INVALID_SEQUENCE) break;
2234             t = s;
2235         }
2236         return r.length - t.length;
2237     }
2238
2239     /**
2240      * Sanitizes an array by replacing malformed ubyte sequences with valid
2241      * ubyte sequences. The result is guaranteed to be valid for this
2242      * encoding scheme.
2243      *
2244      * If the input array is already valid, this function returns the
2245      * original, otherwise it constructs a new array by replacing all illegal
2246      * sequences with the encoding scheme's replacement sequence.
2247      *
2248      * Params:
2249      *    s = the string to be sanitized
2250      */
2251     immutable(ubyte)[] sanitize(immutable(ubyte)[] s)
2252     {
2253         auto n = validLength(s);
2254         if (n == s.length) return s;
2255
2256         auto repSeq = replacementSequence;
2257
2258         // Count how long the string needs to be.
2259         // Overestimating is not a problem
2260         auto len = s.length;
2261         const(ubyte)[] t = s[n..$];
2262         while (t.length != 0)
2263         {
2264             dchar c = safeDecode(t);
2265             assert(c == INVALID_SEQUENCE);
2266             len += repSeq.length;
2267             t = t[validLength(t)..$];
2268         }
2269
2270         // Now do the write
2271         ubyte[] array = new ubyte[len];
2272         array[0..n] = s[0..n];
2273         auto offset = n;
2274
2275         t = s[n..$];
2276         while (t.length != 0)
2277         {
2278             dchar c = safeDecode(t);
2279             assert(c == INVALID_SEQUENCE);
2280             array[offset..offset+repSeq.length] = repSeq[];
2281             offset += repSeq.length;
2282             n = validLength(t);
2283             array[offset..offset+n] = t[0..n];
2284             offset += n;
2285             t = t[n..$];
2286         }
2287         return cast(immutable(ubyte)[])array[0..offset];
2288     }
2289
2290     /**
2291      * Returns the length of the first encoded sequence.
2292      *
2293      * The input to this function MUST be validly encoded.
2294      * This is enforced by the function's in-contract.
2295      *
2296      * Params:
2297      *    s = the array to be sliced
2298      */
2299     size_t firstSequence(const(ubyte)[] s)
2300     in
2301     {
2302         assert(s.length != 0);
2303         const(ubyte)[] u = s;
2304         assert(safeDecode(u) != INVALID_SEQUENCE);
2305     }
2306     body
2307     {
2308         const(ubyte)[] t = s;
2309         decode(s);
2310         return t.length - s.length;
2311     }
2312
2313     /**
2314      * Returns the total number of code points encoded in a ubyte array.
2315      *
2316      * The input to this function MUST be validly encoded.
2317      * This is enforced by the function's in-contract.
2318      *
2319      * Params:
2320      *    s = the string to be counted
2321      */
2322     size_t count(const(ubyte)[] s)
2323     in
2324     {
2325         assert(isValid(s));
2326     }
2327     body
2328     {
2329         size_t n = 0;
2330         while (s.length != 0)
2331         {
2332             decode(s);
2333             ++n;
2334         }
2335         return n;
2336     }
2337
2338     /**
2339      * Returns the array index at which the (n+1)th code point begins.
2340      *
2341      * The input to this function MUST be validly encoded.
2342      * This is enforced by the function's in-contract.
2343      *
2344      * Params:
2345      *    s = the string to be counted
2346      */
2347     sizediff_t index(const(ubyte)[] s, size_t n)
2348     in
2349     {
2350         assert(isValid(s));
2351         assert(n >= 0);
2352     }
2353     body
2354     {
2355         const(ubyte)[] t = s;
2356         for (size_t i=0; i<n; ++i) decode(s);
2357         return t.length - s.length;
2358     }
2359
2360     __gshared string[string] supported;
2361 }
2362
2363 /**
2364  EncodingScheme to handle ASCII
2365
2366  This scheme recognises the following names:
2367                  "ANSI_X3.4-1968",
2368                  "ANSI_X3.4-1986",
2369                  "ASCII",
2370                  "IBM367",
2371                  "ISO646-US",
2372                  "ISO_646.irv:1991",
2373                  "US-ASCII",
2374                  "cp367",
2375                  "csASCII"
2376                  "iso-ir-6",
2377                  "us"
2378  */
2379 class EncodingSchemeASCII : EncodingScheme
2380 {
2381     shared static this()
2382     {
2383         EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2384     }
2385
2386     const
2387     {
2388         override string[] names()
2389         {
2390             return
2391             [
2392                 cast(string)
2393                 "ANSI_X3.4-1968",
2394                 "ANSI_X3.4-1986",
2395                 "ASCII",
2396                 "IBM367",
2397                 "ISO646-US",
2398                 "ISO_646.irv:1991",
2399                 "US-ASCII",
2400                 "cp367",
2401                 "csASCII"
2402                 "iso-ir-6",
2403                 "us"
2404             ];
2405         }
2406
2407         override string toString()
2408         {
2409             return "ASCII";
2410         }
2411
2412         override bool canEncode(dchar c)
2413         {
2414             return std.encoding.canEncode!(AsciiChar)(c);
2415         }
2416
2417         override size_t encodedLength(dchar c)
2418         {
2419                 return std.encoding.encodedLength!(AsciiChar)(c);
2420         }
2421
2422         override size_t encode(dchar c, ubyte[] buffer)
2423         {
2424             auto r = cast(AsciiChar[])buffer;
2425             return std.encoding.encode(c,r);
2426         }
2427
2428         override dchar decode(ref const(ubyte)[] s)
2429         {
2430             auto t = cast(const(AsciiChar)[]) s;
2431             dchar c = std.encoding.decode(t);
2432             s = s[$-t.length..$];
2433             return c;
2434         }
2435
2436         override dchar safeDecode(ref const(ubyte)[] s)
2437         {
2438             auto t = cast(const(AsciiChar)[]) s;
2439             dchar c = std.encoding.safeDecode(t);
2440             s = s[$-t.length..$];
2441             return c;
2442         }
2443
2444         override immutable(ubyte)[] replacementSequence()
2445         {
2446             return cast(immutable(ubyte)[])"?";
2447         }
2448     }
2449 }
2450
2451 /**
2452  EncodingScheme to handle Latin-1
2453
2454  This scheme recognises the following names:
2455                  "CP819",
2456                  "IBM819",
2457                  "ISO-8859-1",
2458                  "ISO_8859-1",
2459                  "ISO_8859-1:1987",
2460                  "csISOLatin1",
2461                  "iso-ir-100",
2462                  "l1",
2463                  "latin1"
2464  */
2465 class EncodingSchemeLatin1 : EncodingScheme
2466 {
2467     shared static this()
2468     {
2469         EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
2470     }
2471
2472     const
2473     {
2474         override string[] names()
2475         {
2476             return
2477             [
2478                 cast(string)
2479                 "CP819",
2480                 "IBM819",
2481                 "ISO-8859-1",
2482                 "ISO_8859-1",
2483                 "ISO_8859-1:1987",
2484                 "csISOLatin1",
2485                 "iso-ir-100",
2486                 "l1",
2487                 "latin1"
2488             ];
2489         }
2490
2491         override string toString()
2492         {
2493             return "ISO-8859-1";
2494         }
2495
2496         override bool canEncode(dchar c)
2497         {
2498             return std.encoding.canEncode!(Latin1Char)(c);
2499         }
2500
2501         override size_t encodedLength(dchar c)
2502         {
2503                 return std.encoding.encodedLength!(Latin1Char)(c);
2504         }
2505
2506         override size_t encode(dchar c, ubyte[] buffer)
2507         {
2508                 auto r = cast(Latin1Char[])buffer;
2509             return std.encoding.encode(c,r);
2510         }
2511
2512         override dchar decode(ref const(ubyte)[] s)
2513         {
2514             auto t = cast(const(Latin1Char)[]) s;
2515             dchar c = std.encoding.decode(t);
2516             s = s[$-t.length..$];
2517             return c;
2518         }
2519
2520         override dchar safeDecode(ref const(ubyte)[] s)
2521         {
2522             auto t = cast(const(Latin1Char)[]) s;
2523             dchar c = std.encoding.safeDecode(t);
2524             s = s[$-t.length..$];
2525             return c;
2526         }
2527
2528         override immutable(ubyte)[] replacementSequence()
2529         {
2530             return cast(immutable(ubyte)[])"?";
2531         }
2532     }
2533 }
2534
2535 /**
2536  EncodingScheme to handle Windows-1252
2537
2538  This scheme recognises the following names:
2539                  "windows-1252"
2540  */
2541 class EncodingSchemeWindows1252 : EncodingScheme
2542 {
2543     shared static this()
2544     {
2545         EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
2546     }
2547
2548     const
2549     {
2550         override string[] names()
2551         {
2552             return
2553             [
2554                 cast(string)
2555                 "windows-1252"
2556             ];
2557         }
2558
2559         override string toString()
2560         {
2561             return "windows-1252";
2562         }
2563
2564         override bool canEncode(dchar c)
2565         {
2566             return std.encoding.canEncode!(Windows1252Char)(c);
2567         }
2568
2569         override size_t encodedLength(dchar c)
2570         {
2571                 return std.encoding.encodedLength!(Windows1252Char)(c);
2572         }
2573
2574         override size_t encode(dchar c, ubyte[] buffer)
2575         {
2576                 auto r = cast(Windows1252Char[])buffer;
2577             return std.encoding.encode(c,r);
2578         }
2579
2580         override dchar decode(ref const(ubyte)[] s)
2581         {
2582             auto t = cast(const(Windows1252Char)[]) s;
2583             dchar c = std.encoding.decode(t);
2584             s = s[$-t.length..$];
2585             return c;
2586         }
2587
2588         override dchar safeDecode(ref const(ubyte)[] s)
2589         {
2590             auto t = cast(const(Windows1252Char)[]) s;
2591             dchar c = std.encoding.safeDecode(t);
2592             s = s[$-t.length..$];
2593             return c;
2594         }
2595
2596         override immutable(ubyte)[] replacementSequence()
2597         {
2598             return cast(immutable(ubyte)[])"?";
2599         }
2600     }
2601 }
2602
2603 /**
2604  EncodingScheme to handle UTF-8
2605
2606  This scheme recognises the following names:
2607                  "UTF-8"
2608  */
2609 class EncodingSchemeUtf8 : EncodingScheme
2610 {
2611     shared static this()
2612     {
2613         EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
2614     }
2615
2616     const
2617     {
2618         override string[] names()
2619         {
2620             return
2621             [
2622                 cast(string)
2623                 "UTF-8"
2624             ];
2625         }
2626
2627         override string toString()
2628         {
2629             return "UTF-8";
2630         }
2631
2632         override bool canEncode(dchar c)
2633         {
2634             return std.encoding.canEncode!(char)(c);
2635         }
2636
2637         override size_t encodedLength(dchar c)
2638         {
2639                 return std.encoding.encodedLength!(char)(c);
2640         }
2641
2642         override size_t encode(dchar c, ubyte[] buffer)
2643         {
2644                 auto r = cast(char[])buffer;
2645             return std.encoding.encode(c,r);
2646         }
2647
2648         override dchar decode(ref const(ubyte)[] s)
2649         {
2650             auto t = cast(const(char)[]) s;
2651             dchar c = std.encoding.decode(t);
2652             s = s[$-t.length..$];
2653             return c;
2654         }
2655
2656         override dchar safeDecode(ref const(ubyte)[] s)
2657         {
2658             auto t = cast(const(char)[]) s;
2659             dchar c = std.encoding.safeDecode(t);
2660             s = s[$-t.length..$];
2661             return c;
2662         }
2663
2664         override immutable(ubyte)[] replacementSequence()
2665         {
2666             return cast(immutable(ubyte)[])"\uFFFD";
2667         }
2668     }
2669 }
2670
2671 /**
2672  EncodingScheme to handle UTF-16 in native byte order
2673
2674  This scheme recognises the following names:
2675                  "UTF-16LE" (little-endian architecture only)
2676                  "UTF-16BE" (big-endian architecture only)
2677  */
2678 class EncodingSchemeUtf16Native : EncodingScheme
2679 {
2680     shared static this()
2681     {
2682         EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
2683     }
2684
2685     const
2686     {
2687         version(LittleEndian) { string NAME = "UTF-16LE"; }
2688         version(BigEndian)    { string NAME = "UTF-16BE"; }
2689
2690         override string[] names()
2691         {
2692             return [ NAME ];
2693         }
2694
2695         override string toString()
2696         {
2697             return NAME;
2698         }
2699
2700         override bool canEncode(dchar c)
2701         {
2702             return std.encoding.canEncode!(wchar)(c);
2703         }
2704
2705         override size_t encodedLength(dchar c)
2706         {
2707                 return std.encoding.encodedLength!(wchar)(c);
2708         }
2709
2710         override size_t encode(dchar c, ubyte[] buffer)
2711         {
2712                 auto r = cast(wchar[])buffer;
2713             return wchar.sizeof * std.encoding.encode(c,r);
2714         }
2715
2716         override dchar decode(ref const(ubyte)[] s)
2717         in
2718         {
2719             assert((s.length & 1) == 0);
2720         }
2721         body
2722         {
2723             auto t = cast(const(wchar)[]) s;
2724             dchar c = std.encoding.decode(t);
2725             s = s[$-t.length..$];
2726             return c;
2727         }
2728
2729         override dchar safeDecode(ref const(ubyte)[] s)
2730         in
2731         {
2732             assert((s.length & 1) == 0);
2733         }
2734         body
2735         {
2736             auto t = cast(const(wchar)[]) s;
2737             dchar c = std.encoding.safeDecode(t);
2738             s = s[$-t.length..$];
2739             return c;
2740         }
2741
2742         override immutable(ubyte)[] replacementSequence()
2743         {
2744             return cast(immutable(ubyte)[])"\uFFFD"w;
2745         }
2746     }
2747 }
2748
2749 /**
2750  EncodingScheme to handle UTF-32 in native byte order
2751
2752  This scheme recognises the following names:
2753                  "UTF-32LE" (little-endian architecture only)
2754                  "UTF-32BE" (big-endian architecture only)
2755  */
2756 class EncodingSchemeUtf32Native : EncodingScheme
2757 {
2758     shared static this()
2759     {
2760         EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
2761     }
2762
2763     const
2764     {
2765         version(LittleEndian) { string NAME = "UTF-32LE"; }
2766         version(BigEndian)    { string NAME = "UTF-32BE"; }
2767
2768         override string[] names()
2769         {
2770             return [ NAME ];
2771         }
2772
2773         override string toString()
2774         {
2775             return NAME;
2776         }
2777
2778         override bool canEncode(dchar c)
2779         {
2780             return std.encoding.canEncode!(dchar)(c);
2781         }
2782
2783         override size_t encodedLength(dchar c)
2784         {
2785                 return std.encoding.encodedLength!(dchar)(c);
2786         }
2787
2788         override size_t encode(dchar c, ubyte[] buffer)
2789         {
2790                 auto r = cast(dchar[])buffer;
2791             return dchar.sizeof * std.encoding.encode(c,r);
2792         }
2793
2794         override dchar decode(ref const(ubyte)[] s)
2795         in
2796         {
2797             assert((s.length & 3) == 0);
2798         }
2799         body
2800         {
2801             auto t = cast(const(dchar)[]) s;
2802             dchar c = std.encoding.decode(t);
2803             s = s[$-t.length..$];
2804             return c;
2805         }
2806
2807         override dchar safeDecode(ref const(ubyte)[] s)
2808         in
2809         {
2810             assert((s.length & 3) == 0);
2811         }
2812         body
2813         {
2814             auto t = cast(const(dchar)[]) s;
2815             dchar c = std.encoding.safeDecode(t);
2816             s = s[$-t.length..$];
2817             return c;
2818         }
2819
2820         override immutable(ubyte)[] replacementSequence()
2821         {
2822             return cast(immutable(ubyte)[])"\uFFFD"d;
2823         }
2824     }
2825 }
2826
2827 //=============================================================================
2828
2829
2830 // Helper functions
2831 version(unittest)
2832 {
2833     void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
2834     {
2835         static if(is(Src==Dst))
2836         {
2837             return s;
2838         }
2839         else static if(is(Src==AsciiChar))
2840         {
2841             transcodeReverse!(char,Dst)(cast(string)s,r);
2842         }
2843         else
2844         {
2845             foreach_reverse(d;codePoints(s))
2846             {
2847                 foreach_reverse(c;codeUnits!(Dst)(d))
2848                 {
2849                     r = c ~ r;
2850                 }
2851             }
2852         }
2853     }
2854
2855     string makeReadable(string s)
2856     {
2857         string r = "\"";
2858         foreach(char c;s)
2859         {
2860             if (c >= 0x20 && c < 0x80)
2861             {
2862                 r ~= c;
2863             }
2864             else
2865             {
2866                 r ~= "\\x";
2867                 r ~= toHexDigit(c >> 4);
2868                 r ~= toHexDigit(c);
2869             }
2870         }
2871         r ~= "\"";
2872         return r;
2873     }
2874
2875     string makeReadable(wstring s)
2876     {
2877         string r = "\"";
2878         foreach(wchar c;s)
2879         {
2880             if (c >= 0x20 && c < 0x80)
2881             {
2882                 r ~= cast(char) c;
2883             }
2884             else
2885             {
2886                 r ~= "\\u";
2887                 r ~= toHexDigit(c >> 12);
2888                 r ~= toHexDigit(c >> 8);
2889                 r ~= toHexDigit(c >> 4);
2890                 r ~= toHexDigit(c);
2891             }
2892         }
2893         r ~= "\"w";
2894         return r;
2895     }
2896
2897     string makeReadable(dstring s)
2898     {
2899         string r = "\"";
2900         foreach(dchar c; s)
2901         {
2902             if (c >= 0x20 && c < 0x80)
2903             {
2904                 r ~= cast(char) c;
2905             }
2906             else if (c < 0x10000)
2907             {
2908                 r ~= "\\u";
2909                 r ~= toHexDigit(c >> 12);
2910                 r ~= toHexDigit(c >> 8);
2911                 r ~= toHexDigit(c >> 4);
2912                 r ~= toHexDigit(c);
2913             }
2914             else
2915             {
2916                 r ~= "\\U00";
2917                 r ~= toHexDigit(c >> 20);
2918                 r ~= toHexDigit(c >> 16);
2919                 r ~= toHexDigit(c >> 12);
2920                 r ~= toHexDigit(c >> 8);
2921                 r ~= toHexDigit(c >> 4);
2922                 r ~= toHexDigit(c);
2923             }
2924         }
2925         r ~= "\"d";
2926         return r;
2927     }
2928
2929     char toHexDigit(int n)
2930     {
2931         return "0123456789ABCDEF"[n & 0xF];
2932     }
2933 }
Note: See TracBrowser for help on using the browser.