Changeset 609
- Timestamp:
- 02/21/08 12:50:25 (9 months ago)
- Files:
-
- candidate/phobos/std/encoding.d (moved) (moved from candidate/phobos/std/utf2.d) (41 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
candidate/phobos/std/encoding.d
r581 r609 1 1 /** 2 Classes and functions for handling and transcoding between the three UTF formats. 2 Classes and functions for handling and transcoding between various encodings. Encodings currently supported are 3 UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 (also known as LATIN-1), and WINDOWS-1252 4 5 Functions are provided for arbitrary encoding and decoding of single characters, arbitrary transcoding between 6 strings of different type, as well as validation and sanitization. 7 8 The type eString!(ascii) represents an ASCII string; the type eString!(latin1) represents an ISO-8859-1 string, and 9 so on. In general, eString!(E) is the string type for encoding E, and eString(utf8), eString(utf16) and eString(utf32) 10 are aliases for string, wstring and dstring respectively. 11 12 Future directions for this module include the ability to handle arbitrary encodings. 3 13 4 14 Authors: Janice Caron 5 15 6 Date: 2006.02.12 16 Date: 2006.02.21 17 7 18 */ 8 19 9 module std.utf2; 10 //import std.stdio; 20 module std.encoding; 11 21 12 22 unittest … … 21 31 22 32 // First possible sequence of a certain length 23 [ 0x00 ], // U -00000000 one byte24 [ 0xC2, 0x80 ], // U -00000080 two bytes25 [ 0xE0, 0xA0, 0x80 ], // U -00000800 three bytes26 [ 0xF0, 0x90, 0x80, 0x80 ], // U -00010000 three bytes33 [ 0x00 ], // U+00000000 one byte 34 [ 0xC2, 0x80 ], // U+00000080 two bytes 35 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes 36 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes 27 37 28 38 // Last possible sequence of a certain length 29 [ 0x7F ], // U -0000007F one byte30 [ 0xDF, 0xBF ], // U -000007FF two bytes31 [ 0xEF, 0xBF, 0xBF ], // U -0000FFFF three bytes39 [ 0x7F ], // U+0000007F one byte 40 [ 0xDF, 0xBF ], // U+000007FF two bytes 41 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes 32 42 33 43 // Other boundary conditions 34 [ 0xED, 0x9F, 0xBF ], // U -0000D7FF Last character before surrogates35 [ 0xEE, 0x80, 0x80 ], // U -0000E000 First character after surrogates36 [ 0xEF, 0xBF, 0xBD ], // U -0000FFFD Unicode replacement character37 [ 0xF4, 0x8F, 0xBF, 0xBF ], // U -0010FFFF Very last character44 [ 0xED, 0x9F, 0xBF ], // U+0000D7FF Last character before surrogates 45 [ 0xEE, 0x80, 0x80 ], // U+0000E000 First character after surrogates 46 [ 0xEF, 0xBF, 0xBD ], // U+0000FFFD Unicode replacement character 47 [ 0xF4, 0x8F, 0xBF, 0xBF ], // U+0010FFFF Very last character 38 48 39 49 // Non-character codepoints … … 44 54 used as characters. Since this module deals with UTF, and not with Unicode 45 55 per se, we choose to accept them here. */ 46 [ 0xDF, 0xBE ], // U -0000FFFE47 [ 0xDF, 0xBF ], // U -0000FFFF56 [ 0xDF, 0xBE ], // U+0000FFFE 57 [ 0xDF, 0xBF ], // U+0000FFFF 48 58 ]; 49 59 50 60 ubyte[][] invalidStrings = 51 61 [ 52 // First possible sequence of a certain length, but greater than U -10FFFF53 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U -00200000 five bytes54 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U -04000000 six bytes55 56 // Last possible sequence of a certain length, but greater than U -10FFFF57 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U -001FFFFF four bytes58 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U -03FFFFFF five bytes59 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U -7FFFFFFF six bytes62 // First possible sequence of a certain length, but greater than U+10FFFF 63 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes 64 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes 65 66 // Last possible sequence of a certain length, but greater than U+10FFFF 67 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes 68 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes 69 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes 60 70 61 71 // Other boundary conditions 62 [ 0xF4, 0x90, 0x80, 0x80 ], // U -00110000 First codepoint after last character72 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 First codepoint after last character 63 73 64 74 // Unexpected continuation bytes … … 86 96 [ 0xF3 ], 87 97 [ 0xF4 ], 88 [ 0xF5 ], // If this were legal it would start a character > U -10FFFF89 [ 0xF6 ], // If this were legal it would start a character > U -10FFFF90 [ 0xF7 ], // If this were legal it would start a character > U -10FFFF98 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF 99 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF 100 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF 91 101 92 102 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing … … 111 121 [ 0x20, 0xFF, 0x20 ], 112 122 113 // Overlong sequences, all representing U -002F123 // Overlong sequences, all representing U+002F 114 124 /* With a safe UTF-8 decoder, all of the following five overlong 115 125 representations of the ASCII character slash ("/") should be rejected … … 126 136 is a boundary test for safe UTF-8 decoders. All five characters should 127 137 be rejected like malformed UTF-8 sequences. */ 128 [ 0xC1, 0xBF ], // U -0000007F129 [ 0xE0, 0x9F, 0xBF ], // U -000007FF130 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U -0000FFFF131 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U -001FFFFF132 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U -03FFFFFF138 [ 0xC1, 0xBF ], // U+0000007F 139 [ 0xE0, 0x9F, 0xBF ], // U+000007FF 140 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF 141 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF 142 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF 133 143 134 144 // Overlong representation of the NUL character … … 147 157 a UTF-8 decoder that accepts them might introduce security problems 148 158 comparable to overlong UTF-8 sequences. */ 149 [ 0xED, 0xA0, 0x80 ], // U -D800150 [ 0xED, 0xAD, 0xBF ], // U -DB7F151 [ 0xED, 0xAE, 0x80 ], // U -DB80152 [ 0xED, 0xAF, 0xBF ], // U -DBFF153 [ 0xED, 0xB0, 0x80 ], // U -DC00154 [ 0xED, 0xBE, 0x80 ], // U -DF80155 [ 0xED, 0xBF, 0xBF ], // U -DFFF159 [ 0xED, 0xA0, 0x80 ], // U+D800 160 [ 0xED, 0xAD, 0xBF ], // U+DB7F 161 [ 0xED, 0xAE, 0x80 ], // U+DB80 162 [ 0xED, 0xAF, 0xBF ], // U+DBFF 163 [ 0xED, 0xB0, 0x80 ], // U+DC00 164 [ 0xED, 0xBE, 0x80 ], // U+DF80 165 [ 0xED, 0xBF, 0xBF ], // U+DFFF 156 166 ]; 157 167 … … 248 258 assert(ds == ds2); 249 259 } 260 261 // Make sure the non-UTF encodings work too 262 { 263 auto s = "\u20AC100"; 264 auto t = to!(windows1252)(s); 265 assert(t == [cast(windows1252)0x80, '1', '0', '0']); 266 auto u = to!(utf8)(s); 267 assert(s == u); 268 auto v = to!(latin1)(s); 269 assert(cast(string)v == "?100"); 270 auto w = to!(ascii)(v); 271 assert(cast(string)w == "?100"); 272 } 250 273 } 251 274 252 275 // Unit tests over. Now for the code... 253 276 254 privatetemplate UTF(T)277 /+private+/ template UTF(T) 255 278 { 256 279 static if (is(T==char)) … … 327 350 } 328 351 329 uint encode( uintc,T[] buffer)352 uint encode(dchar c,T[] buffer) 330 353 in 331 354 { … … 459 482 } 460 483 461 uint encode( uintc,T[] buffer)484 uint encode(dchar c,T[] buffer) 462 485 in 463 486 { … … 550 573 } 551 574 552 uint encode( uintc,T[] buffer)575 uint encode(dchar c,T[] buffer) 553 576 in 554 577 { … … 587 610 } 588 611 } 589 else static if (is(T:a char))590 { 591 alias astringstring;612 else static if (is(T:ascii)) 613 { 614 alias invariant(ascii)[] string; 592 615 593 616 enum MAX_SEQUENCE_LENGTH = 1; … … 624 647 } 625 648 626 uint encode( uintc,T[] buffer)649 uint encode(dchar c,T[] buffer) 627 650 in 628 651 { … … 661 684 } 662 685 } 663 else static if (is(T:l char))664 { 665 alias lstringstring;686 else static if (is(T:latin1)) 687 { 688 alias invariant(latin1)[] string; 666 689 667 690 enum MAX_SEQUENCE_LENGTH = 1; 668 691 669 invariant(char)[] encodingName = " UTF-32";692 invariant(char)[] encodingName = "ISO-8859-1"; 670 693 671 694 bool isValidCodeUnit(T c) … … 698 721 } 699 722 700 uint encode( uintc,T[] buffer)723 uint encode(dchar c,T[] buffer) 701 724 in 702 725 { … … 732 755 void appendReplacementChar(ref string s) 733 756 { 734 s ~= cast(T)0xFFFD; 735 } 736 } 757 s ~= cast(T)'?'; 758 } 759 } 760 else static if (is(T:windows1252)) 761 { 762 alias invariant(windows1252)[] string; 763 764 enum MAX_SEQUENCE_LENGTH = 1; 765 766 invariant(char)[] encodingName = "WINDOWS-1252"; 767 768 wstring charMap = 769 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD" 770 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178" 771 ; 772 773 dchar win2uni(T c) 774 { 775 return isSingle(c) ? c : charMap[c-0x80]; 776 } 777 778 T uni2win(dchar c) 779 { 780 if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return cast(T)c; 781 if (c != 0xFFFD) 782 { 783 foreach(n,d;charMap) 784 { 785 if (c == d) return cast(T)(n + 0x80); 786 } 787 } 788 return '?'; 789 } 790 791 bool isValidCodeUnit(T c) 792 { 793 return(win2uni(c) != 0xFFFD); 794 } 795 796 bool isSingle(T c) 797 { 798 return c < 0x80 || c >= 0xA0; 799 } 800 801 int tails(T c) 802 { 803 return isSingle(c) ? -1 : 0; 804 } 805 806 bool isHead(T c) 807 { 808 return isSingle(c) ? false : isValidCodeUnit(c); 809 } 810 811 bool isTail(T c) 812 { 813 return false; 814 } 815 816 bool badTail(string s, uint n) 817 { 818 return false; 819 } 820 821 bool isInvalidHeadTail(T c, T d) 822 { 823 return false; 824 } 825 826 uint encode(dchar c,T[] buffer) 827 in 828 { 829 assert(isValidCodepoint(c)); 830 assert(buffer.length >= MAX_SEQUENCE_LENGTH); 831 } 832 body 833 { 834 buffer[0] = uni2win(c); 835 return 1; 836 } 837 838 dchar decodeSingleSequence(ref string s) 839 in 840 { 841 assert(s.length != 0); 842 } 843 body 844 { 845 dchar c = win2uni(s[0]); 846 s = s[1..$]; 847 return c; 848 } 849 850 dchar decodeSingleSequenceReverse(ref string s) 851 in 852 { 853 assert(s.length != 0); 854 } 855 body 856 { 857 dchar c = win2uni(s[$-1]); 858 s = s[0..$-1]; 859 return c; 860 } 861 862 void appendReplacementChar(ref string s) 863 { 864 s ~= cast(T)'?'; 865 } 866 } 867 // NOTE: The "else" case is commented out because it doesn't work (yet?) because of some template 868 // issues which have yet to be resolved. Expect this to work in some future release. 869 /+ 870 else // The generic case. All other encodings. 871 { 872 alias invariant(T)[] string; 873 874 static if(is(T.MAX_SEQUENCE_LENGTH)) 875 { 876 enum MAX_SEQUENCE_LENGTH = T.MAX_SEQUENCE_LENGTH; 877 } 878 else 879 { 880 enum MAX_SEQUENCE_LENGTH = 1; 881 } 882 883 invariant(char)[] encodingName() 884 { 885 return T.encodingName; 886 } 887 888 bool isValidCodeUnit(T c) 889 { 890 return c.isValidCodeUnit; 891 } 892 893 bool isSingle(T c) 894 { 895 static if (is(T.isSingle == function)) 896 { 897 return c.isSingle; 898 } 899 else 900 { 901 return c.isValidCodeUnit; 902 } 903 } 904 905 int tails(T c) 906 { 907 static if (is(T.tails == function)) 908 { 909 return c.tails(); 910 } 911 else 912 { 913 return -1; 914 } 915 } 916 917 bool isHead(T c) 918 { 919 static if(is(T.isHead == function)) 920 { 921 return c.isHead; 922 } 923 else 924 { 925 return false; 926 } 927 } 928 929 bool isTail(T c) 930 { 931 static if(is(T.isTail == function)) 932 { 933 return c.isTail; 934 } 935 else 936 { 937 return false; 938 } 939 } 940 941 bool badTail(string s, uint n) 942 { 943 static if(is(T.badTail == function)) 944 { 945 return T.badTail(s,n); 946 } 947 else 948 { 949 return false; 950 } 951 } 952 953 bool isInvalidHeadTail(T c, T d) 954 { 955 static if(is(T.isInvalidHeadTail == function)) 956 { 957 return c.isInvalidHeadTail(d); 958 } 959 else 960 { 961 return false; 962 } 963 } 964 965 uint encode(dchar c,T[] buffer) 966 in 967 { 968 assert(isValidCodepoint(c)); 969 assert(buffer.length >= MAX_SEQUENCE_LENGTH); 970 } 971 body 972 { 973 return encode(c,buffer); 974 } 975 976 dchar decodeSingleSequence(ref string s) 977 { 978 static if(is(T.decodeSingleSequence == function)) 979 { 980 return T.decodeSingleSequence(s); 981 } 982 else 983 { 984 assert(false); 985 return 0; 986 } 987 } 988 989 dchar decodeSingleSequenceReverse(ref string s) 990 { 991 static if(is(T.decodeSingleSequenceReverse == function)) 992 { 993 return T.decodeSingleSequenceReverse(s); 994 } 995 else 996 { 997 assert(false); 998 return 0; 999 } 1000 } 1001 1002 void appendReplacementChar(ref string s) 1003 { 1004 if (is(T.replacementChar == function)) 1005 { 1006 s ~= T.replacementChar(); 1007 } 1008 else 1009 { 1010 s ~= cast(T)'?'; 1011 } 1012 } 1013 } 1014 +/ 737 1015 738 1016 uint pseudoSequenceLength(string s) … … 855 1133 body 856 1134 { 857 uintc = s[0];1135 T c = s[0]; 858 1136 if (isSingle(c)) 859 1137 { … … 872 1150 body 873 1151 { 874 uintc = s[$-1];1152 T c = s[$-1]; 875 1153 if (isSingle(c)) 876 1154 { … … 987 1265 } 988 1266 989 typedef char achar; // / A type representing an ASCII character 990 alias invariant(achar)[] astring; // / A type representing an ASCII string 991 typedef ubyte lchar; // / A type representing an ISO-8859-1 (aka Latin-1) character 992 alias invariant(lchar)[] lstring; // / A type representing an ISO-8859-1 (aka Latin-1) string 1267 alias char utf8; /// A type representing the UTF-8 encoding (an alias of char) 1268 alias wchar utf16; /// A type representing the UTF-16 encoding (an alias of wchar) 1269 alias dchar utf32; /// A type representing the UTF-32 encoding (an alias of dchar) 1270 typedef char ascii; /// A type representing the ASCII encoding 1271 typedef ubyte latin1; /// A type representing the ISO-8859-1 (aka Latin-1) encoding 1272 typedef ubyte windows1252; /// A type representing the WINDOWS-1252 encoding 993 1273 994 1274 /** 995 * Returns the name of a UTF. 1275 * A type representing a string of some specified encoding. The encoding is specified by the template parameter. 1276 */ 1277 template eString(T) 1278 { 1279 alias invariant(T)[] eString; 1280 } 1281 1282 /** 1283 * Returns the name of an encoding. 996 1284 * 997 1285 * The type of the output cannot be deduced. Therefore, it is necessary to explicitly 998 * specify the output type. 999 * 1000 * If the template type is char, this function returns the string "UTF-8"; 1001 * If the template type is wchar, this function returns the string "UTF-16"; 1002 * If the template type is dchar, this function returns the string "UTF-32"; 1003 * 1004 * Standards: Unicode 5.0 1286 * specify the encoding type. 1287 * 1288 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1289 * 1290 * Examples: 1291 * ----------------------------------- 1292 * writefln(encodingName!(latin1)); 1293 * // writes ISO-8859-1 1294 * ----------------------------------- 1005 1295 */ 1006 1296 string encodingName(T)() … … 1011 1301 unittest 1012 1302 { 1013 assert(encodingName!(char) == "UTF-8"); 1014 assert(encodingName!(wchar) == "UTF-16"); 1015 assert(encodingName!(dchar) == "UTF-32"); 1303 assert(encodingName!(utf8) == "UTF-8"); 1304 assert(encodingName!(utf16) == "UTF-16"); 1305 assert(encodingName!(utf32) == "UTF-32"); 1306 assert(encodingName!(ascii) == "ASCII"); 1307 assert(encodingName!(latin1) == "ISO-8859-1"); 1016 1308 } 1017 1309 … … 1019 1311 * Returns true if the character is a valid codepoint 1020 1312 * 1021 * Note that this includes the non-character codepoints U -FFFE and U-FFFF, since these are1313 * Note that this includes the non-character codepoints U+FFFE and U+FFFF, since these are 1022 1314 * valid codepoints (even though they are not valid characters). 1023 1315 * 1024 * This function supercedes std.utf.isValidDchar(), even if only because the older function1025 * was badly named.1026 * 1027 * Standards: Unicode 5.0 1316 * Supercedes: 1317 * This function supercedes std.utf.isValidDchar(). 1318 * 1319 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1028 1320 * 1029 1321 * Params: … … 1036 1328 1037 1329 /** 1038 * Returns true if the string is encoded with a valid UTF encoding 1039 * 1040 * If the input is a string, this function tests for UTF-8; 1041 * If the input is a wstring, this function tests for UTF-16; 1042 * If the input is a dstring, this function tests for UTF-32. 1043 * 1330 * Returns true if the string is encoded correctly 1331 * 1332 * Supercedes: 1044 1333 * This function supercedes std.utf.validate(), however note that this function returns 1045 1334 * a bool indicating whether the input was valid or not, wheras the older funtion would 1046 1335 * throw an exception. 1047 1336 * 1048 * Standards: Unicode 5.0 1337 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1049 1338 * 1050 1339 * Params: … … 1057 1346 1058 1347 /** 1059 * Sanitizes a string by replacing malformed UTF with valid UTF. The result is guaranteed to1060 * be a valid UTFencoding.1348 * Sanitizes a string by replacing malformed code unit sequences with valid code unit sequences. 1349 * The result is guaranteed to be valid for this encoding. 1061 1350 * 1062 1351 * If the input string is already valid, this function returns the original, otherwise 1063 1352 * it constructs a new string by replacing all illegal code unit sequences with the 1064 * Unicode replacement character, U-FFFD. 1065 * 1066 * If the input is a string, this function operates in UTF-8; 1067 * If the input is a wstring, this function operates in UTF-16; 1068 * If the input is a dstring, this function operates in UTF-32. 1069 * 1070 * Standards: Unicode 5.0 1353 * encoding's replacement character, Invalid sequences will be replaced with the 1354 * Unicode replacement character (U+FFFD) if the character repertoire contains it, 1355 * otherwise invalid sequences will be replaced with '?'. 1356 * 1357 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1071 1358 * 1072 1359 * Params: … … 1085 1372 /** 1086 1373 * Returns the slice of the input string from the first character to the end of the 1087 * first UTFsequence. The resulting string may consist of multiple code units, but1374 * first encoded sequence. The resulting string may consist of multiple code units, but 1088 1375 * it will always represent at most one character. If the input is the empty string, 1089 1376 * the return value will be the empty string 1090 1377 * 1091 * The input to this function MUST be valid UTF.1378 * The input to this function MUST be validly encoded. 1092 1379 * This is enforced by the function's in-contract. 1093 1380 * 1094 * If the input is a string, this function operates in UTF-8; 1095 * If the input is a wstring, this function operates in UTF-16; 1096 * If the input is a dstring, this function operates in UTF-32. 1097 * 1098 * Standards: Unicode 5.0 1381 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1099 1382 * 1100 1383 * Params: … … 1107 1390 1108 1391 /** 1109 * Returns the slice of the input string from the start of the last UTFsequence1392 * Returns the slice of the input string from the start of the last encoded sequence 1110 1393 * to the end of the string. The resulting string may consist of multiple code units, 1111 1394 * but it will always represent at most one character. If the input is the empty string, 1112 1395 * the return value will be the empty string 1113 1396 * 1114 * The input to this function MUST be valid UTF.1397 * The input to this function MUST be validly encoded. 1115 1398 * This is enforced by the function's in-contract. 1116 1399 * 1117 * If the input is a string, this function operates in UTF-8; 1118 * If the input is a wstring, this function operates in UTF-16; 1119 * If the input is a dstring, this function operates in UTF-32. 1120 * 1121 * Standards: Unicode 5.0 1400 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1122 1401 * 1123 1402 * Params: … … 1132 1411 * Returns the total number of codepoints encoded in a string. 1133 1412 * 1134 * The input to this function MUST be valid UTF.1413 * The input to this function MUST be validly encoded. 1135 1414 * This is enforced by the function's in-contract. 1136 1415 * 1137 * If the input is a string, this function operates in UTF-8; 1138 * If the input is a wstring, this function operates in UTF-16; 1139 * If the input is a dstring, this function operates in UTF-32. 1140 * 1416 * Supercedes: 1141 1417 * This function supercedes std.utf.toUCSindex(). 1142 1418 * 1143 * Standards: Unicode 5.0 1419 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1144 1420 * 1145 1421 * Params: … … 1154 1430 * Returns the array index at which the (n+1)th codepoint begins. 1155 1431 * 1156 * The input to this function MUST be valid UTF.1432 * The input to this function MUST be validly encoded. 1157 1433 * This is enforced by the function's in-contract. 1158 1434 * 1159 * If the input is a string, this function operates in UTF-8; 1160 * If the input is a wstring, this function operates in UTF-16; 1161 * If the input is a dstring, this function operates in UTF-32. 1162 * 1435 * Supercedes: 1163 1436 * This function supercedes std.utf.toUTFindex(). 1164 1437 * 1165 * Standards: Unicode 5.0 1438 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1166 1439 * 1167 1440 * Params: … … 1176 1449 * Decodes a single codepoint. The decoded code units are removed from the start of the string. 1177 1450 * 1178 * The input to this function MUST be valid UTF.1451 * The input to this function MUST be validly encoded. 1179 1452 * This is enforced by the function's in-contract. 1180 1453 * 1181 * If the input is a string, this function decodes UTF-8; 1182 * If the input is a wstring, this function decodes UTF-16; 1183 * If the input is a dstring, this function decodes UTF-32. 1184 * 1454 * Supercedes: 1185 1455 * This function supercedes std.utf.decode(), however, note that the 1186 * function dchars() superced s it better!1187 * 1188 * Standards: Unicode 5.0 1456 * function dchars() supercedes it more conveniently! 1457 * 1458 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1189 1459 * 1190 1460 * Params: … … 1202 1472 * 1203 1473 * The type of the output cannot be deduced. Therefore, it is necessary to explicitly 1204 * specify the output type. 1205 * 1474 * specify the encoding as a template parameter. 1475 * 1476 * Supercedes: 1206 1477 * This function supercedes std.utf.encode(), however, note that the 1207 * function chars() superced s it better!1208 * 1209 * Standards: Unicode 5.0 1478 * function chars() supercedes it more conveniently! 1479 * 1480 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1210 1481 * 1211 1482 * Params: … … 1215 1486 { 1216 1487 T[] buffer = new T[4]; 1217 uint len = encode ToBuffer(c,buffer);1488 uint len = encode(c,buffer); 1218 1489 return cast(invariant(T)[])(buffer[0..len]); 1219 1490 } … … 1224 1495 * The input to this function MUST be a valid codepoint. 1225 1496 * 1226 * If the buffer is a char[], this function encodes to UTF-8; 1227 * If the buffer is a wchar[], this function encodes to UTF-16; 1228 * If the buffer is a dchar[], this function encodes to UTF-32. 1229 * 1497 * The user-supplied buffer needs to be of type T[], where T is the encoding type 1498 * (currently one of utf8, utf16, utf32, ascii, latin1 or windows1252). Note that utf8, utf16 and utf32 1499 * are aliases for char, wchar and dchar respectively. 1500 * 1501 * Supercedes: 1230 1502 * This function supercedes std.utf.encode(), however, note that the 1231 * function chars() superced s it better!1232 * 1233 * Standards: Unicode 5.0 1503 * function chars() supercedes it more conveniently! 1504 * 1505 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1234 1506 * 1235 1507 * Params: … … 1237 1509 * buffer = where to store the output 1238 1510 */ 1239 uint encode ToBuffer(T)(dchar c,T[] buffer)1511 uint encode(T)(dchar c,T[] buffer) 1240 1512 in 1241 1513 { … … 1260 1532 * Returns a foreachable struct which can bidirectionally iterate over all codepoints in a string. 1261 1533 * 1262 * The input to this function MUST be valid UTF.1534 * The input to this function MUST be validly encoded. 1263 1535 * This is enforced by the function's in-contract. 1264 1536 * … … 1267 1539 * iteration with the offset into the string at which the codepoint begins. 1268 1540 * 1269 * If the input is a string, this function decodes UTF-8; 1270 * If the input is a wstring, this function decodes UTF-16; 1271 * If the input is a dstring, this function decodes UTF-32. 1272 * 1541 * Supercedes: 1273 1542 * This function supercedes std.utf.decode(). 1274 1543 * 1275 * Standards: Unicode 5.0 1544 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1276 1545 * 1277 1546 * Params: … … 1288 1557 * 1289 1558 * Note that, currently, foreach(c:dchars(s)) is superior to foreach(c;s) 1290 * in that the latter will fall over on encountering U -FFFF.1559 * in that the latter will fall over on encountering U+FFFF. 1291 1560 */ 1292 1561 Dchars!(T) dchars(T)(invariant(T)[] s) … … 1301 1570 * 1302 1571 * The type of the output cannot be deduced. Therefore, it is necessary to explicitly 1303 * specify the output type. 1304 * 1305 * If the output type is char, this function encodes to UTF-8; 1306 * If the output type is wchar, this function encodes to UTF-16; 1307 * If the output type is dchar, this function encodes to UTF-32. 1308 * 1572 * specify the encoding type in the template parameter. 1573 * 1574 * Supercedes: 1309 1575 * This function supercedes std.utf.encode(). 1310 1576 * 1311 * Standards: Unicode 5.0 1577 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1312 1578 * 1313 1579 * Params: … … 1329 1595 1330 1596 /** 1331 * Convert a string from one UTF to another.1332 * 1333 * The input to this function MUST be valid UTF.1597 * Convert a string from one encoding to another. (See also to!() below). 1598 * 1599 * The input to this function MUST be validly encoded. 1334 1600 * This is enforced by the function's in-contract. 1335 1601 * 1336 * This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32(). 1337 * 1338 * Standards: Unicode 5.0 1602 * Supercedes: 1603 * This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32() 1604 * (but note that to!() supercedes it more conveniently). 1605 * 1606 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1339 1607 * 1340 1608 * Params: … … 1346 1614 * wstring ws; 1347 1615 * transcode(ws,"hello world"); 1616 * // transcode from UTF-8 to UTF-16 1617 * 1618 * eString!(latin1) ls; 1619 * transcode(ls, ws); 1620 * // transcode from UTF-16 to ISO-8859-1 1348 1621 * -------------------------------------------------------- 1349 1622 */ … … 1352 1625 static if(is(T==U)) 1353 1626 { 1354 r eturns;1355 } 1356 else static if(is(T==a char))1627 r = s; 1628 } 1629 else static if(is(T==ascii)) 1357 1630 { 1358 1631 transcode!(U,char)(r,cast(string)s); 1359 1632 } 1360 else static if(is(T==lchar))1361 {1362 transcode!(U,char)(r,cast(string)s);1363 }1364 1633 else 1365 1634 { … … 1372 1641 } 1373 1642 } 1643 } 1644 1645 /** 1646 * Convert a string from one encoding to another. (See also transcode() above). 1647 * 1648 * The input to this function MUST be validly encoded. 1649 * This is enforced by the function's in-contract. 1650 * 1651 * Supercedes: 1652 * This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32(). 1653 * 1654 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 1655 * 1656 * Params: 1657 * U = the destination encoding type 1658 * s = the sorrce string 1659 * 1660 * Examples: 1661 * ----------------------------------------------------------------------------- 1662 * auto ws = to!(utf16)("hello world"); // transcode from UTF-8 to UTF-16 1663 * auto ls = to!(latin1)(ws); // transcode from UTF-16 to ISO-8859-1 1664 * ----------------------------------------------------------------------------- 1665 */ 1666 invariant(U)[] to(U,T)(invariant(T)[] s) 1667 { 1668 invariant(U)[] r; 1669 transcode(r,s); 1670 return r; 1374 1671 } 1375 1672 … … 1383 1680 return s; 1384 1681 } 1385 else static if(is(T==a char))1682 else static if(is(T==ascii)) 1386 1683 { 1387 1684 transcodeReverse!(U,char)(r,cast(string)s); 1388 1685 } 1389 else static if(is(T==l char))1686 else static if(is(T==latin1)) 1390 1687 { 1391 1688 transcodeReverse!(U,char)(r,cast(string)s);
