Changeset 575
- Timestamp:
- 02/13/08 13:16:04 (10 months ago)
- Files:
-
- candidate/phobos/std/utf2.d (modified) (19 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
candidate/phobos/std/utf2.d
r572 r575 220 220 enum MAX_SEQUENCE_LENGTH = 4; 221 221 222 invariant(char)[] utfName = "UTF-8"; 223 222 224 byte[256] tailTable = 223 225 [ … … 240 242 ]; 241 243 244 bool isValidCodeUnit(T c) 245 { 246 return c < 0x80 || tails(c) >= 0; 247 } 248 242 249 int tails(T c) 243 250 { … … 248 255 { 249 256 return c < 0x80; 257 } 258 259 bool isHead(T c) 260 { 261 return tails(c) > 0; 250 262 } 251 263 … … 274 286 alias wstring string; 275 287 288 invariant(char)[] utfName = "UTF-16"; 289 290 bool isValidCodeUnit(T c) 291 { 292 return true; 293 } 294 276 295 enum MAX_SEQUENCE_LENGTH = 2; 277 296 … … 286 305 } 287 306 307 bool isHead(T c) 308 { 309 return c >= 0xD800 && c < 0xDC00; 310 } 311 288 312 bool isTail(T c) 289 313 { … … 307 331 enum MAX_SEQUENCE_LENGTH = 1; 308 332 333 invariant(char)[] utfName = "UTF-32"; 334 309 335 alias isValidCodepoint isSingle; 310 336 337 alias isValidCodepoint isValidCodeUnit; 338 311 339 int tails(T c) 312 340 { … … 314 342 } 315 343 344 bool isHead(T c) 345 { 346 return false; 347 } 348 316 349 bool isTail(T c) 317 350 { … … 328 361 s ~= cast(T)0xFFFD; 329 362 } 363 } 364 365 uint pseudoSequenceLength(string s) 366 { 367 assert(s.length != 0); 368 if (!isValidCodeUnit(s[0])) return 1; 369 int i = isHead(s[0]) ? 1 : 0; 370 for (; i < s.length; ++i) 371 { 372 if (!isTail(s[i])) break; 373 } 374 assert(i != 0); 375 return i; 330 376 } 331 377 … … 409 455 { 410 456 appendReplacementChar(r); 411 ++i;457 i += pseudoSequenceLength(s[i..$]); 412 458 uint n = validatePartial(s[i..$]); 413 459 r ~= s[i..i+n]; … … 694 740 695 741 /** 742 * Returns the name of a UTF. 743 * 744 * The type of the output cannot be deduced. Therefore, it is necessary to explicitly 745 * specify the output type. 746 * 747 * If the template type is char, this function returns the string "UTF-8"; 748 * If the template type is wchar, this function returns the string "UTF-16"; 749 * If the template type is dchar, this function returns the string "UTF-32"; 750 * 751 * Standards: Unicode 5.0 752 */ 753 string utfName(T)() 754 { 755 return UTF!(T).utfName; 756 } 757 758 unittest 759 { 760 assert(utfName!(char) == "UTF-8"); 761 assert(utfName!(wchar) == "UTF-16"); 762 assert(utfName!(dchar) == "UTF-32"); 763 } 764 765 /** 696 766 * Returns true if the character is a valid codepoint 697 767 * … … 713 783 714 784 /** 715 * Returns true if the string is a valid UTF encoding785 * Returns true if the string is encoded with a valid UTF encoding 716 786 * 717 787 * If the input is a string, this function tests for UTF-8; … … 738 808 * 739 809 * If the input string is already valid, this function returns the original, otherwise 740 * it constructs a new string by replacing all illegal code unit s with the810 * it constructs a new string by replacing all illegal code unit sequences with the 741 811 * Unicode replacement character, U-FFFD. 742 812 * … … 755 825 } 756 826 827 unittest 828 { 829 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); 830 } 831 757 832 /** 758 833 * Returns the slice of the input string from the first character to the end of the 759 834 * first UTF sequence. The resulting string may consist of multiple code units, but 760 * it will always represent at most one character. (Only if the input is empty761 * will it return the empty string).835 * it will always represent at most one character. If the input is the empty string, 836 * the return value will be the empty string 762 837 * 763 838 * The input to this function MUST be valid UTF. 839 * This is enforced by the function's in-contract. 764 840 * 765 841 * If the input is a string, this function operates in UTF-8; … … 780 856 * Returns the slice of the input string from the start of the last UTF sequence 781 857 * to the end of the string. The resulting string may consist of multiple code units, 782 * but it will always represent at most one character. (Only if the input is empty783 * will it return the empty string).858 * but it will always represent at most one character. If the input is the empty string, 859 * the return value will be the empty string 784 860 * 785 861 * The input to this function MUST be valid UTF. 862 * This is enforced by the function's in-contract. 786 863 * 787 864 * If the input is a string, this function operates in UTF-8; … … 803 880 * 804 881 * The input to this function MUST be valid UTF. 882 * This is enforced by the function's in-contract. 805 883 * 806 884 * If the input is a string, this function operates in UTF-8; … … 824 902 * 825 903 * The input to this function MUST be valid UTF. 904 * This is enforced by the function's in-contract. 826 905 * 827 906 * If the input is a string, this function operates in UTF-8; … … 845 924 * 846 925 * The input to this function MUST be valid UTF. 926 * This is enforced by the function's in-contract. 847 927 * 848 928 * If the input is a string, this function decodes UTF-8; … … 900 980 * 901 981 * The input to this function MUST be valid UTF. 982 * This is enforced by the function's in-contract. 902 983 * 903 984 * You can foreach either … … 970 1051 * 971 1052 * The input to this function MUST be valid UTF. 1053 * This is enforced by the function's in-contract. 972 1054 * 973 1055 * This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32().
