Changeset 575

Show
Ignore:
Timestamp:
02/13/08 13:16:04 (10 months ago)
Author:
Janice Caron
Message:

Various changes to ddoc comments
Added function utfName!(T)()
Fixed sanitize to recognize whole invalid sequences

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • candidate/phobos/std/utf2.d

    r572 r575  
    220220        enum MAX_SEQUENCE_LENGTH = 4; 
    221221 
     222        invariant(char)[] utfName = "UTF-8"; 
     223 
    222224        byte[256] tailTable = 
    223225        [ 
     
    240242        ]; 
    241243 
     244        bool isValidCodeUnit(T c) 
     245        { 
     246            return c < 0x80 || tails(c) >= 0; 
     247        } 
     248 
    242249        int tails(T c) 
    243250        { 
     
    248255        { 
    249256            return c < 0x80; 
     257        } 
     258 
     259        bool isHead(T c) 
     260        { 
     261            return tails(c) > 0; 
    250262        } 
    251263 
     
    274286        alias wstring string; 
    275287 
     288        invariant(char)[] utfName = "UTF-16"; 
     289 
     290        bool isValidCodeUnit(T c) 
     291        { 
     292            return true; 
     293        } 
     294 
    276295        enum MAX_SEQUENCE_LENGTH = 2; 
    277296 
     
    286305        } 
    287306 
     307        bool isHead(T c) 
     308        { 
     309            return c >= 0xD800 && c < 0xDC00; 
     310        } 
     311 
    288312        bool isTail(T c) 
    289313        { 
     
    307331        enum MAX_SEQUENCE_LENGTH = 1; 
    308332 
     333        invariant(char)[] utfName = "UTF-32"; 
     334 
    309335        alias isValidCodepoint isSingle; 
    310336 
     337        alias isValidCodepoint isValidCodeUnit; 
     338 
    311339        int tails(T c) 
    312340        { 
     
    314342        } 
    315343 
     344        bool isHead(T c) 
     345        { 
     346            return false; 
     347        } 
     348 
    316349        bool isTail(T c) 
    317350        { 
     
    328361            s ~= cast(T)0xFFFD; 
    329362        } 
     363    } 
     364 
     365    uint pseudoSequenceLength(string s) 
     366    { 
     367        assert(s.length != 0); 
     368        if (!isValidCodeUnit(s[0])) return 1; 
     369        int i = isHead(s[0]) ? 1 : 0; 
     370        for (; i < s.length; ++i) 
     371        { 
     372            if (!isTail(s[i])) break; 
     373        } 
     374        assert(i != 0); 
     375        return i; 
    330376    } 
    331377 
     
    409455        { 
    410456            appendReplacementChar(r); 
    411             ++i
     457            i += pseudoSequenceLength(s[i..$])
    412458            uint n = validatePartial(s[i..$]); 
    413459            r ~= s[i..i+n]; 
     
    694740 
    695741/** 
     742 * Returns the name of a UTF. 
     743 * 
     744 * The type of the output cannot be deduced. Therefore, it is necessary to explicitly 
     745 * specify the output type. 
     746 * 
     747 * If the template type is char, this function returns the string "UTF-8"; 
     748 * If the template type is wchar, this function returns the string "UTF-16"; 
     749 * If the template type is dchar, this function returns the string "UTF-32"; 
     750 * 
     751 * Standards: Unicode 5.0 
     752 */ 
     753string utfName(T)() 
     754{ 
     755    return UTF!(T).utfName; 
     756} 
     757 
     758unittest 
     759{ 
     760    assert(utfName!(char) == "UTF-8"); 
     761    assert(utfName!(wchar) == "UTF-16"); 
     762    assert(utfName!(dchar) == "UTF-32"); 
     763} 
     764 
     765/** 
    696766 * Returns true if the character is a valid codepoint 
    697767 * 
     
    713783 
    714784/** 
    715  * Returns true if the string is a valid UTF encoding 
     785 * Returns true if the string is encoded with a valid UTF encoding 
    716786 * 
    717787 * If the input is a string, this function tests for UTF-8; 
     
    738808 * 
    739809 * If the input string is already valid, this function returns the original, otherwise 
    740  * it constructs a new string by replacing all illegal code units with the 
     810 * it constructs a new string by replacing all illegal code unit sequences with the 
    741811 * Unicode replacement character, U-FFFD. 
    742812 * 
     
    755825} 
    756826 
     827unittest 
     828{ 
     829    assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); 
     830} 
     831 
    757832/** 
    758833 * Returns the slice of the input string from the first character to the end of the 
    759834 * first UTF sequence. The resulting string may consist of multiple code units, but 
    760  * it will always represent at most one character. (Only if the input is empty 
    761  * will it return the empty string). 
     835 * it will always represent at most one character. If the input is the empty string, 
     836 * the return value will be the empty string 
    762837 * 
    763838 * The input to this function MUST be valid UTF. 
     839 * This is enforced by the function's in-contract. 
    764840 * 
    765841 * If the input is a string, this function operates in UTF-8; 
     
    780856 * Returns the slice of the input string from the start of the last UTF sequence 
    781857 * to the end of the string. The resulting string may consist of multiple code units, 
    782  * but it will always represent at most one character. (Only if the input is empty 
    783  * will it return the empty string). 
     858 * but it will always represent at most one character. If the input is the empty string, 
     859 * the return value will be the empty string 
    784860 * 
    785861 * The input to this function MUST be valid UTF. 
     862 * This is enforced by the function's in-contract. 
    786863 * 
    787864 * If the input is a string, this function operates in UTF-8; 
     
    803880 * 
    804881 * The input to this function MUST be valid UTF. 
     882 * This is enforced by the function's in-contract. 
    805883 * 
    806884 * If the input is a string, this function operates in UTF-8; 
     
    824902 * 
    825903 * The input to this function MUST be valid UTF. 
     904 * This is enforced by the function's in-contract. 
    826905 * 
    827906 * If the input is a string, this function operates in UTF-8; 
     
    845924 * 
    846925 * The input to this function MUST be valid UTF. 
     926 * This is enforced by the function's in-contract. 
    847927 * 
    848928 * If the input is a string, this function decodes UTF-8; 
     
    900980 * 
    901981 * The input to this function MUST be valid UTF. 
     982 * This is enforced by the function's in-contract. 
    902983 * 
    903984 * You can foreach either 
     
    9701051 * 
    9711052 * The input to this function MUST be valid UTF. 
     1053 * This is enforced by the function's in-contract. 
    9721054 * 
    9731055 * This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32().