Changeset 568

Show
Ignore:
Timestamp:
02/10/08 16:56:23 (10 months ago)
Author:
Janice Caron
Message:

isChar(), isLetter(), etc. now function correctly for all Unicode characters

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • candidate/phobos/xml.d

    r566 r568  
    9393 * Returns true if the character is a character according to the XML standard 
    9494 * 
     95 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     96 * 
    9597 * Params: 
    9698 *    c = the character to be tested 
    97  * 
    98  * Bugs: Currently gives incorrect result if character is not ASCII 
    9999 */ 
    100 bool isChar(dchar c) 
    101 
    102     return true; 
     100bool isChar(dchar c) // rule 2 
     101
     102    return lookup(CharTable,c); 
     103
     104 
     105unittest 
     106
     107//  const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,0x10000,0x10FFFF]; 
     108    assert(!isChar(cast(dchar)0x8)); 
     109    assert( isChar(cast(dchar)0x9)); 
     110    assert( isChar(cast(dchar)0xA)); 
     111    assert(!isChar(cast(dchar)0xB)); 
     112    assert(!isChar(cast(dchar)0xC)); 
     113    assert( isChar(cast(dchar)0xD)); 
     114    assert(!isChar(cast(dchar)0xE)); 
     115    assert(!isChar(cast(dchar)0x1F)); 
     116    assert( isChar(cast(dchar)0x20)); 
     117    assert( isChar('J')); 
     118    assert( isChar(cast(dchar)0xD7FF)); 
     119    assert(!isChar(cast(dchar)0xD800)); 
     120    assert(!isChar(cast(dchar)0xDFFF)); 
     121    assert( isChar(cast(dchar)0xE000)); 
     122    assert( isChar(cast(dchar)0xFFFD)); 
     123    assert(!isChar(cast(dchar)0xFFFE)); 
     124    assert(!isChar(cast(dchar)0xFFFF)); 
     125    assert( isChar(cast(dchar)0x10000)); 
     126    assert( isChar(cast(dchar)0x10FFFF)); 
     127    assert(!isChar(cast(dchar)0x110000)); 
     128
     129 
     130/** 
     131 * Returns true if the character is a digit according to the XML standard 
     132 * 
     133 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     134 * 
     135 * Params: 
     136 *    c = the character to be tested 
     137 */ 
     138bool isDigit(dchar c) 
     139
     140    return lookup(DigitTable,c); 
    103141} 
    104142 
     
    106144 * Returns true if the character is a letter according to the XML standard 
    107145 * 
     146 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     147 * 
    108148 * Params: 
    109149 *    c = the character to be tested 
    110  * 
    111  * Bugs: Currently gives incorrect result if character is not ASCII 
    112150 */ 
    113 bool isLetter(dchar c) 
    114 { 
    115     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 
     151bool isLetter(dchar c) // rule 84 
     152{ 
     153    return isIdeographicChar(c) || isBaseChar(c); 
    116154} 
    117155 
    118156/** 
    119  * Returns true if the character is a digit according to the XML standard 
     157 * Returns true if the character is an ideographic character according to the XML standard 
     158 * 
     159 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    120160 * 
    121161 * Params: 
    122162 *    c = the character to be tested 
    123  * 
    124  * Bugs: Currently gives incorrect result if character is not ASCII 
    125163 */ 
    126 bool isDigit(dchar c) 
    127 
    128     return c >= '0' && c <= '9'; 
     164bool isIdeographicChar(dchar c) 
     165
     166    return lookup(IdeographicTable,c); 
     167
     168 
     169/** 
     170 * Returns true if the character is a base character according to the XML standard 
     171 * 
     172 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     173 * 
     174 * Params: 
     175 *    c = the character to be tested 
     176 */ 
     177bool isBaseChar(dchar c) 
     178
     179    return lookup(BaseCharTable,c); 
    129180} 
    130181 
     
    132183 * Returns true if the character is a combining character according to the XML standard 
    133184 * 
     185 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     186 * 
    134187 * Params: 
    135188 *    c = the character to be tested 
    136  * 
    137  * Bugs: Currently gives incorrect result if character is not ASCII 
    138189 */ 
    139190bool isCombiningChar(dchar c) 
    140191{ 
    141     return false
     192    return lookup(CombiningCharTable,c)
    142193} 
    143194 
     
    145196 * Returns true if the character is an extender according to the XML standard 
    146197 * 
     198 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
     199 * 
    147200 * Params: 
    148201 *    c = the character to be tested 
    149  * 
    150  * Bugs: Currently gives incorrect result if character is not ASCII 
    151202 */ 
    152203bool isExtender(dchar c) 
    153204{ 
    154     return false
     205    return lookup(ExtenderTable,c)
    155206} 
    156207 
     
    163214 * If the string is not modified, the original will be returned. 
    164215 * 
    165  * Standards: XML 1.0 
     216 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    166217 * 
    167218 * Params: 
     
    204255 * as well as decimal and hexadecimal entities such as &amp;#x20AC; 
    205256 * 
    206  * If the string is not modified, the original will be returned. 
    207  * 
    208  * Standards: XML 1.0 
     257 * If the string does not contain an ampersand, the original will be returned. 
     258 * 
     259 * Note that if the "strict" parameter is false, then illegal ampersands will be ignored 
     260 * (that is, "cat &amp; dog" will decode to "cat &amp; dog"), whereas, if the strict paramter 
     261 * is true, then illegal sequences will cause decoding to fail. 
     262 * 
     263 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    209264 * 
    210265 * Params: 
     
    308363 * Class representing an XML document. 
    309364 * 
    310  * Standards: XML 1.0 
     365 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    311366 * 
    312367 */ 
     
    411466 * Class representing an XML element. 
    412467 * 
    413  * Standards: XML 1.0 
     468 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    414469 * 
    415470 * The class invariant guarantees that the content is well-formed XML 
     
    622677 * Class representing an XML tag. 
    623678 * 
    624  * Standards: XML 1.0 
     679 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    625680 * 
    626681 * The class invariant guarantees 
     
    12461301 * This is a subclass of ElementParser. Most of the useful functions are documented there. 
    12471302 * 
    1248  * Standards: XML 1.0 
     1303 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    12491304 * 
    12501305 * Bugs: 
     
    12721327 * Class for parsing an XML element. 
    12731328 * 
    1274  * Standards: XML 1.0 
     1329 * Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210) 
    12751330 * 
    12761331 * Note that you cannot construct instances of this class directly. You can construct a DocumentParser 
     
    22322287    } 
    22332288 
    2234     string chop(ref string s, uint n) 
     2289    string chop(ref string s, int n) 
    22352290    { 
    22362291        if (n == -1) n = s.length; 
     
    22592314    } 
    22602315 
     2316    // Definitions from the XML specification 
     2317    const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,0x10000,0x10FFFF]; 
     2318    const BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3]; 
     2319    const IdeographicTable=[0x4E00,0x9FA5,0x3007,0x3007,0x3021,0x3029]; 
     2320    const CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,0x3099,0x3099,0x309A,0x309A]; 
     2321    const DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,0x0ED9,0x0F20,0x0F29]; 
     2322    const ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,0x3035,0x309D,0x309E,0x30FC,0x30FE]; 
     2323 
     2324    bool lookup(const(int)[] table, int c) 
     2325    { 
     2326        while (table.length != 0) 
     2327        { 
     2328            int m = (table.length >> 1) & ~1; 
     2329            if (c < table[m]) 
     2330            { 
     2331                table = table[0..m]; 
     2332            } 
     2333            else if (c > table[m+1]) 
     2334            { 
     2335                table = table[m+2..$]; 
     2336            } 
     2337            else return true; 
     2338        } 
     2339        return false; 
     2340    } 
     2341 
    22612342    string startOf(string s) 
    22622343    {