Changeset 292
- Timestamp:
- 12/12/09 08:17:20 (15 years ago)
- Files:
-
- branches/dmd-1.x/src/lexer.c (modified) (3 diffs)
- trunk/src/lexer.c (modified) (3 diffs)
- trunk/src/utf.c (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/dmd-1.x/src/lexer.c
r248 r292 617 617 #endif 618 618 case 'a': case 'b': case 'c': case 'd': case 'e': 619 619 case 'f': case 'g': case 'h': case 'i': case 'j': 620 620 case 'k': case 'm': case 'n': case 'o': 621 621 #if DMDV2 622 622 case 'p': /*case 'q': case 'r':*/ case 's': case 't': 623 623 #else 624 624 case 'p': case 'q': /*case 'r':*/ case 's': case 't': 625 625 #endif 626 626 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': 627 627 case 'z': 628 628 case 'A': case 'B': case 'C': case 'D': case 'E': 629 629 case 'F': case 'G': case 'H': case 'I': case 'J': 630 630 case 'K': case 'M': case 'N': case 'O': 631 631 case 'P': case 'Q': case 'R': case 'S': case 'T': 632 632 case 'U': case 'V': case 'W': case 'X': case 'Y': 633 633 case 'Z': 634 634 case '_': 635 635 case_ident: 636 636 { unsigned char c; 637 StringValue *sv; 638 Identifier *id; 639 640 do 637 638 while (1) 641 639 { 642 640 c = *++p; 643 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF()))); 644 sv = stringtable.update((char *)t->ptr, p - t->ptr); 645 id = (Identifier *) sv->ptrvalue; 641 if (isidchar(c)) 642 continue; 643 else if (c & 0x80) 644 { unsigned char *s = p; 645 unsigned u = decodeUTF(); 646 if (isUniAlpha(u)) 647 continue; 648 error("char 0x%04x not allowed in identifier", u); 649 p = s; 650 } 651 break; 652 } 653 654 StringValue *sv = stringtable.update((char *)t->ptr, p - t->ptr); 655 Identifier *id = (Identifier *) sv->ptrvalue; 646 656 if (!id) 647 657 { id = new Identifier(sv->lstring.string,TOKidentifier); 648 658 sv->ptrvalue = id; 649 659 } 650 660 t->ident = id; 651 661 t->value = (enum TOK) id->value; 652 662 anyToken = 1; 653 663 if (*t->ptr == '_') // if special identifier token 654 664 { 655 665 static char date[11+1]; 656 666 static char time[8+1]; 657 667 static char timestamp[24+1]; 658 668 659 669 if (!date[0]) // lazy evaluation 660 670 { time_t t; 661 671 char *p; 662 672 663 673 ::time(&t); 664 674 p = ctime(&t); 665 675 assert(p); … … 1160 1170 if (*p == c2) \ 1161 1171 { p++; \ 1162 1172 t->value = tok2; \ 1163 1173 } \ 1164 1174 else \ 1165 1175 t->value = tok1; \ 1166 1176 return; 1167 1177 1168 1178 DOUBLE('*', TOKmul, '=', TOKmulass) 1169 1179 DOUBLE('%', TOKmod, '=', TOKmodass) 1170 1180 DOUBLE('^', TOKxor, '=', TOKxorass) 1171 1181 1172 1182 #undef DOUBLE 1173 1183 1174 1184 case '#': 1175 1185 p++; 1176 1186 pragma(); 1177 1187 continue; 1178 1188 1179 1189 default: 1180 { unsigned c har c= *p;1190 { unsigned c = *p; 1181 1191 1182 1192 if (c & 0x80) 1183 { unsigned u= decodeUTF();1193 { c = decodeUTF(); 1184 1194 1185 1195 // Check for start of unicode identifier 1186 if (isUniAlpha( u))1196 if (isUniAlpha(c)) 1187 1197 goto case_ident; 1188 1198 1189 if ( u == PS || u== LS)1199 if (c == PS || c == LS) 1190 1200 { 1191 1201 loc.linnum++; 1192 1202 p++; 1193 1203 continue; 1194 1204 } 1195 1205 } 1196 if ( isprint(c))1206 if (c < 0x80 && isprint(c)) 1197 1207 error("unsupported char '%c'", c); 1198 1208 else 1199 1209 error("unsupported char 0x%02x", c); 1200 1210 p++; 1201 1211 continue; 1202 1212 } 1203 1213 } 1204 1214 } 1205 1215 } 1206 1216 1207 1217 /******************************************* 1208 1218 * Parse escape sequence. 1209 1219 */ 1210 1220 1211 1221 unsigned Lexer::escapeSequence() 1212 1222 { unsigned c; 1213 1223 int n; 1214 1224 int ndigits; 1215 1225 1216 1226 c = *p; … … 1438 1448 stringbuffer.writeByte(0); 1439 1449 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); 1440 1450 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1441 1451 stringPostfix(t); 1442 1452 return TOKstring; 1443 1453 1444 1454 default: 1445 1455 if (c >= '0' && c <= '9') 1446 1456 c -= '0'; 1447 1457 else if (c >= 'a' && c <= 'f') 1448 1458 c -= 'a' - 10; 1449 1459 else if (c >= 'A' && c <= 'F') 1450 1460 c -= 'A' - 10; 1451 1461 else if (c & 0x80) 1452 1462 { p--; 1453 1463 unsigned u = decodeUTF(); 1454 1464 p++; 1455 1465 if (u == PS || u == LS) 1456 1466 loc.linnum++; 1457 1467 else 1458 error("non-hex character \\u% x", u);1468 error("non-hex character \\u%04x", u); 1459 1469 } 1460 1470 else 1461 1471 error("non-hex character '%c'", c); 1462 1472 if (n & 1) 1463 1473 { v = (v << 4) | c; 1464 1474 stringbuffer.writeByte(v); 1465 1475 } 1466 1476 else 1467 1477 v = c; 1468 1478 n++; 1469 1479 break; 1470 1480 } 1471 1481 } 1472 1482 } 1473 1483 1474 1484 1475 1485 #if DMDV2 1476 1486 /************************************** 1477 1487 * Lex delimited strings: 1478 1488 * q"(foo(xxx))" // "foo(xxx)" trunk/src/lexer.c
r269 r292 630 630 #endif 631 631 case 'a': case 'b': case 'c': case 'd': case 'e': 632 632 case 'f': case 'g': case 'h': case 'i': case 'j': 633 633 case 'k': case 'm': case 'n': case 'o': 634 634 #if DMDV2 635 635 case 'p': /*case 'q': case 'r':*/ case 's': case 't': 636 636 #else 637 637 case 'p': case 'q': /*case 'r':*/ case 's': case 't': 638 638 #endif 639 639 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': 640 640 case 'z': 641 641 case 'A': case 'B': case 'C': case 'D': case 'E': 642 642 case 'F': case 'G': case 'H': case 'I': case 'J': 643 643 case 'K': case 'M': case 'N': case 'O': 644 644 case 'P': case 'Q': case 'R': case 'S': case 'T': 645 645 case 'U': case 'V': case 'W': case 'X': case 'Y': 646 646 case 'Z': 647 647 case '_': 648 648 case_ident: 649 649 { unsigned char c; 650 StringValue *sv; 651 Identifier *id; 652 653 do 650 651 while (1) 654 652 { 655 653 c = *++p; 656 } while (isidchar(c) || (c & 0x80 && isUniAlpha(decodeUTF()))); 657 sv = stringtable.update((char *)t->ptr, p - t->ptr); 658 id = (Identifier *) sv->ptrvalue; 654 if (isidchar(c)) 655 continue; 656 else if (c & 0x80) 657 { unsigned char *s = p; 658 unsigned u = decodeUTF(); 659 if (isUniAlpha(u)) 660 continue; 661 error("char 0x%04x not allowed in identifier", u); 662 p = s; 663 } 664 break; 665 } 666 667 StringValue *sv = stringtable.update((char *)t->ptr, p - t->ptr); 668 Identifier *id = (Identifier *) sv->ptrvalue; 659 669 if (!id) 660 670 { id = new Identifier(sv->lstring.string,TOKidentifier); 661 671 sv->ptrvalue = id; 662 672 } 663 673 t->ident = id; 664 674 t->value = (enum TOK) id->value; 665 675 anyToken = 1; 666 676 if (*t->ptr == '_') // if special identifier token 667 677 { 668 678 static char date[11+1]; 669 679 static char time[8+1]; 670 680 static char timestamp[24+1]; 671 681 672 682 if (!date[0]) // lazy evaluation 673 683 { time_t t; 674 684 char *p; 675 685 676 686 ::time(&t); 677 687 p = ctime(&t); 678 688 assert(p); … … 1198 1208 { p++; \ 1199 1209 t->value = tok2; \ 1200 1210 } \ 1201 1211 else \ 1202 1212 t->value = tok1; \ 1203 1213 return; 1204 1214 1205 1215 DOUBLE('*', TOKmul, '=', TOKmulass) 1206 1216 DOUBLE('%', TOKmod, '=', TOKmodass) 1207 1217 #if DMDV1 1208 1218 DOUBLE('^', TOKxor, '=', TOKxorass) 1209 1219 #endif 1210 1220 #undef DOUBLE 1211 1221 1212 1222 case '#': 1213 1223 p++; 1214 1224 pragma(); 1215 1225 continue; 1216 1226 1217 1227 default: 1218 { unsigned c har c= *p;1228 { unsigned c = *p; 1219 1229 1220 1230 if (c & 0x80) 1221 { unsigned u= decodeUTF();1231 { c = decodeUTF(); 1222 1232 1223 1233 // Check for start of unicode identifier 1224 if (isUniAlpha( u))1234 if (isUniAlpha(c)) 1225 1235 goto case_ident; 1226 1236 1227 if ( u == PS || u== LS)1237 if (c == PS || c == LS) 1228 1238 { 1229 1239 loc.linnum++; 1230 1240 p++; 1231 1241 continue; 1232 1242 } 1233 1243 } 1234 if ( isprint(c))1244 if (c < 0x80 && isprint(c)) 1235 1245 error("unsupported char '%c'", c); 1236 1246 else 1237 1247 error("unsupported char 0x%02x", c); 1238 1248 p++; 1239 1249 continue; 1240 1250 } 1241 1251 } 1242 1252 } 1243 1253 } 1244 1254 1245 1255 /******************************************* 1246 1256 * Parse escape sequence. 1247 1257 */ 1248 1258 1249 1259 unsigned Lexer::escapeSequence() 1250 1260 { unsigned c = *p; 1251 1261 1252 1262 #ifdef TEXTUAL_ASSEMBLY_OUT 1253 1263 return c; 1254 1264 #endif … … 1479 1489 stringbuffer.writeByte(0); 1480 1490 t->ustring = (unsigned char *)mem.malloc(stringbuffer.offset); 1481 1491 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); 1482 1492 stringPostfix(t); 1483 1493 return TOKstring; 1484 1494 1485 1495 default: 1486 1496 if (c >= '0' && c <= '9') 1487 1497 c -= '0'; 1488 1498 else if (c >= 'a' && c <= 'f') 1489 1499 c -= 'a' - 10; 1490 1500 else if (c >= 'A' && c <= 'F') 1491 1501 c -= 'A' - 10; 1492 1502 else if (c & 0x80) 1493 1503 { p--; 1494 1504 unsigned u = decodeUTF(); 1495 1505 p++; 1496 1506 if (u == PS || u == LS) 1497 1507 loc.linnum++; 1498 1508 else 1499 error("non-hex character \\u% x", u);1509 error("non-hex character \\u%04x", u); 1500 1510 } 1501 1511 else 1502 1512 error("non-hex character '%c'", c); 1503 1513 if (n & 1) 1504 1514 { v = (v << 4) | c; 1505 1515 stringbuffer.writeByte(v); 1506 1516 } 1507 1517 else 1508 1518 v = c; 1509 1519 n++; 1510 1520 break; 1511 1521 } 1512 1522 } 1513 1523 } 1514 1524 1515 1525 1516 1526 #if DMDV2 1517 1527 /************************************** 1518 1528 * Lex delimited strings: 1519 1529 * q"(foo(xxx))" // "foo(xxx)" trunk/src/utf.c
r189 r292 1 1 // utf.c 2 // Copyright (c) 2003 by Digital Mars2 // Copyright (c) 2003-2009 by Digital Mars 3 3 // All Rights Reserved 4 4 // written by Walter Bright 5 5 // http://www.digitalmars.com 6 6 // License for redistribution is by either the Artistic License 7 7 // in artistic.txt, or the GNU General Public License in gnu.txt. 8 8 // See the included readme.txt for details. 9 9 10 10 // Description of UTF-8 at: 11 11 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 12 12 13 13 #include <stdio.h> 14 14 #include <assert.h> 15 15 16 16 #include "utf.h" 17 17 18 18 int utf_isValidDchar(dchar_t c) 19 19 { 20 20 return c < 0xD800 || 21 21 (c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF); 22 22 } 23 23 24 24 /******************************************** 25 25 * Decode a single UTF-8 character sequence. 26 26 * Returns: 27 27 * NULL success 28 28 * !=NULL error message string 29 29 */ 30 30 31 31 const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult) 32 32 { 33 33 dchar_t V; 34 34 size_t i = *pidx; 35 35 unsigned char u = s[i]; 36 37 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); 36 38 37 39 assert(i >= 0 && i < len); 38 40 39 41 if (u & 0x80) 40 42 { unsigned n; 41 43 unsigned char u2; 42 44 43 45 /* The following encodings are valid, except for the 5 and 6 byte 44 46 * combinations: 45 47 * 0xxxxxxx 46 48 * 110xxxxx 10xxxxxx 47 49 * 1110xxxx 10xxxxxx 10xxxxxx 48 50 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 49 51 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 50 52 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 51 53 */ 52 54 for (n = 1; ; n++) 53 55 { 54 56 if (n > 4) 55 57 goto Lerr; // only do the first 4 of 6 encodings
