Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

Ticket #641: reader.pl

File reader.pl, 8.5 kB (added by ptriller, 1 year ago)
Line 
1 #!/usr/bin/perl -w
2
3 use strict;
4
5 open(FILE ,"<SpecialCasing.txt") || die "Unable to open SpecialCasing.txt file";
6
7 my %data;
8 while(my $i = <FILE>) {
9     $i =~ s/\s*#.*//g;
10     next if $i =~ /^\s*$/;
11    
12     my @line = split /\s*;\s*/, $i;
13     if($#line >= 4 && length($line[4]) > 0) {
14         print "Conditional Mapping skipped: ".$i;
15         next;
16     }
17     $data{$line[0]} = $i;
18 }
19
20 close(FILE);
21
22
23 open(FILE ,"<UnicodeData.txt") || die "Unable to open file";
24 open(OUTPUT, ">src/UnicodeData.d") || die "Unable to open output file";
25
26 print OUTPUT <<EOF
27 /*******************************************************************************
28
29         copyright:      Copyright (c) 2007 Peter Triller. All rights reserved
30
31         license:        BSD style: $(LICENSE)
32
33         version:        Initial release: Sept 2007
34
35         authors:        Peter
36
37         Contains the Unicode Data files converted to structs and simple
38         accessor functions. This file is generated by a Perl script. All
39         necessary changes should be made in the script, not in this file.
40
41 *******************************************************************************/
42
43 module UnicodeData;
44
45 struct UnicodeData {
46     
47     enum GeneralCategory {
48         Lu = 1 <<  0, //  Letter, Uppercase
49         Ll = 1 <<  1, //  Letter, Lowercase
50         Lt = 1 <<  2, //  Letter, Titlecase
51         Lm = 1 <<  3, //  Letter, Modifier
52         Lo = 1 <<  4, //  Letter, Other
53         Mn = 1 <<  5, //  Mark, Nonspacing
54         Mc = 1 <<  6, //  Mark, Spacing Combining
55         Me = 1 <<  7, //  Mark, Enclosing
56         Nd = 1 <<  8, //  Number, Decimal Digit
57         Nl = 1 <<  9, //  Number, Letter
58         No = 1 << 10, //  Number, Other
59         Pc = 1 << 11, //  Punctuation, Connector
60         Pd = 1 << 12, //  Punctuation, Dash
61         Ps = 1 << 13, //  Punctuation, Open
62         Pe = 1 << 14, //  Punctuation, Close
63         Pi = 1 << 15, //  Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
64         Pf = 1 << 16, //  Punctuation, Final quote (may behave like Ps or Pe depending on usage)
65         Po = 1 << 17, //  Punctuation, Other
66         Sm = 1 << 18, //  Symbol, Math
67         Sc = 1 << 19, //  Symbol, Currency
68         Sk = 1 << 20, //  Symbol, Modifier
69         So = 1 << 21, //  Symbol, Other
70         Zs = 1 << 22, //  Separator, Space
71         Zl = 1 << 23, //  Separator, Line
72         Zp = 1 << 24, //  Separator, Paragraph
73         Cc = 1 << 25, //  Other, Control
74         Cf = 1 << 26, //  Other, Format
75         Cs = 1 << 27, //  Other, Surrogate
76         Co = 1 << 28, //  Other, Private Use
77         Cn = 1 << 29, //  Other, Not Assigned (no characters in the file have this property)
78         SpecialMapping = 1 << 30 // Special Bit for detection of specialMappings
79     }
80     
81     
82     enum BidiClass {
83         L   = 1 <<  0, //  Left-to-Right
84         LRE = 1 <<  1, //  Left-to-Right Embedding
85         LRO = 1 <<  2, //  Left-to-Right Override
86         R   = 1 <<  3, //  Right-to-Left
87         AL  = 1 <<  4, //  Right-to-Left Arabic
88         RLE = 1 <<  5, //  Right-to-Left Embedding
89         RLO = 1 <<  6, //  Right-to-Left Override
90         PDF = 1 <<  7, //  Pop Directional Format
91         EN  = 1 <<  8, //  European Number
92         ES  = 1 <<  9, //  European Number Separator
93         ET  = 1 << 10, //  European Number Terminator
94         AN  = 1 << 11, //  Arabic Number
95         CS  = 1 << 12, //  Common Number Separator
96         NSM = 1 << 13, //  Non-Spacing Mark
97         BN  = 1 << 14, //  Boundary Neutral
98         B   = 1 << 15, //  Paragraph Separator
99         S   = 1 << 16, //  Segment Separator
100         WS  = 1 << 17, //  Whitespace
101         ON  = 1 << 18  //  Other Neutrals
102     }
103     
104     enum DecompositionType {
105         None     = 1 <<  0, // Custom type signaling no Decomposition
106         Font     = 1 <<  1, //    A font variant (e.g. a blackletter form).
107         NoBreak  = 1 <<  2, //    A no-break version of a space or hyphen.
108         Initial  = 1 <<  3, //    An initial presentation form (Arabic).
109         Medial   = 1 <<  4, //    A medial presentation form (Arabic).
110         Final    = 1 <<  5, //    A final presentation form (Arabic).
111         Isolated = 1 <<  6, //    An isolated presentation form (Arabic).
112         Circle   = 1 <<  7, //    An encircled form.
113         Super    = 1 <<  8, //    A superscript form.
114         Sub      = 1 <<  9, //    A subscript form.
115         Vertical = 1 << 10, //    A vertical layout presentation form.
116         Wide     = 1 << 11, //    A wide (or zenkaku) compatibility character.
117         Narrow   = 1 << 12, //    A narrow (or hankaku) compatibility character.
118         Small    = 1 << 13, //    A small variant form (CNS compatibility).
119         Square   = 1 << 14, //    A CJK squared font variant.
120         Fraction = 1 << 15, //    A vulgar fraction form.
121         Compat   = 1 << 16  //    Otherwise unspecified compatibility character.
122     }
123     
124     enum BidiMirrored {
125         Y = 1, // Yes
126         N = 2 // No
127     }
128     
129     dchar code;
130     
131 //    char[] name;
132     
133     GeneralCategory generalCategory;
134     
135 //    short canonicalCombiningClass;
136     
137     //TODO the defaults are not yet set correctly
138     
139 //    BidiClass bidiClass;
140     
141     //TODO end
142     
143 //    DecompositionType decompositionType;
144     
145 //    dchar[] decompositionMapping;
146     
147     
148     // TODO Check handling
149     
150 //    int numeric_1;
151     
152 //    int numeric_2;
153     
154 //    double numeric_3;
155     
156     // TODO end
157     
158 //    BidiMirrored bidiMirrored;
159     
160 //    char[] unicode1Name;
161     
162 //    char [] isoComment;
163     
164     dchar simpleUpperCaseMapping;
165     
166     dchar simpleLowerCaseMapping;
167     
168     dchar simpleTitleCaseMapping;
169       
170 }
171
172 struct SpecialCaseData {
173     
174     dchar code;
175     
176     dchar[] upperCaseMapping;
177
178     dchar[] lowerCaseMapping;
179
180     dchar[] titleCaseMapping;
181
182 }
183
184 struct FoldingCaseData {
185     
186     dchar code;
187     
188     dchar[] mapping;
189     
190 }
191
192 UnicodeData *unicodeData[dchar];
193
194 SpecialCaseData *specialCaseData[dchar];
195
196 FoldingCaseData *foldingCaseData[dchar];
197
198 private {
199     UnicodeData internalUnicodeData[] = [
200 EOF
201 ;
202 my $first = 0;
203 my $last = -1;
204 my $num;
205 while(my $i = <FILE>) {
206     my @line = split /;/, $i;
207    
208     foreach my $i (@line) {
209         $i =~ s/^\s+//;
210         $i =~ s/\s+$//;   
211     }
212    
213     my $ucm = (length($line[12]) > 0 ? $line[12]:$line[0]);
214
215     print $num."\n";
216     $num++;
217
218     print OUTPUT " { code:0x$line[0]\n";
219    
220     print OUTPUT "  ,generalCategory:UnicodeData.GeneralCategory.".(length($line[2]) > 0? $line[2] :"Cn");
221    
222     if(exists $data{$line[0]}) {
223         print OUTPUT " | UnicodeData.GeneralCategory.SpecialMapping"
224     }
225     print OUTPUT "\n";
226
227     print OUTPUT "  ,simpleUpperCaseMapping:0x".$ucm."\n";
228    
229     print OUTPUT "  ,simpleLowerCaseMapping:0x".(length($line[13]) > 0 ? $line[13]:$line[0])."\n";
230
231     print OUTPUT "  ,simpleTitleCaseMapping:0x".(length($line[14]) > 0 ? $line[14]:$ucm)."\n";
232
233     print OUTPUT " },\n"
234 }
235 print OUTPUT "\n];\n";
236
237 print OUTPUT "    SpecialCaseData internalSpecialCaseData[] = [\n";
238
239 for my $key (sort keys %data) {
240    
241     my @line = split /\s*;\s*/, $data{$key};
242    
243    
244     print OUTPUT " { code:0x$line[0]\n";
245     print OUTPUT "  ,upperCaseMapping:";
246     my @uc = split /\s+/, $line[3];
247     if(length(@uc) > 0) {
248         print OUTPUT "[ 0x". (join ", 0x" , @uc). " ]\n";
249     } else {
250         print OUTPUT "null\n";
251     }
252     print OUTPUT "  ,lowerCaseMapping:";
253     my @lc = split /\s+/, $line[1];
254     if(length(@lc) > 0) {
255         print OUTPUT "[ 0x". (join ", 0x" , @lc). " ]\n";
256     } else {
257         print OUTPUT "null;\n";
258     }
259     print OUTPUT "  ,titleCaseMapping:";
260     my @tc = split /\s+/, $line[2];
261     if(length(@tc) > 0) {
262         print OUTPUT "[ 0x". (join ", 0x" , @tc). " ]\n";
263     } else {
264         print OUTPUT "null\n";
265     }
266    
267     print OUTPUT "\n },\n";
268 }
269 print OUTPUT "\n];\n";
270
271 close(FILE);
272
273 open(FILE ,"<CaseFolding.txt") || die "Unable to open CaseFolding.txt file";
274
275 print OUTPUT "    FoldingCaseData internalFoldingCaseData[] = [\n";
276
277 while(my $i = <FILE>) {
278     $i =~ s/\s*#.*//g;
279     next if $i =~ /^\s*$/;
280    
281     my @line = split /\s*;\s*/, $i;
282
283     next if $line[1] eq "S";
284    
285     my @mapping = split / +/, $line[2];
286    
287     print OUTPUT " {\n";
288    
289     print OUTPUT "   code:0x$line[0]\n";
290     print OUTPUT "  ,mapping: [ 0x".(join ", 0x" , @mapping)." ]\n";
291     print OUTPUT " },\n";
292 }
293
294 print OUTPUT "\n];\n";
295 close(FILE);
296
297 print OUTPUT "\n}\n\n";
298
299 print OUTPUT <<EOF
300 static this() {
301     foreach(inout entry; internalUnicodeData)
302         unicodeData[entry.code] = &entry;
303     unicodeData.rehash;
304     
305     foreach(inout entry; internalSpecialCaseData)
306         specialCaseData[entry.code] = &entry;
307     specialCaseData.rehash;
308     
309     foreach(inout entry; internalFoldingCaseData)
310         foldingCaseData[entry.code] = &entry;
311     foldingCaseData.rehash;
312 }
313 EOF
314 ;
315
316 close(OUTPUT);