Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

Ticket #641: UnicodeConverter.2.d

File UnicodeConverter.2.d, 26.9 kB (added by ptriller, 1 year ago)
Line 
1 /*******************************************************************************
2
3         copyright:      Copyright (c) 2007 Peter Triller. All rights reserved
4
5         license:        BSD style: $(LICENSE)
6
7         version:        Initial release: Sept 2007
8
9         authors:        Peter
10
11         Provides case mapping Functions for Unicode Strings. As of now it is
12         only 99 % complete, because it does not take into account Conditional
13         case mappings. This means the Greek Letter Sigma will not be correctly
14         case mapped at the end of a Word, and the Locales Lithuanian, Turkish
15         and Azeri are not taken into account during Case Mappings. This means
16         all in all around 12 Characters will not be mapped correctly under
17         some circumstances.
18         
19         ICU4j also does not handle these cases at the moment.
20         
21         Unittests are written against output from ICU4j
22         
23         This Module tries to minimize Memory allocation and usage. You can
24         always pass the output buffer that should be used to the case mapping
25         function, which will be resized if necessary.
26
27 *******************************************************************************/
28
29 module UnicodeConverter;
30
31 private import UnicodeData;
32 private import tango.text.convert.Utf;
33
34
35
36 /**
37  * Converts an Utf8 String to Upper case
38  *
39  * Params:
40  *     input = String to be case mapped
41  *     output = this output buffer will be used unless too small
42  * Returns: the case mapped string
43  */
44 char[] blockToUpper(char[] input, char[] output = null, dchar[] working = null) {
45
46     // ?? How much preallocation ?? This is worst case allocation
47     if (working is null)
48         working.length = input.length;
49
50     size_t produced = 0;
51     size_t ate;
52     size_t oprod = 0;
53     foreach(dchar ch; input) {
54         // TODO Conditional Case Mapping
55         UnicodeData **d = (ch in unicodeData);
56         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
57             SpecialCaseData **s = (ch in specialCaseData);
58             debug {
59                 assert(s !is null);
60             }
61             if((*s).upperCaseMapping !is null) {
62                 // To speed up, use worst case for memory prealocation
63                 // since the length of an UpperCaseMapping list is at most 4
64                 // Make sure no relocation is made in the toUtf8 Method
65                 // better allocation algorithm ?
66                 int len = (*s).upperCaseMapping.length;
67                 if(produced + len >= working.length)
68                     working.length = working.length + working.length / 2 +  len;
69                 oprod = produced;
70                 produced += len;
71                 working[oprod..produced] = (*s).upperCaseMapping;
72                 continue;
73             }           
74         }
75         // Make sure no relocation is made in the toUtf8 Method
76         if(produced + 1 >= output.length)
77             working.length = working.length + working.length / 2 + 1;
78         working[produced++] =  d is null ? ch:(*d).simpleUpperCaseMapping;
79     }
80     return toUtf8(working[0..produced],output);
81 }
82
83
84
85 /**
86  * Converts an Utf8 String to Upper case
87  *
88  * Params:
89  *     input = String to be case mapped
90  *     output = this output buffer will be used unless too small
91  * Returns: the case mapped string
92  */
93 char[] toUpper(char[] input, char[] output = null) {
94
95     dchar[1] buf;
96     // assume most common case: String stays the same length
97     if (output.length < input.length)
98         output.length = input.length;
99
100     size_t produced = 0;
101     size_t ate;
102     foreach(dchar ch; input) {
103         // TODO Conditional Case Mapping
104         UnicodeData **d = (ch in unicodeData);
105         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
106             SpecialCaseData **s = (ch in specialCaseData);
107             debug {
108                 assert(s !is null);
109             }
110             if((*s).upperCaseMapping !is null) {
111                 // To speed up, use worst case for memory prealocation
112                 // since the length of an UpperCaseMapping list is at most 4
113                 // Make sure no relocation is made in the toUtf8 Method
114                 // better allocation algorithm ?
115                 if(produced + (*s).upperCaseMapping.length * 4 >= output.length)
116                         output.length = output.length + output.length / 2 +  (*s).upperCaseMapping.length * 4;
117                 char[] res = toUtf8((*s).upperCaseMapping, output[produced..output.length], &ate);
118                 debug {
119                     assert(ate == (*s).upperCaseMapping.length);
120                     assert(res.ptr == output[produced..output.length].ptr);
121                 }
122                 produced += res.length;
123                 continue;
124             }           
125         }
126         // Make sure no relocation is made in the toUtf8 Method
127         if(produced + 4 >= output.length)
128             output.length = output.length + output.length / 2 + 4;
129         buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
130         char[] res = toUtf8(buf, output[produced..output.length], &ate);
131         debug {
132             assert(ate == 1);
133             assert(res.ptr == output[produced..output.length].ptr);
134         }
135         produced += res.length;
136     }
137     return output[0..produced];
138 }
139
140
141 /**
142  * Converts an Utf16 String to Upper case
143  *
144  * Params:
145  *     input = String to be case mapped
146  *     output = this output buffer will be used unless too small
147  * Returns: the case mapped string
148  */
149 wchar[] toUpper(wchar[] input, wchar[] output = null) {
150
151     dchar[1] buf;
152     // assume most common case: String stays the same length
153     if (output.length < input.length)
154         output.length = input.length;
155
156     size_t produced = 0;
157     size_t ate;
158     foreach(dchar ch; input) {
159         // TODO Conditional Case Mapping
160         UnicodeData **d = (ch in unicodeData);
161         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
162             SpecialCaseData **s = (ch in specialCaseData);
163             debug {
164                 assert(s !is null);
165             }
166             if((*s).upperCaseMapping !is null) {
167                 // To speed up, use worst case for memory prealocation
168                 // Make sure no relocation is made in the toUtf16 Method
169                 // better allocation algorithm ?
170                 if(produced + (*s).upperCaseMapping.length * 2 >= output.length)
171                     output.length = output.length + output.length / 2 +  (*s).upperCaseMapping.length * 3;
172                 wchar[] res = toUtf16((*s).upperCaseMapping, output[produced..output.length], &ate);
173                 debug {
174                     assert(ate == (*s).upperCaseMapping.length);
175                     assert(res.ptr == output[produced..output.length].ptr);
176                 }
177                 produced += res.length;
178                 continue;
179             }
180         }
181         // Make sure no relocation is made in the toUtf16 Method
182         if(produced + 4 >= output.length)
183             output.length = output.length + output.length / 2 + 3;
184         buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
185         wchar[] res = toUtf16(buf, output[produced..output.length], &ate);
186         debug {
187             assert(ate == 1);
188             assert(res.ptr == output[produced..output.length].ptr);
189         }
190         produced += res.length;
191     }
192     return output[0..produced];
193 }
194
195 /**
196  * Converts an Utf32 String to Upper case
197  *
198  * Params:
199  *     input = String to be case mapped
200  *     output = this output buffer will be used unless too small
201  * Returns: the case mapped string
202  */
203 dchar[] toUpper(dchar[] input, dchar[] output = null) {
204
205     // assume most common case: String stays the same length
206     if (input.length > output.length)
207         output.length = input.length;
208
209     size_t produced = 0;
210     if (input.length)
211         foreach(dchar orig; input) {
212             // TODO Conditional Case Mapping
213             UnicodeData **d = (orig in unicodeData);
214             if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
215                 SpecialCaseData **s = (orig in specialCaseData);
216                 debug {
217                     assert(s !is null);
218                 }
219                 if((*s).upperCaseMapping !is null) {
220                     // Better resize strategy ???
221                     if(produced + (*s).upperCaseMapping.length  > output.length)
222                         output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length;
223                     foreach(ch; (*s).upperCaseMapping) {
224                         output[produced++] = ch;
225                     }
226                 }
227                 continue;
228             }
229             if(produced >= output.length)
230                 output.length = output.length + output.length / 2;
231             output[produced++] = d is null ? orig:(*d).simpleUpperCaseMapping;
232         }
233     return output[0..produced];
234 }
235
236
237 /**
238  * Converts an Utf8 String to Lower case
239  *
240  * Params:
241  *     input = String to be case mapped
242  *     output = this output buffer will be used unless too small
243  * Returns: the case mapped string
244  */
245 char[] toLower(char[] input, char[] output = null) {
246
247     dchar[1] buf;
248     // assume most common case: String stays the same length
249     if (output.length < input.length)
250         output.length = input.length;
251
252     size_t produced = 0;
253     size_t ate;
254     foreach(dchar ch; input) {
255         // TODO Conditional Case Mapping
256         UnicodeData **d = (ch in unicodeData);
257         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
258             SpecialCaseData **s = (ch in specialCaseData);
259             debug {
260                 assert(s !is null);
261             }
262             if((*s).lowerCaseMapping !is null) {
263                 // To speed up, use worst case for memory prealocation
264                 // since the length of an LowerCaseMapping list is at most 4
265                 // Make sure no relocation is made in the toUtf8 Method
266                 // better allocation algorithm ?
267                 if(produced + (*s).lowerCaseMapping.length * 4 >= output.length)
268                         output.length = output.length + output.length / 2 +  (*s).lowerCaseMapping.length * 4;
269                 char[] res = toUtf8((*s).lowerCaseMapping, output[produced..output.length], &ate);
270                 debug {
271                     assert(ate == (*s).lowerCaseMapping.length);
272                     assert(res.ptr == output[produced..output.length].ptr);
273                 }
274                 produced += res.length;
275                 continue;
276             }           
277         }
278         // Make sure no relocation is made in the toUtf8 Method
279         if(produced + 4 >= output.length)
280             output.length = output.length + output.length / 2 + 4;
281         buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
282         char[] res = toUtf8(buf, output[produced..output.length], &ate);
283         debug {
284             assert(ate == 1);
285             assert(res.ptr == output[produced..output.length].ptr);
286         }
287         produced += res.length;
288     }
289     return output[0..produced];
290 }
291
292
293 /**
294  * Converts an Utf16 String to Lower case
295  *
296  * Params:
297  *     input = String to be case mapped
298  *     output = this output buffer will be used unless too small
299  * Returns: the case mapped string
300  */
301 wchar[] toLower(wchar[] input, wchar[] output = null) {
302
303     dchar[1] buf;
304     // assume most common case: String stays the same length
305     if (output.length < input.length)
306         output.length = input.length;
307
308     size_t produced = 0;
309     size_t ate;
310     foreach(dchar ch; input) {
311         // TODO Conditional Case Mapping
312         UnicodeData **d = (ch in unicodeData);
313         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
314             SpecialCaseData **s = (ch in specialCaseData);
315             debug {
316                 assert(s !is null);
317             }
318             if((*s).lowerCaseMapping !is null) {
319                 // To speed up, use worst case for memory prealocation
320                 // Make sure no relocation is made in the toUtf16 Method
321                 // better allocation algorithm ?
322                 if(produced + (*s).lowerCaseMapping.length * 2 >= output.length)
323                     output.length = output.length + output.length / 2 +  (*s).lowerCaseMapping.length * 3;
324                 wchar[] res = toUtf16((*s).lowerCaseMapping, output[produced..output.length], &ate);
325                 debug {
326                     assert(ate == (*s).lowerCaseMapping.length);
327                     assert(res.ptr == output[produced..output.length].ptr);
328                 }
329                 produced += res.length;
330                 continue;
331             }
332         }
333         // Make sure no relocation is made in the toUtf16 Method
334         if(produced + 4 >= output.length)
335             output.length = output.length + output.length / 2 + 3;
336         buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
337         wchar[] res = toUtf16(buf, output[produced..output.length], &ate);
338         debug {
339             assert(ate == 1);
340             assert(res.ptr == output[produced..output.length].ptr);
341         }
342         produced += res.length;
343     }
344     return output[0..produced];
345 }
346
347
348 /**
349  * Converts an Utf32 String to Lower case
350  *
351  * Params:
352  *     input = String to be case mapped
353  *     output = this output buffer will be used unless too small
354  * Returns: the case mapped string
355  */
356 dchar[] toLower(dchar[] input, dchar[] output = null) {
357
358     // assume most common case: String stays the same length
359     if (input.length > output.length)
360         output.length = input.length;
361
362     size_t produced = 0;
363     if (input.length)
364         foreach(dchar orig; input) {
365             // TODO Conditional Case Mapping
366             UnicodeData **d = (orig in unicodeData);
367             if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
368                 SpecialCaseData **s = (orig in specialCaseData);
369                 debug {
370                     assert(s !is null);
371                 }
372                 if((*s).lowerCaseMapping !is null) {
373                     // Better resize strategy ???
374                     if(produced + (*s).lowerCaseMapping.length  > output.length)
375                         output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length;
376                     foreach(ch; (*s).lowerCaseMapping) {
377                         output[produced++] = ch;
378                     }
379                 }
380                 continue;
381             }
382             if(produced >= output.length)
383                 output.length = output.length + output.length / 2;
384             output[produced++] = d is null ? orig:(*d).simpleLowerCaseMapping;
385         }
386     return output[0..produced];
387 }
388
389 /**
390  * Converts an Utf8 String to Folding case
391  * Folding case is used for case insensitive comparsions.
392  *
393  * Params:
394  *     input = String to be case mapped
395  *     output = this output buffer will be used unless too small
396  * Returns: the case mapped string
397  */
398 char[] toFold(char[] input, char[] output = null) {
399
400     dchar[1] buf;
401     // assume most common case: String stays the same length
402     if (output.length < input.length)
403         output.length = input.length;
404
405     size_t produced = 0;
406     size_t ate;
407     foreach(dchar ch; input) {
408         FoldingCaseData **s = (ch in foldingCaseData);
409         if(s !is null) {
410             // To speed up, use worst case for memory prealocation
411             // since the length of an UpperCaseMapping list is at most 4
412             // Make sure no relocation is made in the toUtf8 Method
413             // better allocation algorithm ?
414             if(produced + (*s).mapping.length * 4 >= output.length)
415                 output.length = output.length + output.length / 2 +  (*s).mapping.length * 4;
416             char[] res = toUtf8((*s).mapping, output[produced..output.length], &ate);
417             debug {
418                 assert(ate == (*s).mapping.length);
419                 assert(res.ptr == output[produced..output.length].ptr);
420             }
421             produced += res.length;
422             continue;
423         }
424         // Make sure no relocation is made in the toUtf8 Method
425         if(produced + 4 >= output.length)
426             output.length = output.length + output.length / 2 + 4;
427         buf[0] = ch;
428         char[] res = toUtf8(buf, output[produced..output.length], &ate);
429         debug {
430             assert(ate == 1);
431             assert(res.ptr == output[produced..output.length].ptr);
432         }
433         produced += res.length;
434     }
435     return output[0..produced];
436 }
437
438 /**
439  * Converts an Utf16 String to Folding case
440  * Folding case is used for case insensitive comparsions.
441  *
442  * Params:
443  *     input = String to be case mapped
444  *     output = this output buffer will be used unless too small
445  * Returns: the case mapped string
446  */
447 wchar[] toFold(wchar[] input, wchar[] output = null) {
448
449     dchar[1] buf;
450     // assume most common case: String stays the same length
451     if (output.length < input.length)
452         output.length = input.length;
453
454     size_t produced = 0;
455     size_t ate;
456     foreach(dchar ch; input) {
457         FoldingCaseData **s = (ch in foldingCaseData);
458         if(s !is null) {
459             // To speed up, use worst case for memory prealocation
460             // Make sure no relocation is made in the toUtf16 Method
461             // better allocation algorithm ?
462             if(produced + (*s).mapping.length * 2 >= output.length)
463                 output.length = output.length + output.length / 2 +  (*s).mapping.length * 3;
464             wchar[] res = toUtf16((*s).mapping, output[produced..output.length], &ate);
465             debug {
466                 assert(ate == (*s).mapping.length);
467                 assert(res.ptr == output[produced..output.length].ptr);
468             }
469             produced += res.length;
470             continue;
471         }
472         // Make sure no relocation is made in the toUtf16 Method
473         if(produced + 4 >= output.length)
474             output.length = output.length + output.length / 2 + 3;
475         buf[0] = ch;
476         wchar[] res = toUtf16(buf, output[produced..output.length], &ate);
477         debug {
478             assert(ate == 1);
479             assert(res.ptr == output[produced..output.length].ptr);
480         }
481         produced += res.length;
482     }
483     return output[0..produced];
484 }
485
486 /**
487  * Converts an Utf32 String to Folding case
488  * Folding case is used for case insensitive comparsions.
489  *
490  * Params:
491  *     input = String to be case mapped
492  *     output = this output buffer will be used unless too small
493  * Returns: the case mapped string
494  */
495 dchar[] toFold(dchar[] input, dchar[] output = null) {
496
497     // assume most common case: String stays the same length
498     if (input.length > output.length)
499         output.length = input.length;
500
501     size_t produced = 0;
502     if (input.length)
503         foreach(dchar orig; input) {
504             FoldingCaseData **d = (orig in foldingCaseData);
505             if(d !is null ) {
506                 // Better resize strategy ???
507                 if(produced + (*d).mapping.length  > output.length)
508                     output.length = output.length + output.length / 2 + (*d).mapping.length;
509                 foreach(ch; (*d).mapping) {
510                     output[produced++] = ch;
511                 }
512                 continue;
513             }
514             if(produced >= output.length)