Download Reference Manual
The Developer's Library for D
About Wiki Forums Source Search Contact

Ticket #641: UnicodeConverter.d

File UnicodeConverter.d, 22.1 kB (added by ptriller, 1 year ago)
Line 
1 /*******************************************************************************
2
3         copyright:      Copyright (c) 2007 Peter Triller. All rights reserved
4
5         license:        BSD style: $(LICENSE)
6
7         version:        Initial release: Sept 2007
8
9         authors:        Peter
10
11         Provides case mapping Functions for Unicode Strings. As of now it is
12         only 99 % complete, because it does not take into account Conditional
13         case mappings. This means the Greek Letter Sigma will not be correctly
14         case mapped at the end of a Word, and the Locales Lithuanian, Turkish
15         and Azeri are not taken into account during Case Mappings. This means
16         all in all around 12 Characters will not be mapped correctly under
17         some circumstances.
18         
19         ICU4j also does not handle these cases at the moment.
20         
21         Unittests are written against output from ICU4j
22         
23         This Module tries to minimize Memory allocation and usage. You can
24         always pass the output buffer that should be used to the case mapping
25         function, which will be resized if necessary.
26
27 *******************************************************************************/
28
29 module UnicodeConverter;
30
31 private import UnicodeData;
32 private import tango.text.convert.Utf;
33
34
35
36 /**
37  * Converts an Utf8 String to Upper case
38  *
39  * Params:
40  *     input = String to be case mapped
41  *     output = this output buffer will be used unless too small
42  * Returns: the case mapped string
43  */
44 char[] blockToUpper(char[] input, char[] output = null, dchar[] working = null) {
45
46     // ?? How much preallocation ?? This is worst case allocation
47     if (working is null)
48         working.length = input.length;
49
50     size_t produced = 0;
51     size_t ate;
52     size_t oprod = 0;
53     foreach(dchar ch; input) {
54         // TODO Conditional Case Mapping
55         UnicodeData **d = (ch in unicodeData);
56         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
57             SpecialCaseData **s = (ch in specialCaseData);
58             debug {
59                 assert(s !is null);
60             }
61             if((*s).upperCaseMapping !is null) {
62                 // To speed up, use worst case for memory prealocation
63                 // since the length of an UpperCaseMapping list is at most 4
64                 // Make sure no relocation is made in the toUtf8 Method
65                 // better allocation algorithm ?
66                 int len = (*s).upperCaseMapping.length;
67                 if(produced + len >= working.length)
68                     working.length = working.length + working.length / 2 +  len;
69                 oprod = produced;
70                 produced += len;
71                 working[oprod..produced] = (*s).upperCaseMapping;
72                 continue;
73             }           
74         }
75         // Make sure no relocation is made in the toUtf8 Method
76         if(produced + 1 >= output.length)
77             working.length = working.length + working.length / 2 + 1;
78         working[produced++] =  d is null ? ch:(*d).simpleUpperCaseMapping;
79     }
80     return toUtf8(working[0..produced],output);
81 }
82
83
84
85 /**
86  * Converts an Utf8 String to Upper case
87  *
88  * Params:
89  *     input = String to be case mapped
90  *     output = this output buffer will be used unless too small
91  * Returns: the case mapped string
92  */
93 char[] toUpper(char[] input, char[] output = null) {
94
95     dchar[1] buf;
96     // assume most common case: String stays the same length
97     if (output.length < input.length)
98         output.length = input.length;
99
100     size_t produced = 0;
101     size_t ate;
102     foreach(dchar ch; input) {
103         // TODO Conditional Case Mapping
104         UnicodeData **d = (ch in unicodeData);
105         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
106             SpecialCaseData **s = (ch in specialCaseData);
107             debug {
108                 assert(s !is null);
109             }
110             if((*s).upperCaseMapping !is null) {
111                 // To speed up, use worst case for memory prealocation
112                 // since the length of an UpperCaseMapping list is at most 4
113                 // Make sure no relocation is made in the toUtf8 Method
114                 // better allocation algorithm ?
115                 if(produced + (*s).upperCaseMapping.length * 4 >= output.length)
116                         output.length = output.length + output.length / 2 +  (*s).upperCaseMapping.length * 4;
117                 char[] res = toUtf8((*s).upperCaseMapping, output[produced..output.length], &ate);
118                 debug {
119                     assert(ate == (*s).upperCaseMapping.length);
120                     assert(res.ptr == output[produced..output.length].ptr);
121                 }
122                 produced += res.length;
123                 continue;
124             }           
125         }
126         // Make sure no relocation is made in the toUtf8 Method
127         if(produced + 4 >= output.length)
128             output.length = output.length + output.length / 2 + 4;
129         buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
130         char[] res = toUtf8(buf, output[produced..output.length], &ate);
131         debug {
132             assert(ate == 1);
133             assert(res.ptr == output[produced..output.length].ptr);
134         }
135         produced += res.length;
136     }
137     return output[0..produced];
138 }
139
140
141 /**
142  * Converts an Utf16 String to Upper case
143  *
144  * Params:
145  *     input = String to be case mapped
146  *     output = this output buffer will be used unless too small
147  * Returns: the case mapped string
148  */
149 wchar[] toUpper(wchar[] input, wchar[] output = null) {
150
151     dchar[1] buf;
152     // assume most common case: String stays the same length
153     if (output.length < input.length)
154         output.length = input.length;
155
156     size_t produced = 0;
157     size_t ate;
158     foreach(dchar ch; input) {
159         // TODO Conditional Case Mapping
160         UnicodeData **d = (ch in unicodeData);
161         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
162             SpecialCaseData **s = (ch in specialCaseData);
163             debug {
164                 assert(s !is null);
165             }
166             if((*s).upperCaseMapping !is null) {
167                 // To speed up, use worst case for memory prealocation
168                 // Make sure no relocation is made in the toUtf8 Method
169                 // better allocation algorithm ?
170                 if(produced + (*s).upperCaseMapping.length * 2 >= output.length)
171                     output.length = output.length + output.length / 2 +  (*s).upperCaseMapping.length * 3;
172                 wchar[] res = toUtf16((*s).upperCaseMapping, output[produced..output.length], &ate);
173                 debug {
174                     assert(ate == (*s).upperCaseMapping.length);
175                     assert(res.ptr == output[produced..output.length].ptr);
176                 }
177                 produced += res.length;
178                 continue;
179             }
180         }
181         // Make sure no relocation is made in the toUtf8 Method
182         if(produced + 4 >= output.length)
183             output.length = output.length + output.length / 2 + 3;
184         buf[0] = d is null ? ch:(*d).simpleUpperCaseMapping;
185         wchar[] res = toUtf16(buf, output[produced..output.length], &ate);
186         debug {
187             assert(ate == 1);
188             assert(res.ptr == output[produced..output.length].ptr);
189         }
190         produced += res.length;
191     }
192     return output[0..produced];
193 }
194
195 /**
196  * Converts an Utf32 String to Upper case
197  *
198  * Params:
199  *     input = String to be case mapped
200  *     output = this output buffer will be used unless too small
201  * Returns: the case mapped string
202  */
203 dchar[] toUpper(dchar[] input, dchar[] output = null) {
204
205     // assume most common case: String stays the same length
206     if (input.length > output.length)
207         output.length = input.length;
208
209     size_t produced = 0;
210     if (input.length)
211         foreach(dchar orig; input) {
212             // TODO Conditional Case Mapping
213             UnicodeData **d = (orig in unicodeData);
214             if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
215                 SpecialCaseData **s = (orig in specialCaseData);
216                 debug {
217                     assert(s !is null);
218                 }
219                 if((*s).upperCaseMapping !is null) {
220                     // Better resize strategy ???
221                     if(produced + (*s).upperCaseMapping.length  > output.length)
222                         output.length = output.length + output.length / 2 + (*s).upperCaseMapping.length;
223                     foreach(ch; (*s).upperCaseMapping) {
224                         output[produced++] = ch;
225                     }
226                 }
227                 continue;
228             }
229             if(produced >= output.length)
230                 output.length = output.length + output.length / 2;
231             output[produced++] = d is null ? orig:(*d).simpleUpperCaseMapping;
232         }
233     return output[0..produced];
234 }
235
236
237 /**
238  * Converts an Utf8 String to Lower case
239  *
240  * Params:
241  *     input = String to be case mapped
242  *     output = this output buffer will be used unless too small
243  * Returns: the case mapped string
244  */
245 char[] toLower(char[] input, char[] output = null) {
246
247     dchar[1] buf;
248     // assume most common case: String stays the same length
249     if (output.length < input.length)
250         output.length = input.length;
251
252     size_t produced = 0;
253     size_t ate;
254     foreach(dchar ch; input) {
255         // TODO Conditional Case Mapping
256         UnicodeData **d = (ch in unicodeData);
257         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
258             SpecialCaseData **s = (ch in specialCaseData);
259             debug {
260                 assert(s !is null);
261             }
262             if((*s).lowerCaseMapping !is null) {
263                 // To speed up, use worst case for memory prealocation
264                 // since the length of an LowerCaseMapping list is at most 4
265                 // Make sure no relocation is made in the toUtf8 Method
266                 // better allocation algorithm ?
267                 if(produced + (*s).lowerCaseMapping.length * 4 >= output.length)
268                         output.length = output.length + output.length / 2 +  (*s).lowerCaseMapping.length * 4;
269                 char[] res = toUtf8((*s).lowerCaseMapping, output[produced..output.length], &ate);
270                 debug {
271                     assert(ate == (*s).lowerCaseMapping.length);
272                     assert(res.ptr == output[produced..output.length].ptr);
273                 }
274                 produced += res.length;
275                 continue;
276             }           
277         }
278         // Make sure no relocation is made in the toUtf8 Method
279         if(produced + 4 >= output.length)
280             output.length = output.length + output.length / 2 + 4;
281         buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
282         char[] res = toUtf8(buf, output[produced..output.length], &ate);
283         debug {
284             assert(ate == 1);
285             assert(res.ptr == output[produced..output.length].ptr);
286         }
287         produced += res.length;
288     }
289     return output[0..produced];
290 }
291
292
293 /**
294  * Converts an Utf16 String to Lower case
295  *
296  * Params:
297  *     input = String to be case mapped
298  *     output = this output buffer will be used unless too small
299  * Returns: the case mapped string
300  */
301 wchar[] toLower(wchar[] input, wchar[] output = null) {
302
303     dchar[1] buf;
304     // assume most common case: String stays the same length
305     if (output.length < input.length)
306         output.length = input.length;
307
308     size_t produced = 0;
309     size_t ate;
310     foreach(dchar ch; input) {
311         // TODO Conditional Case Mapping
312         UnicodeData **d = (ch in unicodeData);
313         if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
314             SpecialCaseData **s = (ch in specialCaseData);
315             debug {
316                 assert(s !is null);
317             }
318             if((*s).lowerCaseMapping !is null) {
319                 // To speed up, use worst case for memory prealocation
320                 // Make sure no relocation is made in the toUtf8 Method
321                 // better allocation algorithm ?
322                 if(produced + (*s).lowerCaseMapping.length * 2 >= output.length)
323                     output.length = output.length + output.length / 2 +  (*s).lowerCaseMapping.length * 3;
324                 wchar[] res = toUtf16((*s).lowerCaseMapping, output[produced..output.length], &ate);
325                 debug {
326                     assert(ate == (*s).lowerCaseMapping.length);
327                     assert(res.ptr == output[produced..output.length].ptr);
328                 }
329                 produced += res.length;
330                 continue;
331             }
332         }
333         // Make sure no relocation is made in the toUtf8 Method
334         if(produced + 4 >= output.length)
335             output.length = output.length + output.length / 2 + 3;
336         buf[0] = d is null ? ch:(*d).simpleLowerCaseMapping;
337         wchar[] res = toUtf16(buf, output[produced..output.length], &ate);
338         debug {
339             assert(ate == 1);
340             assert(res.ptr == output[produced..output.length].ptr);
341         }
342         produced += res.length;
343     }
344     return output[0..produced];
345 }
346
347 /**
348  * Converts an Utf32 String to Lower case
349  *
350  * Params:
351  *     input = String to be case mapped
352  *     output = this output buffer will be used unless too small
353  * Returns: the case mapped string
354  */
355 dchar[] toLower(dchar[] input, dchar[] output = null) {
356
357     // assume most common case: String stays the same length
358     if (input.length > output.length)
359         output.length = input.length;
360
361     size_t produced = 0;
362     if (input.length)
363         foreach(dchar orig; input) {
364             // TODO Conditional Case Mapping
365             UnicodeData **d = (orig in unicodeData);
366             if(d !is null && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.SpecialMapping)) {
367                 SpecialCaseData **s = (orig in specialCaseData);
368                 debug {
369                     assert(s !is null);
370                 }
371                 if((*s).lowerCaseMapping !is null) {
372                     // Better resize strategy ???
373                     if(produced + (*s).lowerCaseMapping.length  > output.length)
374                         output.length = output.length + output.length / 2 + (*s).lowerCaseMapping.length;
375                     foreach(ch; (*s).lowerCaseMapping) {
376                         output[produced++] = ch;
377                     }
378                 }
379                 continue;
380             }
381             if(produced >= output.length)
382                 output.length = output.length + output.length / 2;
383             output[produced++] = d is null ? orig:(*d).simpleLowerCaseMapping;
384         }
385     return output[0..produced];
386 }
387
388
389
390
391 /**
392  * Determines if a character is a digit. It returns true for decimal
393  * digits only.
394  *
395  * Params:
396  *     ch = the character to be inspected
397  */
398 bool isDigit(dchar ch) {
399     UnicodeData **d = (ch in unicodeData);
400     return (d !is null) && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.Nd);
401 }
402
403
404 /**
405  * Determines if a character is a letter.
406  *
407  * Params:
408  *     ch = the character to be inspected
409  */
410 bool isLetter(int ch) {
411     UnicodeData **d = (ch in unicodeData);
412     return (d !is null) && ((*d).generalCategory &
413         ( UnicodeData.UnicodeData.GeneralCategory.Lu
414         | UnicodeData.UnicodeData.GeneralCategory.Ll
415         | UnicodeData.UnicodeData.GeneralCategory.Lt
416         | UnicodeData.UnicodeData.GeneralCategory.Lm
417         | UnicodeData.UnicodeData.GeneralCategory.Lo));
418 }
419
420 /**
421  * Determines if a character is a letter or a
422  * decimal digit.
423  *
424  * Params:
425  *     ch = the character to be inspected
426  */
427 bool isLetterOrDigit(int ch) {
428     UnicodeData **d = (ch in unicodeData);
429     return (d !is null) && ((*d).generalCategory &
430         ( UnicodeData.UnicodeData.GeneralCategory.Lu
431         | UnicodeData.UnicodeData.GeneralCategory.Ll
432         | UnicodeData.UnicodeData.GeneralCategory.Lt
433         | UnicodeData.UnicodeData.GeneralCategory.Lm
434         | UnicodeData.UnicodeData.GeneralCategory.Lo
435         | UnicodeData.UnicodeData.GeneralCategory.Nd));
436 }
437
438 /**
439  * Determines if a character is a lower case letter.
440  * Params:
441  *     ch = the character to be inspected
442  */
443 bool isLower(dchar ch) {
444     UnicodeData **d = (ch in unicodeData);
445     return (d !is null) && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.Ll);
446 }
447
448 /**
449  * Determines if a character is a title case letter.
450  * Params:
451  *     ch = the character to be inspected
452  */
453 bool isTitle(dchar ch) {
454     UnicodeData **d = (ch in unicodeData);
455     return (d !is null) && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.Lt);
456 }
457
458 /**
459  * Determines if a character is a upper case letter.
460  * Params:
461  *     ch = the character to be inspected
462  */
463 bool isUpper(dchar ch) {
464     UnicodeData **d = (ch in unicodeData);
465     return (d !is null) && ((*d).generalCategory & UnicodeData.UnicodeData.GeneralCategory.Lu);
466 }
467
468 /**
469  * Determines if a character is a Whitespace character.
470  * Whitespace characters are characters in the
471  * General Catetories Zs, Zl, Zp without the No Break
472  * spaces plus the control characters out of the ASCII
473  * range, that are used as spaces:
474  * TAB VT LF FF CR FS GS RS US NL
475  *
476  * WARNING: look at isSpace, maybe that function does
477  *          more what you expect.
478  *
479  * Params:
480  *     ch = the character to be inspected
481  */
482 bool isWhitespace(dchar ch) {
483     if((ch >= 0x0009 && ch <= 0x000D) || (ch >= 0x001C && ch <= 0x001F))
484         return true;
485     UnicodeData **d = (ch in unicodeData);
486     return (d !is null) && ((*d).generalCategory &
487             ( UnicodeData.UnicodeData.GeneralCategory.Zs
488             | UnicodeData.UnicodeData.GeneralCategory.Zl
489             | UnicodeData.UnicodeData.GeneralCategory.Zp))
490             && ch != 0x00A0 // NBSP
491             && ch != 0x202F // NARROW NBSP
492             && ch != 0xFEFF; // ZERO WIDTH NBSP
493 }
494
495 /**
496  * Detemines if a character is a Space character as
497  * specified in the Unicode Standart.
498  *
499  * WARNING: look at isWhitepace, maybe that function does
500  *          more what you expect.
501  *
502  * Params:
503  *     ch = the character to be inspected
504  */
505 bool isSpace(dchar ch) {
506     UnicodeData **d = (ch in unicodeData);
507     return (d !is null) && ((*d).generalCategory &
508             ( UnicodeData.UnicodeData.GeneralCategory.Zs
509             | UnicodeData.UnicodeData.GeneralCategory.Zl
510             | UnicodeData.UnicodeData.GeneralCategory.Zp));
511 }
512
513
514 /**
515  * Detemines if a character is a printable character as
516  * specified in the Unicode Standart.
517  *
518  *
519  * WARNING: look at isWhitepace, maybe that function does
520  *          more what you expect.
521  *
522  * Params:
523  *     ch = the character to be inspected
524  */
525 bool isPrintable(dchar ch) {
526     UnicodeData **d = (ch in unicodeData);
527     return (d !is null) && ((*d).generalCategory &
528             ( UnicodeData.UnicodeData.GeneralCategory.Cn
529             | UnicodeData.UnicodeData.GeneralCategory.Cc
530             | UnicodeData.UnicodeData.GeneralCategory.Cf
531             | UnicodeData.UnicodeData.GeneralCategory.Co
532             | UnicodeData.UnicodeData.GeneralCategory.Cs));
533 }
534
535
536 debug (UnitTest) {       
537
538 unittest {
539    
540    
541     // 1) No Buffer passed, no resize, no SpecialCase
542    
543     char[] testString1utf8 = "\u00E4\u00F6\u00FC"; 
544     wchar[] testString1utf16 = "\u00E4\u00F6\u00FC";
545     dchar[] testString1utf32 = "\u00E4\u00F6\u00FC";
546     char[] refString1utf8 = "\u00C4\u00D6\u00DC";
547     wchar[] refString1utf16 = "\u00C4\u00D6\u00DC";
548     dchar[]