root/trunk/docsrc/cppstrings.dd

Revision 2040, 13.1 kB (checked in by walter, 2 years ago)

typography

  • Property svn:eol-style set to native
Line 
1 Ddoc
2
3 $(COMMUNITY D Strings vs C++ Strings,
4
5
6 Why have strings built-in to the core language of D rather than entirely in
7 a library as in C++ Strings? What's the point? Where's the improvement?
8
9 <h4>Concatenation Operator</h4>
10
11 $(P C++ Strings are stuck with overloading existing operators. The
12     obvious choice for concatenation is += and +.
13     But someone just looking at the code will see + and think "addition".
14     He'll have to look up the types (and types are frequently buried
15     behind multiple typedef's) to see that it's a string type, and
16     it's not adding strings but concatenating them.
17 )
18 $(P Additionally, if one has an array of floats, is $(SINGLEQUOTE +) overloaded to
19     be the same as a vector addition, or an array concatenation?
20 )
21 $(P In D, these problems are avoided by introducing a new binary
22     operator ~ as the concatenation operator. It works with
23     arrays (of which strings are a subset). ~= is the corresponding
24     append operator. ~ on arrays of floats would concatenate them,
25     + would imply a vector add. Adding a new operator makes it possible
26     for orthogonality and consistency in the treatment of arrays.
27     (In D, strings are simply arrays of characters, not a special
28     type.)
29 )
30
31 <h4>Interoperability With C String Syntax</h4>
32
33 $(P Overloading of operators only really works if one of the operands
34     is overloadable. So the C++ string class cannot consistently
35     handle arbitrary expressions containing strings. Consider:
36 )
37
38 $(CCODE
39 const char abc[5] = "world";
40 string str = "hello" + abc;
41 )
42
43 $(P That isn't going to work. But it does work when the core language
44     knows about strings:
45 )
46
47 $(CCODE
48 const char[5] abc = "world";
49 char[] str = "hello" ~ abc;
50 )
51
52 <h4>Consistency With C String Syntax</h4>
53
54 $(P
55     There are three ways to find the length of a string in C++:
56 )
57
58 $(CCODE
59 const char abc[] = "world"; :   sizeof(abc)/sizeof(abc[0])-1
60                 :   strlen(abc)
61 string str;         :   str.length()
62 )
63
64 $(P
65     That kind of inconsistency makes it hard to write generic templates.
66     Consider D:
67 )
68
69 -----------------------
70 char[5] abc = "world";  :   abc.length
71 char[] str      :   str.length
72 -----------------------
73
74 <h4>Checking For Empty Strings</h4>
75
76 $(P
77     C++ strings use a function to determine if a string is empty:
78 )
79
80 $(CCODE
81 string str;
82 if (str.empty())
83     // string is empty
84 )
85
86 $(P
87     In D, an empty string has zero length:
88 )
89
90 -----------------------
91 char[] str;
92 if (!str.length)
93     // string is empty
94 -----------------------
95
96
97 <h4>Resizing Existing String</h4>
98
99 $(P
100     C++ handles this with the resize() member function:
101 )
102
103 $(CCODE
104 string str;
105 str.resize(newsize);
106 )
107
108 $(P
109     D takes advantage of knowing that str is an array, and
110     so resizing it is just changing the length property:
111 )
112
113 -----------------------
114 char[] str;
115 str.length = newsize;
116 -----------------------
117
118 <h4>Slicing a String</h4>
119
120 $(P
121     C++ slices an existing string using a special constructor:
122 )
123
124 $(CCODE
125 string s1 = "hello world";
126 string s2(s1, 6, 5);        // s2 is "world"
127 )
128
129 $(P
130     D has the array slice syntax, not possible with C++:
131 )
132
133 -----------------------
134 string s1 = "hello world";
135 string s2 = s1[6 .. 11];    // s2 is "world"
136 -----------------------
137
138 $(P
139     Slicing, of course, works with any array in D, not just strings.
140 )
141
142 <h4>Copying a String</h4>
143
144 $(P
145     C++ copies strings with the replace function:
146 )
147
148 $(CCODE
149 string s1 = "hello world";
150 string s2 = "goodbye      ";
151 s2.replace(8, 5, s1, 6, 5); // s2 is "goodbye world"
152 )
153
154 $(P
155     D uses the slice syntax as an lvalue:
156 )
157
158 -----------------------
159 char[] s1 = "hello world".dup;
160 char[] s2 = "goodbye      ".dup;
161 s2[8..13] = s1[6..11];      // s2 is "goodbye world"
162 -----------------------
163
164     $(P The $(CODE .dup) is needed because string literals are
165     read-only in D, the $(CODE .dup) will create a copy
166     that is writable.
167     )
168
169
170 <h4>Conversions to C Strings</h4>
171
172 $(P
173     This is needed for compatibility with C API's. In C++, this
174     uses the c_str() member function:
175 )
176
177 $(CCODE
178 void foo(const char *);
179 string s1;
180 foo(s1.c_str());
181 )
182
183 $(P
184     In D, strings can be converted to char* using the .ptr property:
185 )
186
187 -----------------------
188 void foo(char*);
189 char[] s1;
190 foo(s1.ptr);
191 -----------------------
192     $(P although for this to work where $(TT foo) expects a 0 terminated
193     string, $(TT s1) must have a terminating 0. Alternatively, the
194     function $(TT std.string.toStringz) will ensure it:)
195
196 -----------------------
197 void foo(char*);
198 char[] s1;
199 foo(std.string.$(B toStringz)(s1));
200 -----------------------
201
202
203 <h4>Array Bounds Checking</h4>
204
205 $(P
206     In C++, string array bounds checking for [] is not done.
207     In D, array bounds checking is on by default and it can be turned off
208     with a compiler switch after the program is debugged.
209 )
210
211 <h4>String Switch Statements</h4>
212
213 $(P
214     Are not possible in C++, nor is there any way to add them
215     by adding more to the library. In D, they take the obvious
216     syntactical forms:
217 )
218
219 -----------------------
220 switch (str)
221 {
222     case "hello":
223     case "world":
224     ...
225 }
226 -----------------------
227
228 $(P
229     where str can be any of literal "string"s, fixed string arrays
230     like char[10], or dynamic strings like char[]. A quality implementation
231     can, of course, explore many strategies of efficiently implementing
232     this based on the contents of the case strings.
233 )
234
235 <h4>Filling a String</h4>
236
237 $(P
238     In C++, this is done with the replace() member function:
239 )
240
241 $(CCODE
242 string str = "hello";
243 str.replace(1,2,2,'?');     // str is "h??lo"
244 )
245
246 $(P
247     In D, use the array slicing syntax in the natural manner:
248 )
249
250 -----------------------
251 char[5] str = "hello";
252 str[1..3] = '?';        // str is "h??lo"
253 -----------------------
254
255 <h4>Value vs Reference</h4>
256
257 $(P
258     C++ strings, as implemented by STLport, are by value and are
259     0-terminated. [The latter is an implementation choice, but
260     STLport seems to be the most popular implementation.]
261     This, coupled with no garbage collection, has
262     some consequences. First of all, any string created must make
263     its own copy of the string data. The $(SINGLEQUOTE owner) of the string
264     data must be kept track of, because when the owner is deleted
265     all references become invalid. If one tries to avoid the
266     dangling reference problem by treating strings as value types,
267     there will be a lot of overhead of memory allocation,
268     data copying, and memory deallocation. Next, the 0-termination
269     implies that strings cannot refer to other strings. String
270     data in the data segment, stack, etc., cannot
271     be referred to.
272 )
273
274 $(P
275     D strings are reference types, and the memory is garbage collected.
276     This means that only references need to be copied, not the
277     string data. D strings can refer to data in the static data
278     segment, data on the stack, data inside other strings, objects,
279     file buffers, etc. There's no need to keep track of the $(SINGLEQUOTE owner)
280     of the string data.
281 )
282
283 $(P
284     The obvious question is if multiple D strings refer to the same
285     string data, what happens if the data is modified? All the
286     references will now point to the modified data. This can have
287     its own consequences, which can be avoided if the copy-on-write
288     convention is followed. All copy-on-write is is that if
289     a string is written to, an actual copy of the string data is made
290     first.
291 )
292
293 $(P
294     The result of D strings being reference only and garbage collected
295     is that code that does a lot of string manipulating, such as
296     an lzw compressor, can be a lot more efficient in terms of both
297     memory consumption and speed.
298 )
299
300 <h2>Benchmark</h2>
301
302 $(P
303     Let's take a look at a small utility, wordcount, that counts up
304     the frequency of each word in a text file. In D, it looks like this:
305 )
306
307 -----------------------
308 import std.file;
309 import std.stdio;
310
311 int main (char[][] args)
312 {
313     int w_total;
314     int l_total;
315     int c_total;
316     int[char[]] dictionary;
317
318     writefln("   lines   words   bytes file");
319     for (int i = 1; i < args.length; ++i)
320     {
321     char[] input;
322     int w_cnt, l_cnt, c_cnt;
323     int inword;
324     int wstart;
325
326     input = cast(char[])std.file.read(args[i]);
327
328     for (int j = 0; j < input.length; j++)
329     {   char c;
330
331         c = input[j];
332         if (c == '\n')
333         ++l_cnt;
334         if (c >= '0' && c <= '9')
335         {
336         }
337         else if (c >= 'a' && c <= 'z' ||
338         c >= 'A' && c <= 'Z')
339         {
340         if (!inword)
341         {
342             wstart = j;
343             inword = 1;
344             ++w_cnt;
345         }
346         }
347         else if (inword)
348         {   char[] word = input[wstart .. j];
349
350         dictionary[word]++;
351         inword = 0;
352         }
353         ++c_cnt;
354     }
355     if (inword)
356     {   char[] w = input[wstart .. input.length];
357         dictionary[w]++;
358     }
359     writefln("%8s%8s%8s %s", l_cnt, w_cnt, c_cnt, args[i]);
360     l_total += l_cnt;
361     w_total += w_cnt;
362     c_total += c_cnt;
363     }
364
365     if (args.length > 2)
366     {
367     writefln("--------------------------------------%8s%8s%8s total",
368         l_total, w_total, c_total);
369     }
370
371     writefln("--------------------------------------");
372
373     foreach (char[] word1; dictionary.keys.sort)
374     {
375     writefln("%3d %s", dictionary[word1], word1);
376     }
377     return 0;
378 }
379 -----------------------
380
381     $(P (An $(LINK2 wc.html, alternate implementation) that
382     uses buffered file I/O to handle larger files.))
383
384     $(P
385     Two people have written C++ implementations using the C++ standard
386     template library,
387     <a href="http://groups.google.com/groups?q=g:thl953709878d&dq=&hl=en&lr=&ie=UTF-8&oe=UTF-8&selm=bjacrl%244un%2401%241%40news.t-online.com">wccpp1</a>
388     and
389     $(LINK2 #wccpp2, wccpp2).
390     The input file
391     $(LINK2 http://www.gutenberg.org/dirs/etext91/alice30.txt, alice30.txt)
392     is the text of "Alice in Wonderland."
393     The D compiler,
394     <a HREF="http://ftp.digitalmars.com/dmd.zip" title="download D compiler">dmd</a>,
395     and the C++ compiler,
396     <a HREF="http://ftp.digitalmars.com/dmc.zip" title="download dmc.zip">dmc</a>,
397     share the same
398     optimizer and code generator, which provides a more apples to
399     apples comparison of the efficiency of the semantics of the languages
400     rather than the optimization and code generator sophistication.
401     Tests were run on a Win XP machine. dmc uses STLport for the template
402     implementation.
403     )
404
405     $(TABLE1
406     $(TR
407     $(TH Program)
408     $(TH Compile)
409     $(TH Compile Time)
410     $(TH Run)
411     $(TH Run Time)
412     )
413     $(TR
414     $(TD D wc)
415     $(TD dmd wc -O -release)
416     $(TD 0.0719)
417     $(TD wc alice30.txt &gt;log)
418     $(TD 0.0326)
419     )
420     $(TR
421     $(TD C++ wccpp1)
422     $(TD dmc wccpp1 -o -I\dm\stlport\stlport)
423     $(TD 2.1917)
424     $(TD wccpp1 alice30.txt &gt;log)
425     $(TD 0.0944)
426     )
427     $(TR
428     $(TD C++ wccpp2)
429     $(TD dmc wccpp2 -o -I\dm\stlport\stlport)
430     $(TD 2.0463)
431     $(TD wccpp2 alice30.txt &gt;log)
432     $(TD 0.1012)
433     )
434     )
435
436     $(P
437     The following tests were run on linux, again comparing a D compiler
438     ($(LINK2 http://home.earthlink.net/~dvdfrdmn/d, gdc))
439     and a C++ compiler ($(B g++)) that share a common optimizer and
440     code generator. The system is Pentium III 800MHz running RedHat Linux 8.0
441     and gcc 3.4.2.
442     The Digital Mars D compiler for linux ($(B dmd))
443     is included for comparison.
444     )
445
446
447     $(TABLE1
448     $(TR
449     $(TH Program)
450     $(TH Compile)
451     $(TH Compile Time)
452     $(TH Run)
453     $(TH Run Time)
454     )
455     $(TR
456     $(TD D wc)
457     $(TD gdc -O2 -frelease -o wc wc.d)
458     $(TD 0.326)
459     $(TD wc alice30.txt &gt; /dev/null)
460     $(TD 0.041)
461     )
462     $(TR
463     $(TD D wc)
464     $(TD dmd wc -O -release)
465     $(TD 0.235)
466     $(TD wc alice30.txt &gt; /dev/null)
467     $(TD 0.041)
468     )
469     $(TR
470     $(TD C++ wccpp1)
471     $(TD g++ -O2 -o wccpp1 wccpp1.cc)
472     $(TD 2.874)
473     $(TD wccpp1 alice30.txt &gt; /dev/null)
474     $(TD 0.086)
475     )
476     $(TR
477     $(TD C++ wccpp2)
478     $(TD g++ -O2 -o wccpp2 wccpp2.cc)
479     $(TD 2.886)
480     $(TD wccpp2 alice30.txt &gt; /dev/null)
481     $(TD 0.095)
482     )
483     )
484
485     $(P
486     These tests compare gdc with g++ on a PowerMac G5 2x2.0GHz
487     running MacOS X 10.3.5 and gcc 3.4.2. (Timings are a little
488     less accurate.)
489     )
490
491     $(TABLE1
492     $(TR
493     $(TH Program)
494     $(TH Compile)
495     $(TH Compile Time)
496     $(TH Run)
497     $(TH Run Time)
498     )
499     $(TR
500     $(TD D wc)
501     $(TD gdc -O2 -frelease -o wc wc.d)
502     $(TD 0.28)
503     $(TD wc alice30.txt &gt; /dev/null)
504     $(TD 0.03)
505     )
506     $(TR
507     $(TD C++ wccpp1)
508     $(TD g++ -O2 -o wccpp1 wccpp1.cc)
509     $(TD 1.90)
510     $(TD wccpp1 alice30.txt &gt; /dev/null)
511     $(TD 0.07)
512     )
513     $(TR
514     $(TD C++ wccpp2)
515     $(TD g++ -O2 -o wccpp2 wccpp2.cc)
516     $(TD 1.88)
517     $(TD wccpp2 alice30.txt &gt; /dev/null)
518     $(TD 0.08)
519     )
520     )
521 <hr>
522 <h4><a name="wccpp2">wccpp2 by Allan Odgaard</a></h4>
523
524 $(CCODE
525 #include &lt;algorithm&gt;
526 #include &lt;cstdio&gt;
527 #include &lt;fstream&gt;
528 #include &lt;iterator&gt;
529 #include &lt;map&gt;
530 #include &lt;vector&gt;
531
532 bool isWordStartChar (char c)   { return isalpha(c); }
533 bool isWordEndChar (char c) { return !isalnum(c); }
534
535 int main (int argc, char const* argv[])
536 {
537     using namespace std;
538     printf("Lines Words Bytes File:\n");
539
540     map&lt;string, int&gt; dict;
541     int tLines = 0, tWords = 0, tBytes = 0;
542     for(int i = 1; i &lt; argc; i++)
543     {
544     ifstream file(argv[i]);
545     istreambuf_iterator&lt;char&gt; from(file.rdbuf()), to;
546     vector&lt;char&gt; v(from, to);
547     vector&lt;char&gt;::iterator first = v.begin(), last = v.end(), bow, eow;
548
549     int numLines = count(first, last, '\n');
550     int numWords = 0;
551     int numBytes = last - first;
552
553     for(eow = first; eow != last; )
554     {
555         bow = find_if(eow, last, isWordStartChar);
556         eow = find_if(bow, last, isWordEndChar);
557         if(bow != eow)
558         ++dict[string(bow, eow)], ++numWords;
559     }
560
561     printf("%5d %5d %5d %s\n", numLines, numWords, numBytes, argv[i]);
562
563     tLines += numLines;
564     tWords += numWords;
565     tBytes += numBytes;
566     }
567
568     if(argc &gt; 2)
569         printf("-----------------------\n%5d %5d %5d\n", tLines, tWords, tBytes);
570     printf("-----------------------\n\n");
571
572     for(map&lt;string, int&gt;::const_iterator it = dict.begin(); it != dict.end(); ++it)
573         printf("%5d %s\n", it-&gt;second, it-&gt;first.c_str());
574
575     return 0;
576 }
577 )
578
579 )
580
581 Macros:
582     TITLE=D Strings vs C++ Strings
583     WIKI=CPPstrings
Note: See TracBrowser for help on using the browser.