root/trunk/tinyxml.d

Revision 33, 80.6 kB (checked in by yossarian, 2 years ago)

added opApply() to tinyxml

Line 
1 /*
2 www.sourceforge.net/projects/tinyxml
3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must
14 not claim that you wrote the original software. If you use this
15 software in a product, an acknowledgment in the product documentation
16 would be appreciated but is not required.
17
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
20
21 3. This notice may not be removed or altered from any source
22 distribution.
23 */
24 public import std.stream;
25 public import std.string;
26
27
28 private import std.stdio;
29
30 private import std.uni;
31 private import xpath.utf8;
32 private import std.ctype;
33 public import std.cstream;
34 public import xpath.xpath_stream;
35 public {
36 enum TiXmlEncoding
37 {
38     TIXML_ENCODING_UNKNOWN,
39     TIXML_ENCODING_UTF8,
40     TIXML_ENCODING_LEGACY
41 };
42 }
43 private 
44 {
45     debug{
46         alias writefln TIXML_LOG;
47     }
48        
49     import std.utf;
50     const char TIXML_UTF_LEAD_0 = 0xefU;
51     const char TIXML_UTF_LEAD_1 = 0xbbU;
52     const char TIXML_UTF_LEAD_2 = 0xbfU;
53
54     class TiXmlParsingData
55     {
56        
57         public {
58             void Stamp( char[] now, TiXmlEncoding encoding )
59             in { assert( ! (now is null) ); assert(now.length > 0); }
60             body
61             {
62                 // Do nothing if the tabsize is 0.
63             }
64
65            
66             TiXmlCursor Cursor()    { return cursor; }
67         }
68         protected{
69         // Only used by the document!
70             this( char[] start, int _tabsize, int row, int col )
71             {
72                 stamp = start;
73                 tabsize = _tabsize;
74                 cursor.row = row;
75                 cursor.col = col;
76             }
77         }
78    
79         TiXmlCursor     cursor;
80         char[]      stamp;
81         int             tabsize;
82     };
83
84
85 }
86 public {
87 /+
88 #ifndef USE_MMGR
89 #include <ctype.h>
90 #include <stdio.h>
91 #include <stdlib.h>
92 #include <string.h>
93 #include <assert.h>
94 #endif
95
96 // Help out windows:
97 #if defined( _DEBUG ) && !defined( DEBUG )
98 #define DEBUG
99 #endif
100
101 #ifdef TIXML_USE_STL
102     #include <string>
103     #include <iostream>
104     #define TIXML_STRING    std::string
105     #define TIXML_ISTREAM   std::istream
106     #define TIXML_OSTREAM   std::ostream
107 #else
108     #include "tinystr.h"
109     #define TIXML_STRING    TiXmlString
110     #define TIXML_OSTREAM   TiXmlOutputStream
111 #endif
112 +/
113
114 const int TIXML_MAJOR_VERSION = 2;
115 const int TIXML_MINOR_VERSION = 4;
116 const int TIXML_PATCH_VERSION = 3;
117 const int NUM_ENTITY = 5;
118
119 /*  Internal structure for tracking location of items
120     in the XML file.
121 */
122 struct TiXmlCursor
123 {
124     void Clear()        { row = row.init; col = col.init; }
125     int row = -1;   // 0 based.
126     int col = -1;   // 0 based.
127 };
128
129
130 // Only used by Attribute::Query functions
131 enum AttributeQueryEnum
132 { 
133     TIXML_SUCCESS,
134     TIXML_NO_ATTRIBUTE,
135     TIXML_WRONG_TYPE
136 };
137
138
139 // Used by the parsing routines.
140
141 const TiXmlEncoding TIXML_DEFAULT_ENCODING = TiXmlEncoding.TIXML_ENCODING_UNKNOWN;
142
143 /** TiXmlBase is a base class for every class in TinyXml.
144     It does little except to establish that TinyXml classes
145     can be printed and provide some utility functions.
146
147     In XML, the document and elements can contain
148     other elements and other types of nodes.
149
150     @verbatim
151     A Document can contain: Element (container or leaf)
152                             Comment (leaf)
153                             Unknown (leaf)
154                             Declaration( leaf )
155
156     An Element can contain: Element (container or leaf)
157                             Text    (leaf)
158                             Attributes (not on tree)
159                             Comment (leaf)
160                             Unknown (leaf)
161
162     A Decleration contains: Attributes (not on tree)
163     @endverbatim
164 */
165 class TiXmlBase
166 {
167     public {
168     this() { userData = null; }
169
170     /** All TinyXml classes can print themselves to a filestream.
171         This is a formatted print, and will insert tabs and newlines.
172         
173         (For an unformatted stream, use the << operator.)
174     */
175     abstract void Print( Stream Stream, int depth );
176
177     /** The world does not agree on whether white space should be kept or
178         not. In order to make everyone happy, these global, static functions
179         are provided to set whether or not TinyXml will condense all white space
180         into a single space or not. The default is to condense. Note changing this
181         values is not thread safe.
182     */
183     static void SetCondenseWhiteSpace( bool condense )      { condenseWhiteSpace = condense; }
184
185     /// Return the current white space setting.
186     static bool IsWhiteSpaceCondensed()                     { return condenseWhiteSpace; }
187
188     /** Return the position, in the original source file, of this node or attribute.
189         The row and column are 1-based. (That is the first row and first column is
190         1,1). If the returns values are 0 or less, then the parser does not have
191         a row and column value.
192
193         Generally, the row and column value will be set when the TiXmlDocument::Load(),
194         TiXmlDocument::LoadFile(), or any TiXmlNode::Parse() is called. It will NOT be set
195         when the DOM was created from operator>>.
196
197         The values reflect the initial load. Once the DOM is modified programmatically
198         (by adding or changing nodes and attributes) the new values will NOT update to
199         reflect changes in the document.
200
201         There is a minor performance cost to computing the row and column. Computation
202         can be disabled if TiXmlDocument::SetTabSize() is called with 0 as the value.
203
204         @sa TiXmlDocument::SetTabSize()
205     */
206     int Row() { return location.row + 1; }
207     int Column() { return location.col + 1; }   ///< See Row()
208
209     void  SetUserData( void* user )         { userData = user; }
210     void* GetUserData()                     { return userData; }
211
212     // Table that returns, for a given lead byte, the total number of bytes
213     // in the UTF-8 sequence.
214     static const int utf8ByteTable[256];
215
216     abstract char[] Parse(  char[] p,
217                                 TiXmlParsingData data,
218                                 TiXmlEncoding encoding /*= TIXML_ENCODING_UNKNOWN */ );
219
220     enum TiXmlError
221     {
222         TIXML_NO_ERROR = 0,
223         TIXML_ERROR,
224         TIXML_ERROR_OPENING_FILE,
225         TIXML_ERROR_OUT_OF_MEMORY,
226         TIXML_ERROR_PARSING_ELEMENT,
227         TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME,
228         TIXML_ERROR_READING_ELEMENT_VALUE,
229         TIXML_ERROR_READING_ATTRIBUTES,
230         TIXML_ERROR_PARSING_EMPTY,
231         TIXML_ERROR_READING_END_TAG,
232         TIXML_ERROR_PARSING_UNKNOWN,
233         TIXML_ERROR_PARSING_COMMENT,
234         TIXML_ERROR_PARSING_DECLARATION,
235         TIXML_ERROR_DOCUMENT_EMPTY,
236         TIXML_ERROR_EMBEDDED_NULL,
237         TIXML_ERROR_PARSING_CDATA,
238         TIXML_ERROR_DOCUMENT_TOP_ONLY,
239
240         TIXML_ERROR_STRING_COUNT
241     };
242     }
243 protected{
244
245     static char[]   SkipWhiteSpace( char[] op, TiXmlEncoding encoding )
246     {
247         return stripl(op);
248         /*
249         if (!op || op.length == 0)
250             return null;
251         int p = 0;
252         if ( encoding == TiXmlEncoding.TIXML_ENCODING_UTF8 )
253         {
254             for (p = 0; p < op.length; ++p)
255             {
256                 //const unsigned char* pU = (const unsigned char*)p;
257                 
258                 // Skip the stupid Microsoft UTF-8 Byte order marks
259                 if (    op[p]==TIXML_UTF_LEAD_0
260                      && op[p+1]==TIXML_UTF_LEAD_1
261                      && op[p+2]==TIXML_UTF_LEAD_2 )
262                 {
263                     p += 3;
264                     continue;
265                 }
266                 else if(op[p]==TIXML_UTF_LEAD_0
267                      && op[p+1]==0xbfU
268                      && op[p+2]==0xbeU )
269                 {
270                     p += 3;
271                     continue;
272                 }
273                 else if(op[p]==TIXML_UTF_LEAD_0
274                      && op[p+1]==0xbfU
275                      && op[p+2]==0xbfU )
276                 {
277                     p += 3;
278                     continue;
279                 }
280     
281                 if ( IsWhiteSpace( op[p] ) || op[p] == '\n' || op[p] =='\r' )       // Still using old rules for white space.
282                     ++p;
283                 else
284                     break;
285             }
286         }
287         else
288         {
289             while ( p < op.length && IsWhiteSpace( op[p] ) || op[p] == '\n' || op[p] =='\r' )
290                 ++p;
291         }
292     
293         return op[p..length];*/
294     }
295    
296     static bool IsWhiteSpace( char c )
297     {
298         return (  std.string.iswhite( c ) || c == '\n' || c == '\r' );
299     }
300     static bool IsWhiteSpace( int c )
301     {
302         if ( c < 256 )
303             return IsWhiteSpace( cast(char) c );
304         return false;   // Again, only truly correct for English/Latin...but usually works.
305     }
306
307     void StreamOut (OutputStream o);
308
309     /*#ifdef TIXML_USE_STL
310         static bool StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag );
311         static bool StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag );
312     #endif*/
313
314     /*  Reads an XML name into the string provided. Returns
315         a pointer just past the last character of the name,
316         or 0 if the function has an error.
317     */
318     static char[] ReadName( char[] p, out char[] name, TiXmlEncoding encoding )
319     in
320     { assert(p); }
321     body
322     {
323         name = "";
324    
325         // Names start with letters or underscores.
326         // Of course, in unicode, tinyxml has no idea what a letter *is*. The
327         // algorithm is generous.
328         //
329         // After that, they can be letters, underscores, numbers,
330         // hyphens, or colons. (Colons are valid ony for namespaces,
331         // but tinyxml can't tell namespaces from names.)
332         int i = 0;
333        
334         if ( p && p.length > 0
335              && ( IsAlpha( cast(ubyte) p[0], encoding ) || p[0] == '_' ) )
336         {
337             while( i < p.length &&  ( IsAlphaNum( p[i], encoding )
338                              || p[i] == '_'
339                              || p[i] == '-'
340                              || p[i] == '.'
341                              || p[i] == ':' ) )
342             {
343                 name ~= p[i];
344                 ++i;
345             }
346             return p[i..length];
347         }
348         return null;
349     }
350
351     /*  Reads text. Returns a pointer past the given end tag.
352         Wickedly complex options, but it keeps the (sensitive) code in one place.
353     */
354     static int ReadText(    char[] sin,             // where to start
355                                     inout char[] text,          // the string read
356                                     bool ignoreWhiteSpace,      // whether to keep the white space
357                                     char[] endTag,          // what ends this text
358                                     bool ignoreCase,            // whether to ignore case in the end tag
359                                     TiXmlEncoding encoding )    // the current encoding
360 /*  in { writefln("ReadText: sin='%s', endTag='%s'",replace(sin,"\n","\\n"),endTag); }
361     out (o) { writefln(" returned %d (text: %s)", o, text); }
362     body*/
363     {
364         text = "";
365         int p = 0;
366         if (!(!ignoreWhiteSpace         // certain tags always keep whitespace
367              || !condenseWhiteSpace ))  // if true, whitespace is always kept
368              sin = SkipWhiteSpace( sin, encoding );
369              
370         p = (!ignoreCase)?(find(sin, endTag)):(ifind(sin,endTag));
371         if (p == -1)
372         {
373             text = sin;
374             return sin.length;
375         }
376         else
377         {
378             text = sin[0..p];
379             return p + endTag.length;
380         }
381     }
382
383                                    
384                                    
385                                    
386
387     // If an entity has been found, transform it into a character.
388     static int GetEntity( char[] sin, inout char[] value, inout int length, TiXmlEncoding encoding )
389     {
390         // Presume an entity, and pull it out.
391         char[] ent;
392         int delta;
393         int i;
394         length = 0;
395    
396         if ( sin.length > 2 && sin[1] == '#' )
397         {
398             ulong ucs = 0;
399             //ptrdiff_t delta = 0;
400             uint mult = 1;
401    
402             if ( sin[2] == 'x' )
403             {
404                 if (sin.length < 3) return 0;
405                 // Hexadecimal.
406    
407                 char[] q = sin[3..length];
408
409                 delta = find(q, ';');
410                 if (delta == -1)
411                     return 0;
412                    
413                 int _q = delta-1;
414    
415                 while ( q[_q] != 'x' )
416                 {
417                     if ( q[_q] >= '0' && q[_q] <= '9' )
418                         ucs += mult * (q[_q] - '0');
419                     else if ( q[_q] >= 'a' && q[_q] <= 'f' )
420                         ucs += mult * (*q - 'a' + 10);
421                     else if ( q[_q] >= 'A' && q[_q] <= 'F' )
422                         ucs += mult * (*q - 'A' + 10 );
423                     else
424                         return 0;
425                     mult *= 16;
426                     --_q;
427                 }
428             }
429             else
430             {
431                 // Decimal.
432                 if (sin.length < 3) return 0;
433                
434                 char[] q = sin[2..length];
435                 delta = find(q, ';');
436                 if ( delta == -1 ) return 0;
437                 int _q = delta-1;
438                 --_q;
439                
440                 while ( q[_q] != '#' )
441                 {
442                     if ( q[_q] >= '0' && q[_q] <= '9' )
443                         ucs += mult * (q[_q] - '0');
444                     else
445                         return 0;
446                     mult *= 10;
447                     --_q;
448                 }
449             }
450             if ( encoding == TiXmlEncoding.TIXML_ENCODING_UTF8 )
451             {
452                 ConvertUTF32ToUTF8( ucs, value, length );
453             }
454             else
455             {
456                 value[0] = cast(char)ucs;
457                 length = 1;
458             }
459             return delta + 1;
460         }
461    
462         // Now try to match it.
463         for( i=0; i<NUM_ENTITY; ++i )
464         {
465             if ( entity[i].str == sin )
466             {
467                 value[0] = entity[i].chr;
468                 length = 1;
469                 return entity[i].str.length;
470             }
471         }
472    
473         // So it wasn't an entity, its unrecognized, or something like that.
474         value[0] = sin[0];  // Don't put back the last one, since we return it!
475         length = 1;         // Leave unrecognized entities - this doesn't really work.
476                         // Just writes strange XML.
477         return 1;
478     }
479
480
481     // Get a character, while interpreting entities.
482     // The length can be from 0 to 4 bytes.
483     static int GetChar( char[] sin, inout char[] _value, inout int length, TiXmlEncoding encoding )
484     in { assert(sin); }
485     body
486     {
487         if ( encoding == TiXmlEncoding.TIXML_ENCODING_UTF8 )
488         {
489             length = utf8ByteTable[ sin[0] ];
490             assert( length >= 0 && length < 5 );
491         }
492         else
493         {
494             length = 1;
495         }
496
497         if ( length == 1 )
498         {
499             if ( sin[0] == '&' )
500                 return GetEntity( sin, _value, length, encoding );
501             _value = sin;
502             return 1;
503         }
504         else if ( length > 0 )
505         {
506             for( int i=0; i < sin.length && i<length; ++i ) {
507                 _value[i] = sin[i];
508             }
509             return length;
510         }
511         else
512         {
513             return 0;
514         }
515     }
516
517     // Puts a string to a stream, expanding entities as it goes.
518     // Note this should not contian the '<', '>', etc, or they will be transformed into entities!
519     static void PutString( char[] str, OutputStream outs )
520     {
521         char[] buffer;
522         PutString( str, buffer );
523         outs.writeString(buffer);
524     }
525
526     static void PutString( char[] str, out char[] outString )
527     {
528         int i=0;
529    
530         foreach (int i,char c;str)
531         {
532             ubyte _c = cast(ubyte) str[i];
533    
534             if ( _c == '&'
535                  && i < ( str.length - 2 )
536                  && str[i+1] == '#'
537                  && str[i+2] == 'x' )
538             {
539                 // Hexadecimal character reference.
540                 // Pass through unchanged.
541                 // &#xA9;   -- copyright symbol, for example.
542                 //
543                 // The -1 is a bug fix from Rob Laveaux. It keeps
544                 // an overflow from happening if there is no ';'.
545                 // There are actually 2 ways to exit this loop -
546                 // while fails (error case) and break (semicolon found).
547                 // However, there is no mechanism (currently) for
548                 // this function to return an error.
549                 while ( i<str.length-1 )
550                 {
551                     outString ~= str[i];
552                     ++i;
553                     if ( str[i] == ';' )
554                         break;
555                 }
556             }
557             else if ( c == '&' )
558             {
559                 outString ~= entity[0].str;
560                 ++i;
561             }
562             else if ( c == '<' )
563             {
564                 outString ~= entity[1].str;
565             }
566             else if ( c == '>' )
567             {
568                 outString ~= entity[2].str;
569                 ++i;
570             }
571             else if ( c == '\"' )
572             {
573                 outString ~= entity[3].str;
574                 ++i;
575             }
576             else if ( c == '\'' )
577             {
578                 outString ~= entity[4].str;
579                 ++i;
580             }
581             else if ( c < 32 )
582             {
583                 // Easy pass at non-alpha/numeric/symbol
584                 // Below 32 is symbolic.
585                 char[] buf = format("&#x%02X;", cast(uint) ( c & 0xff ));
586                
587
588                 outString ~= buf;
589                 ++i;
590             }
591             else
592             {
593                 outString ~= c; // somewhat more efficient function call.
594                 ++i;
595             }
596         }
597     }
598
599
600     // Return true if the next characters in the stream are any of the endTag sequences.
601     // Ignore case only works for english, and should only be relied on when comparing
602     // to English words: StringEqual( p, "version", true ) is fine.
603     static bool StringEqual(    char[] p,
604                                 char[] endTag,
605                                 bool ignoreCase,
606                                 TiXmlEncoding encoding )
607     in { assert(p); assert(endTag); assert(p.length > 0); }
608     body
609     {
610         if ( ignoreCase )
611             return (ifind(p, endTag) == 0);
612         else
613             return (find(p, endTag) == 0);
614     }
615
616
617     static char[][] errorString = [
618     "No error",
619     "Error",
620     "Failed to open file",
621     "Memory allocation failed.",
622     "Error parsing Element.",
623     "Failed to read Element name",
624     "Error reading Element value.",
625     "Error reading Attributes.",
626     "Error: empty tag.",
627     "Error reading end tag.",
628     "Error parsing Unknown.",
629     "Error parsing Comment.",
630     "Error parsing Declaration.",
631     "Error document empty.",
632     "Error null (0) or unexpected EOF found in input stream.",
633     "Error parsing CDATA.",
634     "Error when TiXmlDocument added to document, because TiXmlDocument can only be at the root.",
635 ];
636
637     TiXmlCursor location;
638
639     /// Field containing a generic user pointer
640     void*           userData;
641    
642     // None of these methods are reliable for any language except English.
643     // Good for approximation, not great for accuracy.
644     static int IsAlpha( ubyte anyByte, TiXmlEncoding encoding )
645     {
646         // This will only work for low-ascii, everything else is assumed to be a valid
647         // letter. I'm not sure this is the best approach, but it is quite tricky trying
648         // to figure out alhabetical vs. not across encoding. So take a very
649         // conservative approach.
650         return u8isUniAlpha(cast(char)anyByte);
651     }
652
653     static int IsAlphaNum( ubyte anyByte, TiXmlEncoding encoding )
654     {
655         if ( anyByte < 127 )
656             return (IsAlpha(anyByte,encoding) || std.ctype.isdigit(anyByte));
657         else
658             return 1;
659     }
660
661    
662     static int ToLower( int v, TiXmlEncoding encoding )
663     {
664         return std.ctype.tolower(v);
665     }
666     static void ConvertUTF32ToUTF8( ulong input, inout char[] output, inout int length )
667     {
668         const ulong BYTE_MASK = 0xBF;
669         const ulong BYTE_MARK