GEAR  1.9.0
 All Classes Namespaces Functions Variables Typedefs Enumerations Friends Pages
tinyxmlparser.cc
1 /*
2 www.sourceforge.net/projects/tinyxml
3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4 
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
8 
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
12 
13 1. The origin of this software must not be misrepresented; you must
14 not claim that you wrote the original software. If you use this
15 software in a product, an acknowledgment in the product documentation
16 would be appreciated but is not required.
17 
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
20 
21 3. This notice may not be removed or altered from any source
22 distribution.
23 
24  F.Gaede, DESY : changed extension to .cc for use with gear
25  and include from "gearxml/tinyxml.h"
26  : put in namespace gear
27  $Id: tinyxmlparser.cc,v 1.2 2008-12-19 13:52:34 gaede Exp $
28 */
29 
30 #include "gearxml/tinyxml.h"
31 #include <ctype.h>
32 #include <stddef.h>
33 
34 
35 namespace gear{
36 
37 //#define DEBUG_PARSER
38 
39 // Note tha "PutString" hardcodes the same list. This
40 // is less flexible than it appears. Changing the entries
41 // or order will break putstring.
42 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
43 {
44  { "&amp;", 5, '&' },
45  { "&lt;", 4, '<' },
46  { "&gt;", 4, '>' },
47  { "&quot;", 6, '\"' },
48  { "&apos;", 6, '\'' }
49 };
50 
51 // Bunch of unicode info at:
52 // http://www.unicode.org/faq/utf_bom.html
53 // Including the basic of this table, which determines the #bytes in the
54 // sequence from the lead byte. 1 placed for invalid sequences --
55 // although the result will be junk, pass it through as much as possible.
56 // Beware of the non-characters in UTF-8:
57 // ef bb bf (Microsoft "lead bytes")
58 // ef bf be
59 // ef bf bf
60 
61 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
62 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
63 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
64 
65 const int TiXmlBase::utf8ByteTable[256] =
66 {
67  // 0 1 2 3 4 5 6 7 8 9 a b c d e f
68  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
69  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
70  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
71  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
72  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
73  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
74  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
75  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
76  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
77  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
78  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
79  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
80  1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
81  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
82  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
83  4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
84 };
85 
86 
87 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
88 {
89  const unsigned long BYTE_MASK = 0xBF;
90  const unsigned long BYTE_MARK = 0x80;
91  const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
92 
93  if (input < 0x80)
94  *length = 1;
95  else if ( input < 0x800 )
96  *length = 2;
97  else if ( input < 0x10000 )
98  *length = 3;
99  else if ( input < 0x200000 )
100  *length = 4;
101  else
102  { *length = 0; return; } // This code won't covert this correctly anyway.
103 
104  output += *length;
105 
106  // Scary scary fall throughs.
107  switch (*length)
108  {
109  case 4:
110  --output;
111  *output = (char)((input | BYTE_MARK) & BYTE_MASK);
112  input >>= 6;
113  case 3:
114  --output;
115  *output = (char)((input | BYTE_MARK) & BYTE_MASK);
116  input >>= 6;
117  case 2:
118  --output;
119  *output = (char)((input | BYTE_MARK) & BYTE_MASK);
120  input >>= 6;
121  case 1:
122  --output;
123  *output = (char)(input | FIRST_BYTE_MARK[*length]);
124  }
125 }
126 
127 
128 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
129 {
130  // This will only work for low-ascii, everything else is assumed to be a valid
131  // letter. I'm not sure this is the best approach, but it is quite tricky trying
132  // to figure out alhabetical vs. not across encoding. So take a very
133  // conservative approach.
134 
135 // if ( encoding == TIXML_ENCODING_UTF8 )
136 // {
137  if ( anyByte < 127 )
138  return isalpha( anyByte );
139  else
140  return 1; // What else to do? The unicode set is huge...get the english ones right.
141 // }
142 // else
143 // {
144 // return isalpha( anyByte );
145 // }
146 }
147 
148 
149 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
150 {
151  // This will only work for low-ascii, everything else is assumed to be a valid
152  // letter. I'm not sure this is the best approach, but it is quite tricky trying
153  // to figure out alhabetical vs. not across encoding. So take a very
154  // conservative approach.
155 
156 // if ( encoding == TIXML_ENCODING_UTF8 )
157 // {
158  if ( anyByte < 127 )
159  return isalnum( anyByte );
160  else
161  return 1; // What else to do? The unicode set is huge...get the english ones right.
162 // }
163 // else
164 // {
165 // return isalnum( anyByte );
166 // }
167 }
168 
169 
171 {
172  friend class TiXmlDocument;
173  public:
174  void Stamp( const char* now, TiXmlEncoding encoding );
175 
176  const TiXmlCursor& Cursor() { return cursor; }
177 
178  private:
179  // Only used by the document!
180  TiXmlParsingData( const char* start, int _tabsize, int row, int col )
181  {
182  assert( start );
183  stamp = start;
184  tabsize = _tabsize;
185  cursor.row = row;
186  cursor.col = col;
187  }
188 
189  TiXmlCursor cursor{};
190  const char* stamp{};
191  int tabsize{};
192 };
193 
194 
195 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
196 {
197  assert( now );
198 
199  // Do nothing if the tabsize is 0.
200  if ( tabsize < 1 )
201  {
202  return;
203  }
204 
205  // Get the current row, column.
206  int row = cursor.row;
207  int col = cursor.col;
208  const char* p = stamp;
209  assert( p );
210 
211  while ( p < now )
212  {
213  // Treat p as unsigned, so we have a happy compiler.
214  const unsigned char* pU = (const unsigned char*)p;
215 
216  // Code contributed by Fletcher Dunn: (modified by lee)
217  switch (*pU) {
218  case 0:
219  // We *should* never get here, but in case we do, don't
220  // advance past the terminating null character, ever
221  return;
222 
223  case '\r':
224  // bump down to the next line
225  ++row;
226  col = 0;
227  // Eat the character
228  ++p;
229 
230  // Check for \r\n sequence, and treat this as a single character
231  if (*p == '\n') {
232  ++p;
233  }
234  break;
235 
236  case '\n':
237  // bump down to the next line
238  ++row;
239  col = 0;
240 
241  // Eat the character
242  ++p;
243 
244  // Check for \n\r sequence, and treat this as a single
245  // character. (Yes, this bizarre thing does occur still
246  // on some arcane platforms...)
247  if (*p == '\r') {
248  ++p;
249  }
250  break;
251 
252  case '\t':
253  // Eat the character
254  ++p;
255 
256  // Skip to next tab stop
257  col = (col / tabsize + 1) * tabsize;
258  break;
259 
260  case TIXML_UTF_LEAD_0:
261  if ( encoding == TIXML_ENCODING_UTF8 )
262  {
263  if ( *(p+1) && *(p+2) )
264  {
265  // In these cases, don't advance the column. These are
266  // 0-width spaces.
267  if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
268  p += 3;
269  else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
270  p += 3;
271  else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
272  p += 3;
273  else
274  { p +=3; ++col; } // A normal character.
275  }
276  }
277  else
278  {
279  ++p;
280  ++col;
281  }
282  break;
283 
284  default:
285  if ( encoding == TIXML_ENCODING_UTF8 )
286  {
287  // Eat the 1 to 4 byte utf8 character.
288  int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];
289  if ( step == 0 )
290  step = 1; // Error case from bad encoding, but handle gracefully.
291  p += step;
292 
293  // Just advance one column, of course.
294  ++col;
295  }
296  else
297  {
298  ++p;
299  ++col;
300  }
301  break;
302  }
303  }
304  cursor.row = row;
305  cursor.col = col;
306  assert( cursor.row >= -1 );
307  assert( cursor.col >= -1 );
308  stamp = p;
309  assert( stamp );
310 }
311 
312 
313 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
314 {
315  if ( !p || !*p )
316  {
317  return 0;
318  }
319  if ( encoding == TIXML_ENCODING_UTF8 )
320  {
321  while ( *p )
322  {
323  const unsigned char* pU = (const unsigned char*)p;
324 
325  // Skip the stupid Microsoft UTF-8 Byte order marks
326  if ( *(pU+0)==TIXML_UTF_LEAD_0
327  && *(pU+1)==TIXML_UTF_LEAD_1
328  && *(pU+2)==TIXML_UTF_LEAD_2 )
329  {
330  p += 3;
331  continue;
332  }
333  else if(*(pU+0)==TIXML_UTF_LEAD_0
334  && *(pU+1)==0xbfU
335  && *(pU+2)==0xbeU )
336  {
337  p += 3;
338  continue;
339  }
340  else if(*(pU+0)==TIXML_UTF_LEAD_0
341  && *(pU+1)==0xbfU
342  && *(pU+2)==0xbfU )
343  {
344  p += 3;
345  continue;
346  }
347 
348  if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
349  ++p;
350  else
351  break;
352  }
353  }
354  else
355  {
356  while ( ( *p && IsWhiteSpace( *p ) ) || *p == '\n' || *p =='\r' )
357  ++p;
358  }
359 
360  return p;
361 }
362 
363 #ifdef TIXML_USE_STL
364 /*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
365 {
366  for( ;; )
367  {
368  if ( !in->good() ) return false;
369 
370  int c = in->peek();
371  // At this scope, we can't get to a document. So fail silently.
372  if ( !IsWhiteSpace( c ) || c <= 0 )
373  return true;
374 
375  *tag += (char) in->get();
376  }
377 }
378 
379 /*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
380 {
381  //assert( character > 0 && character < 128 ); // else it won't work in utf-8
382  while ( in->good() )
383  {
384  int c = in->peek();
385  if ( c == character )
386  return true;
387  if ( c <= 0 ) // Silent failure: can't get document at this scope
388  return false;
389 
390  in->get();
391  *tag += (char) c;
392  }
393  return false;
394 }
395 #endif
396 
397 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
398 {
399  *name = "";
400  assert( p );
401 
402  // Names start with letters or underscores.
403  // Of course, in unicode, tinyxml has no idea what a letter *is*. The
404  // algorithm is generous.
405  //
406  // After that, they can be letters, underscores, numbers,
407  // hyphens, or colons. (Colons are valid ony for namespaces,
408  // but tinyxml can't tell namespaces from names.)
409  if ( p && *p
410  && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
411  {
412  while( p && *p
413  && ( IsAlphaNum( (unsigned char ) *p, encoding )
414  || *p == '_'
415  || *p == '-'
416  || *p == '.'
417  || *p == ':' ) )
418  {
419  (*name) += *p;
420  ++p;
421  }
422  return p;
423  }
424  return 0;
425 }
426 
427 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
428 {
429  // Presume an entity, and pull it out.
430  TIXML_STRING ent;
431  int i;
432  *length = 0;
433 
434  if ( *(p+1) && *(p+1) == '#' && *(p+2) )
435  {
436  unsigned long ucs = 0;
437  ptrdiff_t delta = 0;
438  unsigned mult = 1;
439 
440  if ( *(p+2) == 'x' )
441  {
442  // Hexadecimal.
443  if ( !*(p+3) ) return 0;
444 
445  const char* q = p+3;
446  q = strchr( q, ';' );
447 
448  if ( !q || !*q ) return 0;
449 
450  delta = q-p;
451  --q;
452 
453  while ( *q != 'x' )
454  {
455  if ( *q >= '0' && *q <= '9' )
456  ucs += mult * (*q - '0');
457  else if ( *q >= 'a' && *q <= 'f' )
458  ucs += mult * (*q - 'a' + 10);
459  else if ( *q >= 'A' && *q <= 'F' )
460  ucs += mult * (*q - 'A' + 10 );
461  else
462  return 0;
463  mult *= 16;
464  --q;
465  }
466  }
467  else
468  {
469  // Decimal.
470  if ( !*(p+2) ) return 0;
471 
472  const char* q = p+2;
473  q = strchr( q, ';' );
474 
475  if ( !q || !*q ) return 0;
476 
477  delta = q-p;
478  --q;
479 
480  while ( *q != '#' )
481  {
482  if ( *q >= '0' && *q <= '9' )
483  ucs += mult * (*q - '0');
484  else
485  return 0;
486  mult *= 10;
487  --q;
488  }
489  }
490  if ( encoding == TIXML_ENCODING_UTF8 )
491  {
492  // convert the UCS to UTF-8
493  ConvertUTF32ToUTF8( ucs, value, length );
494  }
495  else
496  {
497  *value = (char)ucs;
498  *length = 1;
499  }
500  return p + delta + 1;
501  }
502 
503  // Now try to match it.
504  for( i=0; i<NUM_ENTITY; ++i )
505  {
506  if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
507  {
508  assert( strlen( entity[i].str ) == entity[i].strLength );
509  *value = entity[i].chr;
510  *length = 1;
511  return ( p + entity[i].strLength );
512  }
513  }
514 
515  // So it wasn't an entity, its unrecognized, or something like that.
516  *value = *p; // Don't put back the last one, since we return it!
517  return p+1;
518 }
519 
520 
521 bool TiXmlBase::StringEqual( const char* p,
522  const char* tag,
523  bool ignoreCase,
524  TiXmlEncoding encoding )
525 {
526  assert( p );
527  assert( tag );
528  if ( !p || !*p )
529  {
530  assert( 0 );
531  return false;
532  }
533 
534  const char* q = p;
535 
536  if ( ignoreCase )
537  {
538  while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
539  {
540  ++q;
541  ++tag;
542  }
543 
544  if ( *tag == 0 )
545  return true;
546  }
547  else
548  {
549  while ( *q && *tag && *q == *tag )
550  {
551  ++q;
552  ++tag;
553  }
554 
555  if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
556  return true;
557  }
558  return false;
559 }
560 
561 const char* TiXmlBase::ReadText( const char* p,
562  TIXML_STRING * text,
563  bool trimWhiteSpace,
564  const char* endTag,
565  bool caseInsensitive,
566  TiXmlEncoding encoding )
567 {
568  *text = "";
569  if ( !trimWhiteSpace // certain tags always keep whitespace
570  || !condenseWhiteSpace ) // if true, whitespace is always kept
571  {
572  // Keep all the white space.
573  while ( p && *p
574  && !StringEqual( p, endTag, caseInsensitive, encoding )
575  )
576  {
577  int len;
578  char cArr[4] = { 0, 0, 0, 0 };
579  p = GetChar( p, cArr, &len, encoding );
580  text->append( cArr, len );
581  }
582  }
583  else
584  {
585  bool whitespace = false;
586 
587  // Remove leading white space:
588  p = SkipWhiteSpace( p, encoding );
589  while ( p && *p
590  && !StringEqual( p, endTag, caseInsensitive, encoding ) )
591  {
592  if ( *p == '\r' || *p == '\n' )
593  {
594  whitespace = true;
595  ++p;
596  }
597  else if ( IsWhiteSpace( *p ) )
598  {
599  whitespace = true;
600  ++p;
601  }
602  else
603  {
604  // If we've found whitespace, add it before the
605  // new character. Any whitespace just becomes a space.
606  if ( whitespace )
607  {
608  (*text) += ' ';
609  whitespace = false;
610  }
611  int len;
612  char cArr[4] = { 0, 0, 0, 0 };
613  p = GetChar( p, cArr, &len, encoding );
614  if ( len == 1 )
615  (*text) += cArr[0]; // more efficient
616  else
617  text->append( cArr, len );
618  }
619  }
620  }
621  return p + strlen( endTag );
622 }
623 
624 #ifdef TIXML_USE_STL
625 
626 void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
627 {
628  // The basic issue with a document is that we don't know what we're
629  // streaming. Read something presumed to be a tag (and hope), then
630  // identify it, and call the appropriate stream method on the tag.
631  //
632  // This "pre-streaming" will never read the closing ">" so the
633  // sub-tag can orient itself.
634 
635  if ( !StreamTo( in, '<', tag ) )
636  {
637  SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
638  return;
639  }
640 
641  while ( in->good() )
642  {
643  int tagIndex = (int) tag->length();
644  while ( in->good() && in->peek() != '>' )
645  {
646  int c = in->get();
647  if ( c <= 0 )
648  {
649  SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
650  break;
651  }
652  (*tag) += (char) c;
653  }
654 
655  if ( in->good() )
656  {
657  // We now have something we presume to be a node of
658  // some sort. Identify it, and call the node to
659  // continue streaming.
660  TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
661 
662  if ( node )
663  {
664  node->StreamIn( in, tag );
665  bool isElement = node->ToElement() != 0;
666  delete node;
667  node = 0;
668 
669  // If this is the root element, we're done. Parsing will be
670  // done by the >> operator.
671  if ( isElement )
672  {
673  return;
674  }
675  }
676  else
677  {
678  SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
679  return;
680  }
681  }
682  }
683  // We should have returned sooner.
684  SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
685 }
686 
687 #endif
688 
689 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
690 {
691  ClearError();
692 
693  // Parse away, at the document level. Since a document
694  // contains nothing but other tags, most of what happens
695  // here is skipping white space.
696  if ( !p || !*p )
697  {
698  SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
699  return 0;
700  }
701 
702  // Note that, for a document, this needs to come
703  // before the while space skip, so that parsing
704  // starts from the pointer we are given.
705  location.Clear();
706  if ( prevData )
707  {
708  location.row = prevData->cursor.row;
709  location.col = prevData->cursor.col;
710  }
711  else
712  {
713  location.row = 0;
714  location.col = 0;
715  }
716  TiXmlParsingData data( p, TabSize(), location.row, location.col );
717  location = data.Cursor();
718 
719  if ( encoding == TIXML_ENCODING_UNKNOWN )
720  {
721  // Check for the Microsoft UTF-8 lead bytes.
722  const unsigned char* pU = (const unsigned char*)p;
723  if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
724  && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
725  && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
726  {
727  encoding = TIXML_ENCODING_UTF8;
728  }
729  }
730 
731  p = SkipWhiteSpace( p, encoding );
732  if ( !p )
733  {
734  SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
735  return 0;
736  }
737 
738  while ( p && *p )
739  {
740  TiXmlNode* node = Identify( p, encoding );
741  if ( node )
742  {
743  p = node->Parse( p, &data, encoding );
744  LinkEndChild( node );
745  }
746  else
747  {
748  break;
749  }
750 
751  // Did we get encoding info?
752  if ( encoding == TIXML_ENCODING_UNKNOWN
753  && node->ToDeclaration() )
754  {
755  TiXmlDeclaration* dec = node->ToDeclaration();
756  const char* enc = dec->Encoding();
757  assert( enc );
758 
759  if ( *enc == 0 )
760  encoding = TIXML_ENCODING_UTF8;
761  else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
762  encoding = TIXML_ENCODING_UTF8;
763  else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
764  encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
765  else
766  encoding = TIXML_ENCODING_LEGACY;
767  }
768 
769  p = SkipWhiteSpace( p, encoding );
770  }
771 
772  // Was this empty?
773  if ( !firstChild ) {
774  SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
775  return 0;
776  }
777 
778  // All is well.
779  return p;
780 }
781 
782 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
783 {
784  // The first error in a chain is more accurate - don't set again!
785  if ( error )
786  return;
787 
788  assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
789  error = true;
790  errorId = err;
791  errorDesc = errorString[ errorId ];
792 
793  errorLocation.Clear();
794  if ( pError && data )
795  {
796  //TiXmlParsingData data( pError, prevData );
797  data->Stamp( pError, encoding );
798  errorLocation = data->Cursor();
799  }
800 }
801 
802 
803 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
804 {
805  TiXmlNode* returnNode = 0;
806 
807  p = SkipWhiteSpace( p, encoding );
808  if( !p || !*p || *p != '<' )
809  {
810  return 0;
811  }
812 
813  TiXmlDocument* doc = GetDocument();
814  p = SkipWhiteSpace( p, encoding );
815 
816  if ( !p || !*p )
817  {
818  return 0;
819  }
820 
821  // What is this thing?
822  // - Elements start with a letter or underscore, but xml is reserved.
823  // - Comments: <!--
824  // - Decleration: <?xml
825  // - Everthing else is unknown to tinyxml.
826  //
827 
828  const char* xmlHeader = { "<?xml" };
829  const char* commentHeader = { "<!--" };
830  const char* dtdHeader = { "<!" };
831 
832  if ( StringEqual( p, xmlHeader, true, encoding ) )
833  {
834  #ifdef DEBUG_PARSER
835  TIXML_LOG( "XML parsing Declaration\n" );
836  #endif
837  returnNode = new TiXmlDeclaration();
838  }
839  else if ( StringEqual( p, commentHeader, false, encoding ) )
840  {
841  #ifdef DEBUG_PARSER
842  TIXML_LOG( "XML parsing Comment\n" );
843  #endif
844  returnNode = new TiXmlComment();
845  }
846  else if ( StringEqual( p, dtdHeader, false, encoding ) )
847  {
848  #ifdef DEBUG_PARSER
849  TIXML_LOG( "XML parsing Unknown(1)\n" );
850  #endif
851  returnNode = new TiXmlUnknown();
852  }
853  else if ( IsAlpha( *(p+1), encoding )
854  || *(p+1) == '_' )
855  {
856  #ifdef DEBUG_PARSER
857  TIXML_LOG( "XML parsing Element\n" );
858  #endif
859  returnNode = new TiXmlElement( "" );
860  }
861  else
862  {
863  #ifdef DEBUG_PARSER
864  TIXML_LOG( "XML parsing Unknown(2)\n" );
865  #endif
866  returnNode = new TiXmlUnknown();
867  }
868 
869  if ( returnNode )
870  {
871  // Set the parent, so it can report errors
872  returnNode->parent = this;
873  }
874  else
875  {
876  if ( doc )
877  doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
878  }
879  return returnNode;
880 }
881 
882 #ifdef TIXML_USE_STL
883 
884 void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
885 {
886  // We're called with some amount of pre-parsing. That is, some of "this"
887  // element is in "tag". Go ahead and stream to the closing ">"
888  while( in->good() )
889  {
890  int c = in->get();
891  if ( c <= 0 )
892  {
893  TiXmlDocument* document = GetDocument();
894  if ( document )
895  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
896  return;
897  }
898  (*tag) += (char) c ;
899 
900  if ( c == '>' )
901  break;
902  }
903 
904  if ( tag->length() < 3 ) return;
905 
906  // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
907  // If not, identify and stream.
908 
909  if ( tag->at( tag->length() - 1 ) == '>'
910  && tag->at( tag->length() - 2 ) == '/' )
911  {
912  // All good!
913  return;
914  }
915  else if ( tag->at( tag->length() - 1 ) == '>' )
916  {
917  // There is more. Could be:
918  // text
919  // closing tag
920  // another node.
921  for ( ;; )
922  {
923  StreamWhiteSpace( in, tag );
924 
925  // Do we have text?
926  if ( in->good() && in->peek() != '<' )
927  {
928  // Yep, text.
929  TiXmlText text( "" );
930  text.StreamIn( in, tag );
931 
932  // What follows text is a closing tag or another node.
933  // Go around again and figure it out.
934  continue;
935  }
936 
937  // We now have either a closing tag...or another node.
938  // We should be at a "<", regardless.
939  if ( !in->good() ) return;
940  assert( in->peek() == '<' );
941  int tagIndex = tag->length();
942 
943  bool closingTag = false;
944  bool firstCharFound = false;
945 
946  for( ;; )
947  {
948  if ( !in->good() )
949  return;
950 
951  int c = in->peek();
952  if ( c <= 0 )
953  {
954  TiXmlDocument* document = GetDocument();
955  if ( document )
956  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
957  return;
958  }
959 
960  if ( c == '>' )
961  break;
962 
963  *tag += (char) c;
964  in->get();
965 
966  if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
967  {
968  firstCharFound = true;
969  if ( c == '/' )
970  closingTag = true;
971  }
972  }
973  // If it was a closing tag, then read in the closing '>' to clean up the input stream.
974  // If it was not, the streaming will be done by the tag.
975  if ( closingTag )
976  {
977  if ( !in->good() )
978  return;
979 
980  int c = in->get();
981  if ( c <= 0 )
982  {
983  TiXmlDocument* document = GetDocument();
984  if ( document )
985  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
986  return;
987  }
988  assert( c == '>' );
989  *tag += (char) c;
990 
991  // We are done, once we've found our closing tag.
992  return;
993  }
994  else
995  {
996  // If not a closing tag, id it, and stream.
997  const char* tagloc = tag->c_str() + tagIndex;
998  TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
999  if ( !node )
1000  return;
1001  node->StreamIn( in, tag );
1002  delete node;
1003  node = 0;
1004 
1005  // No return: go around from the beginning: text, closing tag, or node.
1006  }
1007  }
1008  }
1009 }
1010 #endif
1011 
1012 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1013 {
1014  p = SkipWhiteSpace( p, encoding );
1015  TiXmlDocument* document = GetDocument();
1016 
1017  if ( !p || !*p )
1018  {
1019  if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1020  return 0;
1021  }
1022 
1023 // TiXmlParsingData data( p, prevData );
1024  if ( data )
1025  {
1026  data->Stamp( p, encoding );
1027  location = data->Cursor();
1028  }
1029 
1030  if ( *p != '<' )
1031  {
1032  if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1033  return 0;
1034  }
1035 
1036  p = SkipWhiteSpace( p+1, encoding );
1037 
1038  // Read the name.
1039  const char* pErr = p;
1040 
1041  p = ReadName( p, &value, encoding );
1042  if ( !p || !*p )
1043  {
1044  if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1045  return 0;
1046  }
1047 
1048  TIXML_STRING endTag ("</");
1049  endTag += value;
1050  endTag += ">";
1051 
1052  // Check for and read attributes. Also look for an empty
1053  // tag or an end tag.
1054  while ( p && *p )
1055  {
1056  pErr = p;
1057  p = SkipWhiteSpace( p, encoding );
1058  if ( !p || !*p )
1059  {
1060  if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1061  return 0;
1062  }
1063  if ( *p == '/' )
1064  {
1065  ++p;
1066  // Empty tag.
1067  if ( *p != '>' )
1068  {
1069  if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1070  return 0;
1071  }
1072  return (p+1);
1073  }
1074  else if ( *p == '>' )
1075  {
1076  // Done with attributes (if there were any.)
1077  // Read the value -- which can include other
1078  // elements -- read the end tag, and return.
1079  ++p;
1080  p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1081  if ( !p || !*p )
1082  return 0;
1083 
1084  // We should find the end tag now
1085  if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1086  {
1087  p += endTag.length();
1088  return p;
1089  }
1090  else
1091  {
1092  if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1093  return 0;
1094  }
1095  }
1096  else
1097  {
1098  // Try to read an attribute:
1099  TiXmlAttribute* attrib = new TiXmlAttribute();
1100  if ( !attrib )
1101  {
1102  if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1103  return 0;
1104  }
1105 
1106  attrib->SetDocument( document );
1107  const char* ppErr = p;
1108  p = attrib->Parse( p, data, encoding );
1109 
1110  if ( !p || !*p )
1111  {
1112  if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, ppErr, data, encoding );
1113  delete attrib;
1114  return 0;
1115  }
1116 
1117  // Handle the strange case of double attributes:
1118  TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1119  if ( node )
1120  {
1121  node->SetValue( attrib->Value() );
1122  delete attrib;
1123  return 0;
1124  }
1125 
1126  attributeSet.Add( attrib );
1127  }
1128  }
1129  return p;
1130 }
1131 
1132 
1133 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1134 {
1135  TiXmlDocument* document = GetDocument();
1136 
1137  const char* pWithWhiteSpace = p;
1138  // Read in text and elements in any order.
1139  p = SkipWhiteSpace( p, encoding );
1140  while ( p && *p )
1141  {
1142  if ( *p != '<' )
1143  {
1144  // Take what we have, make a text element.
1145  TiXmlText* textNode = new TiXmlText( "" );
1146 
1147  if ( !textNode )
1148  {
1149  if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1150  return 0;
1151  }
1152 
1154  {
1155  p = textNode->Parse( p, data, encoding );
1156  }
1157  else
1158  {
1159  // Special case: we want to keep the white space
1160  // so that leading spaces aren't removed.
1161  p = textNode->Parse( pWithWhiteSpace, data, encoding );
1162  }
1163 
1164  if ( !textNode->Blank() )
1165  LinkEndChild( textNode );
1166  else
1167  delete textNode;
1168  }
1169  else
1170  {
1171  // We hit a '<'
1172  // Have we hit a new element or an end tag?
1173  if ( StringEqual( p, "</", false, encoding ) )
1174  {
1175  return p;
1176  }
1177  else
1178  {
1179  TiXmlNode* node = Identify( p, encoding );
1180  if ( node )
1181  {
1182  p = node->Parse( p, data, encoding );
1183  LinkEndChild( node );
1184  }
1185  else
1186  {
1187  return 0;
1188  }
1189  }
1190  }
1191  p = SkipWhiteSpace( p, encoding );
1192  }
1193 
1194  if ( !p )
1195  {
1196  if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1197  }
1198  return p;
1199 }
1200 
1201 
1202 #ifdef TIXML_USE_STL
1203 void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1204 {
1205  while ( in->good() )
1206  {
1207  int c = in->get();
1208  if ( c <= 0 )
1209  {
1210  TiXmlDocument* document = GetDocument();
1211  if ( document )
1212  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1213  return;
1214  }
1215  (*tag) += (char) c;
1216 
1217  if ( c == '>' )
1218  {
1219  // All is well.
1220  return;
1221  }
1222  }
1223 }
1224 #endif
1225 
1226 
1227 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1228 {
1229  TiXmlDocument* document = GetDocument();
1230  p = SkipWhiteSpace( p, encoding );
1231 
1232 // TiXmlParsingData data( p, prevData );
1233  if ( data )
1234  {
1235  data->Stamp( p, encoding );
1236  location = data->Cursor();
1237  }
1238  if ( !p || !*p || *p != '<' )
1239  {
1240  if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1241  return 0;
1242  }
1243  ++p;
1244  value = "";
1245 
1246  while ( p && *p && *p != '>' )
1247  {
1248  value += *p;
1249  ++p;
1250  }
1251 
1252  if ( !p )
1253  {
1254  if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1255  }
1256  if ( *p == '>' )
1257  return p+1;
1258  return p;
1259 }
1260 
1261 #ifdef TIXML_USE_STL
1262 void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1263 {
1264  while ( in->good() )
1265  {
1266  int c = in->get();
1267  if ( c <= 0 )
1268  {
1269  TiXmlDocument* document = GetDocument();
1270  if ( document )
1271  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1272  return;
1273  }
1274 
1275  (*tag) += (char) c;
1276 
1277  if ( c == '>'
1278  && tag->at( tag->length() - 2 ) == '-'
1279  && tag->at( tag->length() - 3 ) == '-' )
1280  {
1281  // All is well.
1282  return;
1283  }
1284  }
1285 }
1286 #endif
1287 
1288 
1289 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1290 {
1291  TiXmlDocument* document = GetDocument();
1292  value = "";
1293 
1294  p = SkipWhiteSpace( p, encoding );
1295 
1296 // TiXmlParsingData data( p, prevData );
1297  if ( data )
1298  {
1299  data->Stamp( p, encoding );
1300  location = data->Cursor();
1301  }
1302  const char* startTag = "<!--";
1303  const char* endTag = "-->";
1304 
1305  if ( !StringEqual( p, startTag, false, encoding ) )
1306  {
1307  document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1308  return 0;
1309  }
1310  p += strlen( startTag );
1311  p = ReadText( p, &value, false, endTag, false, encoding );
1312  return p;
1313 }
1314 
1315 
1316 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1317 {
1318  p = SkipWhiteSpace( p, encoding );
1319  if ( !p || !*p ) return 0;
1320 
1321 //fg this code has no effect and causes: -Wunused-but-set-variable
1322 //fg int tabsize = 4;
1323 //fg if ( document )
1324 //fg tabsize = document->TabSize();
1325 
1326 // TiXmlParsingData data( p, prevData );
1327  if ( data )
1328  {
1329  data->Stamp( p, encoding );
1330  location = data->Cursor();
1331  }
1332  // Read the name, the '=' and the value.
1333  const char* pErr = p;
1334  p = ReadName( p, &name, encoding );
1335  if ( !p || !*p )
1336  {
1337  if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1338  return 0;
1339  }
1340  p = SkipWhiteSpace( p, encoding );
1341  if ( !p || !*p || *p != '=' )
1342  {
1343  if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1344  return 0;
1345  }
1346 
1347  ++p; // skip '='
1348  p = SkipWhiteSpace( p, encoding );
1349  if ( !p || !*p )
1350  {
1351  if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1352  return 0;
1353  }
1354 
1355  const char* end;
1356 
1357  if ( *p == '\'' )
1358  {
1359  ++p;
1360  end = "\'";
1361  p = ReadText( p, &value, false, end, false, encoding );
1362  }
1363  else if ( *p == '"' )
1364  {
1365  ++p;
1366  end = "\"";
1367  p = ReadText( p, &value, false, end, false, encoding );
1368  }
1369  else
1370  {
1371  // All attribute values should be in single or double quotes.
1372  // But this is such a common error that the parser will try
1373  // its best, even without them.
1374  value = "";
1375  while ( p && *p // existence
1376  && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace
1377  && *p != '/' && *p != '>' ) // tag end
1378  {
1379  value += *p;
1380  ++p;
1381  }
1382  }
1383  return p;
1384 }
1385 
1386 #ifdef TIXML_USE_STL
1387 void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1388 {
1389  while ( in->good() )
1390  {
1391  int c = in->peek();
1392  if ( c == '<' )
1393  return;
1394  if ( c <= 0 )
1395  {
1396  TiXmlDocument* document = GetDocument();
1397  if ( document )
1398  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1399  return;
1400  }
1401 
1402  (*tag) += (char) c;
1403  in->get();
1404  }
1405 }
1406 #endif
1407 
1408 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1409 {
1410  value = "";
1411 // TiXmlParsingData data( p, prevData );
1412  if ( data )
1413  {
1414  data->Stamp( p, encoding );
1415  location = data->Cursor();
1416  }
1417  bool ignoreWhite = true;
1418 
1419  const char* end = "<";
1420  p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1421  if ( p )
1422  return p-1; // don't truncate the '<'
1423  return 0;
1424 }
1425 
1426 #ifdef TIXML_USE_STL
1427 void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1428 {
1429  while ( in->good() )
1430  {
1431  int c = in->get();
1432  if ( c <= 0 )
1433  {
1434  TiXmlDocument* document = GetDocument();
1435  if ( document )
1436  document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1437  return;
1438  }
1439  (*tag) += (char) c;
1440 
1441  if ( c == '>' )
1442  {
1443  // All is well.
1444  return;
1445  }
1446  }
1447 }
1448 #endif
1449 
1450 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1451 {
1452  p = SkipWhiteSpace( p, _encoding );
1453  // Find the beginning, find the end, and look for
1454  // the stuff in-between.
1455  TiXmlDocument* document = GetDocument();
1456  if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1457  {
1458  if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1459  return 0;
1460  }
1461 // TiXmlParsingData data( p, prevData );
1462  if ( data )
1463  {
1464  data->Stamp( p, _encoding );
1465  location = data->Cursor();
1466  }
1467  p += 5;
1468 
1469  version = "";
1470  encoding = "";
1471  standalone = "";
1472 
1473  while ( p && *p )
1474  {
1475  if ( *p == '>' )
1476  {
1477  ++p;
1478  return p;
1479  }
1480 
1481  p = SkipWhiteSpace( p, _encoding );
1482  if ( StringEqual( p, "version", true, _encoding ) )
1483  {
1484  TiXmlAttribute attrib;
1485  p = attrib.Parse( p, data, _encoding );
1486  version = attrib.Value();
1487  }
1488  else if ( StringEqual( p, "encoding", true, _encoding ) )
1489  {
1490  TiXmlAttribute attrib;
1491  p = attrib.Parse( p, data, _encoding );
1492  encoding = attrib.Value();
1493  }
1494  else if ( StringEqual( p, "standalone", true, _encoding ) )
1495  {
1496  TiXmlAttribute attrib;
1497  p = attrib.Parse( p, data, _encoding );
1498  standalone = attrib.Value();
1499  }
1500  else
1501  {
1502  // Read over whatever it is.
1503  while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1504  ++p;
1505  }
1506  }
1507  return 0;
1508 }
1509 
1510 bool TiXmlText::Blank() const
1511 {
1512  for ( unsigned i=0; i<value.length(); i++ )
1513  if ( !IsWhiteSpace( value[i] ) )
1514  return false;
1515  return true;
1516 }
1517 
1518 } //fg: end namespace gear
virtual const char * Parse(const char *p, TiXmlParsingData *data=0, TiXmlEncoding encoding=TIXML_DEFAULT_ENCODING)
Parse the given null terminated block of xml data.
void SetValue(const char *_value)
Set the value.
Definition: tinyxml.h:728
TiXmlNode * LinkEndChild(TiXmlNode *addThis)
Add a new node related to this.
Definition: tinyxml.cc:201
In correct XML the declaration is the first entry in the file.
Definition: tinyxml.h:1061
static bool IsWhiteSpaceCondensed()
Return the current white space setting.
Definition: tinyxml.h:169
const char * Encoding() const
Encoding. Will return an empty string if none was found.
Definition: tinyxml.h:1087
Always the top level node.
Definition: tinyxml.h:1154
const TiXmlDeclaration * ToDeclaration() const
Cast to a more defined type. Will return null not of the requested type.
Definition: tinyxml.h:621
The parent class for everything in the Document Object Model.
Definition: tinyxml.h:370
void ClearError()
If you have handled the error, it can be reset with this call.
Definition: tinyxml.h:1262
const TiXmlDocument * GetDocument() const
Return a pointer to the Document this node lives in.
Definition: tinyxml.cc:595