Home | History | Annotate | Download | only in tinyxml
      1 /*
      2 www.sourceforge.net/projects/tinyxml
      3 Original code by Lee Thomason (www.grinninglizard.com)
      4 
      5 This software is provided 'as-is', without any express or implied
      6 warranty. In no event will the authors be held liable for any
      7 damages arising from the use of this software.
      8 
      9 Permission is granted to anyone to use this software for any
     10 purpose, including commercial applications, and to alter it and
     11 redistribute it freely, subject to the following restrictions:
     12 
     13 1. The origin of this software must not be misrepresented; you must
     14 not claim that you wrote the original software. If you use this
     15 software in a product, an acknowledgment in the product documentation
     16 would be appreciated but is not required.
     17 
     18 2. Altered source versions must be plainly marked as such, and
     19 must not be misrepresented as being the original software.
     20 
     21 3. This notice may not be removed or altered from any source
     22 distribution.
     23 */
     24 
     25 #include <ctype.h>
     26 #include <stddef.h>
     27 
     28 #include "tinyxml.h"
     29 
     30 //#define DEBUG_PARSER
     31 #if defined( DEBUG_PARSER )
     32 #	if defined( DEBUG ) && defined( _MSC_VER )
     33 #		include <windows.h>
     34 #		define TIXML_LOG OutputDebugString
     35 #	else
     36 #		define TIXML_LOG printf
     37 #	endif
     38 #endif
     39 
     40 // Note tha "PutString" hardcodes the same list. This
     41 // is less flexible than it appears. Changing the entries
     42 // or order will break putstring.
     43 TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] =
     44 {
     45 	{ "&amp;",  5, '&' },
     46 	{ "&lt;",   4, '<' },
     47 	{ "&gt;",   4, '>' },
     48 	{ "&quot;", 6, '\"' },
     49 	{ "&apos;", 6, '\'' }
     50 };
     51 
     52 // Bunch of unicode info at:
     53 //		http://www.unicode.org/faq/utf_bom.html
     54 // Including the basic of this table, which determines the #bytes in the
     55 // sequence from the lead byte. 1 placed for invalid sequences --
     56 // although the result will be junk, pass it through as much as possible.
     57 // Beware of the non-characters in UTF-8:
     58 //				ef bb bf (Microsoft "lead bytes")
     59 //				ef bf be
     60 //				ef bf bf
     61 
     62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
     63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
     64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
     65 
     66 const int TiXmlBase::utf8ByteTable[256] =
     67 {
     68 	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
     69 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
     70 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
     71 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
     72 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
     73 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
     74 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
     75 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
     76 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
     77 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
     78 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
     79 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
     80 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
     81 		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
     82 		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
     83 		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
     84 		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
     85 };
     86 
     87 
     88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
     89 {
     90 	const unsigned long BYTE_MASK = 0xBF;
     91 	const unsigned long BYTE_MARK = 0x80;
     92 	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     93 
     94 	if (input < 0x80)
     95 		*length = 1;
     96 	else if ( input < 0x800 )
     97 		*length = 2;
     98 	else if ( input < 0x10000 )
     99 		*length = 3;
    100 	else if ( input < 0x200000 )
    101 		*length = 4;
    102 	else
    103 		{ *length = 0; return; }	// This code won't covert this correctly anyway.
    104 
    105 	output += *length;
    106 
    107 	// Scary scary fall throughs.
    108 	switch (*length)
    109 	{
    110 		case 4:
    111 			--output;
    112 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
    113 			input >>= 6;
    114 		case 3:
    115 			--output;
    116 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
    117 			input >>= 6;
    118 		case 2:
    119 			--output;
    120 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
    121 			input >>= 6;
    122 		case 1:
    123 			--output;
    124 			*output = (char)(input | FIRST_BYTE_MARK[*length]);
    125 	}
    126 }
    127 
    128 
    129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
    130 {
    131 	// This will only work for low-ascii, everything else is assumed to be a valid
    132 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
    133 	// to figure out alhabetical vs. not across encoding. So take a very
    134 	// conservative approach.
    135 
    136 //	if ( encoding == TIXML_ENCODING_UTF8 )
    137 //	{
    138 		if ( anyByte < 127 )
    139 			return isalpha( anyByte );
    140 		else
    141 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
    142 //	}
    143 //	else
    144 //	{
    145 //		return isalpha( anyByte );
    146 //	}
    147 }
    148 
    149 
    150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
    151 {
    152 	// This will only work for low-ascii, everything else is assumed to be a valid
    153 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
    154 	// to figure out alhabetical vs. not across encoding. So take a very
    155 	// conservative approach.
    156 
    157 //	if ( encoding == TIXML_ENCODING_UTF8 )
    158 //	{
    159 		if ( anyByte < 127 )
    160 			return isalnum( anyByte );
    161 		else
    162 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
    163 //	}
    164 //	else
    165 //	{
    166 //		return isalnum( anyByte );
    167 //	}
    168 }
    169 
    170 
    171 class TiXmlParsingData
    172 {
    173 	friend class TiXmlDocument;
    174   public:
    175 	void Stamp( const char* now, TiXmlEncoding encoding );
    176 
    177 	const TiXmlCursor& Cursor() const	{ return cursor; }
    178 
    179   private:
    180 	// Only used by the document!
    181 	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
    182 	{
    183 		assert( start );
    184 		stamp = start;
    185 		tabsize = _tabsize;
    186 		cursor.row = row;
    187 		cursor.col = col;
    188 	}
    189 
    190 	TiXmlCursor		cursor;
    191 	const char*		stamp;
    192 	int				tabsize;
    193 };
    194 
    195 
    196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
    197 {
    198 	assert( now );
    199 
    200 	// Do nothing if the tabsize is 0.
    201 	if ( tabsize < 1 )
    202 	{
    203 		return;
    204 	}
    205 
    206 	// Get the current row, column.
    207 	int row = cursor.row;
    208 	int col = cursor.col;
    209 	const char* p = stamp;
    210 	assert( p );
    211 
    212 	while ( p < now )
    213 	{
    214 		// Treat p as unsigned, so we have a happy compiler.
    215 		const unsigned char* pU = (const unsigned char*)p;
    216 
    217 		// Code contributed by Fletcher Dunn: (modified by lee)
    218 		switch (*pU) {
    219 			case 0:
    220 				// We *should* never get here, but in case we do, don't
    221 				// advance past the terminating null character, ever
    222 				return;
    223 
    224 			case '\r':
    225 				// bump down to the next line
    226 				++row;
    227 				col = 0;
    228 				// Eat the character
    229 				++p;
    230 
    231 				// Check for \r\n sequence, and treat this as a single character
    232 				if (*p == '\n') {
    233 					++p;
    234 				}
    235 				break;
    236 
    237 			case '\n':
    238 				// bump down to the next line
    239 				++row;
    240 				col = 0;
    241 
    242 				// Eat the character
    243 				++p;
    244 
    245 				// Check for \n\r sequence, and treat this as a single
    246 				// character.  (Yes, this bizarre thing does occur still
    247 				// on some arcane platforms...)
    248 				if (*p == '\r') {
    249 					++p;
    250 				}
    251 				break;
    252 
    253 			case '\t':
    254 				// Eat the character
    255 				++p;
    256 
    257 				// Skip to next tab stop
    258 				col = (col / tabsize + 1) * tabsize;
    259 				break;
    260 
    261 			case TIXML_UTF_LEAD_0:
    262 				if ( encoding == TIXML_ENCODING_UTF8 )
    263 				{
    264 					if ( *(p+1) && *(p+2) )
    265 					{
    266 						// In these cases, don't advance the column. These are
    267 						// 0-width spaces.
    268 						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
    269 							p += 3;
    270 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
    271 							p += 3;
    272 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
    273 							p += 3;
    274 						else
    275 							{ p +=3; ++col; }	// A normal character.
    276 					}
    277 				}
    278 				else
    279 				{
    280 					++p;
    281 					++col;
    282 				}
    283 				break;
    284 
    285 			default:
    286 				if ( encoding == TIXML_ENCODING_UTF8 )
    287 				{
    288 					// Eat the 1 to 4 byte utf8 character.
    289 					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
    290 					if ( step == 0 )
    291 						step = 1;		// Error case from bad encoding, but handle gracefully.
    292 					p += step;
    293 
    294 					// Just advance one column, of course.
    295 					++col;
    296 				}
    297 				else
    298 				{
    299 					++p;
    300 					++col;
    301 				}
    302 				break;
    303 		}
    304 	}
    305 	cursor.row = row;
    306 	cursor.col = col;
    307 	assert( cursor.row >= -1 );
    308 	assert( cursor.col >= -1 );
    309 	stamp = p;
    310 	assert( stamp );
    311 }
    312 
    313 
    314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
    315 {
    316 	if ( !p || !*p )
    317 	{
    318 		return 0;
    319 	}
    320 	if ( encoding == TIXML_ENCODING_UTF8 )
    321 	{
    322 		while ( *p )
    323 		{
    324 			const unsigned char* pU = (const unsigned char*)p;
    325 
    326 			// Skip the stupid Microsoft UTF-8 Byte order marks
    327 			if (	*(pU+0)==TIXML_UTF_LEAD_0
    328 				 && *(pU+1)==TIXML_UTF_LEAD_1
    329 				 && *(pU+2)==TIXML_UTF_LEAD_2 )
    330 			{
    331 				p += 3;
    332 				continue;
    333 			}
    334 			else if(*(pU+0)==TIXML_UTF_LEAD_0
    335 				 && *(pU+1)==0xbfU
    336 				 && *(pU+2)==0xbeU )
    337 			{
    338 				p += 3;
    339 				continue;
    340 			}
    341 			else if(*(pU+0)==TIXML_UTF_LEAD_0
    342 				 && *(pU+1)==0xbfU
    343 				 && *(pU+2)==0xbfU )
    344 			{
    345 				p += 3;
    346 				continue;
    347 			}
    348 
    349 			if ( IsWhiteSpace( *p ) )		// Still using old rules for white space.
    350 				++p;
    351 			else
    352 				break;
    353 		}
    354 	}
    355 	else
    356 	{
    357 		while ( *p && IsWhiteSpace( *p ) )
    358 			++p;
    359 	}
    360 
    361 	return p;
    362 }
    363 
    364 #ifdef TIXML_USE_STL
    365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
    366 {
    367 	for( ;; )
    368 	{
    369 		if ( !in->good() ) return false;
    370 
    371 		int c = in->peek();
    372 		// At this scope, we can't get to a document. So fail silently.
    373 		if ( !IsWhiteSpace( c ) || c <= 0 )
    374 			return true;
    375 
    376 		*tag += (char) in->get();
    377 	}
    378 }
    379 
    380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
    381 {
    382 	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
    383 	while ( in->good() )
    384 	{
    385 		int c = in->peek();
    386 		if ( c == character )
    387 			return true;
    388 		if ( c <= 0 )		// Silent failure: can't get document at this scope
    389 			return false;
    390 
    391 		in->get();
    392 		*tag += (char) c;
    393 	}
    394 	return false;
    395 }
    396 #endif
    397 
    398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
    399 // "assign" optimization removes over 10% of the execution time.
    400 //
    401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
    402 {
    403 	// Oddly, not supported on some comilers,
    404 	//name->clear();
    405 	// So use this:
    406 	*name = "";
    407 	assert( p );
    408 
    409 	// Names start with letters or underscores.
    410 	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
    411 	// algorithm is generous.
    412 	//
    413 	// After that, they can be letters, underscores, numbers,
    414 	// hyphens, or colons. (Colons are valid ony for namespaces,
    415 	// but tinyxml can't tell namespaces from names.)
    416 	if (    p && *p
    417 		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
    418 	{
    419 		const char* start = p;
    420 		while(		p && *p
    421 				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
    422 						 || *p == '_'
    423 						 || *p == '-'
    424 						 || *p == '.'
    425 						 || *p == ':' ) )
    426 		{
    427 			//(*name) += *p; // expensive
    428 			++p;
    429 		}
    430 		if ( p-start > 0 ) {
    431 			name->assign( start, p-start );
    432 		}
    433 		return p;
    434 	}
    435 	return 0;
    436 }
    437 
    438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
    439 {
    440 	// Presume an entity, and pull it out.
    441     TIXML_STRING ent;
    442 	int i;
    443 	*length = 0;
    444 
    445 	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
    446 	{
    447 		unsigned long ucs = 0;
    448 		ptrdiff_t delta = 0;
    449 		unsigned mult = 1;
    450 
    451 		if ( *(p+2) == 'x' )
    452 		{
    453 			// Hexadecimal.
    454 			if ( !*(p+3) ) return 0;
    455 
    456 			const char* q = p+3;
    457 			q = strchr( q, ';' );
    458 
    459 			if ( !q || !*q ) return 0;
    460 
    461 			delta = q-p;
    462 			--q;
    463 
    464 			while ( *q != 'x' )
    465 			{
    466 				if ( *q >= '0' && *q <= '9' )
    467 					ucs += mult * (*q - '0');
    468 				else if ( *q >= 'a' && *q <= 'f' )
    469 					ucs += mult * (*q - 'a' + 10);
    470 				else if ( *q >= 'A' && *q <= 'F' )
    471 					ucs += mult * (*q - 'A' + 10 );
    472 				else
    473 					return 0;
    474 				mult *= 16;
    475 				--q;
    476 			}
    477 		}
    478 		else
    479 		{
    480 			// Decimal.
    481 			if ( !*(p+2) ) return 0;
    482 
    483 			const char* q = p+2;
    484 			q = strchr( q, ';' );
    485 
    486 			if ( !q || !*q ) return 0;
    487 
    488 			delta = q-p;
    489 			--q;
    490 
    491 			while ( *q != '#' )
    492 			{
    493 				if ( *q >= '0' && *q <= '9' )
    494 					ucs += mult * (*q - '0');
    495 				else
    496 					return 0;
    497 				mult *= 10;
    498 				--q;
    499 			}
    500 		}
    501 		if ( encoding == TIXML_ENCODING_UTF8 )
    502 		{
    503 			// convert the UCS to UTF-8
    504 			ConvertUTF32ToUTF8( ucs, value, length );
    505 		}
    506 		else
    507 		{
    508 			*value = (char)ucs;
    509 			*length = 1;
    510 		}
    511 		return p + delta + 1;
    512 	}
    513 
    514 	// Now try to match it.
    515 	for( i=0; i<NUM_ENTITY; ++i )
    516 	{
    517 		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
    518 		{
    519 			assert( strlen( entity[i].str ) == entity[i].strLength );
    520 			*value = entity[i].chr;
    521 			*length = 1;
    522 			return ( p + entity[i].strLength );
    523 		}
    524 	}
    525 
    526 	// So it wasn't an entity, its unrecognized, or something like that.
    527 	*value = *p;	// Don't put back the last one, since we return it!
    528 	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
    529 					// Just writes strange XML.
    530 	return p+1;
    531 }
    532 
    533 
    534 bool TiXmlBase::StringEqual( const char* p,
    535 							 const char* tag,
    536 							 bool ignoreCase,
    537 							 TiXmlEncoding encoding )
    538 {
    539 	assert( p );
    540 	assert( tag );
    541 	if ( !p || !*p )
    542 	{
    543 		assert( 0 );
    544 		return false;
    545 	}
    546 
    547 	const char* q = p;
    548 
    549 	if ( ignoreCase )
    550 	{
    551 		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
    552 		{
    553 			++q;
    554 			++tag;
    555 		}
    556 
    557 		if ( *tag == 0 )
    558 			return true;
    559 	}
    560 	else
    561 	{
    562 		while ( *q && *tag && *q == *tag )
    563 		{
    564 			++q;
    565 			++tag;
    566 		}
    567 
    568 		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
    569 			return true;
    570 	}
    571 	return false;
    572 }
    573 
    574 const char* TiXmlBase::ReadText(	const char* p,
    575 									TIXML_STRING * text,
    576 									bool trimWhiteSpace,
    577 									const char* endTag,
    578 									bool caseInsensitive,
    579 									TiXmlEncoding encoding )
    580 {
    581     *text = "";
    582 	if (    !trimWhiteSpace			// certain tags always keep whitespace
    583 		 || !condenseWhiteSpace )	// if true, whitespace is always kept
    584 	{
    585 		// Keep all the white space.
    586 		while (	   p && *p
    587 				&& !StringEqual( p, endTag, caseInsensitive, encoding )
    588 			  )
    589 		{
    590 			int len;
    591 			char cArr[4] = { 0, 0, 0, 0 };
    592 			p = GetChar( p, cArr, &len, encoding );
    593 			text->append( cArr, len );
    594 		}
    595 	}
    596 	else
    597 	{
    598 		bool whitespace = false;
    599 
    600 		// Remove leading white space:
    601 		p = SkipWhiteSpace( p, encoding );
    602 		while (	   p && *p
    603 				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
    604 		{
    605 			if ( *p == '\r' || *p == '\n' )
    606 			{
    607 				whitespace = true;
    608 				++p;
    609 			}
    610 			else if ( IsWhiteSpace( *p ) )
    611 			{
    612 				whitespace = true;
    613 				++p;
    614 			}
    615 			else
    616 			{
    617 				// If we've found whitespace, add it before the
    618 				// new character. Any whitespace just becomes a space.
    619 				if ( whitespace )
    620 				{
    621 					(*text) += ' ';
    622 					whitespace = false;
    623 				}
    624 				int len;
    625 				char cArr[4] = { 0, 0, 0, 0 };
    626 				p = GetChar( p, cArr, &len, encoding );
    627 				if ( len == 1 )
    628 					(*text) += cArr[0];	// more efficient
    629 				else
    630 					text->append( cArr, len );
    631 			}
    632 		}
    633 	}
    634 	if ( p && *p )
    635 		p += strlen( endTag );
    636 	return ( p && *p ) ? p : 0;
    637 }
    638 
    639 #ifdef TIXML_USE_STL
    640 
    641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
    642 {
    643 	// The basic issue with a document is that we don't know what we're
    644 	// streaming. Read something presumed to be a tag (and hope), then
    645 	// identify it, and call the appropriate stream method on the tag.
    646 	//
    647 	// This "pre-streaming" will never read the closing ">" so the
    648 	// sub-tag can orient itself.
    649 
    650 	if ( !StreamTo( in, '<', tag ) )
    651 	{
    652 		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
    653 		return;
    654 	}
    655 
    656 	while ( in->good() )
    657 	{
    658 		int tagIndex = (int) tag->length();
    659 		while ( in->good() && in->peek() != '>' )
    660 		{
    661 			int c = in->get();
    662 			if ( c <= 0 )
    663 			{
    664 				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
    665 				break;
    666 			}
    667 			(*tag) += (char) c;
    668 		}
    669 
    670 		if ( in->good() )
    671 		{
    672 			// We now have something we presume to be a node of
    673 			// some sort. Identify it, and call the node to
    674 			// continue streaming.
    675 			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
    676 
    677 			if ( node )
    678 			{
    679 				node->StreamIn( in, tag );
    680 				bool isElement = node->ToElement() != 0;
    681 				delete node;
    682 				node = 0;
    683 
    684 				// If this is the root element, we're done. Parsing will be
    685 				// done by the >> operator.
    686 				if ( isElement )
    687 				{
    688 					return;
    689 				}
    690 			}
    691 			else
    692 			{
    693 				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
    694 				return;
    695 			}
    696 		}
    697 	}
    698 	// We should have returned sooner.
    699 	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
    700 }
    701 
    702 #endif
    703 
    704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
    705 {
    706 	ClearError();
    707 
    708 	// Parse away, at the document level. Since a document
    709 	// contains nothing but other tags, most of what happens
    710 	// here is skipping white space.
    711 	if ( !p || !*p )
    712 	{
    713 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
    714 		return 0;
    715 	}
    716 
    717 	// Note that, for a document, this needs to come
    718 	// before the while space skip, so that parsing
    719 	// starts from the pointer we are given.
    720 	location.Clear();
    721 	if ( prevData )
    722 	{
    723 		location.row = prevData->cursor.row;
    724 		location.col = prevData->cursor.col;
    725 	}
    726 	else
    727 	{
    728 		location.row = 0;
    729 		location.col = 0;
    730 	}
    731 	TiXmlParsingData data( p, TabSize(), location.row, location.col );
    732 	location = data.Cursor();
    733 
    734 	if ( encoding == TIXML_ENCODING_UNKNOWN )
    735 	{
    736 		// Check for the Microsoft UTF-8 lead bytes.
    737 		const unsigned char* pU = (const unsigned char*)p;
    738 		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
    739 			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
    740 			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
    741 		{
    742 			encoding = TIXML_ENCODING_UTF8;
    743 			useMicrosoftBOM = true;
    744 		}
    745 	}
    746 
    747     p = SkipWhiteSpace( p, encoding );
    748 	if ( !p )
    749 	{
    750 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
    751 		return 0;
    752 	}
    753 
    754 	while ( p && *p )
    755 	{
    756 		TiXmlNode* node = Identify( p, encoding );
    757 		if ( node )
    758 		{
    759 			p = node->Parse( p, &data, encoding );
    760 			LinkEndChild( node );
    761 		}
    762 		else
    763 		{
    764 			break;
    765 		}
    766 
    767 		// Did we get encoding info?
    768 		if (    encoding == TIXML_ENCODING_UNKNOWN
    769 			 && node->ToDeclaration() )
    770 		{
    771 			TiXmlDeclaration* dec = node->ToDeclaration();
    772 			const char* enc = dec->Encoding();
    773 			assert( enc );
    774 
    775 			if ( *enc == 0 )
    776 				encoding = TIXML_ENCODING_UTF8;
    777 			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
    778 				encoding = TIXML_ENCODING_UTF8;
    779 			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
    780 				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
    781 			else
    782 				encoding = TIXML_ENCODING_LEGACY;
    783 		}
    784 
    785 		p = SkipWhiteSpace( p, encoding );
    786 	}
    787 
    788 	// Was this empty?
    789 	if ( !firstChild ) {
    790 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
    791 		return 0;
    792 	}
    793 
    794 	// All is well.
    795 	return p;
    796 }
    797 
    798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
    799 {
    800 	// The first error in a chain is more accurate - don't set again!
    801 	if ( error )
    802 		return;
    803 
    804 	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
    805 	error   = true;
    806 	errorId = err;
    807 	errorDesc = errorString[ errorId ];
    808 
    809 	errorLocation.Clear();
    810 	if ( pError && data )
    811 	{
    812 		data->Stamp( pError, encoding );
    813 		errorLocation = data->Cursor();
    814 	}
    815 }
    816 
    817 
    818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
    819 {
    820 	TiXmlNode* returnNode = 0;
    821 
    822 	p = SkipWhiteSpace( p, encoding );
    823 	if( !p || !*p || *p != '<' )
    824 	{
    825 		return 0;
    826 	}
    827 
    828 	p = SkipWhiteSpace( p, encoding );
    829 
    830 	if ( !p || !*p )
    831 	{
    832 		return 0;
    833 	}
    834 
    835 	// What is this thing?
    836 	// - Elements start with a letter or underscore, but xml is reserved.
    837 	// - Comments: <!--
    838 	// - Decleration: <?xml
    839 	// - Everthing else is unknown to tinyxml.
    840 	//
    841 
    842 	const char* xmlHeader = { "<?xml" };
    843 	const char* commentHeader = { "<!--" };
    844 	const char* dtdHeader = { "<!" };
    845 	const char* cdataHeader = { "<![CDATA[" };
    846 
    847 	if ( StringEqual( p, xmlHeader, true, encoding ) )
    848 	{
    849 		#ifdef DEBUG_PARSER
    850 			TIXML_LOG( "XML parsing Declaration\n" );
    851 		#endif
    852 		returnNode = new TiXmlDeclaration();
    853 	}
    854 	else if ( StringEqual( p, commentHeader, false, encoding ) )
    855 	{
    856 		#ifdef DEBUG_PARSER
    857 			TIXML_LOG( "XML parsing Comment\n" );
    858 		#endif
    859 		returnNode = new TiXmlComment();
    860 	}
    861 	else if ( StringEqual( p, cdataHeader, false, encoding ) )
    862 	{
    863 		#ifdef DEBUG_PARSER
    864 			TIXML_LOG( "XML parsing CDATA\n" );
    865 		#endif
    866 		TiXmlText* text = new TiXmlText( "" );
    867 		text->SetCDATA( true );
    868 		returnNode = text;
    869 	}
    870 	else if ( StringEqual( p, dtdHeader, false, encoding ) )
    871 	{
    872 		#ifdef DEBUG_PARSER
    873 			TIXML_LOG( "XML parsing Unknown(1)\n" );
    874 		#endif
    875 		returnNode = new TiXmlUnknown();
    876 	}
    877 	else if (    IsAlpha( *(p+1), encoding )
    878 			  || *(p+1) == '_' )
    879 	{
    880 		#ifdef DEBUG_PARSER
    881 			TIXML_LOG( "XML parsing Element\n" );
    882 		#endif
    883 		returnNode = new TiXmlElement( "" );
    884 	}
    885 	else
    886 	{
    887 		#ifdef DEBUG_PARSER
    888 			TIXML_LOG( "XML parsing Unknown(2)\n" );
    889 		#endif
    890 		returnNode = new TiXmlUnknown();
    891 	}
    892 
    893 	if ( returnNode )
    894 	{
    895 		// Set the parent, so it can report errors
    896 		returnNode->parent = this;
    897 	}
    898 	return returnNode;
    899 }
    900 
    901 #ifdef TIXML_USE_STL
    902 
    903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
    904 {
    905 	// We're called with some amount of pre-parsing. That is, some of "this"
    906 	// element is in "tag". Go ahead and stream to the closing ">"
    907 	while( in->good() )
    908 	{
    909 		int c = in->get();
    910 		if ( c <= 0 )
    911 		{
    912 			TiXmlDocument* document = GetDocument();
    913 			if ( document )
    914 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
    915 			return;
    916 		}
    917 		(*tag) += (char) c ;
    918 
    919 		if ( c == '>' )
    920 			break;
    921 	}
    922 
    923 	if ( tag->length() < 3 ) return;
    924 
    925 	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
    926 	// If not, identify and stream.
    927 
    928 	if (    tag->at( tag->length() - 1 ) == '>'
    929 		 && tag->at( tag->length() - 2 ) == '/' )
    930 	{
    931 		// All good!
    932 		return;
    933 	}
    934 	else if ( tag->at( tag->length() - 1 ) == '>' )
    935 	{
    936 		// There is more. Could be:
    937 		//		text
    938 		//		cdata text (which looks like another node)
    939 		//		closing tag
    940 		//		another node.
    941 		for ( ;; )
    942 		{
    943 			StreamWhiteSpace( in, tag );
    944 
    945 			// Do we have text?
    946 			if ( in->good() && in->peek() != '<' )
    947 			{
    948 				// Yep, text.
    949 				TiXmlText text( "" );
    950 				text.StreamIn( in, tag );
    951 
    952 				// What follows text is a closing tag or another node.
    953 				// Go around again and figure it out.
    954 				continue;
    955 			}
    956 
    957 			// We now have either a closing tag...or another node.
    958 			// We should be at a "<", regardless.
    959 			if ( !in->good() ) return;
    960 			assert( in->peek() == '<' );
    961 			int tagIndex = (int) tag->length();
    962 
    963 			bool closingTag = false;
    964 			bool firstCharFound = false;
    965 
    966 			for( ;; )
    967 			{
    968 				if ( !in->good() )
    969 					return;
    970 
    971 				int c = in->peek();
    972 				if ( c <= 0 )
    973 				{
    974 					TiXmlDocument* document = GetDocument();
    975 					if ( document )
    976 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
    977 					return;
    978 				}
    979 
    980 				if ( c == '>' )
    981 					break;
    982 
    983 				*tag += (char) c;
    984 				in->get();
    985 
    986 				// Early out if we find the CDATA id.
    987 				if ( c == '[' && tag->size() >= 9 )
    988 				{
    989 					size_t len = tag->size();
    990 					const char* start = tag->c_str() + len - 9;
    991 					if ( strcmp( start, "<![CDATA[" ) == 0 ) {
    992 						assert( !closingTag );
    993 						break;
    994 					}
    995 				}
    996 
    997 				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
    998 				{
    999 					firstCharFound = true;
   1000 					if ( c == '/' )
   1001 						closingTag = true;
   1002 				}
   1003 			}
   1004 			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
   1005 			// If it was not, the streaming will be done by the tag.
   1006 			if ( closingTag )
   1007 			{
   1008 				if ( !in->good() )
   1009 					return;
   1010 
   1011 				int c = in->get();
   1012 				if ( c <= 0 )
   1013 				{
   1014 					TiXmlDocument* document = GetDocument();
   1015 					if ( document )
   1016 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
   1017 					return;
   1018 				}
   1019 				assert( c == '>' );
   1020 				*tag += (char) c;
   1021 
   1022 				// We are done, once we've found our closing tag.
   1023 				return;
   1024 			}
   1025 			else
   1026 			{
   1027 				// If not a closing tag, id it, and stream.
   1028 				const char* tagloc = tag->c_str() + tagIndex;
   1029 				TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
   1030 				if ( !node )
   1031 					return;
   1032 				node->StreamIn( in, tag );
   1033 				delete node;
   1034 				node = 0;
   1035 
   1036 				// No return: go around from the beginning: text, closing tag, or node.
   1037 			}
   1038 		}
   1039 	}
   1040 }
   1041 #endif
   1042 
   1043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1044 {
   1045 	p = SkipWhiteSpace( p, encoding );
   1046 	TiXmlDocument* document = GetDocument();
   1047 
   1048 	if ( !p || !*p )
   1049 	{
   1050 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
   1051 		return 0;
   1052 	}
   1053 
   1054 	if ( data )
   1055 	{
   1056 		data->Stamp( p, encoding );
   1057 		location = data->Cursor();
   1058 	}
   1059 
   1060 	if ( *p != '<' )
   1061 	{
   1062 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
   1063 		return 0;
   1064 	}
   1065 
   1066 	p = SkipWhiteSpace( p+1, encoding );
   1067 
   1068 	// Read the name.
   1069 	const char* pErr = p;
   1070 
   1071     p = ReadName( p, &value, encoding );
   1072 	if ( !p || !*p )
   1073 	{
   1074 		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
   1075 		return 0;
   1076 	}
   1077 
   1078     TIXML_STRING endTag ("</");
   1079 	endTag += value;
   1080 
   1081 	// Check for and read attributes. Also look for an empty
   1082 	// tag or an end tag.
   1083 	while ( p && *p )
   1084 	{
   1085 		pErr = p;
   1086 		p = SkipWhiteSpace( p, encoding );
   1087 		if ( !p || !*p )
   1088 		{
   1089 			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
   1090 			return 0;
   1091 		}
   1092 		if ( *p == '/' )
   1093 		{
   1094 			++p;
   1095 			// Empty tag.
   1096 			if ( *p  != '>' )
   1097 			{
   1098 				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
   1099 				return 0;
   1100 			}
   1101 			return (p+1);
   1102 		}
   1103 		else if ( *p == '>' )
   1104 		{
   1105 			// Done with attributes (if there were any.)
   1106 			// Read the value -- which can include other
   1107 			// elements -- read the end tag, and return.
   1108 			++p;
   1109 			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
   1110 			if ( !p || !*p ) {
   1111 				// We were looking for the end tag, but found nothing.
   1112 				// Fix for [ 1663758 ] Failure to report error on bad XML
   1113 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
   1114 				return 0;
   1115 			}
   1116 
   1117 			// We should find the end tag now
   1118 			// note that:
   1119 			// </foo > and
   1120 			// </foo>
   1121 			// are both valid end tags.
   1122 			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
   1123 			{
   1124 				p += endTag.length();
   1125 				p = SkipWhiteSpace( p, encoding );
   1126 				if ( p && *p && *p == '>' ) {
   1127 					++p;
   1128 					return p;
   1129 				}
   1130 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
   1131 				return 0;
   1132 			}
   1133 			else
   1134 			{
   1135 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
   1136 				return 0;
   1137 			}
   1138 		}
   1139 		else
   1140 		{
   1141 			// Try to read an attribute:
   1142 			TiXmlAttribute* attrib = new TiXmlAttribute();
   1143 			if ( !attrib )
   1144 			{
   1145 				return 0;
   1146 			}
   1147 
   1148 			attrib->SetDocument( document );
   1149 			pErr = p;
   1150 			p = attrib->Parse( p, data, encoding );
   1151 
   1152 			if ( !p || !*p )
   1153 			{
   1154 				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
   1155 				delete attrib;
   1156 				return 0;
   1157 			}
   1158 
   1159 			// Handle the strange case of double attributes:
   1160 			#ifdef TIXML_USE_STL
   1161 			TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
   1162 			#else
   1163 			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
   1164 			#endif
   1165 			if ( node )
   1166 			{
   1167 				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
   1168 				delete attrib;
   1169 				return 0;
   1170 			}
   1171 
   1172 			attributeSet.Add( attrib );
   1173 		}
   1174 	}
   1175 	return p;
   1176 }
   1177 
   1178 
   1179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1180 {
   1181 	TiXmlDocument* document = GetDocument();
   1182 
   1183 	// Read in text and elements in any order.
   1184 	const char* pWithWhiteSpace = p;
   1185 	p = SkipWhiteSpace( p, encoding );
   1186 
   1187 	while ( p && *p )
   1188 	{
   1189 		if ( *p != '<' )
   1190 		{
   1191 			// Take what we have, make a text element.
   1192 			TiXmlText* textNode = new TiXmlText( "" );
   1193 
   1194 			if ( !textNode )
   1195 			{
   1196 			    return 0;
   1197 			}
   1198 
   1199 			if ( TiXmlBase::IsWhiteSpaceCondensed() )
   1200 			{
   1201 				p = textNode->Parse( p, data, encoding );
   1202 			}
   1203 			else
   1204 			{
   1205 				// Special case: we want to keep the white space
   1206 				// so that leading spaces aren't removed.
   1207 				p = textNode->Parse( pWithWhiteSpace, data, encoding );
   1208 			}
   1209 
   1210 			if ( !textNode->Blank() )
   1211 				LinkEndChild( textNode );
   1212 			else
   1213 				delete textNode;
   1214 		}
   1215 		else
   1216 		{
   1217 			// We hit a '<'
   1218 			// Have we hit a new element or an end tag? This could also be
   1219 			// a TiXmlText in the "CDATA" style.
   1220 			if ( StringEqual( p, "</", false, encoding ) )
   1221 			{
   1222 				return p;
   1223 			}
   1224 			else
   1225 			{
   1226 				TiXmlNode* node = Identify( p, encoding );
   1227 				if ( node )
   1228 				{
   1229 					p = node->Parse( p, data, encoding );
   1230 					LinkEndChild( node );
   1231 				}
   1232 				else
   1233 				{
   1234 					return 0;
   1235 				}
   1236 			}
   1237 		}
   1238 		pWithWhiteSpace = p;
   1239 		p = SkipWhiteSpace( p, encoding );
   1240 	}
   1241 
   1242 	if ( !p )
   1243 	{
   1244 		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
   1245 	}
   1246 	return p;
   1247 }
   1248 
   1249 
   1250 #ifdef TIXML_USE_STL
   1251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
   1252 {
   1253 	while ( in->good() )
   1254 	{
   1255 		int c = in->get();
   1256 		if ( c <= 0 )
   1257 		{
   1258 			TiXmlDocument* document = GetDocument();
   1259 			if ( document )
   1260 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
   1261 			return;
   1262 		}
   1263 		(*tag) += (char) c;
   1264 
   1265 		if ( c == '>' )
   1266 		{
   1267 			// All is well.
   1268 			return;
   1269 		}
   1270 	}
   1271 }
   1272 #endif
   1273 
   1274 
   1275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1276 {
   1277 	TiXmlDocument* document = GetDocument();
   1278 	p = SkipWhiteSpace( p, encoding );
   1279 
   1280 	if ( data )
   1281 	{
   1282 		data->Stamp( p, encoding );
   1283 		location = data->Cursor();
   1284 	}
   1285 	if ( !p || !*p || *p != '<' )
   1286 	{
   1287 		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
   1288 		return 0;
   1289 	}
   1290 	++p;
   1291     value = "";
   1292 
   1293 	while ( p && *p && *p != '>' )
   1294 	{
   1295 		value += *p;
   1296 		++p;
   1297 	}
   1298 
   1299 	if ( !p )
   1300 	{
   1301 		if ( document )
   1302 			document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
   1303 	}
   1304 	if ( p && *p == '>' )
   1305 		return p+1;
   1306 	return p;
   1307 }
   1308 
   1309 #ifdef TIXML_USE_STL
   1310 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
   1311 {
   1312 	while ( in->good() )
   1313 	{
   1314 		int c = in->get();
   1315 		if ( c <= 0 )
   1316 		{
   1317 			TiXmlDocument* document = GetDocument();
   1318 			if ( document )
   1319 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
   1320 			return;
   1321 		}
   1322 
   1323 		(*tag) += (char) c;
   1324 
   1325 		if ( c == '>'
   1326 			 && tag->at( tag->length() - 2 ) == '-'
   1327 			 && tag->at( tag->length() - 3 ) == '-' )
   1328 		{
   1329 			// All is well.
   1330 			return;
   1331 		}
   1332 	}
   1333 }
   1334 #endif
   1335 
   1336 
   1337 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1338 {
   1339 	TiXmlDocument* document = GetDocument();
   1340 	value = "";
   1341 
   1342 	p = SkipWhiteSpace( p, encoding );
   1343 
   1344 	if ( data )
   1345 	{
   1346 		data->Stamp( p, encoding );
   1347 		location = data->Cursor();
   1348 	}
   1349 	const char* startTag = "<!--";
   1350 	const char* endTag   = "-->";
   1351 
   1352 	if ( !StringEqual( p, startTag, false, encoding ) )
   1353 	{
   1354 		if ( document )
   1355 			document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
   1356 		return 0;
   1357 	}
   1358 	p += strlen( startTag );
   1359 
   1360 	// [ 1475201 ] TinyXML parses entities in comments
   1361 	// Oops - ReadText doesn't work, because we don't want to parse the entities.
   1362 	// p = ReadText( p, &value, false, endTag, false, encoding );
   1363 	//
   1364 	// from the XML spec:
   1365 	/*
   1366 	 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
   1367 	              they may appear within the document type declaration at places allowed by the grammar.
   1368 				  They are not part of the document's character data; an XML processor MAY, but need not,
   1369 				  make it possible for an application to retrieve the text of comments. For compatibility,
   1370 				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
   1371 				  references MUST NOT be recognized within comments.
   1372 
   1373 				  An example of a comment:
   1374 
   1375 				  <!-- declarations for <head> & <body> -->
   1376 	*/
   1377 
   1378     value = "";
   1379 	// Keep all the white space.
   1380 	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
   1381 	{
   1382 		value.append( p, 1 );
   1383 		++p;
   1384 	}
   1385 	if ( p && *p )
   1386 		p += strlen( endTag );
   1387 
   1388 	return p;
   1389 }
   1390 
   1391 
   1392 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1393 {
   1394 	p = SkipWhiteSpace( p, encoding );
   1395 	if ( !p || !*p ) return 0;
   1396 
   1397 	if ( data )
   1398 	{
   1399 		data->Stamp( p, encoding );
   1400 		location = data->Cursor();
   1401 	}
   1402 	// Read the name, the '=' and the value.
   1403 	const char* pErr = p;
   1404 	p = ReadName( p, &name, encoding );
   1405 	if ( !p || !*p )
   1406 	{
   1407 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
   1408 		return 0;
   1409 	}
   1410 	p = SkipWhiteSpace( p, encoding );
   1411 	if ( !p || !*p || *p != '=' )
   1412 	{
   1413 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
   1414 		return 0;
   1415 	}
   1416 
   1417 	++p;	// skip '='
   1418 	p = SkipWhiteSpace( p, encoding );
   1419 	if ( !p || !*p )
   1420 	{
   1421 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
   1422 		return 0;
   1423 	}
   1424 
   1425 	const char* end;
   1426 	const char SINGLE_QUOTE = '\'';
   1427 	const char DOUBLE_QUOTE = '\"';
   1428 
   1429 	if ( *p == SINGLE_QUOTE )
   1430 	{
   1431 		++p;
   1432 		end = "\'";		// single quote in string
   1433 		p = ReadText( p, &value, false, end, false, encoding );
   1434 	}
   1435 	else if ( *p == DOUBLE_QUOTE )
   1436 	{
   1437 		++p;
   1438 		end = "\"";		// double quote in string
   1439 		p = ReadText( p, &value, false, end, false, encoding );
   1440 	}
   1441 	else
   1442 	{
   1443 		// All attribute values should be in single or double quotes.
   1444 		// But this is such a common error that the parser will try
   1445 		// its best, even without them.
   1446 		value = "";
   1447 		while (    p && *p											// existence
   1448 				&& !IsWhiteSpace( *p )								// whitespace
   1449 				&& *p != '/' && *p != '>' )							// tag end
   1450 		{
   1451 			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
   1452 				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
   1453 				// We did not have an opening quote but seem to have a
   1454 				// closing one. Give up and throw an error.
   1455 				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
   1456 				return 0;
   1457 			}
   1458 			value += *p;
   1459 			++p;
   1460 		}
   1461 	}
   1462 	return p;
   1463 }
   1464 
   1465 #ifdef TIXML_USE_STL
   1466 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
   1467 {
   1468 	while ( in->good() )
   1469 	{
   1470 		int c = in->peek();
   1471 		if ( !cdata && (c == '<' ) )
   1472 		{
   1473 			return;
   1474 		}
   1475 		if ( c <= 0 )
   1476 		{
   1477 			TiXmlDocument* document = GetDocument();
   1478 			if ( document )
   1479 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
   1480 			return;
   1481 		}
   1482 
   1483 		(*tag) += (char) c;
   1484 		in->get();	// "commits" the peek made above
   1485 
   1486 		if ( cdata && c == '>' && tag->size() >= 3 ) {
   1487 			size_t len = tag->size();
   1488 			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
   1489 				// terminator of cdata.
   1490 				return;
   1491 			}
   1492 		}
   1493 	}
   1494 }
   1495 #endif
   1496 
   1497 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
   1498 {
   1499 	value = "";
   1500 	TiXmlDocument* document = GetDocument();
   1501 
   1502 	if ( data )
   1503 	{
   1504 		data->Stamp( p, encoding );
   1505 		location = data->Cursor();
   1506 	}
   1507 
   1508 	const char* const startTag = "<![CDATA[";
   1509 	const char* const endTag   = "]]>";
   1510 
   1511 	if ( cdata || StringEqual( p, startTag, false, encoding ) )
   1512 	{
   1513 		cdata = true;
   1514 
   1515 		if ( !StringEqual( p, startTag, false, encoding ) )
   1516 		{
   1517 			if ( document )
   1518 				document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
   1519 			return 0;
   1520 		}
   1521 		p += strlen( startTag );
   1522 
   1523 		// Keep all the white space, ignore the encoding, etc.
   1524 		while (	   p && *p
   1525 				&& !StringEqual( p, endTag, false, encoding )
   1526 			  )
   1527 		{
   1528 			value += *p;
   1529 			++p;
   1530 		}
   1531 
   1532 		TIXML_STRING dummy;
   1533 		p = ReadText( p, &dummy, false, endTag, false, encoding );
   1534 		return p;
   1535 	}
   1536 	else
   1537 	{
   1538 		bool ignoreWhite = true;
   1539 
   1540 		const char* end = "<";
   1541 		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
   1542 		if ( p && *p )
   1543 			return p-1;	// don't truncate the '<'
   1544 		return 0;
   1545 	}
   1546 }
   1547 
   1548 #ifdef TIXML_USE_STL
   1549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
   1550 {
   1551 	while ( in->good() )
   1552 	{
   1553 		int c = in->get();
   1554 		if ( c <= 0 )
   1555 		{
   1556 			TiXmlDocument* document = GetDocument();
   1557 			if ( document )
   1558 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
   1559 			return;
   1560 		}
   1561 		(*tag) += (char) c;
   1562 
   1563 		if ( c == '>' )
   1564 		{
   1565 			// All is well.
   1566 			return;
   1567 		}
   1568 	}
   1569 }
   1570 #endif
   1571 
   1572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
   1573 {
   1574 	p = SkipWhiteSpace( p, _encoding );
   1575 	// Find the beginning, find the end, and look for
   1576 	// the stuff in-between.
   1577 	TiXmlDocument* document = GetDocument();
   1578 	if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
   1579 	{
   1580 		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
   1581 		return 0;
   1582 	}
   1583 	if ( data )
   1584 	{
   1585 		data->Stamp( p, _encoding );
   1586 		location = data->Cursor();
   1587 	}
   1588 	p += 5;
   1589 
   1590 	version = "";
   1591 	encoding = "";
   1592 	standalone = "";
   1593 
   1594 	while ( p && *p )
   1595 	{
   1596 		if ( *p == '>' )
   1597 		{
   1598 			++p;
   1599 			return p;
   1600 		}
   1601 
   1602 		p = SkipWhiteSpace( p, _encoding );
   1603 		if ( StringEqual( p, "version", true, _encoding ) )
   1604 		{
   1605 			TiXmlAttribute attrib;
   1606 			p = attrib.Parse( p, data, _encoding );
   1607 			version = attrib.Value();
   1608 		}
   1609 		else if ( StringEqual( p, "encoding", true, _encoding ) )
   1610 		{
   1611 			TiXmlAttribute attrib;
   1612 			p = attrib.Parse( p, data, _encoding );
   1613 			encoding = attrib.Value();
   1614 		}
   1615 		else if ( StringEqual( p, "standalone", true, _encoding ) )
   1616 		{
   1617 			TiXmlAttribute attrib;
   1618 			p = attrib.Parse( p, data, _encoding );
   1619 			standalone = attrib.Value();
   1620 		}
   1621 		else
   1622 		{
   1623 			// Read over whatever it is.
   1624 			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
   1625 				++p;
   1626 		}
   1627 	}
   1628 	return 0;
   1629 }
   1630 
   1631 bool TiXmlText::Blank() const
   1632 {
   1633 	for ( unsigned i=0; i<value.length(); i++ )
   1634 		if ( !IsWhiteSpace( value[i] ) )
   1635 			return false;
   1636 	return true;
   1637 }
   1638 
   1639