gloox 1.0.28
parser.cpp
1/*
2 Copyright (c) 2004-2023 by Jakob Schröter <js@camaya.net>
3 This file is part of the gloox library. http://camaya.net/gloox
4
5 This software is distributed under a license. The full license
6 agreement can be found in the file LICENSE in this distribution.
7 This software may not be copied, modified, sold or distributed
8 other than expressed in the named license agreement.
9
10 This software is distributed without any warranty.
11*/
12
13#include "gloox.h"
14#include "util.h"
15#include "parser.h"
16
17#include <cstdlib>
18
19namespace gloox
20{
21
22 Parser::Parser( TagHandler* ph, bool deleteRoot )
23 : m_tagHandler( ph ), m_current( 0 ), m_root( 0 ), m_xmlnss( 0 ), m_state( Initial ),
24 m_preamble( 0 ), m_quote( false ), m_haveTagPrefix( false ), m_haveAttribPrefix( false ),
25 m_attribIsXmlns( false ), m_deleteRoot( deleteRoot )
26 {
27 }
28
30 {
31 cleanup( true );
32 }
33
34 Parser::DecodeState Parser::decode( std::string::size_type& pos, const std::string& data )
35 {
36 std::string::size_type p = data.find( ';', pos );
37 std::string::size_type diff = p - pos;
38
39 if( p == std::string::npos )
40 {
41 m_backBuffer = data.substr( pos );
42 return DecodeInsufficient;
43 }
44
45 if( diff < 3 || diff > 9 )
46 return DecodeInvalid;
47
48 std::string rep;
49 switch( data[pos + 1] )
50 {
51 case '#':
52 {
53 int base = 10;
54 int idx = 2;
55
56 if( data[pos + 2] == 'x' || data[pos + 2] == 'X' )
57 {
58 base = 16;
59 idx = 3;
60 }
61
62 char* end;
63 const long int val = std::strtol( data.data() + pos + idx, &end, base );
64 if( *end != ';' || val < 0 )
65 return DecodeInvalid;
66
67 if( val == 0x9 || val == 0xA || val == 0xD || ( val >= 0x20 && val <= 0x7F ) )
68 {
69 rep += char( val );
70 }
71 else if( val >= 0x80 && val <= 0x7FF )
72 {
73 rep += char( 192 + ( val >> 6 ) );
74 rep += char( 128 + ( val % 64 ) );
75 }
76 else if( ( val >= 0x800 && val <= 0xD7FF ) || ( val >= 0xE000 && val <= 0xFFFD ) )
77 {
78 rep += char( 224 + ( val >> 12 ) );
79 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
80 rep += char( 128 + ( val % 64 ) );
81 }
82 else if( val >= 0x100000 && val < 0x10FFFF )
83 {
84 rep += char( 240 + ( val >> 18 ) );
85 rep += char( 128 + ( ( val >> 12 ) % 64 ) );
86 rep += char( 128 + ( ( val >> 6 ) % 64 ) );
87 rep += char( 128 + ( val % 64 ) );
88 }
89 else
90 return DecodeInvalid;
91 }
92 break;
93 case 'l':
94 if( diff == 3 && data[pos + 2] == 't' )
95 rep += '<';
96 else
97 return DecodeInvalid;
98 break;
99 case 'g':
100 if( diff == 3 && data[pos + 2] == 't' )
101 rep += '>';
102 else
103 return DecodeInvalid;
104 break;
105 case 'a':
106 if( diff == 5 && !data.compare( pos + 1, 5, "apos;" ) )
107 rep += '\'';
108 else if( diff == 4 && !data.compare( pos + 1, 4, "amp;" ) )
109 rep += '&';
110 else
111 return DecodeInvalid;
112 break;
113 case 'q':
114 if( diff == 5 && !data.compare( pos + 1, 5, "quot;" ) )
115 rep += '"';
116 else
117 return DecodeInvalid;
118 break;
119 default:
120 return DecodeInvalid;
121 }
122
123 switch( m_state )
124 {
125 case InterTag:
126 case TagInside:
127 m_cdata += rep;
128 break;
129 case TagAttributeValue:
130 m_value += rep;
131 break;
132 default:
133 break;
134 }
135 pos += diff;
136 return DecodeValid;
137 }
138
139 Parser::ForwardScanState Parser::forwardScan( std::string::size_type& pos, const std::string& data,
140 const std::string& needle )
141 {
142 if( pos + needle.length() <= data.length() )
143 {
144 if( !data.compare( pos, needle.length(), needle ) )
145 {
146 pos += needle.length() - 1;
147 return ForwardFound;
148 }
149 else
150 {
151 return ForwardNotFound;
152 }
153 }
154 else
155 {
156 m_backBuffer = data.substr( pos );
157 return ForwardInsufficientSize;
158 }
159 }
160
161 int Parser::feed( std::string& data )
162 {
163 if( !m_backBuffer.empty() )
164 {
165 data.insert( 0, m_backBuffer );
166 m_backBuffer = EmptyString;
167 }
168
169 std::string::size_type count = data.length();
170 for( std::string::size_type i = 0; i < count; ++i )
171 {
172 const unsigned char c = data[i];
173// printf( "found char: %c, ", c );
174
175 switch( m_state )
176 {
177 case Initial:
178// printf( "Initial: %c\n", c );
179 if( isWhitespace( c ) )
180 break;
181
182 switch( c )
183 {
184 case '<':
185 m_state = TagOpening;
186 break;
187 default:
188 cleanup();
189 return static_cast<int>( i );
190 break;
191 }
192 break;
193 case InterTag:
194// printf( "InterTag: %c\n", c );
195 m_tag = EmptyString;
196 if( isWhitespace( c ) )
197 {
198 m_state = TagInside;
199 if( m_current )
200 m_cdata += c;
201 break;
202 }
203
204 switch( c )
205 {
206 case '&':
207// printf( "InterTag, calling decode\n" );
208 switch( decode( i, data ) )
209 {
210 case DecodeValid:
211 m_state = TagInside;
212 break;
213 case DecodeInvalid:
214 cleanup();
215 return static_cast<int>( i );
216 case DecodeInsufficient:
217 return -1;
218 }
219 break;
220 case '<':
221 m_state = TagOpening;
222 break;
223 case '>':
224 default:
225 if( m_current )
226 {
227 m_cdata += c;
228 m_state = TagInside;
229 }
230 break;
231 }
232 break;
233 case TagOpening: // opening '<' has been found before
234// printf( "TagOpening: %c\n", c );
235 if( isWhitespace( c ) )
236 break;
237
238 switch( c )
239 {
240 case '<':
241 case '>':
242 case '&':
243 cleanup();
244 return static_cast<int>( i );
245 break;
246 case '/':
247 m_state = TagClosingSlash;
248 break;
249 case '?':
250 m_state = TagNameCollect;
251 m_preamble = 1;
252 break;
253 case '!':
254 switch( forwardScan( i, data, "![CDATA[" ) )
255 {
256 case ForwardFound:
257 m_state = TagCDATASection;
258 break;
259 case ForwardNotFound:
260 cleanup();
261 return static_cast<int>( i );
262 case ForwardInsufficientSize:
263 return -1;
264 }
265 break;
266 default:
267 m_tag += c;
268 m_state = TagNameCollect;
269 break;
270 }
271 break;
272 case TagCDATASection:
273 switch( c )
274 {
275 case ']':
276 switch( forwardScan( i, data, "]]>" ) )
277 {
278 case ForwardFound:
279 m_state = TagInside;
280 break;
281 case ForwardNotFound:
282 m_cdata += c;
283 break;
284 case ForwardInsufficientSize:
285 return -1;
286 }
287 break;
288 default:
289 m_cdata += c;
290 break;
291 }
292 break;
293 case TagNameCollect: // we're collecting the tag's name, we have at least one octet already
294// printf( "TagNameCollect: %c\n", c );
295 if( isWhitespace( c ) )
296 {
297 m_state = TagNameComplete;
298 break;
299 }
300
301 switch( c )
302 {
303 case '<':
304 case '?':
305 case '!':
306 case '&':
307 cleanup();
308 return static_cast<int>( i );
309 break;
310 case '/':
311 m_state = TagOpeningSlash;
312 break;
313 case '>':
314 addTag();
315 m_state = TagInside;
316 break;
317 case ':':
318 if( !m_haveTagPrefix )
319 {
320 m_haveTagPrefix = true;
321 m_tagPrefix = m_tag;
322 m_tag = EmptyString;
323 }
324 else
325 {
326 cleanup();
327 return static_cast<int>( i );
328 }
329 break;
330 default:
331 m_tag += c;
332 break;
333 }
334 break;
335 case TagInside: // we're inside a tag, expecting a child tag or cdata
336// printf( "TagInside: %c\n", c );
337 m_tag = EmptyString;
338 switch( c )
339 {
340 case '<':
341 addCData();
342 m_state = TagOpening;
343 break;
344 case '&':
345// printf( "TagInside, calling decode\n" );
346 switch( decode( i, data ) )
347 {
348 case DecodeValid:
349 break;
350 case DecodeInvalid:
351 cleanup();
352 return static_cast<int>( i );
353 case DecodeInsufficient:
354 return -1;
355 }
356 break;
357 default:
358 m_cdata += c;
359 break;
360 }
361 break;
362 case TagOpeningSlash: // a slash in an opening tag has been found, initing close of the tag
363// printf( "TagOpeningSlash: %c\n", c );
364 if( isWhitespace( c ) )
365 break;
366
367 if( c == '>' )
368 {
369 addTag();
370 if( !closeTag() )
371 {
372// printf( "noipe, here\n" );
373 cleanup();
374 return static_cast<int>( i );
375 }
376
377 m_state = InterTag;
378 }
379 else
380 {
381 cleanup();
382 return static_cast<int>( i );
383 }
384 break;
385 case TagClosingSlash: // we have found the '/' of a closing tag
386// printf( "TagClosingSlash: %c\n", c );
387 if( isWhitespace( c ) )
388 break;
389
390 switch( c )
391 {
392 case '>':
393 case '<':
394 case '/':
395 cleanup();
396 return static_cast<int>( i );
397 break;
398 default:
399 m_tag += c;
400 m_state = TagClosing;
401 break;
402 }
403 break;
404 case TagClosing: // we're collecting the name of a closing tag
405// printf( "TagClosing: %c\n", c );
406 switch( c )
407 {
408 case '<':
409 case '/':
410 case '!':
411 case '?':
412 case '&':
413 cleanup();
414 return static_cast<int>( i );
415 break;
416 case ':':
417 if( !m_haveTagPrefix )
418 {
419 m_haveTagPrefix = true;
420 m_tagPrefix = m_tag;
421 m_tag = EmptyString;
422 }
423 else
424 {
425 cleanup();
426 return static_cast<int>( i );
427 }
428 break;
429 case '>':
430 if( !closeTag() )
431 {
432// printf( "here\n" );
433 cleanup();
434 return static_cast<int>( i );
435 }
436 m_state = InterTag;
437 break;
438 default:
439 m_tag += c;
440 break;
441 }
442 break;
443 case TagNameComplete: // a tag name is complete, expect tag close or attribs
444// printf( "TagNameComplete: %c\n", c );
445 if( isWhitespace( c ) )
446 break;
447
448 switch( c )
449 {
450 case '<':
451 case '!':
452 case '&':
453 cleanup();
454 return static_cast<int>( i );
455 break;
456 case '/':
457 m_state = TagOpeningSlash;
458 break;
459 case '>':
460 if( m_preamble == 1 )
461 {
462 cleanup();
463 return static_cast<int>( i );
464 }
465 m_state = TagInside;
466 addTag();
467 break;
468 case '?':
469 if( m_preamble == 1 )
470 m_preamble = 2;
471 else
472 {
473 cleanup();
474 return static_cast<int>( i );
475 }
476 break;
477 default:
478 m_attrib += c;
479 m_state = TagAttribute;
480 break;
481 }
482 break;
483 case TagAttribute: // we're collecting the name of an attribute, we have at least 1 octet
484// printf( "TagAttribute: %c\n", c );
485 if( isWhitespace( c ) )
486 {
487 m_state = TagAttributeComplete;
488 break;
489 }
490
491 switch( c )
492 {
493 case '<':
494 case '/':
495 case '>':
496 case '?':
497 case '!':
498 case '&':
499 cleanup();
500 return static_cast<int>( i );
501 break;
502 case '=':
503 m_state = TagAttributeEqual;
504 break;
505 case ':':
506 if( !m_haveAttribPrefix && m_attrib != XMLNS )
507 {
508 m_haveAttribPrefix = true;
509 m_attribPrefix = m_attrib;
510 m_attrib = EmptyString;
511 }
512 else if( m_attrib == XMLNS )
513 {
514 m_attribIsXmlns = true;
515 m_attrib = EmptyString;
516 }
517 else
518 {
519 cleanup();
520 return static_cast<int>( i );
521 }
522 break;
523 default:
524 m_attrib += c;
525 }
526 break;
527 case TagAttributeComplete: // we're expecting an equals sign or ws
528// printf( "TagAttributeComplete: %c\n", c );
529 if( isWhitespace( c ) )
530 break;
531
532 switch( c )
533 {
534 case '=':
535 m_state = TagAttributeEqual;
536 break;
537 default:
538 cleanup();
539 return static_cast<int>( i );
540 break;
541 }
542 break;
543 case TagAttributeEqual: // we have found an equals sign
544// printf( "TagAttributeEqual: %c\n", c );
545 if( isWhitespace( c ) )
546 break;
547
548 switch( c )
549 {
550 case '"':
551 m_quote = true;
552 case '\'':
553 m_state = TagAttributeValue;
554 break;
555 default:
556 cleanup();
557 return static_cast<int>( i );
558 break;
559 }
560 break;
561 case TagAttributeValue: // we're expecting value data
562// printf( "TagValue: %c\n", c );
563 switch( c )
564 {
565 case '<':
566 cleanup();
567 return static_cast<int>( i );
568 break;
569 case '\'':
570 if( m_quote )
571 {
572 m_value += c;
573 break;
574 }
575 case '"':
576 addAttribute();
577 m_state = TagNameAlmostComplete;
578 m_quote = false;
579 break;
580 case '&':
581// printf( "TagAttributeValue, calling decode\n" );
582 switch( decode( i, data ) )
583 {
584 case DecodeValid:
585 break;
586 case DecodeInvalid:
587 cleanup();
588 return static_cast<int>( i );
589 case DecodeInsufficient:
590 return -1;
591 }
592 break;
593 case '>':
594 default:
595 m_value += c;
596 }
597 break;
598 case TagNameAlmostComplete:
599// printf( "TagAttributeEqual: %c\n", c );
600 if( isWhitespace( c ) )
601 {
602 m_state = TagNameComplete;
603 break;
604 }
605
606 switch( c )
607 {
608 case '/':
609 m_state = TagOpeningSlash;
610 break;
611 case '>':
612 if( m_preamble == 1 )
613 {
614 cleanup();
615 return static_cast<int>( i );
616 }
617 m_state = TagInside;
618 addTag();
619 break;
620 case '?':
621 if( m_preamble == 1 )
622 m_preamble = 2;
623 else
624 {
625 cleanup();
626 return static_cast<int>( i );
627 }
628 break;
629 default:
630 cleanup();
631 return static_cast<int>( i );
632 break;
633 }
634 break;
635 default:
636// printf( "default action!?\n" );
637 break;
638 }
639// printf( "parser state: %d\n", m_state );
640 }
641
642 return -1;
643 }
644
645 void Parser::addTag()
646 {
647 if( !m_root )
648 {
649// printf( "created Tag named %s, ", m_tag.c_str() );
650 m_root = new Tag( m_tag );
651 m_current = m_root;
652 }
653 else
654 {
655// printf( "created Tag named %s, ", m_tag.c_str() );
656 m_current = new Tag( m_current, m_tag );
657 }
658
659 if( m_haveTagPrefix )
660 {
661// printf( "setting tag prefix: %s\n", m_tagPrefix.c_str() );
662 m_current->setPrefix( m_tagPrefix );
663 m_haveTagPrefix = false;
664 }
665
666 if( m_attribs.size() )
667 {
668 m_current->setAttributes( m_attribs );
669// printf( "added %d attributes, ", m_attribs.size() );
670 m_attribs.clear();
671 }
672
673 if( m_xmlnss )
674 {
675// printf( "have ns decls\n" );
676// StringMap::const_iterator it = m_xmlnss->begin();
677// for( ; it != m_xmlnss->end(); ++it )
678// printf( "%s='%s'\n", (*it).first.c_str(), (*it).second.c_str() );
679 m_current->setXmlns( m_xmlnss );
680 m_xmlnss = 0;
681 }
682
683 m_current->setXmlns( m_xmlns );
684 m_xmlns = EmptyString;
685
686 if( m_tag == "stream" && m_root->xmlns() == XMLNS_STREAM )
687 {
688 streamEvent( m_root );
689 cleanup( m_deleteRoot );
690 return;
691 }
692// else
693// printf( "%s, ", m_root->xml().c_str() );
694
695 if( m_root && m_root == m_current && m_tagPrefix == "stream" )
696 m_root->setXmlns( XMLNS_STREAM, m_tagPrefix );
697
698 if( m_tag == "xml" && m_preamble == 2 )
699 cleanup();
700 }
701
702 void Parser::addAttribute()
703 {
704 Tag::Attribute* attr = new Tag::Attribute( m_attrib, m_value );;
705 if( m_attribIsXmlns )
706 {
707 if( !m_xmlnss )
708 m_xmlnss = new StringMap();
709
710 (*m_xmlnss)[m_attrib] = m_value;
711 attr->setPrefix( XMLNS );
712 }
713 else
714 {
715// printf( "adding attribute: %s:%s='%s'\n", m_attribPrefix.c_str(), m_attrib.c_str(), m_value.c_str() );
716 if( !m_attribPrefix.empty() )
717 attr->setPrefix( m_attribPrefix );
718 if( m_attrib == XMLNS )
719 m_xmlns = m_value;
720 }
721 m_attribs.push_back( attr );
722 m_attrib = EmptyString;
723 m_value = EmptyString;
724 m_attribPrefix = EmptyString;
725 m_haveAttribPrefix = false;
726 m_attribIsXmlns = false;
727 }
728
729 void Parser::addCData()
730 {
731 if( m_current && !m_cdata.empty() )
732 {
733 m_current->addCData( m_cdata );
734// printf( "added cdata %s to %s: %s\n",
735// m_cdata.c_str(), m_current->name().c_str(), m_current->xml().c_str() );
736 m_cdata = EmptyString;
737 }
738 }
739
740 bool Parser::closeTag()
741 {
742// printf( "about to close, " );
743
744 if( m_tag == "stream" && m_tagPrefix == "stream" )
745 return true;
746
747 if( !m_current || m_current->name() != m_tag
748 || ( !m_current->prefix().empty() && m_current->prefix() != m_tagPrefix ) )
749 {
750// printf( "current xml: %s\n", m_current->xml().c_str() );
751// printf( "current name: %s, m_tag: %s\n", m_current->name().c_str(), m_tag.c_str() );
752// printf( "current prefix: %s, m_tagPrefix: %s\n", m_current->prefix().c_str(), m_tagPrefix.c_str() );
753 return false;
754 }
755
756// printf( "m_current: %s, ", m_current->name().c_str() );
757// printf( "m_tag: %s, ", m_tag.c_str() );
758
759 m_tagPrefix = EmptyString;
760 m_haveTagPrefix = false;
761
762 if( m_current->parent() )
763 m_current = m_current->parent();
764 else
765 {
766// printf( "pushing upstream\n" );
767 streamEvent( m_root );
768 cleanup( m_deleteRoot );
769 }
770
771 return true;
772 }
773
774 void Parser::cleanup( bool deleteRoot )
775 {
776 if( deleteRoot )
777 delete m_root;
778 m_root = 0;
779 m_current = 0;
780 delete m_xmlnss;
781 m_xmlnss = 0;
782 m_cdata = EmptyString;
783 m_tag = EmptyString;
784 m_attrib = EmptyString;
785 m_attribPrefix = EmptyString;
786 m_tagPrefix = EmptyString;
787 m_haveAttribPrefix = false;
788 m_haveTagPrefix = false;
789 m_value = EmptyString;
790 m_xmlns = EmptyString;
791 util::clearList( m_attribs );
792 m_attribs.clear();
793 m_state = Initial;
794 m_preamble = 0;
795 }
796
797 bool Parser::isWhitespace( unsigned char c )
798 {
799 return ( c == 0x09 || c == 0x0a || c == 0x0d || c == 0x20 );
800 }
801
802 void Parser::streamEvent( Tag* tag )
803 {
804 if( m_tagHandler )
805 m_tagHandler->handleTag( tag );
806 }
807
808}
Parser(TagHandler *ph, bool deleteRoot=true)
Definition parser.cpp:22
void cleanup(bool deleteRoot=true)
Definition parser.cpp:774
int feed(std::string &data)
Definition parser.cpp:161
virtual ~Parser()
Definition parser.cpp:29
A virtual interface which can be reimplemented to receive non-XMPP Core stanzas.
Definition taghandler.h:33
virtual void handleTag(Tag *tag)=0
This is an abstraction of an XML element.
Definition tag.h:47
const std::string & name() const
Definition tag.h:394
bool setPrefix(const std::string &prefix)
Definition tag.cpp:565
const std::string xmlns() const
Definition tag.cpp:543
bool addCData(const std::string &cdata)
Definition tag.cpp:481
Tag * parent() const
Definition tag.h:526
void setAttributes(const AttributeList &attributes)
Definition tag.cpp:409
bool setXmlns(const std::string &xmlns, const std::string &prefix=EmptyString)
Definition tag.cpp:522
const std::string & prefix() const
Definition tag.h:249
void clearList(std::list< T * > &L)
Definition util.h:152
The namespace for the gloox library.
Definition adhoc.cpp:28
const std::string XMLNS_STREAM
Definition gloox.cpp:84
const std::string EmptyString
Definition gloox.cpp:124
const std::string XMLNS
Definition gloox.cpp:122
std::map< std::string, std::string > StringMap
Definition gloox.h:1261