From fc2b0a51fb70ad19e5c628229e80cf34fc9ebd96 Mon Sep 17 00:00:00 2001 From: halx99 Date: Sat, 18 Feb 2023 10:26:11 +0800 Subject: [PATCH] Tidy structure --- xsxml/.clang-format => .clang-format | 104 +- .../no-recursive => no-recursive}/xsxml.hpp | 3354 ++++++++--------- xsxml/xsxml.hpp => xsxml.hpp | 3140 +++++++-------- 3 files changed, 3299 insertions(+), 3299 deletions(-) rename xsxml/.clang-format => .clang-format (95%) rename {xsxml/no-recursive => no-recursive}/xsxml.hpp (96%) rename xsxml/xsxml.hpp => xsxml.hpp (97%) diff --git a/xsxml/.clang-format b/.clang-format similarity index 95% rename from xsxml/.clang-format rename to .clang-format index 9bfec47..4827302 100644 --- a/xsxml/.clang-format +++ b/.clang-format @@ -1,52 +1,52 @@ ---- -BasedOnStyle: LLVM - -# Allow double brackets such as std::vector>. -Standard: Cpp11 - -SortIncludes: false - -# Keep lines under 100 columns long. -ColumnLimit: 100 - -# Always break before braces -BreakBeforeBraces: Custom -BraceWrapping: - AfterClass: true - AfterControlStatement: true - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterStruct: true - AfterUnion: true - BeforeCatch: true - BeforeElse: true - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false - - # Keeps extern "C" blocks unindented. - AfterExternBlock: false - -# Indent case labels. -IndentCaseLabels: true - -# Right-align pointers and references -PointerAlignment: Left - -# ANGLE likes to align things as much as possible. -AlignOperands: true -AlignConsecutiveAssignments: true - -# Use 2 space negative offset for access modifiers -AccessModifierOffset: -2 - -# TODO(jmadill): Decide if we want this on. Doesn't have an "all or none" mode. -AllowShortCaseLabelsOnASingleLine: false - -# Useful for spacing out functions in classes -KeepEmptyLinesAtTheStartOfBlocks: true - -# Indent nested PP directives. -IndentPPDirectives: AfterHash +--- +BasedOnStyle: LLVM + +# Allow double brackets such as std::vector>. +Standard: Cpp11 + +SortIncludes: false + +# Keep lines under 100 columns long. +ColumnLimit: 100 + +# Always break before braces +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: true + AfterControlStatement: true + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterStruct: true + AfterUnion: true + BeforeCatch: true + BeforeElse: true + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false + + # Keeps extern "C" blocks unindented. + AfterExternBlock: false + +# Indent case labels. +IndentCaseLabels: true + +# Right-align pointers and references +PointerAlignment: Left + +# ANGLE likes to align things as much as possible. +AlignOperands: true +AlignConsecutiveAssignments: true + +# Use 2 space negative offset for access modifiers +AccessModifierOffset: -2 + +# TODO(jmadill): Decide if we want this on. Doesn't have an "all or none" mode. +AllowShortCaseLabelsOnASingleLine: false + +# Useful for spacing out functions in classes +KeepEmptyLinesAtTheStartOfBlocks: true + +# Indent nested PP directives. +IndentPPDirectives: AfterHash diff --git a/xsxml/no-recursive/xsxml.hpp b/no-recursive/xsxml.hpp similarity index 96% rename from xsxml/no-recursive/xsxml.hpp rename to no-recursive/xsxml.hpp index ed65527..eff884a 100644 --- a/xsxml/no-recursive/xsxml.hpp +++ b/no-recursive/xsxml.hpp @@ -1,1677 +1,1677 @@ -////////////////////////////////////////////////////////////////////////////////////////// -// The embedded xml SAX parser, extract from pugixml DOM parser -// please see: https://github.com/zeux/pugixml -////////////////////////////////////////////////////////////////////////////////////////// -/* -The MIT License (MIT) -Copyright (c) 2019 halx99 -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef SIMDSOFT__XSXML_HPP -#define SIMDSOFT__XSXML_HPP -#pragma once -#include -#include -#include - -#define XSXML__DECL inline - -namespace xsxml -{ - -typedef char char_t; - -// Parsing status, returned as part of xml_parse_result object -enum xml_parse_status -{ - status_ok = 0, // No error - - status_file_not_found, // File was not found during load_file() - status_io_error, // Error reading from file/stream - status_out_of_memory, // Could not allocate memory - status_internal_error, // Internal error occurred - - status_unrecognized_tag, // Parser could not determine tag type - - status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction - status_bad_comment, // Parsing error occurred while parsing comment - status_bad_cdata, // Parsing error occurred while parsing CDATA section - status_bad_doctype, // Parsing error occurred while parsing document type declaration - status_bad_pcdata, // Parsing error occurred while parsing PCDATA section - status_bad_start_element, // Parsing error occurred while parsing start element tag - status_bad_attribute, // Parsing error occurred while parsing element attribute - status_bad_end_element, // Parsing error occurred while parsing end element tag - status_end_element_mismatch, // There was a mismatch of start-end tags (closing tag had incorrect - // name, some tag was not closed or there was an excessive closing - // tag) - - status_append_invalid_root, // Unable to append nodes since root type is not node_element or - // node_document (exclusive to xml_node::append_buffer) - - status_no_document_element // Parsing resulted in a document without element nodes -}; - -// Parsing options - -// Minimal parsing mode (equivalent to turning all other flags off). -// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. -const unsigned int parse_minimal = 0x0000; - -// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is -// off by default. -const unsigned int parse_pi = 0x0001; - -// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by -// default. -const unsigned int parse_comments = 0x0002; - -// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by -// default. -const unsigned int parse_cdata = 0x0004; - -// This flag determines if plain character data (node_pcdata) that consist only of whitespace are -// added to the DOM tree. This flag is off by default; turning it on usually results in slower -// parsing and more memory consumption. -const unsigned int parse_ws_pcdata = 0x0008; - -// This flag determines if character and entity references are expanded during parsing. This flag is -// on by default. -const unsigned int parse_escapes = 0x0010; - -// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This -// flag is on by default. -const unsigned int parse_eol = 0x0020; - -// This flag determines if attribute values are normalized using CDATA normalization rules during -// parsing. This flag is on by default. -const unsigned int parse_wconv_attribute = 0x0040; - -// This flag determines if attribute values are normalized using NMTOKENS normalization rules during -// parsing. This flag is off by default. -const unsigned int parse_wnorm_attribute = 0x0080; - -// This flag determines if document declaration (node_declaration) is added to the DOM tree. This -// flag is off by default. -const unsigned int parse_declaration = 0x0100; - -// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This -// flag is off by default. -const unsigned int parse_doctype = 0x0200; - -// This flag determines if plain character data (node_pcdata) that is the only child of the parent -// node and that consists only of whitespace is added to the DOM tree. This flag is off by default; -// turning it on may result in slower parsing and more memory consumption. -const unsigned int parse_ws_pcdata_single = 0x0400; - -// This flag determines if leading and trailing whitespace is to be removed from plain character -// data. This flag is off by default. -const unsigned int parse_trim_pcdata = 0x0800; - -// This flag determines if plain character data that does not have a parent node is added to the DOM -// tree, and if an empty document is a valid document. This flag is off by default. -const unsigned int parse_fragment = 0x1000; - -// This flag determines if plain character data is be stored in the parent element's value. This -// significantly changes the structure of the document; this flag is only recommended for parsing -// documents with many PCDATA nodes in memory-constrained environments. This flag is off by default. -const unsigned int parse_embed_pcdata = 0x2000; - -// The default parsing mode. -// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are -// expanded, End-of-Line characters are normalized, attribute values are normalized using CDATA -// normalization rules. -const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; - -// The full parsing mode. -// Nodes of all types are added to the DOM tree, character/reference entities are expanded, -// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization -// rules. -const unsigned int parse_full = - parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; - -// The max parse deep of xml -// Don't define it to large, otherwise, will lead stack overflow -const unsigned int parse_max_deep = 512; - -typedef char_t* (*strconv_attribute_t)(char_t*, char_t); -typedef char_t* (*strconv_pcdata_t)(char_t*); - -enum chartype_t -{ - ct_parse_pcdata = 1, // \0, &, \r, < - ct_parse_attr = 2, // \0, &, \r, ', " - ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab - ct_space = 8, // \r, \n, space, tab - ct_parse_cdata = 16, // \0, ], >, \r - ct_parse_comment = 32, // \0, -, >, \r - ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . - ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, : -}; - -static const unsigned char chartype_table[256] = { - 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 - 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 - 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 - 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111 - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127 - - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+ - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192}; - -enum chartypex_t -{ - ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > - ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, " - ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _ - ctx_digit = 8, // 0-9 - ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, . -}; - -static const unsigned char chartypex_table[256] = { - 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 - 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47 - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63 - - 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79 - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95 - 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111 - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127 - - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+ - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; - -// Branch weight controls -#if defined(__GNUC__) -# define XSXML__UNLIKELY(cond) __builtin_expect(cond, 0) -#else -# define XSXML__UNLIKELY(cond) (cond) -#endif - -#define XSXML__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast(c)] & (ct)) - -#define XSXML__IS_CHARTYPE(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartype_table) -#define XSXML__IS_CHARTYPEX(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartypex_table) - -// Parser utilities -#define XSXML__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) -#define XSXML__SKIPWS() \ - { \ - while (XSXML__IS_CHARTYPE(*s, ct_space)) \ - ++s; \ - } -#define XSXML__OPTSET(OPT) (optmsk & (OPT)) -#define XSXML__PUSHNODE(TYPE) \ - { \ - cursor = append_new_node(cursor, alloc, TYPE); \ - if (!cursor) \ - XSXML__THROW_ERROR(status_out_of_memory, s); \ - } -#define XSXML__POPNODE() \ - { \ - cursor = cursor->parent; \ - } -#define XSXML__SCANFOR(X) \ - { \ - while (*s != 0 && !(X)) \ - ++s; \ - } -#define XSXML__SCANWHILE(X) \ - { \ - while (X) \ - ++s; \ - } -#define XSXML__SCANWHILE_UNROLL(X) \ - { \ - for (;;) \ - { \ - char_t ss = s[0]; \ - if (XSXML__UNLIKELY(!(X))) \ - { \ - break; \ - } \ - ss = s[1]; \ - if (XSXML__UNLIKELY(!(X))) \ - { \ - s += 1; \ - break; \ - } \ - ss = s[2]; \ - if (XSXML__UNLIKELY(!(X))) \ - { \ - s += 2; \ - break; \ - } \ - ss = s[3]; \ - if (XSXML__UNLIKELY(!(X))) \ - { \ - s += 3; \ - break; \ - } \ - s += 4; \ - } \ - } -#define XSXML__ENDSEG() \ - { \ - ch = *s; \ - *s = 0; \ - ++s; \ - } -#define XSXML__THROW_ERROR(err, m) \ - return error_offset = m, error_status = err, static_cast(0) -#define XSXML__CHECK_ERROR(err, m) \ - { \ - if (*s == 0) \ - XSXML__THROW_ERROR(err, m); \ - } - -// Simple static assertion -#define XSXML__STATIC_ASSERT(cond) \ - { \ - static const char condition_failed[(cond) ? 1 : -1] = {0}; \ - (void)condition_failed[0]; \ - } - -// Parsing result -struct xml_parse_result -{ - // Parsing status (see xml_parse_status) - xml_parse_status status; - - // Last parsed offset (in char_t units from start of input data) - ptrdiff_t offset; - - // Source document encoding - // xml_encoding encoding; - - // Default constructor, initializes object to failed state - xml_parse_result() : status(status_internal_error), offset(0) {} - - // Cast to bool operator - operator bool() const { return status == status_ok; } - - // Get error description - const char* description() const; -}; - -struct opt_false -{ - enum - { - value = 0 - }; -}; - -struct opt_true -{ - enum - { - value = 1 - }; -}; - -struct gap -{ - char_t* end; - size_t size; - - gap() : end(0), size(0) {} - - // Push new gap, move s count bytes further (skipping the gap). - // Collapse previous gap. - void push(char_t*& s, size_t count) - { - if (end) // there was a gap already; collapse it - { - // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) - assert(s >= end); - memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); - } - - s += count; // end of current gap - - // "merge" two gaps - end = s; - size += count; - } - - // Collapse all gaps, return past-the-end pointer - char_t* flush(char_t* s) - { - if (end) - { - // Move [old_gap_end, current_pos) to [old_gap_start, ...) - assert(s >= end); - memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); - - return s - size; - } - else - return s; - } -}; - -struct utf8_writer -{ - typedef uint8_t* value_type; - - static value_type low(value_type result, uint32_t ch) - { - // U+0000..U+007F - if (ch < 0x80) - { - *result = static_cast(ch); - return result + 1; - } - // U+0080..U+07FF - else if (ch < 0x800) - { - result[0] = static_cast(0xC0 | (ch >> 6)); - result[1] = static_cast(0x80 | (ch & 0x3F)); - return result + 2; - } - // U+0800..U+FFFF - else - { - result[0] = static_cast(0xE0 | (ch >> 12)); - result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); - result[2] = static_cast(0x80 | (ch & 0x3F)); - return result + 3; - } - } - - static value_type high(value_type result, uint32_t ch) - { - // U+10000..U+10FFFF - result[0] = static_cast(0xF0 | (ch >> 18)); - result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); - result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); - result[3] = static_cast(0x80 | (ch & 0x3F)); - return result + 4; - } - - static value_type any(value_type result, uint32_t ch) - { - return (ch < 0x10000) ? low(result, ch) : high(result, ch); - } -}; - -XSXML__DECL char_t* strconv_escape(char_t* s, gap& g) -{ - char_t* stre = s + 1; - - switch (*stre) - { - case '#': // &#... - { - unsigned int ucsc = 0; - - if (stre[1] == 'x') // &#x... (hex code) - { - stre += 2; - - char_t ch = *stre; - - if (ch == ';') - return stre; - - for (;;) - { - if (static_cast(ch - '0') <= 9) - ucsc = 16 * ucsc + (ch - '0'); - else if (static_cast((ch | ' ') - 'a') <= 5) - ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); - else if (ch == ';') - break; - else // cancel - return stre; - - ch = *++stre; - } - - ++stre; - } - else // &#... (dec code) - { - char_t ch = *++stre; - - if (ch == ';') - return stre; - - for (;;) - { - if (static_cast(static_cast(ch) - '0') <= 9) - ucsc = 10 * ucsc + (ch - '0'); - else if (ch == ';') - break; - else // cancel - return stre; - - ch = *++stre; - } - - ++stre; - } - - s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc)); - - g.push(s, stre - s); - return stre; - } - - case 'a': // &a - { - ++stre; - - if (*stre == 'm') // &am - { - if (*++stre == 'p' && *++stre == ';') // & - { - *s++ = '&'; - ++stre; - - g.push(s, stre - s); - return stre; - } - } - else if (*stre == 'p') // &ap - { - if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // ' - { - *s++ = '\''; - ++stre; - - g.push(s, stre - s); - return stre; - } - } - break; - } - - case 'g': // &g - { - if (*++stre == 't' && *++stre == ';') // > - { - *s++ = '>'; - ++stre; - - g.push(s, stre - s); - return stre; - } - break; - } - - case 'l': // &l - { - if (*++stre == 't' && *++stre == ';') // < - { - *s++ = '<'; - ++stre; - - g.push(s, stre - s); - return stre; - } - break; - } - - case 'q': // &q - { - if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // " - { - *s++ = '"'; - ++stre; - - g.push(s, stre - s); - return stre; - } - break; - } - - default: - break; - } - - return stre; -} - -template struct strconv_pcdata_impl -{ - static char_t* parse(char_t* s) - { - gap g; - - char_t* begin = s; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_pcdata)); - - if (*s == '<') // PCDATA ends here - { - char_t* end = g.flush(s); - - if (opt_trim::value) - while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space)) - --end; - - *end = 0; - - return s + 1; - } - else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair - { - *s++ = '\n'; // replace first one with 0x0a - - if (*s == '\n') - g.push(s, 1); - } - else if (opt_escape::value && *s == '&') - { - s = strconv_escape(s, g); - } - else if (*s == 0) - { - char_t* end = g.flush(s); - - if (opt_trim::value) - while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space)) - --end; - - *end = 0; - - return s; - } - else - ++s; - } - } -}; - -XSXML__DECL strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) -{ - XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); - - switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim) - { - case 0: - return strconv_pcdata_impl::parse; - case 1: - return strconv_pcdata_impl::parse; - case 2: - return strconv_pcdata_impl::parse; - case 3: - return strconv_pcdata_impl::parse; - case 4: - return strconv_pcdata_impl::parse; - case 5: - return strconv_pcdata_impl::parse; - case 6: - return strconv_pcdata_impl::parse; - case 7: - return strconv_pcdata_impl::parse; - default: - assert(false); - return 0; // should not get here - } -} - -template struct strconv_attribute_impl -{ - static char_t* parse_wnorm(char_t* s, char_t end_quote) - { - gap g; - - // trim leading whitespaces - if (XSXML__IS_CHARTYPE(*s, ct_space)) - { - char_t* str = s; - - do - ++str; - while (XSXML__IS_CHARTYPE(*str, ct_space)); - - g.push(s, str - s); - } - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space)); - - if (*s == end_quote) - { - char_t* str = g.flush(s); - - do - *str-- = 0; - while (XSXML__IS_CHARTYPE(*str, ct_space)); - - return s + 1; - } - else if (XSXML__IS_CHARTYPE(*s, ct_space)) - { - *s++ = ' '; - - if (XSXML__IS_CHARTYPE(*s, ct_space)) - { - char_t* str = s + 1; - while (XSXML__IS_CHARTYPE(*str, ct_space)) - ++str; - - g.push(s, str - s); - } - } - else if (opt_escape::value && *s == '&') - { - s = strconv_escape(s, g); - } - else if (!*s) - { - return 0; - } - else - ++s; - } - } - - static char_t* parse_wconv(char_t* s, char_t end_quote) - { - gap g; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws)); - - if (*s == end_quote) - { - *g.flush(s) = 0; - - return s + 1; - } - else if (XSXML__IS_CHARTYPE(*s, ct_space)) - { - if (*s == '\r') - { - *s++ = ' '; - - if (*s == '\n') - g.push(s, 1); - } - else - *s++ = ' '; - } - else if (opt_escape::value && *s == '&') - { - s = strconv_escape(s, g); - } - else if (!*s) - { - return 0; - } - else - ++s; - } - } - - static char_t* parse_eol(char_t* s, char_t end_quote) - { - gap g; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr)); - - if (*s == end_quote) - { - *g.flush(s) = 0; - - return s + 1; - } - else if (*s == '\r') - { - *s++ = '\n'; - - if (*s == '\n') - g.push(s, 1); - } - else if (opt_escape::value && *s == '&') - { - s = strconv_escape(s, g); - } - else if (!*s) - { - return 0; - } - else - ++s; - } - } - - static char_t* parse_simple(char_t* s, char_t end_quote) - { - gap g; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr)); - - if (*s == end_quote) - { - *g.flush(s) = 0; - - return s + 1; - } - else if (opt_escape::value && *s == '&') - { - s = strconv_escape(s, g); - } - else if (!*s) - { - return 0; - } - else - ++s; - } - } -}; - -XSXML__DECL xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0) -{ - xml_parse_result result; - result.status = status; - result.offset = offset; - - return result; -} - -XSXML__DECL char_t* strconv_comment(char_t* s, char_t endch) -{ - gap g; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_comment)); - - if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair - { - *s++ = '\n'; // replace first one with 0x0a - - if (*s == '\n') - g.push(s, 1); - } - else if (s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')) // comment ends here - { - *g.flush(s) = 0; - - return s + (s[2] == '>' ? 3 : 2); - } - else if (*s == 0) - { - return 0; - } - else - ++s; - } -} - -XSXML__DECL char_t* strconv_cdata(char_t* s, char_t endch) -{ - gap g; - - while (true) - { - XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_cdata)); - - if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair - { - *s++ = '\n'; // replace first one with 0x0a - - if (*s == '\n') - g.push(s, 1); - } - else if (s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')) // CDATA ends here - { - *g.flush(s) = 0; - - return s + 1; - } - else if (*s == 0) - { - return 0; - } - else - ++s; - } -} - -XSXML__DECL strconv_attribute_t get_strconv_attribute(unsigned int optmask) -{ - XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && - parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80); - - switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes) - { - case 0: - return strconv_attribute_impl::parse_simple; - case 1: - return strconv_attribute_impl::parse_simple; - case 2: - return strconv_attribute_impl::parse_eol; - case 3: - return strconv_attribute_impl::parse_eol; - case 4: - return strconv_attribute_impl::parse_wconv; - case 5: - return strconv_attribute_impl::parse_wconv; - case 6: - return strconv_attribute_impl::parse_wconv; - case 7: - return strconv_attribute_impl::parse_wconv; - case 8: - return strconv_attribute_impl::parse_wnorm; - case 9: - return strconv_attribute_impl::parse_wnorm; - case 10: - return strconv_attribute_impl::parse_wnorm; - case 11: - return strconv_attribute_impl::parse_wnorm; - case 12: - return strconv_attribute_impl::parse_wnorm; - case 13: - return strconv_attribute_impl::parse_wnorm; - case 14: - return strconv_attribute_impl::parse_wnorm; - case 15: - return strconv_attribute_impl::parse_wnorm; - default: - assert(false); - return 0; // should not get here - } -} - -// Skip utf-8 bom -static char_t* parse_skip_bom(char_t* s) -{ - return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; -} - -// Simple string view -class string_view -{ -public: - string_view() : _Mystr(nullptr), _Mysize(0) {} - string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {} - const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; } - size_t length() const { return _Mysize; } - bool empty() const { return _Mysize == 0; } - -private: - char_t* _Mystr; - size_t _Mysize; -}; - -// The sax3 parse callbacks -struct xml_sax3_parse_cb -{ - std::function xml_start_element_cb; - std::function xml_attr_cb; - std::function xml_end_attr_cb; - std::function xml_end_element_cb; - std::function xml_text_cb; -}; - -/////////////// xml_sax3_parser /////////// -struct xml_sax3_parser -{ - // xml_allocator alloc; - char_t* error_offset; - xml_parse_status error_status; - - xml_sax3_parse_cb* handler; - - xml_sax3_parser(xml_sax3_parse_cb* handler_) - : handler(handler_), error_offset(0), error_status(status_ok) - {} - - ~xml_sax3_parser() - { - // *alloc_state = alloc; - } - - // DOCTYPE consists of nested sections of the following possible types: - // , , "...", '...' - // - // - // First group can not contain nested groups - // Second group can contain nested groups of the same type - // Third group can contain all other groups - char_t* parse_doctype_primitive(char_t* s) - { - if (*s == '"' || *s == '\'') - { - // quoted string - char_t ch = *s++; - XSXML__SCANFOR(*s == ch); - if (!*s) - XSXML__THROW_ERROR(status_bad_doctype, s); - - s++; - } - else if (s[0] == '<' && s[1] == '?') - { - // - s += 2; - XSXML__SCANFOR(s[0] == '?' && - s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype - if (!*s) - XSXML__THROW_ERROR(status_bad_doctype, s); - - s += 2; - } - else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') - { - s += 4; - XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && - s[2] == - '>'); // no need for ENDSWITH because --> can't terminate proper doctype - if (!*s) - XSXML__THROW_ERROR(status_bad_doctype, s); - - s += 3; - } - else - XSXML__THROW_ERROR(status_bad_doctype, s); - - return s; - } - - char_t* parse_doctype_ignore(char_t* s) - { - size_t depth = 0; - - assert(s[0] == '<' && s[1] == '!' && s[2] == '['); - s += 3; - - while (*s) - { - if (s[0] == '<' && s[1] == '!' && s[2] == '[') - { - // nested ignore section - s += 3; - depth++; - } - else if (s[0] == ']' && s[1] == ']' && s[2] == '>') - { - // ignore section end - s += 3; - - if (depth == 0) - return s; - - depth--; - } - else - s++; - } - - XSXML__THROW_ERROR(status_bad_doctype, s); - } - - char_t* parse_doctype_group(char_t* s, char_t endch) - { - size_t depth = 0; - - assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); - s += 2; - - while (*s) - { - if (s[0] == '<' && s[1] == '!' && s[2] != '-') - { - if (s[2] == '[') - { - // ignore - s = parse_doctype_ignore(s); - if (!s) - return s; - } - else - { - // some control group - s += 2; - depth++; - } - } - else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') - { - // unknown tag (forbidden), or some primitive group - s = parse_doctype_primitive(s); - if (!s) - return s; - } - else if (*s == '>') - { - if (depth == 0) - return s; - - depth--; - s++; - } - else - s++; - } - - if (depth != 0 || endch != '>') - XSXML__THROW_ERROR(status_bad_doctype, s); - - return s; - } - - char_t* parse_exclamation(char_t* s, unsigned int optmsk, char_t endch) - { - // parse node contents, starting with exclamation mark - ++s; - - if (*s == '-') // 'value = s; // Save the offset. - value = s; - } - - if (XSXML__OPTSET(parse_eol) && XSXML__OPTSET(parse_comments)) - { - s = strconv_comment(s, endch); - - if (!s) - XSXML__THROW_ERROR(status_bad_comment, value); - } - else - { - // Scan for terminating '-->'. - XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')); - XSXML__CHECK_ERROR(status_bad_comment, s); - - if (XSXML__OPTSET(parse_comments)) - *s = 0; // Zero-terminate this segment at the first terminating '-'. - - s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. - } - } - else - XSXML__THROW_ERROR(status_bad_comment, s); - } - else if (*s == '[') - { - // ''. - XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')); - XSXML__CHECK_ERROR(status_bad_cdata, s); - - *s++ = 0; // Zero-terminate this segment. - } - } - else // Flagged for discard, but we still have to scan for the terminator. - { - // Scan for terminating ']]>'. - XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')); - XSXML__CHECK_ERROR(status_bad_cdata, s); - - ++s; - } - - s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. - } - else - XSXML__THROW_ERROR(status_bad_cdata, s); - } - else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && - s[5] == 'P' && XSXML__ENDSWITH(s[6], 'E')) - { - s -= 2; - - // TODO: check for doctype, parent must be nullptr - // if (cursor->parent) XSXML__THROW_ERROR(status_bad_doctype, s); - - char_t* mark = s + 9; - - s = parse_doctype_group(s, endch); - if (!s) - return s; - - assert((*s == 0 && endch == '>') || *s == '>'); - if (*s) - *s++ = 0; - - if (XSXML__OPTSET(parse_doctype)) - { - while (XSXML__IS_CHARTYPE(*mark, ct_space)) - ++mark; - - // SAX3: Ignore doctype - // XSXML__PUSHNODE(node_doctype); - - // cursor->value = mark; - } - } - else if (*s == 0 && endch == '-') - XSXML__THROW_ERROR(status_bad_comment, s); - else if (*s == 0 && endch == '[') - XSXML__THROW_ERROR(status_bad_cdata, s); - else - XSXML__THROW_ERROR(status_unrecognized_tag, s); - - return s; - } - - char_t* parse_question(char_t* s, unsigned int optmsk, char_t endch) - { - // load into registers - // xml_node_struct* cursor = ref_cursor; - char_t ch = 0; - - // parse node contents, starting with question mark - ++s; - - // read PI target - char_t* target = s; - - if (!XSXML__IS_CHARTYPE(*s, ct_start_symbol)) - XSXML__THROW_ERROR(status_bad_pi, s); - - XSXML__SCANWHILE(XSXML__IS_CHARTYPE(*s, ct_symbol)); - XSXML__CHECK_ERROR(status_bad_pi, s); - - // determine node type; stricmp / strcasecmp is not portable - bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && - (target[2] | ' ') == 'l' && target + 3 == s; - - if (declaration ? XSXML__OPTSET(parse_declaration) : XSXML__OPTSET(parse_pi)) - { - if (declaration) - { - // TODO: disallow non top-level declarations - // if (cursor->parent) XSXML__THROW_ERROR(status_bad_pi, s); - - // SAX3: Ignore declaration. - // XSXML__PUSHNODE(node_declaration); - } - else - { - // SAX3: Ignore pi. - // XSXML__PUSHNODE(node_pi); - } - - XSXML__ENDSEG(); - - // parse value/attributes - if (ch == '?') - { - // empty node - if (!XSXML__ENDSWITH(*s, '>')) - XSXML__THROW_ERROR(status_bad_pi, s); - s += (*s == '>'); - - // XSXML__POPNODE(); - } - else if (XSXML__IS_CHARTYPE(ch, ct_space)) - { - XSXML__SKIPWS(); - - // scan for tag end - char_t* value = s; - - XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>')); - XSXML__CHECK_ERROR(status_bad_pi, s); - - if (declaration) - { - // replace ending ? with / so that 'element' terminates properly - *s = '/'; - - // we exit from this function with cursor at node_declaration, which is a signal to - // parse() to go to LOC_ATTRIBUTES - s = value; - } - else - { - // store value and step over > - // cursor->value = value; - - // XSXML__POPNODE(); - - XSXML__ENDSEG(); - - s += (*s == '>'); - } - } - else - XSXML__THROW_ERROR(status_bad_pi, s); - } - else - { - // scan for tag end - XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>')); - XSXML__CHECK_ERROR(status_bad_pi, s); - - s += (s[1] == '>' ? 2 : 1); - } - - // store from registers - // ref_cursor = cursor; - - return s; - } - - template struct fixed_stack - { - public: - fixed_stack() : size_(0) {} - - void push(const _T& val) - { - if (size_ < _Capacity) - elements_[size_++] = val; - } - - _T pop() - { - if (size_ > 0) - return elements_[size_-- - 1]; - return _T{}; - } - - private: - _T elements_[_Capacity]; - size_t size_; - }; - - char_t* parse_tree(char_t* s, unsigned int optmsk, char_t endch) - { - strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); - strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); - - char_t ch = 0; - char_t* mark = s; - char_t* value = nullptr; - size_t n = 0; - - fixed_stack stk; // 4K on 32bits, 6K on 64bits - - while (*s != 0) - { - if (*s == '<') - { - ++s; - - LOC_TAG: - if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...' - { - // SAX3: TODO: xmlStartElement. - // XSXML__PUSHNODE(node_element); // Append a new node to the tree. - - mark = s; - - XSXML__SCANWHILE_UNROLL(XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. - - handler->xml_start_element_cb(mark, s - mark); - stk.push(::xsxml::string_view(mark, s - mark)); - - XSXML__ENDSEG(); // Save char in 'ch', terminate & step over. - - if (ch == '>') - { - handler->xml_end_attr_cb(); // end of tag - } - else if (XSXML__IS_CHARTYPE(ch, ct_space)) - { - while (true) - { // parse attributes - XSXML__SKIPWS(); // Eat any whitespace. - - if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // <... #... - { - // SAX3: TODO: implement attribute. - // xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for - // this attribute. if (!a) XSXML__THROW_ERROR(status_out_of_memory, s); - - mark = s; // Save the offset. - - XSXML__SCANWHILE_UNROLL( - XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. - n = s - mark; - XSXML__ENDSEG(); // Save char in 'ch', terminate & step over. - - if (XSXML__IS_CHARTYPE(ch, ct_space)) - { - XSXML__SKIPWS(); // Eat any whitespace. - - ch = *s; - ++s; - } - - if (ch == '=') // '<... #=...' - { - XSXML__SKIPWS(); // Eat any whitespace. - - if (*s == '"' || *s == '\'') // '<... #="...' - { - ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. - ++s; // Step over the quote. - value = s; // a->value = s; // Save the offset. - - s = strconv_attribute(s, ch); - - if (!s) - XSXML__THROW_ERROR(status_bad_attribute, value); - - // After this line the loop continues from the start; - // Whitespaces, / and > are ok, symbols and EOF are wrong, - // everything else will be detected - if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) - XSXML__THROW_ERROR(status_bad_attribute, s); - handler->xml_attr_cb(mark, n, value, s - value - 1); - } - else - XSXML__THROW_ERROR(status_bad_attribute, s); - } - else - XSXML__THROW_ERROR(status_bad_attribute, s); - } - else if (*s == '/') - { - ++s; - if (*s == '>') - { - auto ele_name = stk.pop(); - handler->xml_end_attr_cb(); - handler->xml_end_element_cb(ele_name.c_str(), ele_name.length()); - ++s; - break; - } - else if (*s == 0 && endch == '>') - { - auto ele_name = stk.pop(); - handler->xml_end_attr_cb(); - handler->xml_end_element_cb(ele_name.c_str(), ele_name.length()); - break; - } - else - XSXML__THROW_ERROR(status_bad_start_element, s); - } - else if (*s == '>') - { - ++s; - handler->xml_end_attr_cb(); - break; - } - else if (*s == 0 && endch == '>') - { - break; - } - else - XSXML__THROW_ERROR(status_bad_start_element, s); - } - - // !!! - } - else if (ch == '/') // '<#.../' - { - if (!XSXML__ENDSWITH(*s, '>')) - XSXML__THROW_ERROR(status_bad_start_element, s); - - stk.pop(); - handler->xml_end_element_cb(mark, s - mark); - s += (*s == '>'); - } - else if (ch == 0) - { - // we stepped over null terminator, backtrack & handle closing tag - --s; - - if (endch != '>') - XSXML__THROW_ERROR(status_bad_start_element, s); - } - else - XSXML__THROW_ERROR(status_bad_start_element, s); - } - else if (*s == '/') - { - ++s; - - mark = s; - - // SAX3, we don't check end element name - while (XSXML__IS_CHARTYPE(*s, ct_symbol)) - ++s; - - stk.pop(); - handler->xml_end_element_cb(mark, s - mark); - - XSXML__SKIPWS(); - - if (*s == 0) - { - if (endch != '>') - XSXML__THROW_ERROR(status_bad_end_element, s); - } - else - { - if (*s != '>') - XSXML__THROW_ERROR(status_bad_end_element, s); - ++s; - } - } - else if (*s == '?') // 'first_child) continue; - } - } - - if (!XSXML__OPTSET(parse_trim_pcdata)) - s = mark; - - // SAX3: Ignore node_pcdata. - if (/*cursor->parent ||*/ XSXML__OPTSET(parse_fragment)) - { // Currently, SAX3 simplely skip, do not regard text it node - if (XSXML__OPTSET(parse_embed_pcdata) /*&& cursor->parent && !cursor->first_child && !cursor->value*/) - { - // cursor->value = s; // Save the offset. - } - else - { - // XSXML__PUSHNODE(node_pcdata); // Append a new node on the tree. - - // cursor->value = s; // Save the offset. - - // XSXML__POPNODE(); // Pop since this is a standalone. - } - - s = strconv_pcdata(s); - - if (!*s) - break; - } - else - { - XSXML__SCANFOR(*s == '<'); // '...<' - if (!*s) - break; - - handler->xml_text_cb(mark, s - mark); - - ++s; - } - - // We're after '<' - goto LOC_TAG; - } - } - - // SAX3: TODO: check that last tag is closed, - // if (cursor != root) XSXML__THROW_ERROR(status_end_element_mismatch, s); - - return s; - } - - static char_t* parse_skip_bom(char_t* s) - { - return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; - } - - /*static bool has_element_node_siblings(xml_node_struct* node) - { - while (node) - { - if (XSXML__NODETYPE(node) == node_element) return true; - - node = node->next_sibling; - } - - return false; - }*/ - - static xml_parse_result parse(char_t* buffer, size_t length, xml_sax3_parse_cb* handler, - unsigned int optmsk = parse_default) - { - // early-out for empty documents - if (length == 0) - return make_parse_result(XSXML__OPTSET(parse_fragment) ? status_ok - : status_no_document_element); - - // get last child of the root before parsing - // xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 - // : 0; - - // create parser on stack - xml_sax3_parser parser(handler); - - // save last character and make buffer zero-terminated (speeds up parsing) - char_t endch = buffer[length - 1]; - buffer[length - 1] = 0; - - // skip BOM to make sure it does not end up as part of parse output - char_t* buffer_data = parse_skip_bom(buffer); - - // perform actual parsing - parser.parse_tree(buffer_data, optmsk, endch); - - xml_parse_result result = make_parse_result( - parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); - assert(result.offset >= 0 && static_cast(result.offset) <= length); - - if (result) - { - // since we removed last character, we have to handle the only possible false positive (stray - // <) - if (endch == '<') - return make_parse_result(status_unrecognized_tag, length - 1); - } - else - { - // roll back offset if it occurs on a null terminator in the source buffer - if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) - result.offset--; - } - - return result; - } -}; /* xml_sax3_parser */ -}; // namespace xsxml - -#endif +////////////////////////////////////////////////////////////////////////////////////////// +// The embedded xml SAX parser, extract from pugixml DOM parser +// please see: https://github.com/zeux/pugixml +////////////////////////////////////////////////////////////////////////////////////////// +/* +The MIT License (MIT) +Copyright (c) 2019 halx99 +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SIMDSOFT__XSXML_HPP +#define SIMDSOFT__XSXML_HPP +#pragma once +#include +#include +#include + +#define XSXML__DECL inline + +namespace xsxml +{ + +typedef char char_t; + +// Parsing status, returned as part of xml_parse_result object +enum xml_parse_status +{ + status_ok = 0, // No error + + status_file_not_found, // File was not found during load_file() + status_io_error, // Error reading from file/stream + status_out_of_memory, // Could not allocate memory + status_internal_error, // Internal error occurred + + status_unrecognized_tag, // Parser could not determine tag type + + status_bad_pi, // Parsing error occurred while parsing document declaration/processing instruction + status_bad_comment, // Parsing error occurred while parsing comment + status_bad_cdata, // Parsing error occurred while parsing CDATA section + status_bad_doctype, // Parsing error occurred while parsing document type declaration + status_bad_pcdata, // Parsing error occurred while parsing PCDATA section + status_bad_start_element, // Parsing error occurred while parsing start element tag + status_bad_attribute, // Parsing error occurred while parsing element attribute + status_bad_end_element, // Parsing error occurred while parsing end element tag + status_end_element_mismatch, // There was a mismatch of start-end tags (closing tag had incorrect + // name, some tag was not closed or there was an excessive closing + // tag) + + status_append_invalid_root, // Unable to append nodes since root type is not node_element or + // node_document (exclusive to xml_node::append_buffer) + + status_no_document_element // Parsing resulted in a document without element nodes +}; + +// Parsing options + +// Minimal parsing mode (equivalent to turning all other flags off). +// Only elements and PCDATA sections are added to the DOM tree, no text conversions are performed. +const unsigned int parse_minimal = 0x0000; + +// This flag determines if processing instructions (node_pi) are added to the DOM tree. This flag is +// off by default. +const unsigned int parse_pi = 0x0001; + +// This flag determines if comments (node_comment) are added to the DOM tree. This flag is off by +// default. +const unsigned int parse_comments = 0x0002; + +// This flag determines if CDATA sections (node_cdata) are added to the DOM tree. This flag is on by +// default. +const unsigned int parse_cdata = 0x0004; + +// This flag determines if plain character data (node_pcdata) that consist only of whitespace are +// added to the DOM tree. This flag is off by default; turning it on usually results in slower +// parsing and more memory consumption. +const unsigned int parse_ws_pcdata = 0x0008; + +// This flag determines if character and entity references are expanded during parsing. This flag is +// on by default. +const unsigned int parse_escapes = 0x0010; + +// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This +// flag is on by default. +const unsigned int parse_eol = 0x0020; + +// This flag determines if attribute values are normalized using CDATA normalization rules during +// parsing. This flag is on by default. +const unsigned int parse_wconv_attribute = 0x0040; + +// This flag determines if attribute values are normalized using NMTOKENS normalization rules during +// parsing. This flag is off by default. +const unsigned int parse_wnorm_attribute = 0x0080; + +// This flag determines if document declaration (node_declaration) is added to the DOM tree. This +// flag is off by default. +const unsigned int parse_declaration = 0x0100; + +// This flag determines if document type declaration (node_doctype) is added to the DOM tree. This +// flag is off by default. +const unsigned int parse_doctype = 0x0200; + +// This flag determines if plain character data (node_pcdata) that is the only child of the parent +// node and that consists only of whitespace is added to the DOM tree. This flag is off by default; +// turning it on may result in slower parsing and more memory consumption. +const unsigned int parse_ws_pcdata_single = 0x0400; + +// This flag determines if leading and trailing whitespace is to be removed from plain character +// data. This flag is off by default. +const unsigned int parse_trim_pcdata = 0x0800; + +// This flag determines if plain character data that does not have a parent node is added to the DOM +// tree, and if an empty document is a valid document. This flag is off by default. +const unsigned int parse_fragment = 0x1000; + +// This flag determines if plain character data is be stored in the parent element's value. This +// significantly changes the structure of the document; this flag is only recommended for parsing +// documents with many PCDATA nodes in memory-constrained environments. This flag is off by default. +const unsigned int parse_embed_pcdata = 0x2000; + +// The default parsing mode. +// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are +// expanded, End-of-Line characters are normalized, attribute values are normalized using CDATA +// normalization rules. +const unsigned int parse_default = parse_cdata | parse_escapes | parse_wconv_attribute | parse_eol; + +// The full parsing mode. +// Nodes of all types are added to the DOM tree, character/reference entities are expanded, +// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization +// rules. +const unsigned int parse_full = + parse_default | parse_pi | parse_comments | parse_declaration | parse_doctype; + +// The max parse deep of xml +// Don't define it to large, otherwise, will lead stack overflow +const unsigned int parse_max_deep = 512; + +typedef char_t* (*strconv_attribute_t)(char_t*, char_t); +typedef char_t* (*strconv_pcdata_t)(char_t*); + +enum chartype_t +{ + ct_parse_pcdata = 1, // \0, &, \r, < + ct_parse_attr = 2, // \0, &, \r, ', " + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab + ct_space = 8, // \r, \n, space, tab + ct_parse_cdata = 16, // \0, ], >, \r + ct_parse_comment = 32, // \0, -, >, \r + ct_symbol = 64, // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, . + ct_start_symbol = 128 // Any symbol > 127, a-z, A-Z, _, : +}; + +static const unsigned char chartype_table[256] = { + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47 + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95 + 0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 96-111 + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 0, 0, 0, // 112-127 + + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 128+ + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192}; + +enum chartypex_t +{ + ctx_special_pcdata = 1, // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, > + ctx_special_attr = 2, // Any symbol >= 0 and < 32 (except \t), &, <, >, " + ctx_start_symbol = 4, // Any symbol > 127, a-z, A-Z, _ + ctx_digit = 8, // 0-9 + ctx_symbol = 16 // Any symbol > 127, a-z, A-Z, 0-9, _, -, . +}; + +static const unsigned char chartypex_table[256] = { + 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 2, 3, 3, // 0-15 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 16-31 + 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 16, 16, 0, // 32-47 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 3, 0, 3, 0, // 48-63 + + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 64-79 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 20, // 80-95 + 0, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 96-111 + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, // 112-127 + + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, // 128+ + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; + +// Branch weight controls +#if defined(__GNUC__) +# define XSXML__UNLIKELY(cond) __builtin_expect(cond, 0) +#else +# define XSXML__UNLIKELY(cond) (cond) +#endif + +#define XSXML__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast(c)] & (ct)) + +#define XSXML__IS_CHARTYPE(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartype_table) +#define XSXML__IS_CHARTYPEX(c, ct) XSXML__IS_CHARTYPE_IMPL(c, ct, chartypex_table) + +// Parser utilities +#define XSXML__ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e))) +#define XSXML__SKIPWS() \ + { \ + while (XSXML__IS_CHARTYPE(*s, ct_space)) \ + ++s; \ + } +#define XSXML__OPTSET(OPT) (optmsk & (OPT)) +#define XSXML__PUSHNODE(TYPE) \ + { \ + cursor = append_new_node(cursor, alloc, TYPE); \ + if (!cursor) \ + XSXML__THROW_ERROR(status_out_of_memory, s); \ + } +#define XSXML__POPNODE() \ + { \ + cursor = cursor->parent; \ + } +#define XSXML__SCANFOR(X) \ + { \ + while (*s != 0 && !(X)) \ + ++s; \ + } +#define XSXML__SCANWHILE(X) \ + { \ + while (X) \ + ++s; \ + } +#define XSXML__SCANWHILE_UNROLL(X) \ + { \ + for (;;) \ + { \ + char_t ss = s[0]; \ + if (XSXML__UNLIKELY(!(X))) \ + { \ + break; \ + } \ + ss = s[1]; \ + if (XSXML__UNLIKELY(!(X))) \ + { \ + s += 1; \ + break; \ + } \ + ss = s[2]; \ + if (XSXML__UNLIKELY(!(X))) \ + { \ + s += 2; \ + break; \ + } \ + ss = s[3]; \ + if (XSXML__UNLIKELY(!(X))) \ + { \ + s += 3; \ + break; \ + } \ + s += 4; \ + } \ + } +#define XSXML__ENDSEG() \ + { \ + ch = *s; \ + *s = 0; \ + ++s; \ + } +#define XSXML__THROW_ERROR(err, m) \ + return error_offset = m, error_status = err, static_cast(0) +#define XSXML__CHECK_ERROR(err, m) \ + { \ + if (*s == 0) \ + XSXML__THROW_ERROR(err, m); \ + } + +// Simple static assertion +#define XSXML__STATIC_ASSERT(cond) \ + { \ + static const char condition_failed[(cond) ? 1 : -1] = {0}; \ + (void)condition_failed[0]; \ + } + +// Parsing result +struct xml_parse_result +{ + // Parsing status (see xml_parse_status) + xml_parse_status status; + + // Last parsed offset (in char_t units from start of input data) + ptrdiff_t offset; + + // Source document encoding + // xml_encoding encoding; + + // Default constructor, initializes object to failed state + xml_parse_result() : status(status_internal_error), offset(0) {} + + // Cast to bool operator + operator bool() const { return status == status_ok; } + + // Get error description + const char* description() const; +}; + +struct opt_false +{ + enum + { + value = 0 + }; +}; + +struct opt_true +{ + enum + { + value = 1 + }; +}; + +struct gap +{ + char_t* end; + size_t size; + + gap() : end(0), size(0) {} + + // Push new gap, move s count bytes further (skipping the gap). + // Collapse previous gap. + void push(char_t*& s, size_t count) + { + if (end) // there was a gap already; collapse it + { + // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + } + + s += count; // end of current gap + + // "merge" two gaps + end = s; + size += count; + } + + // Collapse all gaps, return past-the-end pointer + char_t* flush(char_t* s) + { + if (end) + { + // Move [old_gap_end, current_pos) to [old_gap_start, ...) + assert(s >= end); + memmove(end - size, end, reinterpret_cast(s) - reinterpret_cast(end)); + + return s - size; + } + else + return s; + } +}; + +template struct strconv_pcdata_impl +{ + static char_t* parse(char_t* s) + { + gap g; + + char_t* begin = s; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_pcdata)); + + if (*s == '<') // PCDATA ends here + { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s + 1; + } + else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') + g.push(s, 1); + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (*s == 0) + { + char_t* end = g.flush(s); + + if (opt_trim::value) + while (end > begin && XSXML__IS_CHARTYPE(end[-1], ct_space)) + --end; + + *end = 0; + + return s; + } + else + ++s; + } + } +}; + +XSXML__DECL strconv_pcdata_t get_strconv_pcdata(unsigned int optmask) +{ + XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800); + + switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim) + { + case 0: + return strconv_pcdata_impl::parse; + case 1: + return strconv_pcdata_impl::parse; + case 2: + return strconv_pcdata_impl::parse; + case 3: + return strconv_pcdata_impl::parse; + case 4: + return strconv_pcdata_impl::parse; + case 5: + return strconv_pcdata_impl::parse; + case 6: + return strconv_pcdata_impl::parse; + case 7: + return strconv_pcdata_impl::parse; + default: + assert(false); + return 0; // should not get here + } +} + +struct utf8_writer +{ + typedef uint8_t* value_type; + + static value_type low(value_type result, uint32_t ch) + { + // U+0000..U+007F + if (ch < 0x80) + { + *result = static_cast(ch); + return result + 1; + } + // U+0080..U+07FF + else if (ch < 0x800) + { + result[0] = static_cast(0xC0 | (ch >> 6)); + result[1] = static_cast(0x80 | (ch & 0x3F)); + return result + 2; + } + // U+0800..U+FFFF + else + { + result[0] = static_cast(0xE0 | (ch >> 12)); + result[1] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (ch & 0x3F)); + return result + 3; + } + } + + static value_type high(value_type result, uint32_t ch) + { + // U+10000..U+10FFFF + result[0] = static_cast(0xF0 | (ch >> 18)); + result[1] = static_cast(0x80 | ((ch >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((ch >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (ch & 0x3F)); + return result + 4; + } + + static value_type any(value_type result, uint32_t ch) + { + return (ch < 0x10000) ? low(result, ch) : high(result, ch); + } +}; + +XSXML__DECL char_t* strconv_escape(char_t* s, gap& g) +{ + char_t* stre = s + 1; + + switch (*stre) + { + case '#': // &#... + { + unsigned int ucsc = 0; + + if (stre[1] == 'x') // &#x... (hex code) + { + stre += 2; + + char_t ch = *stre; + + if (ch == ';') + return stre; + + for (;;) + { + if (static_cast(ch - '0') <= 9) + ucsc = 16 * ucsc + (ch - '0'); + else if (static_cast((ch | ' ') - 'a') <= 5) + ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + else // &#... (dec code) + { + char_t ch = *++stre; + + if (ch == ';') + return stre; + + for (;;) + { + if (static_cast(static_cast(ch) - '0') <= 9) + ucsc = 10 * ucsc + (ch - '0'); + else if (ch == ';') + break; + else // cancel + return stre; + + ch = *++stre; + } + + ++stre; + } + + s = reinterpret_cast(utf8_writer::any(reinterpret_cast(s), ucsc)); + + g.push(s, stre - s); + return stre; + } + + case 'a': // &a + { + ++stre; + + if (*stre == 'm') // &am + { + if (*++stre == 'p' && *++stre == ';') // & + { + *s++ = '&'; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + else if (*stre == 'p') // &ap + { + if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // ' + { + *s++ = '\''; + ++stre; + + g.push(s, stre - s); + return stre; + } + } + break; + } + + case 'g': // &g + { + if (*++stre == 't' && *++stre == ';') // > + { + *s++ = '>'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'l': // &l + { + if (*++stre == 't' && *++stre == ';') // < + { + *s++ = '<'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + case 'q': // &q + { + if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // " + { + *s++ = '"'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + + default: + break; + } + + return stre; +} + +template struct strconv_attribute_impl +{ + static char_t* parse_wnorm(char_t* s, char_t end_quote) + { + gap g; + + // trim leading whitespaces + if (XSXML__IS_CHARTYPE(*s, ct_space)) + { + char_t* str = s; + + do + ++str; + while (XSXML__IS_CHARTYPE(*str, ct_space)); + + g.push(s, str - s); + } + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws | ct_space)); + + if (*s == end_quote) + { + char_t* str = g.flush(s); + + do + *str-- = 0; + while (XSXML__IS_CHARTYPE(*str, ct_space)); + + return s + 1; + } + else if (XSXML__IS_CHARTYPE(*s, ct_space)) + { + *s++ = ' '; + + if (XSXML__IS_CHARTYPE(*s, ct_space)) + { + char_t* str = s + 1; + while (XSXML__IS_CHARTYPE(*str, ct_space)) + ++str; + + g.push(s, str - s); + } + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else + ++s; + } + } + + static char_t* parse_wconv(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr_ws)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (XSXML__IS_CHARTYPE(*s, ct_space)) + { + if (*s == '\r') + { + *s++ = ' '; + + if (*s == '\n') + g.push(s, 1); + } + else + *s++ = ' '; + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else + ++s; + } + } + + static char_t* parse_eol(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == '\r') + { + *s++ = '\n'; + + if (*s == '\n') + g.push(s, 1); + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else + ++s; + } + } + + static char_t* parse_simple(char_t* s, char_t end_quote) + { + gap g; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_attr)); + + if (*s == end_quote) + { + *g.flush(s) = 0; + + return s + 1; + } + else if (opt_escape::value && *s == '&') + { + s = strconv_escape(s, g); + } + else if (!*s) + { + return 0; + } + else + ++s; + } + } +}; + +XSXML__DECL xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0) +{ + xml_parse_result result; + result.status = status; + result.offset = offset; + + return result; +} + +XSXML__DECL char_t* strconv_comment(char_t* s, char_t endch) +{ + gap g; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_comment)); + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') + g.push(s, 1); + } + else if (s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')) // comment ends here + { + *g.flush(s) = 0; + + return s + (s[2] == '>' ? 3 : 2); + } + else if (*s == 0) + { + return 0; + } + else + ++s; + } +} + +XSXML__DECL char_t* strconv_cdata(char_t* s, char_t endch) +{ + gap g; + + while (true) + { + XSXML__SCANWHILE_UNROLL(!XSXML__IS_CHARTYPE(ss, ct_parse_cdata)); + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') + g.push(s, 1); + } + else if (s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')) // CDATA ends here + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == 0) + { + return 0; + } + else + ++s; + } +} + +XSXML__DECL strconv_attribute_t get_strconv_attribute(unsigned int optmask) +{ + XSXML__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && + parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80); + + switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes) + { + case 0: + return strconv_attribute_impl::parse_simple; + case 1: + return strconv_attribute_impl::parse_simple; + case 2: + return strconv_attribute_impl::parse_eol; + case 3: + return strconv_attribute_impl::parse_eol; + case 4: + return strconv_attribute_impl::parse_wconv; + case 5: + return strconv_attribute_impl::parse_wconv; + case 6: + return strconv_attribute_impl::parse_wconv; + case 7: + return strconv_attribute_impl::parse_wconv; + case 8: + return strconv_attribute_impl::parse_wnorm; + case 9: + return strconv_attribute_impl::parse_wnorm; + case 10: + return strconv_attribute_impl::parse_wnorm; + case 11: + return strconv_attribute_impl::parse_wnorm; + case 12: + return strconv_attribute_impl::parse_wnorm; + case 13: + return strconv_attribute_impl::parse_wnorm; + case 14: + return strconv_attribute_impl::parse_wnorm; + case 15: + return strconv_attribute_impl::parse_wnorm; + default: + assert(false); + return 0; // should not get here + } +} + +// Skip utf-8 bom +static char_t* parse_skip_bom(char_t* s) +{ + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; +} + +// Simple string view +class string_view +{ +public: + string_view() : _Mystr(nullptr), _Mysize(0) {} + string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {} + const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; } + size_t length() const { return _Mysize; } + bool empty() const { return _Mysize == 0; } + +private: + char_t* _Mystr; + size_t _Mysize; +}; + +// The sax3 parse callbacks +struct xml_sax3_parse_cb +{ + std::function xml_start_element_cb; + std::function xml_attr_cb; + std::function xml_end_attr_cb; + std::function xml_end_element_cb; + std::function xml_text_cb; +}; + +/////////////// xml_sax3_parser /////////// +struct xml_sax3_parser +{ + // xml_allocator alloc; + char_t* error_offset; + xml_parse_status error_status; + + xml_sax3_parse_cb* handler; + + xml_sax3_parser(xml_sax3_parse_cb* handler_) + : handler(handler_), error_offset(0), error_status(status_ok) + {} + + ~xml_sax3_parser() + { + // *alloc_state = alloc; + } + + // DOCTYPE consists of nested sections of the following possible types: + // , , "...", '...' + // + // + // First group can not contain nested groups + // Second group can contain nested groups of the same type + // Third group can contain all other groups + char_t* parse_doctype_primitive(char_t* s) + { + if (*s == '"' || *s == '\'') + { + // quoted string + char_t ch = *s++; + XSXML__SCANFOR(*s == ch); + if (!*s) + XSXML__THROW_ERROR(status_bad_doctype, s); + + s++; + } + else if (s[0] == '<' && s[1] == '?') + { + // + s += 2; + XSXML__SCANFOR(s[0] == '?' && + s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype + if (!*s) + XSXML__THROW_ERROR(status_bad_doctype, s); + + s += 2; + } + else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') + { + s += 4; + XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && + s[2] == + '>'); // no need for ENDSWITH because --> can't terminate proper doctype + if (!*s) + XSXML__THROW_ERROR(status_bad_doctype, s); + + s += 3; + } + else + XSXML__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_doctype_ignore(char_t* s) + { + size_t depth = 0; + + assert(s[0] == '<' && s[1] == '!' && s[2] == '['); + s += 3; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] == '[') + { + // nested ignore section + s += 3; + depth++; + } + else if (s[0] == ']' && s[1] == ']' && s[2] == '>') + { + // ignore section end + s += 3; + + if (depth == 0) + return s; + + depth--; + } + else + s++; + } + + XSXML__THROW_ERROR(status_bad_doctype, s); + } + + char_t* parse_doctype_group(char_t* s, char_t endch) + { + size_t depth = 0; + + assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); + s += 2; + + while (*s) + { + if (s[0] == '<' && s[1] == '!' && s[2] != '-') + { + if (s[2] == '[') + { + // ignore + s = parse_doctype_ignore(s); + if (!s) + return s; + } + else + { + // some control group + s += 2; + depth++; + } + } + else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') + { + // unknown tag (forbidden), or some primitive group + s = parse_doctype_primitive(s); + if (!s) + return s; + } + else if (*s == '>') + { + if (depth == 0) + return s; + + depth--; + s++; + } + else + s++; + } + + if (depth != 0 || endch != '>') + XSXML__THROW_ERROR(status_bad_doctype, s); + + return s; + } + + char_t* parse_exclamation(char_t* s, unsigned int optmsk, char_t endch) + { + // parse node contents, starting with exclamation mark + ++s; + + if (*s == '-') // 'value = s; // Save the offset. + value = s; + } + + if (XSXML__OPTSET(parse_eol) && XSXML__OPTSET(parse_comments)) + { + s = strconv_comment(s, endch); + + if (!s) + XSXML__THROW_ERROR(status_bad_comment, value); + } + else + { + // Scan for terminating '-->'. + XSXML__SCANFOR(s[0] == '-' && s[1] == '-' && XSXML__ENDSWITH(s[2], '>')); + XSXML__CHECK_ERROR(status_bad_comment, s); + + if (XSXML__OPTSET(parse_comments)) + *s = 0; // Zero-terminate this segment at the first terminating '-'. + + s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'. + } + } + else + XSXML__THROW_ERROR(status_bad_comment, s); + } + else if (*s == '[') + { + // ''. + XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')); + XSXML__CHECK_ERROR(status_bad_cdata, s); + + *s++ = 0; // Zero-terminate this segment. + } + } + else // Flagged for discard, but we still have to scan for the terminator. + { + // Scan for terminating ']]>'. + XSXML__SCANFOR(s[0] == ']' && s[1] == ']' && XSXML__ENDSWITH(s[2], '>')); + XSXML__CHECK_ERROR(status_bad_cdata, s); + + ++s; + } + + s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'. + } + else + XSXML__THROW_ERROR(status_bad_cdata, s); + } + else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && + s[5] == 'P' && XSXML__ENDSWITH(s[6], 'E')) + { + s -= 2; + + // TODO: check for doctype, parent must be nullptr + // if (cursor->parent) XSXML__THROW_ERROR(status_bad_doctype, s); + + char_t* mark = s + 9; + + s = parse_doctype_group(s, endch); + if (!s) + return s; + + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) + *s++ = 0; + + if (XSXML__OPTSET(parse_doctype)) + { + while (XSXML__IS_CHARTYPE(*mark, ct_space)) + ++mark; + + // SAX3: Ignore doctype + // XSXML__PUSHNODE(node_doctype); + + // cursor->value = mark; + } + } + else if (*s == 0 && endch == '-') + XSXML__THROW_ERROR(status_bad_comment, s); + else if (*s == 0 && endch == '[') + XSXML__THROW_ERROR(status_bad_cdata, s); + else + XSXML__THROW_ERROR(status_unrecognized_tag, s); + + return s; + } + + char_t* parse_question(char_t* s, unsigned int optmsk, char_t endch) + { + // load into registers + // xml_node_struct* cursor = ref_cursor; + char_t ch = 0; + + // parse node contents, starting with question mark + ++s; + + // read PI target + char_t* target = s; + + if (!XSXML__IS_CHARTYPE(*s, ct_start_symbol)) + XSXML__THROW_ERROR(status_bad_pi, s); + + XSXML__SCANWHILE(XSXML__IS_CHARTYPE(*s, ct_symbol)); + XSXML__CHECK_ERROR(status_bad_pi, s); + + // determine node type; stricmp / strcasecmp is not portable + bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && + (target[2] | ' ') == 'l' && target + 3 == s; + + if (declaration ? XSXML__OPTSET(parse_declaration) : XSXML__OPTSET(parse_pi)) + { + if (declaration) + { + // TODO: disallow non top-level declarations + // if (cursor->parent) XSXML__THROW_ERROR(status_bad_pi, s); + + // SAX3: Ignore declaration. + // XSXML__PUSHNODE(node_declaration); + } + else + { + // SAX3: Ignore pi. + // XSXML__PUSHNODE(node_pi); + } + + XSXML__ENDSEG(); + + // parse value/attributes + if (ch == '?') + { + // empty node + if (!XSXML__ENDSWITH(*s, '>')) + XSXML__THROW_ERROR(status_bad_pi, s); + s += (*s == '>'); + + // XSXML__POPNODE(); + } + else if (XSXML__IS_CHARTYPE(ch, ct_space)) + { + XSXML__SKIPWS(); + + // scan for tag end + char_t* value = s; + + XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>')); + XSXML__CHECK_ERROR(status_bad_pi, s); + + if (declaration) + { + // replace ending ? with / so that 'element' terminates properly + *s = '/'; + + // we exit from this function with cursor at node_declaration, which is a signal to + // parse() to go to LOC_ATTRIBUTES + s = value; + } + else + { + // store value and step over > + // cursor->value = value; + + // XSXML__POPNODE(); + + XSXML__ENDSEG(); + + s += (*s == '>'); + } + } + else + XSXML__THROW_ERROR(status_bad_pi, s); + } + else + { + // scan for tag end + XSXML__SCANFOR(s[0] == '?' && XSXML__ENDSWITH(s[1], '>')); + XSXML__CHECK_ERROR(status_bad_pi, s); + + s += (s[1] == '>' ? 2 : 1); + } + + // store from registers + // ref_cursor = cursor; + + return s; + } + + template struct fixed_stack + { + public: + fixed_stack() : size_(0) {} + + void push(const _T& val) + { + if (size_ < _Capacity) + elements_[size_++] = val; + } + + _T pop() + { + if (size_ > 0) + return elements_[size_-- - 1]; + return _T{}; + } + + private: + _T elements_[_Capacity]; + size_t size_; + }; + + char_t* parse_tree(char_t* s, unsigned int optmsk, char_t endch) + { + strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk); + strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk); + + char_t ch = 0; + char_t* mark = s; + char_t* value = nullptr; + size_t n = 0; + + fixed_stack stk; // 4K on 32bits, 6K on 64bits + + while (*s != 0) + { + if (*s == '<') + { + ++s; + + LOC_TAG: + if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...' + { + // SAX3: TODO: xmlStartElement. + // XSXML__PUSHNODE(node_element); // Append a new node to the tree. + + mark = s; + + XSXML__SCANWHILE_UNROLL(XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + + handler->xml_start_element_cb(mark, s - mark); + stk.push(::xsxml::string_view(mark, s - mark)); + + XSXML__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (ch == '>') + { + handler->xml_end_attr_cb(); // end of tag + } + else if (XSXML__IS_CHARTYPE(ch, ct_space)) + { + while (true) + { // parse attributes + XSXML__SKIPWS(); // Eat any whitespace. + + if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) // <... #... + { + // SAX3: TODO: implement attribute. + // xml_attribute_struct* a = append_new_attribute(cursor, alloc); // Make space for + // this attribute. if (!a) XSXML__THROW_ERROR(status_out_of_memory, s); + + mark = s; // Save the offset. + + XSXML__SCANWHILE_UNROLL( + XSXML__IS_CHARTYPE(ss, ct_symbol)); // Scan for a terminator. + n = s - mark; + XSXML__ENDSEG(); // Save char in 'ch', terminate & step over. + + if (XSXML__IS_CHARTYPE(ch, ct_space)) + { + XSXML__SKIPWS(); // Eat any whitespace. + + ch = *s; + ++s; + } + + if (ch == '=') // '<... #=...' + { + XSXML__SKIPWS(); // Eat any whitespace. + + if (*s == '"' || *s == '\'') // '<... #="...' + { + ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. + ++s; // Step over the quote. + value = s; // a->value = s; // Save the offset. + + s = strconv_attribute(s, ch); + + if (!s) + XSXML__THROW_ERROR(status_bad_attribute, value); + + // After this line the loop continues from the start; + // Whitespaces, / and > are ok, symbols and EOF are wrong, + // everything else will be detected + if (XSXML__IS_CHARTYPE(*s, ct_start_symbol)) + XSXML__THROW_ERROR(status_bad_attribute, s); + handler->xml_attr_cb(mark, n, value, s - value - 1); + } + else + XSXML__THROW_ERROR(status_bad_attribute, s); + } + else + XSXML__THROW_ERROR(status_bad_attribute, s); + } + else if (*s == '/') + { + ++s; + if (*s == '>') + { + auto ele_name = stk.pop(); + handler->xml_end_attr_cb(); + handler->xml_end_element_cb(ele_name.c_str(), ele_name.length()); + ++s; + break; + } + else if (*s == 0 && endch == '>') + { + auto ele_name = stk.pop(); + handler->xml_end_attr_cb(); + handler->xml_end_element_cb(ele_name.c_str(), ele_name.length()); + break; + } + else + XSXML__THROW_ERROR(status_bad_start_element, s); + } + else if (*s == '>') + { + ++s; + handler->xml_end_attr_cb(); + break; + } + else if (*s == 0 && endch == '>') + { + break; + } + else + XSXML__THROW_ERROR(status_bad_start_element, s); + } + + // !!! + } + else if (ch == '/') // '<#.../' + { + if (!XSXML__ENDSWITH(*s, '>')) + XSXML__THROW_ERROR(status_bad_start_element, s); + + stk.pop(); + handler->xml_end_element_cb(mark, s - mark); + s += (*s == '>'); + } + else if (ch == 0) + { + // we stepped over null terminator, backtrack & handle closing tag + --s; + + if (endch != '>') + XSXML__THROW_ERROR(status_bad_start_element, s); + } + else + XSXML__THROW_ERROR(status_bad_start_element, s); + } + else if (*s == '/') + { + ++s; + + mark = s; + + // SAX3, we don't check end element name + while (XSXML__IS_CHARTYPE(*s, ct_symbol)) + ++s; + + stk.pop(); + handler->xml_end_element_cb(mark, s - mark); + + XSXML__SKIPWS(); + + if (*s == 0) + { + if (endch != '>') + XSXML__THROW_ERROR(status_bad_end_element, s); + } + else + { + if (*s != '>') + XSXML__THROW_ERROR(status_bad_end_element, s); + ++s; + } + } + else if (*s == '?') // 'first_child) continue; + } + } + + if (!XSXML__OPTSET(parse_trim_pcdata)) + s = mark; + + // SAX3: Ignore node_pcdata. + if (/*cursor->parent ||*/ XSXML__OPTSET(parse_fragment)) + { // Currently, SAX3 simplely skip, do not regard text it node + if (XSXML__OPTSET(parse_embed_pcdata) /*&& cursor->parent && !cursor->first_child && !cursor->value*/) + { + // cursor->value = s; // Save the offset. + } + else + { + // XSXML__PUSHNODE(node_pcdata); // Append a new node on the tree. + + // cursor->value = s; // Save the offset. + + // XSXML__POPNODE(); // Pop since this is a standalone. + } + + s = strconv_pcdata(s); + + if (!*s) + break; + } + else + { + XSXML__SCANFOR(*s == '<'); // '...<' + if (!*s) + break; + + handler->xml_text_cb(mark, s - mark); + + ++s; + } + + // We're after '<' + goto LOC_TAG; + } + } + + // SAX3: TODO: check that last tag is closed, + // if (cursor != root) XSXML__THROW_ERROR(status_end_element_mismatch, s); + + return s; + } + + static char_t* parse_skip_bom(char_t* s) + { + return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + } + + /*static bool has_element_node_siblings(xml_node_struct* node) + { + while (node) + { + if (XSXML__NODETYPE(node) == node_element) return true; + + node = node->next_sibling; + } + + return false; + }*/ + + static xml_parse_result parse(char_t* buffer, size_t length, xml_sax3_parse_cb* handler, + unsigned int optmsk = parse_default) + { + // early-out for empty documents + if (length == 0) + return make_parse_result(XSXML__OPTSET(parse_fragment) ? status_ok + : status_no_document_element); + + // get last child of the root before parsing + // xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c + 0 + // : 0; + + // create parser on stack + xml_sax3_parser parser(handler); + + // save last character and make buffer zero-terminated (speeds up parsing) + char_t endch = buffer[length - 1]; + buffer[length - 1] = 0; + + // skip BOM to make sure it does not end up as part of parse output + char_t* buffer_data = parse_skip_bom(buffer); + + // perform actual parsing + parser.parse_tree(buffer_data, optmsk, endch); + + xml_parse_result result = make_parse_result( + parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); + assert(result.offset >= 0 && static_cast(result.offset) <= length); + + if (result) + { + // since we removed last character, we have to handle the only possible false positive (stray + // <) + if (endch == '<') + return make_parse_result(status_unrecognized_tag, length - 1); + } + else + { + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast(result.offset) == length - 1 && endch == 0) + result.offset--; + } + + return result; + } +}; /* xml_sax3_parser */ +}; // namespace xsxml + +#endif diff --git a/xsxml/xsxml.hpp b/xsxml.hpp similarity index 97% rename from xsxml/xsxml.hpp rename to xsxml.hpp index 260626e..38b49f3 100644 --- a/xsxml/xsxml.hpp +++ b/xsxml.hpp @@ -1,1570 +1,1570 @@ -////////////////////////////////////////////////////////////////////////////////////////// -// The embedded xml SAX parser, extract from rapidxml DOM parser -// please see: http://rapidxml.sourceforge.net/ -////////////////////////////////////////////////////////////////////////////////////////// -/* -The MIT License (MIT) -Copyright (c) 2019 halx99 -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef SIMDSOFT__XSXML_HPP -#define SIMDSOFT__XSXML_HPP - -#include -#include -#include -#include - -// On MSVC, disable "conditional expression is constant" warning (level 4). -// This warning is almost impossible to avoid with certain types of templated code -#ifdef _MSC_VER -# pragma warning(push) -# pragma warning(disable : 4127) // Conditional expression is constant -#endif - -#if !defined(XSXML__PARSE_ERROR) -# define XSXML__PARSE_ERROR(what, where) throw parse_error(what, where) -#endif - -namespace xsxml -{ -//! Parse flag instructing the parser to not create data nodes. -//! Text of first data node will still be placed in value of parent element, unless -//! xsxml::parse_no_element_values flag is also specified. Can be combined with other flags by -//! use of | operator.

See xml_document::parse() function. -const int parse_no_data_nodes = 0x1; - -//! Parse flag instructing the parser to not use text of first data node as a value of parent -//! element. Can be combined with other flags by use of | operator. Note that child data nodes of -//! element node take precendence over its value when printing. That is, if element has one or more -//! child data nodes and a value, the value will be ignored. Use -//! xsxml::parse_no_data_nodes flag to prevent creation of data nodes if you want to manipulate -//! data using values of elements.

See xml_document::parse() function. -const int parse_no_element_values = 0x2; - -//! Parse flag instructing the parser to not place zero terminators after strings in the source -//! text. By default zero terminators are placed, modifying source text. Can be combined with other -//! flags by use of | operator.

See xml_document::parse() function. -const int parse_no_string_terminators = 0x4; - -//! Parse flag instructing the parser to not translate entities in the source text. -//! By default entities are translated, modifying source text. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_no_entity_translation = 0x8; - -//! Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters. -//! By default, UTF-8 handling is enabled. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_no_utf8 = 0x10; - -//! Parse flag instructing the parser to create XML declaration node. -//! By default, declaration node is not created. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_declaration_node = 0x20; - -//! Parse flag instructing the parser to create comments nodes. -//! By default, comment nodes are not created. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_comment_nodes = 0x40; - -//! Parse flag instructing the parser to create DOCTYPE node. -//! By default, doctype node is not created. -//! Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept -//! documents with more than one. Can be combined with other flags by use of | operator.

-//! See xml_document::parse() function. -static const int parse_doctype_node = 0x80; - -//! Parse flag instructing the parser to create PI nodes. -//! By default, PI nodes are not created. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_pi_nodes = 0x100; - -//! Parse flag instructing the parser to validate closing tag names. -//! If not set, name inside closing tag is irrelevant to the parser. -//! By default, closing tags are not validated. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_validate_closing_tags = 0x200; - -//! Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes. -//! By default, whitespace is not trimmed. -//! This flag does not cause the parser to modify source text. -//! Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_trim_whitespace = 0x400; - -//! Parse flag instructing the parser to condense all whitespace runs of data nodes to a single -//! space character. Trimming of leading and trailing whitespace of data is controlled by -//! xsxml::parse_trim_whitespace flag. By default, whitespace is not normalized. If this flag is -//! specified, source text will be modified. Can be combined with other flags by use of | operator. -//!

-//! See xml_document::parse() function. -static const int parse_normalize_whitespace = 0x800; - -//! Parse flag instructing the parser to convert html entity -//! this flag only works when the flag 'parse_no_entity_translation' not specified -//!

-//! See xml_document::parse() function. -static const int parse_html_entity_translation = 0x1000; - -// Compound flags - -//! Parse flags which represent default behaviour of the parser. -//! This is always equal to 0, so that all other flags can be simply ored together. -//! Normally there is no need to inconveniently disable flags by anding with their negated (~) -//! values. This also means that meaning of each flag is a negation of the default setting. -//! For example, if flag name is xsxml::parse_no_utf8, it means that utf-8 is enabled by -//! default, and using the flag will disable it.

See xml_document::parse() function. -static const int parse_default = 0; - -//! A combination of parse flags that forbids any modifications of the source text. -//! This also results in faster parsing. However, note that the following will occur: -//!
    -//!
  • names and values of nodes will not be zero terminated, you have to use xml_base::name_size() -//! and xml_base::value_size() functions to determine where name and value ends
  • entities -//! will not be translated
  • whitespace will not be normalized
  • -//!
-//! See xml_document::parse() function. -static const int parse_non_destructive = parse_no_string_terminators | parse_no_entity_translation; - -//! A combination of parse flags resulting in fastest possible parsing, without sacrificing -//! important data.

See xml_document::parse() function. -static const int parse_fastest = parse_non_destructive | parse_no_data_nodes; - -//! A combination of parse flags resulting in largest amount of data being extracted. -//! This usually results in slowest parsing. -//!

-//! See xml_document::parse() function. -static const int parse_full = parse_declaration_node | parse_comment_nodes | parse_doctype_node | - parse_pi_nodes | parse_validate_closing_tags; - -static const int parse_normal = parse_no_data_nodes; - -typedef char char_t; - -class string_view -{ -public: - string_view() : _Mystr(nullptr), _Mysize(0) {} - string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {} - const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; } - size_t length() const { return _Mysize; } - bool empty() const { return _Mysize == 0; } - -private: - char_t* _Mystr; - size_t _Mysize; -}; - -// The sax3 parse callbacks -struct xml_sax3_parse_cb -{ - std::function xml_start_element_cb; - std::function xml_attr_cb; - std::function xml_end_attr_cb; - std::function xml_end_element_cb; - std::function xml_text_cb; -}; - -namespace internal -{ -//! \cond internal -// Struct that contains lookup tables for the parser -// It must be a template to allow correct linking (because it has static data members, which are -// defined in a header file). -template struct lookup_tables -{ - static const unsigned char lookup_whitespace[256]; // Whitespace table - static const unsigned char lookup_node_name[256]; // Node name table - static const unsigned char lookup_text[256]; // Text table - static const unsigned char lookup_text_pure_no_ws[256]; // Text table - static const unsigned char lookup_text_pure_with_ws[256]; // Text table - static const unsigned char lookup_attribute_name[256]; // Attribute name table - static const unsigned char lookup_attribute_data_1[256]; // Attribute data table with single quote - static const unsigned char - lookup_attribute_data_1_pure[256]; // Attribute data table with single quote - static const unsigned char - lookup_attribute_data_2[256]; // Attribute data table with double quotes - static const unsigned char - lookup_attribute_data_2_pure[256]; // Attribute data table with double quotes - static const unsigned char lookup_digits[256]; // Digits - static const unsigned char - lookup_upcase[256]; // To uppercase conversion table for ASCII characters -}; - -// Compare strings for equality -template -inline bool compare(const _CharT* p1, std::size_t size1, const _CharT* p2, std::size_t size2, - bool case_sensitive) -{ - if (size1 != size2) - return false; - if (case_sensitive) - { - for (const _CharT* end = p1 + size1; p1 < end; ++p1, ++p2) - if (*p1 != *p2) - return false; - } - else - { - for (const _CharT* end = p1 + size1; p1 < end; ++p1, ++p2) - if (lookup_tables<0>::lookup_upcase[static_cast(*p1)] != - lookup_tables<0>::lookup_upcase[static_cast(*p2)]) - return false; - } - return true; -} -} // namespace internal - -//! Parse error exception. -//! This exception is thrown by the parser when an error occurs. -//! Use what() function to get human-readable error message. -//! Use where() function to get a pointer to position within source text where error was detected. -//!

-//! If throwing exceptions by the parser is undesirable, -//! it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before xsxml.hpp is included. -//! This will cause the parser to call xsxml::parse_error_handler() function instead of throwing -//! an exception. This function must be defined by the user.

This class derives from -//! std::exception class. -class parse_error : public std::exception -{ - -public: - //! Constructs parse error - parse_error(const char* what, void* where) : m_what(what), m_where(where) {} - - //! Gets human readable description of error. - //! \return Pointer to null terminated description of the error. - virtual const char* what() const throw() { return m_what; } - - //! Gets pointer to character data where error happened. - //! _CharT should be the same as char type of xml_document that produced the error. - //! \return Pointer to location within the parsed string where error occured. - template _CharT* where() const { return reinterpret_cast<_CharT*>(m_where); } - -private: - const char* m_what; - void* m_where; -}; - -//! This class represents root of the DOM hierarchy. -//! It is also an xml_node and a memory_pool through public inheritance. -//! Use parse() function to build a DOM tree from a zero-terminated XML text string. -//! parse() function allocates memory for nodes and attributes by using functions of xml_document, -//! which are inherited from memory_pool. -//! To access root node of the document, use the document itself, as if it was an xml_node. -//! \param char_t Character type to use. -class xml_sax3_parser -{ - xml_sax3_parse_cb* handler_; - - enum class parse_result - { - ok, - expected_close_tag, - unrecognized_tag, - }; - parse_result parse_result_ = parse_result::ok; - -public: - template - static parse_result parse(char_t* text, int length, xml_sax3_parse_cb* handler) - { - xml_sax3_parser parser(handler); - parser.parse(text, length); - return parser.parse_result_; - } - - //! Constructs empty XML document - xml_sax3_parser(xml_sax3_parse_cb* handler) { handler_ = handler; } - - //! Parses zero-terminated XML string according to given flags. - //! Passed string will be modified by the parser, unless xsxml::parse_non_destructive flag is - //! used. The string must persist for the lifetime of the document. In case of error, - //! xsxml::parse_error exception will be thrown.

If you want to parse contents of a - //! file, you must first load the file into the memory, and pass pointer to its beginning. Make - //! sure that data is zero-terminated.

Document can be parsed into multiple times. Each - //! new call to parse removes previous nodes and attributes (if any), but does not clear memory - //! pool. \param text XML data to parse; pointer is non-const to denote fact that this data may be - //! modified by the parser. - template void parse(char_t* text, int length) - { - assert(text); - - // save last character and make buffer zero-terminated (speeds up parsing) - auto endch = text[length - 1]; - text[length - 1] = 0; - - // Parse BOM, if any - parse_bom(text); - - // Parse children - _L_Loop : { - // Skip whitespace before node - skip(text); - if (*text == 0) - goto _L_end; - - // Parse and append new child - if (*text == char_t('<')) - { - ++text; // Skip '<' - parse_node(text); - } - else - XSXML__PARSE_ERROR("expected <", text); - - goto _L_Loop; - } - - _L_end: - // check parse result. - if (parse_result_ == parse_result::ok) - { - if (endch == '<') - { - parse_result_ = parse_result::unrecognized_tag; - XSXML__PARSE_ERROR("unrecognized tag", text); - } - } - else - { // parse_result_: parse_result::expected_close_tag - if (endch == '>') - parse_result_ = parse_result::ok; - else - XSXML__PARSE_ERROR("expected >", text); - } - } - - //! Clears the document by deleting all nodes and clearing the memory pool. - //! All nodes owned by document pool are destroyed. - void clear() - { - // this->remove_all_nodes(); - // this->remove_all_attributes(); - // memory_pool::clear(); - } - -private: - /////////////////////////////////////////////////////////////////////// - // Internal character utility functions - - // Detect whitespace character - struct whitespace_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_whitespace[static_cast(ch)]; - } - }; - - // Detect node name character - struct node_name_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_node_name[static_cast(ch)]; - } - }; - - // Detect attribute name character - struct attribute_name_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_attribute_name[static_cast(ch)]; - } - }; - - // Detect text character (PCDATA) - struct text_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_text[static_cast(ch)]; - } - }; - - // Detect text character (PCDATA) that does not require processing - struct text_pure_no_ws_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast(ch)]; - } - }; - - // Detect text character (PCDATA) that does not require processing - struct text_pure_with_ws_pred - { - static unsigned char test(char_t ch) - { - return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast(ch)]; - } - }; - - // Detect attribute value character - template struct attribute_value_pred - { - static unsigned char test(char_t ch) - { - if (Quote == char_t('\'')) - return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast(ch)]; - if (Quote == char_t('\"')) - return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast(ch)]; - return 0; // Should never be executed, to avoid warnings on Comeau - } - }; - - // Detect attribute value character - template struct attribute_value_pure_pred - { - static unsigned char test(char_t ch) - { - if (Quote == char_t('\'')) - return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast( - ch)]; - if (Quote == char_t('\"')) - return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast( - ch)]; - return 0; // Should never be executed, to avoid warnings on Comeau - } - }; - - // Insert coded character, using UTF8 or 8-bit ASCII - template static void insert_coded_character(char_t*& text, unsigned long code) - { - if (Flags & parse_no_utf8) - { - // Insert 8-bit ASCII character - // Todo: possibly verify that code is less than 256 and use replacement char otherwise? - text[0] = static_cast(code); - text += 1; - } - else - { - // Insert UTF8 sequence - if (code < 0x80) // 1 byte sequence - { - text[0] = static_cast(code); - text += 1; - } - else if (code < 0x800) // 2 byte sequence - { - text[1] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[0] = static_cast(code | 0xC0); - text += 2; - } - else if (code < 0x10000) // 3 byte sequence - { - text[2] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[1] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[0] = static_cast(code | 0xE0); - text += 3; - } - else if (code < 0x110000) // 4 byte sequence - { - text[3] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[2] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[1] = static_cast((code | 0x80) & 0xBF); - code >>= 6; - text[0] = static_cast(code | 0xF0); - text += 4; - } - else // Invalid, only codes up to 0x10FFFF are allowed in Unicode - { - XSXML__PARSE_ERROR("invalid numeric character entity", text); - } - } - } - - // Skip characters until predicate evaluates to true - template static void skip(char_t*& text) - { - char_t* tmp = text; - while (StopPred::test(*tmp)) - ++tmp; - text = tmp; - } - - // Skip characters until predicate evaluates to true while doing the following: - // - replacing XML character entity references with proper characters (' & " < - // > &#...;) - // - condensing whitespace sequences to single space character - template - static char_t* skip_and_expand_character_refs(char_t*& text) - { - // If entity translation, whitespace condense and whitespace trimming is disabled, use plain - // skip - if (Flags & parse_no_entity_translation && !(Flags & parse_normalize_whitespace) && - !(Flags & parse_trim_whitespace)) - { - skip(text); - return text; - } - - // Use simple skip until first modification is detected - skip(text); - - // Use translation skip - char_t* src = text; - char_t* dest = src; - while (StopPred::test(*src)) - { - // If entity translation is enabled - if (!(Flags & parse_no_entity_translation)) - { - // Test if replacement is needed - if (src[0] == char_t('&')) - { - switch (src[1]) - { - - // & ' - case char_t('a'): - if (src[2] == char_t('m') && src[3] == char_t('p') && src[4] == char_t(';')) - { - *dest = char_t('&'); - ++dest; - src += 5; - continue; - } - if (src[2] == char_t('p') && src[3] == char_t('o') && src[4] == char_t('s') && - src[5] == char_t(';')) - { - *dest = char_t('\''); - ++dest; - src += 6; - continue; - } - break; - - // " - case char_t('q'): - if (src[2] == char_t('u') && src[3] == char_t('o') && src[4] == char_t('t') && - src[5] == char_t(';')) - { - *dest = char_t('"'); - ++dest; - src += 6; - continue; - } - break; - - // > - case char_t('g'): - if (src[2] == char_t('t') && src[3] == char_t(';')) - { - *dest = char_t('>'); - ++dest; - src += 4; - continue; - } - break; - - // < - case char_t('l'): - if (src[2] == char_t('t') && src[3] == char_t(';')) - { - *dest = char_t('<'); - ++dest; - src += 4; - continue; - } - break; - - // &#...; - assumes ASCII - case char_t('#'): - if (src[2] == char_t('x')) - { - unsigned long code = 0; - src += 3; // Skip &#x - while (1) - { - unsigned char digit = - internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; - if (digit == 0xFF) - break; - code = code * 16 + digit; - ++src; - } - insert_coded_character(dest, code); // Put character in output - } - else - { - unsigned long code = 0; - src += 2; // Skip &# - while (1) - { - unsigned char digit = - internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; - if (digit == 0xFF) - break; - code = code * 10 + digit; - ++src; - } - insert_coded_character(dest, code); // Put character in output - } - if (*src == char_t(';')) - ++src; - else - XSXML__PARSE_ERROR("expected ;", src); - continue; - - // Something else - default: - if (Flags & parse_html_entity_translation) - { - switch (src[1]) - { //   - case char_t('n'): - if (src[2] == char_t('b') && src[3] == char_t('s') && src[4] == char_t('p') && - src[5] == char_t(';')) - { - *dest = char_t(' '); - ++dest; - src += 6; - continue; - } - break; - //   - case char_t('e'): - if (src[2] == char_t('m') && src[3] == char_t('s') && src[4] == char_t('p') && - src[5] == char_t(';')) - { - *dest = char_t(' '); - ++dest; - *dest = char_t(' '); - ++dest; - src += 6; - continue; - } - break; - } - } - // Ignore, just copy '&' verbatim - break; - } - } - } - - // If whitespace condensing is enabled - if (Flags & parse_normalize_whitespace) - { - // Test if condensing is needed - if (whitespace_pred::test(*src)) - { - *dest = char_t(' '); - ++dest; // Put single space in dest - ++src; // Skip first whitespace char - // Skip remaining whitespace chars - while (whitespace_pred::test(*src)) - ++src; - continue; - } - } - - // No replacement, only copy character - *dest++ = *src++; - } - - // Return new end - text = src; - return dest; - } - - /////////////////////////////////////////////////////////////////////// - // Internal parsing functions - - // Parse UTF-8 BOM, if any - inline void parse_bom(char*& text) - { - if (static_cast(text[0]) == 0xEF && - static_cast(text[1]) == 0xBB && static_cast(text[2]) == 0xBF) - { - text += 3; - } - } - - // Parse UTF-16/32 BOM, if any - inline void parse_bom(wchar_t*& text) - { - const wchar_t bom = 0xFEFF; - if (text[0] == bom) - { - ++text; - } - } - // Parse XML declaration ( void parse_xml_declaration(char_t*& text) - { - // If parsing of declaration is disabled - if (!(Flags & parse_declaration_node)) - { - // Skip until end of declaration - while (text[0] != char_t('?') || text[1] != char_t('>')) - { - if (!text[0]) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - text += 2; // Skip '?>' - return; // return 0; - } - - // Create declaration - // xml_node *declaration = this->allocate_node(node_declaration); - - // Skip whitespace before attributes or ?> - skip(text); - - // Parse declaration attributes - parse_node_attributes(text /*, declaration*/); - - // Skip ?> - if (text[0] != char_t('?') || text[1] != char_t('>')) - XSXML__PARSE_ERROR("expected ?>", text); - text += 2; - - // return declaration; - } - - // Parse XML comment (' - return; // return 0; // Do not produce comment node - } - - // Skip until end of comment - while (text[0] != char_t('-') || text[1] != char_t('-') || text[2] != char_t('>')) - { - if (!text[0]) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - - // Create comment node - // xml_node *comment = this->allocate_node(node_comment); - // comment->value(value, text - value); // TODO: DNT implement comment - - // Place zero terminator after comment value - if (!(Flags & parse_no_string_terminators)) - *text = char_t('\0'); - - text += 3; // Skip '-->' - return; - } - - // Parse DOCTYPE - template void parse_doctype(char_t*& text) - { - // Skip to > - while (*text != char_t('>')) - { - // Determine character type - switch (*text) - { - - // If '[' encountered, scan for matching ending ']' using naive algorithm with depth - // This works for all W3C test files except for 2 most wicked - case char_t('['): { - ++text; // Skip '[' - int depth = 1; - while (depth > 0) - { - switch (*text) - { - case char_t('['): - ++depth; - break; - case char_t(']'): - --depth; - break; - case 0: - XSXML__PARSE_ERROR("unexpected end of data", text); - default: - break; - } - ++text; - } - break; - } - - // Error on end of text - case char_t('\0'): - XSXML__PARSE_ERROR("unexpected end of data", text); - - // Other character, skip it - default: - ++text; - } - } - - // If DOCTYPE nodes enabled - if (Flags & parse_doctype_node) - { // SAX3: ignore doctype node - // Place zero terminator after value - if (!(Flags & parse_no_string_terminators)) - *text = char_t('\0'); - - text += 1; // skip '>' - - return; // return doctype; - } - else - { - text += 1; // skip '>' - return; // return 0; - } - } - - // Parse PI - template void parse_pi(char_t*& text) - { - // If creation of PI nodes is enabled - if (Flags & parse_pi_nodes) - { - // Create pi node - // xml_node *pi = this->allocate_node(node_pi); - - // Extract PI target name - char_t* name = text; - skip(text); - if (text == name) - XSXML__PARSE_ERROR("expected PI target", text); - // pi->name(name, text - name); // TODO: DNT notify for pi - - // Skip whitespace between pi target and pi - skip(text); - - // Skip to '?>' - while (text[0] != char_t('?') || text[1] != char_t('>')) - { - if (*text == char_t('\0')) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - - text += 2; // Skip '?>' - return; // return pi; - } - else - { - // Skip to '?>' - while (text[0] != char_t('?') || text[1] != char_t('>')) - { - if (*text == char_t('\0')) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - text += 2; // Skip '?>' - return; // return 0; - } - } - - // Parse and append data - // Return character that ends data. - // This is necessary because this character might have been overwritten by a terminating 0 - template - char_t parse_and_append_data(/*const string_view& elementName unused for SAX,*/ char_t*& text, - char_t* contents_start) - { - // Backup to contents start if whitespace trimming is disabled - if (!(Flags & parse_trim_whitespace)) - text = contents_start; - - // Skip until end of data - char_t *value = text, *end; - if (Flags & parse_normalize_whitespace) - end = skip_and_expand_character_refs(text); - else - end = skip_and_expand_character_refs(text); - - // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after - // > - if (Flags & parse_trim_whitespace) - { - if (Flags & parse_normalize_whitespace) - { - // Whitespace is already condensed to single space characters by skipping function, so just - // trim 1 char off the end - if (*(end - 1) == char_t(' ')) - --end; - } - else - { - // Backup until non-whitespace character is found - while (whitespace_pred::test(*(end - 1))) - --end; - } - } - - char_t ch = *text; - // Place zero terminator after value - if (!(Flags & parse_no_string_terminators)) - { - // char_t ch = *text; - *end = char_t('\0'); - // return ch; // Return character that ends data; this is required because zero - // terminator overwritten it - } - - handler_->xml_text_cb(value, end - value); - - // Return character that ends data - return ch; - } - - // Parse CDATA - template void parse_cdata(char_t*& text) - { - // If CDATA is disabled - if (Flags & parse_no_data_nodes) - { - // Skip until end of cdata - while (text[0] != char_t(']') || text[1] != char_t(']') || text[2] != char_t('>')) - { - if (!text[0]) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - text += 3; // Skip ]]> - return; // return 0; // Do not produce CDATA node - } - - // Skip until end of cdata - while (text[0] != char_t(']') || text[1] != char_t(']') || text[2] != char_t('>')) - { - if (!text[0]) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - - // Place zero terminator after value - if (!(Flags & parse_no_string_terminators)) - *text = char_t('\0'); - - text += 3; // Skip ]]> - return; // return cdata; - } - - // Parse element node - template void parse_element(char_t*& text) - { - // Create element node - // xml_node *element = this->allocate_node(node_element); - - // Extract element name - auto mark = text; - skip(text); - size_t n = text - mark; - if (n == 0) - XSXML__PARSE_ERROR("expected element name", text); - - // Skip whitespace between element name and attributes or > - skip(text); - - auto chTmp = *text; - // Place zero terminator after name - if (!(Flags & parse_no_string_terminators)) - mark[n] = (char_t)'\0'; - - // Notify start element - handler_->xml_start_element_cb(mark, n); // - - // Parse attributes, if any - if (chTmp != '>' && chTmp != char_t('/')) - { - parse_node_attributes(text); - chTmp = *text; - } - - // Notify end attr - handler_->xml_end_attr_cb(); - - // Determine ending type - if (chTmp == char_t('>')) - { - ++text; - parse_node_contents(text, mark, n); - } - else if (chTmp == char_t('/')) - { - ++text; - if (*text != char_t('>')) - { - parse_result_ = parse_result::expected_close_tag; - if (*text != 0) - XSXML__PARSE_ERROR("expected >", text); - } - else - ++text; - } - else - { - if (chTmp != 0) - { - parse_result_ = parse_result::expected_close_tag; - XSXML__PARSE_ERROR("expected >", text); - } // else, parse to eof - } - - // Return parsed element - handler_->xml_end_element_cb(mark, n); - // return element; - } - - // Determine node type, and parse it - template void parse_node(char_t*& text) - { - // Parse proper node type - switch (text[0]) - { - - // <... - default: - // Parse and append element node - return parse_element(text); - - // (text); - } - else - { - // Parse PI - return parse_pi(text); - } - - // (text); - } - break; - - // (text); - } - break; - - // (text); - } - break; - - default: - break; - } // switch - - // Attempt to skip other, unrecognized node types starting with ')) - { - if (*text == 0) - XSXML__PARSE_ERROR("unexpected end of data", text); - ++text; - } - ++text; // Skip '>' - return; // return 0; // No node recognized - } - } - - // Parse contents of the node - children, data etc. - template void parse_node_contents(char_t*& text, const char_t* mark, size_t n) - { - // For all children and text - while (1) - { - // Skip whitespace between > and node contents - char_t* contents_start = text; // Store start of node contents before whitespace is skipped - skip(text); - char_t next_char = *text; - - // After data nodes, instead of continuing the loop, control jumps here. - // This is because zero termination inside parse_and_append_data() function - // would wreak havoc with the above code. - // Also, skipping whitespace after data nodes is unnecessary. - after_data_node: - - // Determine what comes next: node closing, child node, data node, or 0? - switch (next_char) - { - - // Node closing or child node - case char_t('<'): - if (text[1] == char_t('/')) - { - // Node closing - text += 2; // Skip '(text); - if (!internal::compare(mark, n, closing_name, text - closing_name, true)) - XSXML__PARSE_ERROR("invalid closing tag name", text); - } - else - skip(text); // No validation, just skip name - - // Skip remaining whitespace after node name - skip(text); - if (*text != char_t('>')) - { - parse_result_ = parse_result::expected_close_tag; - if (*text != 0) - XSXML__PARSE_ERROR("expected >", text); - } - else - ++text; // Skip '>' - return; // Node closed, finished parsing contents - } - else - { - // Child node - ++text; // Skip '<' - parse_node(text); - } - break; - - // End of data - error - case char_t('\0'): - XSXML__PARSE_ERROR("unexpected end of data", text); - - // Data node - default: - next_char = parse_and_append_data(/*elementName, */ text, contents_start); - goto after_data_node; // Bypass regular processing after data nodes - } - } - } - - // Parse XML attributes of the node - template void parse_node_attributes(char_t*& text) - { - // For all attributes - while (attribute_name_pred::test(*text)) - { - // Extract attribute name - char_t* name = text; - ++text; // Skip first character of attribute name - skip(text); - if (text == name) - XSXML__PARSE_ERROR("expected attribute name", name); - - // Create new attribute - // xml_attribute *attribute = this->allocate_attribute(); - // attribute->name(name, text - name); - auto namesize = text - name; - // node->append_attribute(attribute); - - // Skip whitespace after attribute name - skip(text); - - // Skip = - if (*text != char_t('=')) - XSXML__PARSE_ERROR("expected =", text); - ++text; - - // Add terminating zero after name - if (!(Flags & parse_no_string_terminators)) - name[namesize] = 0; - - // Skip whitespace after = - skip(text); - - // Skip quote and remember if it was ' or " - char_t quote = *text; - if (quote != char_t('\'') && quote != char_t('"')) - XSXML__PARSE_ERROR("expected ' or \"", text); - ++text; - - // Extract attribute value and expand char refs in it - char_t *value = text, *end; - const int AttFlags = - Flags & ~parse_normalize_whitespace; // No whitespace normalization in attributes - if (quote == char_t('\'')) - end = - skip_and_expand_character_refs, - attribute_value_pure_pred, AttFlags>(text); - else - end = - skip_and_expand_character_refs, - attribute_value_pure_pred, AttFlags>(text); - - // Set attribute value - // attribute->value(value, end - value); - auto valuesize = end - value; - - // Make sure that end quote is present - if (*text != quote) - XSXML__PARSE_ERROR("expected ' or \"", text); - ++text; // Skip quote - - // Add terminating zero after value - if (!(Flags & parse_no_string_terminators)) - value[valuesize] = 0; - - handler_->xml_attr_cb(name, namesize, value, valuesize); - - // Skip whitespace after attribute value - skip(text); - } - } - -}; /* CLASS xml_sax3_parser */ - -//! \cond internal -namespace internal -{ -// Whitespace (space \n \r \t) -template -const unsigned char lookup_tables::lookup_whitespace[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F -}; - -// Node name (anything but space \n \r \t / > ? \0) -template -const unsigned char lookup_tables::lookup_node_name[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Text (i.e. PCDATA) (anything but < \0) -template -const unsigned char lookup_tables::lookup_text[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Text (i.e. PCDATA) that does not require processing when ws normalization is disabled -// (anything but < \0 &) -template -const unsigned char lookup_tables::lookup_text_pure_no_ws[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Text (i.e. PCDATA) that does not require processing when ws normalizationis is enabled -// (anything but < \0 & space \n \r \t) -template -const unsigned char lookup_tables::lookup_text_pure_with_ws[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Attribute name (anything but space \n \r \t / < > = ? ! \0) -template -const unsigned char lookup_tables::lookup_attribute_name[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Attribute data with single quote (anything but ' \0) -template -const unsigned char lookup_tables::lookup_attribute_data_1[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Attribute data with single quote that does not require processing (anything but ' \0 &) -template -const unsigned char lookup_tables::lookup_attribute_data_1_pure[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Attribute data with double quote (anything but " \0) -template -const unsigned char lookup_tables::lookup_attribute_data_2[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Attribute data with double quote that does not require processing (anything but " \0 &) -template -const unsigned char lookup_tables::lookup_attribute_data_2_pure[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 - 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F -}; - -// Digits (dec and hex, 255 denotes end of numeric character reference) -template -const unsigned char lookup_tables::lookup_digits[256] = { - // 0 1 2 3 4 5 6 7 8 9 A B C D E F - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 0 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 1 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 2 - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, // 3 - 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 5 - 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 6 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 7 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 8 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 9 - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // A - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // B - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // C - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // D - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // E - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 // F -}; - -// Upper case conversion -template -const unsigned char lookup_tables::lookup_upcase[256] = { - // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A B C D E F - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0 - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // 1 - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // 2 - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // 3 - 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 4 - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, // 5 - 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 6 - 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // 7 - 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // 8 - 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // 9 - 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // A - 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // B - 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, // C - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, // D - 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // E - 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 // F -}; -} // namespace internal - //! \endcond -} // namespace xsxml - -// Undefine internal macros -#undef XSXML__PARSE_ERROR - -// On MSVC, restore warnings state -#ifdef _MSC_VER -# pragma warning(pop) -#endif - -#endif +////////////////////////////////////////////////////////////////////////////////////////// +// The embedded xml SAX parser, extract from rapidxml DOM parser +// please see: http://rapidxml.sourceforge.net/ +////////////////////////////////////////////////////////////////////////////////////////// +/* +The MIT License (MIT) +Copyright (c) 2019 halx99 +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SIMDSOFT__XSXML_HPP +#define SIMDSOFT__XSXML_HPP + +#include +#include +#include +#include + +// On MSVC, disable "conditional expression is constant" warning (level 4). +// This warning is almost impossible to avoid with certain types of templated code +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4127) // Conditional expression is constant +#endif + +#if !defined(XSXML__PARSE_ERROR) +# define XSXML__PARSE_ERROR(what, where) throw parse_error(what, where) +#endif + +namespace xsxml +{ +//! Parse flag instructing the parser to not create data nodes. +//! Text of first data node will still be placed in value of parent element, unless +//! xsxml::parse_no_element_values flag is also specified. Can be combined with other flags by +//! use of | operator.

See xml_document::parse() function. +const int parse_no_data_nodes = 0x1; + +//! Parse flag instructing the parser to not use text of first data node as a value of parent +//! element. Can be combined with other flags by use of | operator. Note that child data nodes of +//! element node take precendence over its value when printing. That is, if element has one or more +//! child data nodes and a value, the value will be ignored. Use +//! xsxml::parse_no_data_nodes flag to prevent creation of data nodes if you want to manipulate +//! data using values of elements.

See xml_document::parse() function. +const int parse_no_element_values = 0x2; + +//! Parse flag instructing the parser to not place zero terminators after strings in the source +//! text. By default zero terminators are placed, modifying source text. Can be combined with other +//! flags by use of | operator.

See xml_document::parse() function. +const int parse_no_string_terminators = 0x4; + +//! Parse flag instructing the parser to not translate entities in the source text. +//! By default entities are translated, modifying source text. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_no_entity_translation = 0x8; + +//! Parse flag instructing the parser to disable UTF-8 handling and assume plain 8 bit characters. +//! By default, UTF-8 handling is enabled. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_no_utf8 = 0x10; + +//! Parse flag instructing the parser to create XML declaration node. +//! By default, declaration node is not created. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_declaration_node = 0x20; + +//! Parse flag instructing the parser to create comments nodes. +//! By default, comment nodes are not created. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_comment_nodes = 0x40; + +//! Parse flag instructing the parser to create DOCTYPE node. +//! By default, doctype node is not created. +//! Although W3C specification allows at most one DOCTYPE node, RapidXml will silently accept +//! documents with more than one. Can be combined with other flags by use of | operator.

+//! See xml_document::parse() function. +static const int parse_doctype_node = 0x80; + +//! Parse flag instructing the parser to create PI nodes. +//! By default, PI nodes are not created. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_pi_nodes = 0x100; + +//! Parse flag instructing the parser to validate closing tag names. +//! If not set, name inside closing tag is irrelevant to the parser. +//! By default, closing tags are not validated. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_validate_closing_tags = 0x200; + +//! Parse flag instructing the parser to trim all leading and trailing whitespace of data nodes. +//! By default, whitespace is not trimmed. +//! This flag does not cause the parser to modify source text. +//! Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_trim_whitespace = 0x400; + +//! Parse flag instructing the parser to condense all whitespace runs of data nodes to a single +//! space character. Trimming of leading and trailing whitespace of data is controlled by +//! xsxml::parse_trim_whitespace flag. By default, whitespace is not normalized. If this flag is +//! specified, source text will be modified. Can be combined with other flags by use of | operator. +//!

+//! See xml_document::parse() function. +static const int parse_normalize_whitespace = 0x800; + +//! Parse flag instructing the parser to convert html entity +//! this flag only works when the flag 'parse_no_entity_translation' not specified +//!

+//! See xml_document::parse() function. +static const int parse_html_entity_translation = 0x1000; + +// Compound flags + +//! Parse flags which represent default behaviour of the parser. +//! This is always equal to 0, so that all other flags can be simply ored together. +//! Normally there is no need to inconveniently disable flags by anding with their negated (~) +//! values. This also means that meaning of each flag is a negation of the default setting. +//! For example, if flag name is xsxml::parse_no_utf8, it means that utf-8 is enabled by +//! default, and using the flag will disable it.

See xml_document::parse() function. +static const int parse_default = 0; + +//! A combination of parse flags that forbids any modifications of the source text. +//! This also results in faster parsing. However, note that the following will occur: +//!
    +//!
  • names and values of nodes will not be zero terminated, you have to use xml_base::name_size() +//! and xml_base::value_size() functions to determine where name and value ends
  • entities +//! will not be translated
  • whitespace will not be normalized
  • +//!
+//! See xml_document::parse() function. +static const int parse_non_destructive = parse_no_string_terminators | parse_no_entity_translation; + +//! A combination of parse flags resulting in fastest possible parsing, without sacrificing +//! important data.

See xml_document::parse() function. +static const int parse_fastest = parse_non_destructive | parse_no_data_nodes; + +//! A combination of parse flags resulting in largest amount of data being extracted. +//! This usually results in slowest parsing. +//!

+//! See xml_document::parse() function. +static const int parse_full = parse_declaration_node | parse_comment_nodes | parse_doctype_node | + parse_pi_nodes | parse_validate_closing_tags; + +static const int parse_normal = parse_no_data_nodes; + +typedef char char_t; + +class string_view +{ +public: + string_view() : _Mystr(nullptr), _Mysize(0) {} + string_view(char_t* str, size_t size) : _Mystr(str), _Mysize(size) {} + const char* c_str() const { return _Mystr != nullptr ? _Mystr : ""; } + size_t length() const { return _Mysize; } + bool empty() const { return _Mysize == 0; } + +private: + char_t* _Mystr; + size_t _Mysize; +}; + +// The sax3 parse callbacks +struct xml_sax3_parse_cb +{ + std::function xml_start_element_cb; + std::function xml_attr_cb; + std::function xml_end_attr_cb; + std::function xml_end_element_cb; + std::function xml_text_cb; +}; + +namespace internal +{ +//! \cond internal +// Struct that contains lookup tables for the parser +// It must be a template to allow correct linking (because it has static data members, which are +// defined in a header file). +template struct lookup_tables +{ + static const unsigned char lookup_whitespace[256]; // Whitespace table + static const unsigned char lookup_node_name[256]; // Node name table + static const unsigned char lookup_text[256]; // Text table + static const unsigned char lookup_text_pure_no_ws[256]; // Text table + static const unsigned char lookup_text_pure_with_ws[256]; // Text table + static const unsigned char lookup_attribute_name[256]; // Attribute name table + static const unsigned char lookup_attribute_data_1[256]; // Attribute data table with single quote + static const unsigned char + lookup_attribute_data_1_pure[256]; // Attribute data table with single quote + static const unsigned char + lookup_attribute_data_2[256]; // Attribute data table with double quotes + static const unsigned char + lookup_attribute_data_2_pure[256]; // Attribute data table with double quotes + static const unsigned char lookup_digits[256]; // Digits + static const unsigned char + lookup_upcase[256]; // To uppercase conversion table for ASCII characters +}; + +// Compare strings for equality +template +inline bool compare(const _CharT* p1, std::size_t size1, const _CharT* p2, std::size_t size2, + bool case_sensitive) +{ + if (size1 != size2) + return false; + if (case_sensitive) + { + for (const _CharT* end = p1 + size1; p1 < end; ++p1, ++p2) + if (*p1 != *p2) + return false; + } + else + { + for (const _CharT* end = p1 + size1; p1 < end; ++p1, ++p2) + if (lookup_tables<0>::lookup_upcase[static_cast(*p1)] != + lookup_tables<0>::lookup_upcase[static_cast(*p2)]) + return false; + } + return true; +} +} // namespace internal + +//! Parse error exception. +//! This exception is thrown by the parser when an error occurs. +//! Use what() function to get human-readable error message. +//! Use where() function to get a pointer to position within source text where error was detected. +//!

+//! If throwing exceptions by the parser is undesirable, +//! it can be disabled by defining RAPIDXML_NO_EXCEPTIONS macro before xsxml.hpp is included. +//! This will cause the parser to call xsxml::parse_error_handler() function instead of throwing +//! an exception. This function must be defined by the user.

This class derives from +//! std::exception class. +class parse_error : public std::exception +{ + +public: + //! Constructs parse error + parse_error(const char* what, void* where) : m_what(what), m_where(where) {} + + //! Gets human readable description of error. + //! \return Pointer to null terminated description of the error. + virtual const char* what() const throw() { return m_what; } + + //! Gets pointer to character data where error happened. + //! _CharT should be the same as char type of xml_document that produced the error. + //! \return Pointer to location within the parsed string where error occured. + template _CharT* where() const { return reinterpret_cast<_CharT*>(m_where); } + +private: + const char* m_what; + void* m_where; +}; + +//! This class represents root of the DOM hierarchy. +//! It is also an xml_node and a memory_pool through public inheritance. +//! Use parse() function to build a DOM tree from a zero-terminated XML text string. +//! parse() function allocates memory for nodes and attributes by using functions of xml_document, +//! which are inherited from memory_pool. +//! To access root node of the document, use the document itself, as if it was an xml_node. +//! \param char_t Character type to use. +class xml_sax3_parser +{ + xml_sax3_parse_cb* handler_; + + enum class parse_result + { + ok, + expected_close_tag, + unrecognized_tag, + }; + parse_result parse_result_ = parse_result::ok; + +public: + template + static parse_result parse(char_t* text, int length, xml_sax3_parse_cb* handler) + { + xml_sax3_parser parser(handler); + parser.parse(text, length); + return parser.parse_result_; + } + + //! Constructs empty XML document + xml_sax3_parser(xml_sax3_parse_cb* handler) { handler_ = handler; } + + //! Parses zero-terminated XML string according to given flags. + //! Passed string will be modified by the parser, unless xsxml::parse_non_destructive flag is + //! used. The string must persist for the lifetime of the document. In case of error, + //! xsxml::parse_error exception will be thrown.

If you want to parse contents of a + //! file, you must first load the file into the memory, and pass pointer to its beginning. Make + //! sure that data is zero-terminated.

Document can be parsed into multiple times. Each + //! new call to parse removes previous nodes and attributes (if any), but does not clear memory + //! pool. \param text XML data to parse; pointer is non-const to denote fact that this data may be + //! modified by the parser. + template void parse(char_t* text, int length) + { + assert(text); + + // save last character and make buffer zero-terminated (speeds up parsing) + auto endch = text[length - 1]; + text[length - 1] = 0; + + // Parse BOM, if any + parse_bom(text); + + // Parse children + _L_Loop : { + // Skip whitespace before node + skip(text); + if (*text == 0) + goto _L_end; + + // Parse and append new child + if (*text == char_t('<')) + { + ++text; // Skip '<' + parse_node(text); + } + else + XSXML__PARSE_ERROR("expected <", text); + + goto _L_Loop; + } + + _L_end: + // check parse result. + if (parse_result_ == parse_result::ok) + { + if (endch == '<') + { + parse_result_ = parse_result::unrecognized_tag; + XSXML__PARSE_ERROR("unrecognized tag", text); + } + } + else + { // parse_result_: parse_result::expected_close_tag + if (endch == '>') + parse_result_ = parse_result::ok; + else + XSXML__PARSE_ERROR("expected >", text); + } + } + + //! Clears the document by deleting all nodes and clearing the memory pool. + //! All nodes owned by document pool are destroyed. + void clear() + { + // this->remove_all_nodes(); + // this->remove_all_attributes(); + // memory_pool::clear(); + } + +private: + /////////////////////////////////////////////////////////////////////// + // Internal character utility functions + + // Detect whitespace character + struct whitespace_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_whitespace[static_cast(ch)]; + } + }; + + // Detect node name character + struct node_name_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_node_name[static_cast(ch)]; + } + }; + + // Detect attribute name character + struct attribute_name_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_attribute_name[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) + struct text_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_text[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) that does not require processing + struct text_pure_no_ws_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_text_pure_no_ws[static_cast(ch)]; + } + }; + + // Detect text character (PCDATA) that does not require processing + struct text_pure_with_ws_pred + { + static unsigned char test(char_t ch) + { + return internal::lookup_tables<0>::lookup_text_pure_with_ws[static_cast(ch)]; + } + }; + + // Detect attribute value character + template struct attribute_value_pred + { + static unsigned char test(char_t ch) + { + if (Quote == char_t('\'')) + return internal::lookup_tables<0>::lookup_attribute_data_1[static_cast(ch)]; + if (Quote == char_t('\"')) + return internal::lookup_tables<0>::lookup_attribute_data_2[static_cast(ch)]; + return 0; // Should never be executed, to avoid warnings on Comeau + } + }; + + // Detect attribute value character + template struct attribute_value_pure_pred + { + static unsigned char test(char_t ch) + { + if (Quote == char_t('\'')) + return internal::lookup_tables<0>::lookup_attribute_data_1_pure[static_cast( + ch)]; + if (Quote == char_t('\"')) + return internal::lookup_tables<0>::lookup_attribute_data_2_pure[static_cast( + ch)]; + return 0; // Should never be executed, to avoid warnings on Comeau + } + }; + + // Insert coded character, using UTF8 or 8-bit ASCII + template static void insert_coded_character(char_t*& text, unsigned long code) + { + if (Flags & parse_no_utf8) + { + // Insert 8-bit ASCII character + // Todo: possibly verify that code is less than 256 and use replacement char otherwise? + text[0] = static_cast(code); + text += 1; + } + else + { + // Insert UTF8 sequence + if (code < 0x80) // 1 byte sequence + { + text[0] = static_cast(code); + text += 1; + } + else if (code < 0x800) // 2 byte sequence + { + text[1] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[0] = static_cast(code | 0xC0); + text += 2; + } + else if (code < 0x10000) // 3 byte sequence + { + text[2] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[1] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[0] = static_cast(code | 0xE0); + text += 3; + } + else if (code < 0x110000) // 4 byte sequence + { + text[3] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[2] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[1] = static_cast((code | 0x80) & 0xBF); + code >>= 6; + text[0] = static_cast(code | 0xF0); + text += 4; + } + else // Invalid, only codes up to 0x10FFFF are allowed in Unicode + { + XSXML__PARSE_ERROR("invalid numeric character entity", text); + } + } + } + + // Skip characters until predicate evaluates to true + template static void skip(char_t*& text) + { + char_t* tmp = text; + while (StopPred::test(*tmp)) + ++tmp; + text = tmp; + } + + // Skip characters until predicate evaluates to true while doing the following: + // - replacing XML character entity references with proper characters (' & " < + // > &#...;) + // - condensing whitespace sequences to single space character + template + static char_t* skip_and_expand_character_refs(char_t*& text) + { + // If entity translation, whitespace condense and whitespace trimming is disabled, use plain + // skip + if (Flags & parse_no_entity_translation && !(Flags & parse_normalize_whitespace) && + !(Flags & parse_trim_whitespace)) + { + skip(text); + return text; + } + + // Use simple skip until first modification is detected + skip(text); + + // Use translation skip + char_t* src = text; + char_t* dest = src; + while (StopPred::test(*src)) + { + // If entity translation is enabled + if (!(Flags & parse_no_entity_translation)) + { + // Test if replacement is needed + if (src[0] == char_t('&')) + { + switch (src[1]) + { + + // & ' + case char_t('a'): + if (src[2] == char_t('m') && src[3] == char_t('p') && src[4] == char_t(';')) + { + *dest = char_t('&'); + ++dest; + src += 5; + continue; + } + if (src[2] == char_t('p') && src[3] == char_t('o') && src[4] == char_t('s') && + src[5] == char_t(';')) + { + *dest = char_t('\''); + ++dest; + src += 6; + continue; + } + break; + + // " + case char_t('q'): + if (src[2] == char_t('u') && src[3] == char_t('o') && src[4] == char_t('t') && + src[5] == char_t(';')) + { + *dest = char_t('"'); + ++dest; + src += 6; + continue; + } + break; + + // > + case char_t('g'): + if (src[2] == char_t('t') && src[3] == char_t(';')) + { + *dest = char_t('>'); + ++dest; + src += 4; + continue; + } + break; + + // < + case char_t('l'): + if (src[2] == char_t('t') && src[3] == char_t(';')) + { + *dest = char_t('<'); + ++dest; + src += 4; + continue; + } + break; + + // &#...; - assumes ASCII + case char_t('#'): + if (src[2] == char_t('x')) + { + unsigned long code = 0; + src += 3; // Skip &#x + while (1) + { + unsigned char digit = + internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; + if (digit == 0xFF) + break; + code = code * 16 + digit; + ++src; + } + insert_coded_character(dest, code); // Put character in output + } + else + { + unsigned long code = 0; + src += 2; // Skip &# + while (1) + { + unsigned char digit = + internal::lookup_tables<0>::lookup_digits[static_cast(*src)]; + if (digit == 0xFF) + break; + code = code * 10 + digit; + ++src; + } + insert_coded_character(dest, code); // Put character in output + } + if (*src == char_t(';')) + ++src; + else + XSXML__PARSE_ERROR("expected ;", src); + continue; + + // Something else + default: + if (Flags & parse_html_entity_translation) + { + switch (src[1]) + { //   + case char_t('n'): + if (src[2] == char_t('b') && src[3] == char_t('s') && src[4] == char_t('p') && + src[5] == char_t(';')) + { + *dest = char_t(' '); + ++dest; + src += 6; + continue; + } + break; + //   + case char_t('e'): + if (src[2] == char_t('m') && src[3] == char_t('s') && src[4] == char_t('p') && + src[5] == char_t(';')) + { + *dest = char_t(' '); + ++dest; + *dest = char_t(' '); + ++dest; + src += 6; + continue; + } + break; + } + } + // Ignore, just copy '&' verbatim + break; + } + } + } + + // If whitespace condensing is enabled + if (Flags & parse_normalize_whitespace) + { + // Test if condensing is needed + if (whitespace_pred::test(*src)) + { + *dest = char_t(' '); + ++dest; // Put single space in dest + ++src; // Skip first whitespace char + // Skip remaining whitespace chars + while (whitespace_pred::test(*src)) + ++src; + continue; + } + } + + // No replacement, only copy character + *dest++ = *src++; + } + + // Return new end + text = src; + return dest; + } + + /////////////////////////////////////////////////////////////////////// + // Internal parsing functions + + // Parse UTF-8 BOM, if any + inline void parse_bom(char*& text) + { + if (static_cast(text[0]) == 0xEF && + static_cast(text[1]) == 0xBB && static_cast(text[2]) == 0xBF) + { + text += 3; + } + } + + // Parse UTF-16/32 BOM, if any + inline void parse_bom(wchar_t*& text) + { + const wchar_t bom = 0xFEFF; + if (text[0] == bom) + { + ++text; + } + } + // Parse XML declaration ( void parse_xml_declaration(char_t*& text) + { + // If parsing of declaration is disabled + if (!(Flags & parse_declaration_node)) + { + // Skip until end of declaration + while (text[0] != char_t('?') || text[1] != char_t('>')) + { + if (!text[0]) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 2; // Skip '?>' + return; // return 0; + } + + // Create declaration + // xml_node *declaration = this->allocate_node(node_declaration); + + // Skip whitespace before attributes or ?> + skip(text); + + // Parse declaration attributes + parse_node_attributes(text /*, declaration*/); + + // Skip ?> + if (text[0] != char_t('?') || text[1] != char_t('>')) + XSXML__PARSE_ERROR("expected ?>", text); + text += 2; + + // return declaration; + } + + // Parse XML comment (' + return; // return 0; // Do not produce comment node + } + + // Skip until end of comment + while (text[0] != char_t('-') || text[1] != char_t('-') || text[2] != char_t('>')) + { + if (!text[0]) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + + // Create comment node + // xml_node *comment = this->allocate_node(node_comment); + // comment->value(value, text - value); // TODO: DNT implement comment + + // Place zero terminator after comment value + if (!(Flags & parse_no_string_terminators)) + *text = char_t('\0'); + + text += 3; // Skip '-->' + return; + } + + // Parse DOCTYPE + template void parse_doctype(char_t*& text) + { + // Skip to > + while (*text != char_t('>')) + { + // Determine character type + switch (*text) + { + + // If '[' encountered, scan for matching ending ']' using naive algorithm with depth + // This works for all W3C test files except for 2 most wicked + case char_t('['): { + ++text; // Skip '[' + int depth = 1; + while (depth > 0) + { + switch (*text) + { + case char_t('['): + ++depth; + break; + case char_t(']'): + --depth; + break; + case 0: + XSXML__PARSE_ERROR("unexpected end of data", text); + default: + break; + } + ++text; + } + break; + } + + // Error on end of text + case char_t('\0'): + XSXML__PARSE_ERROR("unexpected end of data", text); + + // Other character, skip it + default: + ++text; + } + } + + // If DOCTYPE nodes enabled + if (Flags & parse_doctype_node) + { // SAX3: ignore doctype node + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + *text = char_t('\0'); + + text += 1; // skip '>' + + return; // return doctype; + } + else + { + text += 1; // skip '>' + return; // return 0; + } + } + + // Parse PI + template void parse_pi(char_t*& text) + { + // If creation of PI nodes is enabled + if (Flags & parse_pi_nodes) + { + // Create pi node + // xml_node *pi = this->allocate_node(node_pi); + + // Extract PI target name + char_t* name = text; + skip(text); + if (text == name) + XSXML__PARSE_ERROR("expected PI target", text); + // pi->name(name, text - name); // TODO: DNT notify for pi + + // Skip whitespace between pi target and pi + skip(text); + + // Skip to '?>' + while (text[0] != char_t('?') || text[1] != char_t('>')) + { + if (*text == char_t('\0')) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + + text += 2; // Skip '?>' + return; // return pi; + } + else + { + // Skip to '?>' + while (text[0] != char_t('?') || text[1] != char_t('>')) + { + if (*text == char_t('\0')) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 2; // Skip '?>' + return; // return 0; + } + } + + // Parse and append data + // Return character that ends data. + // This is necessary because this character might have been overwritten by a terminating 0 + template + char_t parse_and_append_data(/*const string_view& elementName unused for SAX,*/ char_t*& text, + char_t* contents_start) + { + // Backup to contents start if whitespace trimming is disabled + if (!(Flags & parse_trim_whitespace)) + text = contents_start; + + // Skip until end of data + char_t *value = text, *end; + if (Flags & parse_normalize_whitespace) + end = skip_and_expand_character_refs(text); + else + end = skip_and_expand_character_refs(text); + + // Trim trailing whitespace if flag is set; leading was already trimmed by whitespace skip after + // > + if (Flags & parse_trim_whitespace) + { + if (Flags & parse_normalize_whitespace) + { + // Whitespace is already condensed to single space characters by skipping function, so just + // trim 1 char off the end + if (*(end - 1) == char_t(' ')) + --end; + } + else + { + // Backup until non-whitespace character is found + while (whitespace_pred::test(*(end - 1))) + --end; + } + } + + char_t ch = *text; + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + { + // char_t ch = *text; + *end = char_t('\0'); + // return ch; // Return character that ends data; this is required because zero + // terminator overwritten it + } + + handler_->xml_text_cb(value, end - value); + + // Return character that ends data + return ch; + } + + // Parse CDATA + template void parse_cdata(char_t*& text) + { + // If CDATA is disabled + if (Flags & parse_no_data_nodes) + { + // Skip until end of cdata + while (text[0] != char_t(']') || text[1] != char_t(']') || text[2] != char_t('>')) + { + if (!text[0]) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + text += 3; // Skip ]]> + return; // return 0; // Do not produce CDATA node + } + + // Skip until end of cdata + while (text[0] != char_t(']') || text[1] != char_t(']') || text[2] != char_t('>')) + { + if (!text[0]) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + + // Place zero terminator after value + if (!(Flags & parse_no_string_terminators)) + *text = char_t('\0'); + + text += 3; // Skip ]]> + return; // return cdata; + } + + // Parse element node + template void parse_element(char_t*& text) + { + // Create element node + // xml_node *element = this->allocate_node(node_element); + + // Extract element name + auto mark = text; + skip(text); + size_t n = text - mark; + if (n == 0) + XSXML__PARSE_ERROR("expected element name", text); + + // Skip whitespace between element name and attributes or > + skip(text); + + auto chTmp = *text; + // Place zero terminator after name + if (!(Flags & parse_no_string_terminators)) + mark[n] = (char_t)'\0'; + + // Notify start element + handler_->xml_start_element_cb(mark, n); // + + // Parse attributes, if any + if (chTmp != '>' && chTmp != char_t('/')) + { + parse_node_attributes(text); + chTmp = *text; + } + + // Notify end attr + handler_->xml_end_attr_cb(); + + // Determine ending type + if (chTmp == char_t('>')) + { + ++text; + parse_node_contents(text, mark, n); + } + else if (chTmp == char_t('/')) + { + ++text; + if (*text != char_t('>')) + { + parse_result_ = parse_result::expected_close_tag; + if (*text != 0) + XSXML__PARSE_ERROR("expected >", text); + } + else + ++text; + } + else + { + if (chTmp != 0) + { + parse_result_ = parse_result::expected_close_tag; + XSXML__PARSE_ERROR("expected >", text); + } // else, parse to eof + } + + // Return parsed element + handler_->xml_end_element_cb(mark, n); + // return element; + } + + // Determine node type, and parse it + template void parse_node(char_t*& text) + { + // Parse proper node type + switch (text[0]) + { + + // <... + default: + // Parse and append element node + return parse_element(text); + + // (text); + } + else + { + // Parse PI + return parse_pi(text); + } + + // (text); + } + break; + + // (text); + } + break; + + // (text); + } + break; + + default: + break; + } // switch + + // Attempt to skip other, unrecognized node types starting with ')) + { + if (*text == 0) + XSXML__PARSE_ERROR("unexpected end of data", text); + ++text; + } + ++text; // Skip '>' + return; // return 0; // No node recognized + } + } + + // Parse contents of the node - children, data etc. + template void parse_node_contents(char_t*& text, const char_t* mark, size_t n) + { + // For all children and text + while (1) + { + // Skip whitespace between > and node contents + char_t* contents_start = text; // Store start of node contents before whitespace is skipped + skip(text); + char_t next_char = *text; + + // After data nodes, instead of continuing the loop, control jumps here. + // This is because zero termination inside parse_and_append_data() function + // would wreak havoc with the above code. + // Also, skipping whitespace after data nodes is unnecessary. + after_data_node: + + // Determine what comes next: node closing, child node, data node, or 0? + switch (next_char) + { + + // Node closing or child node + case char_t('<'): + if (text[1] == char_t('/')) + { + // Node closing + text += 2; // Skip '(text); + if (!internal::compare(mark, n, closing_name, text - closing_name, true)) + XSXML__PARSE_ERROR("invalid closing tag name", text); + } + else + skip(text); // No validation, just skip name + + // Skip remaining whitespace after node name + skip(text); + if (*text != char_t('>')) + { + parse_result_ = parse_result::expected_close_tag; + if (*text != 0) + XSXML__PARSE_ERROR("expected >", text); + } + else + ++text; // Skip '>' + return; // Node closed, finished parsing contents + } + else + { + // Child node + ++text; // Skip '<' + parse_node(text); + } + break; + + // End of data - error + case char_t('\0'): + XSXML__PARSE_ERROR("unexpected end of data", text); + + // Data node + default: + next_char = parse_and_append_data(/*elementName, */ text, contents_start); + goto after_data_node; // Bypass regular processing after data nodes + } + } + } + + // Parse XML attributes of the node + template void parse_node_attributes(char_t*& text) + { + // For all attributes + while (attribute_name_pred::test(*text)) + { + // Extract attribute name + char_t* name = text; + ++text; // Skip first character of attribute name + skip(text); + if (text == name) + XSXML__PARSE_ERROR("expected attribute name", name); + + // Create new attribute + // xml_attribute *attribute = this->allocate_attribute(); + // attribute->name(name, text - name); + auto namesize = text - name; + // node->append_attribute(attribute); + + // Skip whitespace after attribute name + skip(text); + + // Skip = + if (*text != char_t('=')) + XSXML__PARSE_ERROR("expected =", text); + ++text; + + // Add terminating zero after name + if (!(Flags & parse_no_string_terminators)) + name[namesize] = 0; + + // Skip whitespace after = + skip(text); + + // Skip quote and remember if it was ' or " + char_t quote = *text; + if (quote != char_t('\'') && quote != char_t('"')) + XSXML__PARSE_ERROR("expected ' or \"", text); + ++text; + + // Extract attribute value and expand char refs in it + char_t *value = text, *end; + const int AttFlags = + Flags & ~parse_normalize_whitespace; // No whitespace normalization in attributes + if (quote == char_t('\'')) + end = + skip_and_expand_character_refs, + attribute_value_pure_pred, AttFlags>(text); + else + end = + skip_and_expand_character_refs, + attribute_value_pure_pred, AttFlags>(text); + + // Set attribute value + // attribute->value(value, end - value); + auto valuesize = end - value; + + // Make sure that end quote is present + if (*text != quote) + XSXML__PARSE_ERROR("expected ' or \"", text); + ++text; // Skip quote + + // Add terminating zero after value + if (!(Flags & parse_no_string_terminators)) + value[valuesize] = 0; + + handler_->xml_attr_cb(name, namesize, value, valuesize); + + // Skip whitespace after attribute value + skip(text); + } + } + +}; /* CLASS xml_sax3_parser */ + +//! \cond internal +namespace internal +{ +// Whitespace (space \n \r \t) +template +const unsigned char lookup_tables::lookup_whitespace[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F +}; + +// Node name (anything but space \n \r \t / > ? \0) +template +const unsigned char lookup_tables::lookup_node_name[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Text (i.e. PCDATA) (anything but < \0) +template +const unsigned char lookup_tables::lookup_text[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Text (i.e. PCDATA) that does not require processing when ws normalization is disabled +// (anything but < \0 &) +template +const unsigned char lookup_tables::lookup_text_pure_no_ws[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Text (i.e. PCDATA) that does not require processing when ws normalizationis is enabled +// (anything but < \0 & space \n \r \t) +template +const unsigned char lookup_tables::lookup_text_pure_with_ws[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Attribute name (anything but space \n \r \t / < > = ? ! \0) +template +const unsigned char lookup_tables::lookup_attribute_name[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Attribute data with single quote (anything but ' \0) +template +const unsigned char lookup_tables::lookup_attribute_data_1[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Attribute data with single quote that does not require processing (anything but ' \0 &) +template +const unsigned char lookup_tables::lookup_attribute_data_1_pure[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Attribute data with double quote (anything but " \0) +template +const unsigned char lookup_tables::lookup_attribute_data_2[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Attribute data with double quote that does not require processing (anything but " \0 &) +template +const unsigned char lookup_tables::lookup_attribute_data_2_pure[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1 + 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // E + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // F +}; + +// Digits (dec and hex, 255 denotes end of numeric character reference) +template +const unsigned char lookup_tables::lookup_digits[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 0 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 1 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 2 + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, // 3 + 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 4 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 5 + 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 6 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 7 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 8 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 9 + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // A + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // B + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // C + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // D + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // E + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 // F +}; + +// Upper case conversion +template +const unsigned char lookup_tables::lookup_upcase[256] = { + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A B C D E F + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0 + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, // 1 + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, // 2 + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, // 3 + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 4 + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, // 5 + 96, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, // 6 + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 123, 124, 125, 126, 127, // 7 + 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, // 8 + 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, // 9 + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, // A + 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, // B + 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, // C + 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, // D + 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, // E + 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 // F +}; +} // namespace internal + //! \endcond +} // namespace xsxml + +// Undefine internal macros +#undef XSXML__PARSE_ERROR + +// On MSVC, restore warnings state +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +#endif