diff --git a/src/pugixml.cpp b/src/pugixml.cpp index c3df93b..f9a7012 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3903,53 +3903,114 @@ PUGI__NS_BEGIN xml_encoding encoding; }; + template PUGI__FN PUGI__UNSIGNED_OVERFLOW char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative) + { + char_t* result = end - 1; + U rest = negative ? 0 - value : value; + + do + { + *result-- = static_cast('0' + (rest % 10)); + rest /= 10; + } + while (rest); + + assert(result >= begin); + (void)begin; + + *result = '-'; + + return result + !negative; + } + + static void char_output_escaped(xml_buffered_writer& writer, const char_t c, chartypex_t type, unsigned int flags) + { + switch (c) + { + case '&': + writer.write('&', 'a', 'm', 'p', ';'); + break; + case '<': + writer.write('&', 'l', 't', ';'); + break; + case '>': + writer.write('&', 'g', 't', ';'); + break; + case '"': + if (flags & format_attribute_single_quote) + writer.write('"'); + else + writer.write('&', 'q', 'u', 'o', 't', ';'); + break; + case '\'': + if (flags & format_attribute_single_quote) + writer.write('&', 'a', 'p', 'o', 's', ';'); + else + writer.write('\''); + break; + default: + { + if (PUGI__IS_CHARTYPEX(c, type)) + { + unsigned int ch = static_cast(c); + assert(ch < 32); + + // handle control codes + if (!(flags & format_skip_control_chars)) + writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); + } + else + writer.write(c); + } + } + } + PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags) { while (*s) { - const char_t* prev = s; - - // While *s is a usual symbol - PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type)); - - writer.write_buffer(prev, static_cast(s - prev)); - - switch (*s) + if (flags & format_escape_nonascii) { - case 0: break; - case '&': - writer.write('&', 'a', 'm', 'p', ';'); - ++s; - break; - case '<': - writer.write('&', 'l', 't', ';'); - ++s; - break; - case '>': - writer.write('&', 'g', 't', ';'); - ++s; - break; - case '"': - if (flags & format_attribute_single_quote) - writer.write('"'); - else - writer.write('&', 'q', 'u', 'o', 't', ';'); - ++s; - break; - case '\'': - if (flags & format_attribute_single_quote) - writer.write('&', 'a', 'p', 'o', 's', ';'); - else - writer.write('\''); - ++s; - break; - default: // s is not a usual symbol - { - unsigned int ch = static_cast(*s++); - assert(ch < 32); + int cplen = 1; + if (((*s) & 0xf8) == 0xf0) cplen = 4; + else if (((*s) & 0xf0) == 0xe0) cplen = 3; + else if (((*s) & 0xe0) == 0xc0) cplen = 2; - if (!(flags & format_skip_control_chars)) - writer.write('&', '#', static_cast((ch / 10) + '0'), static_cast((ch % 10) + '0'), ';'); + if (cplen == 1) + { + char_output_escaped(writer, *s, type, flags); + ++s; + } + else + { + unsigned int value = static_cast((*s) & (0xff >> (cplen + 1))) << ((cplen - 1) * 6); + ++s; + for (int len = cplen -1; len && *s != 0; --len) + { + value |= (static_cast(*s) - 0x80) << ((len - 1) * 6); + ++s; + } + writer.write('&', '#'); + char_t buf[8]; + char_t* end = buf + sizeof(buf) / sizeof(buf[0]); + char_t* begin = integer_to_string(buf, end, value, false); + writer.write_buffer(begin, static_cast(end - begin)); + writer.write(';'); + } + } + else + { + const char_t* prev = s; + + // While *s is a usual symbol + PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type)); + + writer.write_buffer(prev, static_cast(s - prev)); + + if (*s != 0) + { + char_output_escaped(writer, *s, type, flags); + ++s; } } } @@ -4625,26 +4686,6 @@ PUGI__NS_BEGIN } #endif - template PUGI__FN PUGI__UNSIGNED_OVERFLOW char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative) - { - char_t* result = end - 1; - U rest = negative ? 0 - value : value; - - do - { - *result-- = static_cast('0' + (rest % 10)); - rest /= 10; - } - while (rest); - - assert(result >= begin); - (void)begin; - - *result = '-'; - - return result + !negative; - } - // set value with conversion functions template PUGI__FN bool set_value_ascii(String& dest, Header& header, uintptr_t header_mask, char* buf) diff --git a/src/pugixml.hpp b/src/pugixml.hpp index f658109..aeb870d 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -259,6 +259,9 @@ namespace pugi // Use single quotes ' instead of double quotes " for enclosing attribute values. This flag is off by default. const unsigned int format_attribute_single_quote = 0x200; + // Escape non-ascii attribute values and PCDATA contents. This flag is off by default. + const unsigned int format_escape_nonascii = 0x400; + // The default set of formatting flags. // Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none. const unsigned int format_default = format_indent; diff --git a/tests/test.cpp b/tests/test.cpp index a97116e..1803fb2 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -58,6 +58,11 @@ bool test_node(const pugi::xml_node& node, const pugi::char_t* contents, const p node.print(writer, indent, flags, get_native_encoding()); + if (writer.as_string() != contents) + { + printf("found: %s\n", writer.as_string().c_str()); + } + return writer.as_string() == contents; } diff --git a/tests/test_write.cpp b/tests/test_write.cpp index 0410e82..0a0214c 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -188,15 +188,30 @@ TEST(write_doctype_null) CHECK_NODE(doc, STR("")); } -TEST_XML(write_escape, "text") +TEST_XML(write_escape_1, "text") { doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t"); doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t")); CHECK_NODE(doc, STR("'"& \"><>'\"&\r\n\t")); +} + +TEST_XML(write_escape_2, "text") +{ + doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t"); + doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t")); + CHECK_NODE_EX(doc, STR("<>'\"&\r\n\t"), STR(""), format_raw | format_attribute_single_quote); } +TEST_XML(write_escape_3, "text") +{ + doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&äöü"); + doc.child(STR("node")).first_child().set_value(STR("<>'\"&äöü")); + + CHECK_NODE_EX(doc, STR("<>'"&äöü"), STR(""), format_raw | format_escape_nonascii); +} + TEST_XML(write_escape_roundtrip, "text") { doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");