fix #342 escape non-ascii characters too

This commit is contained in:
pamapa 2020-04-18 17:43:08 +02:00
parent 5e64076af9
commit 1fd5258133
4 changed files with 126 additions and 62 deletions

View File

@ -3903,53 +3903,114 @@ PUGI__NS_BEGIN
xml_encoding encoding;
};
template <typename U> PUGI__FN PUGI__UNSIGNED_OVERFLOW char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative)
{
char_t* result = end - 1;
U rest = negative ? 0 - value : value;
do
{
*result-- = static_cast<char_t>('0' + (rest % 10));
rest /= 10;
}
while (rest);
assert(result >= begin);
(void)begin;
*result = '-';
return result + !negative;
}
static void char_output_escaped(xml_buffered_writer& writer, const char_t c, chartypex_t type, unsigned int flags)
{
switch (c)
{
case '&':
writer.write('&', 'a', 'm', 'p', ';');
break;
case '<':
writer.write('&', 'l', 't', ';');
break;
case '>':
writer.write('&', 'g', 't', ';');
break;
case '"':
if (flags & format_attribute_single_quote)
writer.write('"');
else
writer.write('&', 'q', 'u', 'o', 't', ';');
break;
case '\'':
if (flags & format_attribute_single_quote)
writer.write('&', 'a', 'p', 'o', 's', ';');
else
writer.write('\'');
break;
default:
{
if (PUGI__IS_CHARTYPEX(c, type))
{
unsigned int ch = static_cast<unsigned int>(c);
assert(ch < 32);
// handle control codes
if (!(flags & format_skip_control_chars))
writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
}
else
writer.write(c);
}
}
}
PUGI__FN void text_output_escaped(xml_buffered_writer& writer, const char_t* s, chartypex_t type, unsigned int flags)
{
while (*s)
{
const char_t* prev = s;
// While *s is a usual symbol
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
writer.write_buffer(prev, static_cast<size_t>(s - prev));
switch (*s)
if (flags & format_escape_nonascii)
{
case 0: break;
case '&':
writer.write('&', 'a', 'm', 'p', ';');
++s;
break;
case '<':
writer.write('&', 'l', 't', ';');
++s;
break;
case '>':
writer.write('&', 'g', 't', ';');
++s;
break;
case '"':
if (flags & format_attribute_single_quote)
writer.write('"');
else
writer.write('&', 'q', 'u', 'o', 't', ';');
++s;
break;
case '\'':
if (flags & format_attribute_single_quote)
writer.write('&', 'a', 'p', 'o', 's', ';');
else
writer.write('\'');
++s;
break;
default: // s is not a usual symbol
{
unsigned int ch = static_cast<unsigned int>(*s++);
assert(ch < 32);
int cplen = 1;
if (((*s) & 0xf8) == 0xf0) cplen = 4;
else if (((*s) & 0xf0) == 0xe0) cplen = 3;
else if (((*s) & 0xe0) == 0xc0) cplen = 2;
if (!(flags & format_skip_control_chars))
writer.write('&', '#', static_cast<char_t>((ch / 10) + '0'), static_cast<char_t>((ch % 10) + '0'), ';');
if (cplen == 1)
{
char_output_escaped(writer, *s, type, flags);
++s;
}
else
{
unsigned int value = static_cast<unsigned char>((*s) & (0xff >> (cplen + 1))) << ((cplen - 1) * 6);
++s;
for (int len = cplen -1; len && *s != 0; --len)
{
value |= (static_cast<unsigned char>(*s) - 0x80) << ((len - 1) * 6);
++s;
}
writer.write('&', '#');
char_t buf[8];
char_t* end = buf + sizeof(buf) / sizeof(buf[0]);
char_t* begin = integer_to_string<unsigned int>(buf, end, value, false);
writer.write_buffer(begin, static_cast<size_t>(end - begin));
writer.write(';');
}
}
else
{
const char_t* prev = s;
// While *s is a usual symbol
PUGI__SCANWHILE_UNROLL(!PUGI__IS_CHARTYPEX(ss, type));
writer.write_buffer(prev, static_cast<size_t>(s - prev));
if (*s != 0)
{
char_output_escaped(writer, *s, type, flags);
++s;
}
}
}
@ -4625,26 +4686,6 @@ PUGI__NS_BEGIN
}
#endif
template <typename U> PUGI__FN PUGI__UNSIGNED_OVERFLOW char_t* integer_to_string(char_t* begin, char_t* end, U value, bool negative)
{
char_t* result = end - 1;
U rest = negative ? 0 - value : value;
do
{
*result-- = static_cast<char_t>('0' + (rest % 10));
rest /= 10;
}
while (rest);
assert(result >= begin);
(void)begin;
*result = '-';
return result + !negative;
}
// set value with conversion functions
template <typename String, typename Header>
PUGI__FN bool set_value_ascii(String& dest, Header& header, uintptr_t header_mask, char* buf)

View File

@ -259,6 +259,9 @@ namespace pugi
// Use single quotes ' instead of double quotes " for enclosing attribute values. This flag is off by default.
const unsigned int format_attribute_single_quote = 0x200;
// Escape non-ascii attribute values and PCDATA contents. This flag is off by default.
const unsigned int format_escape_nonascii = 0x400;
// The default set of formatting flags.
// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
const unsigned int format_default = format_indent;

View File

@ -58,6 +58,11 @@ bool test_node(const pugi::xml_node& node, const pugi::char_t* contents, const p
node.print(writer, indent, flags, get_native_encoding());
if (writer.as_string() != contents)
{
printf("found: %s\n", writer.as_string().c_str());
}
return writer.as_string() == contents;
}

View File

@ -188,15 +188,30 @@ TEST(write_doctype_null)
CHECK_NODE(doc, STR("<!DOCTYPE>"));
}
TEST_XML(write_escape, "<node attr=''>text</node>")
TEST_XML(write_escape_1, "<node attr=''>text</node>")
{
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");
doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t"));
CHECK_NODE(doc, STR("<node attr=\"&lt;>'&quot;&amp;&#04;&#13;&#10;&#09;\">&lt;&gt;'\"&amp;&#04;\r\n\t</node>"));
}
TEST_XML(write_escape_2, "<node attr=''>text</node>")
{
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");
doc.child(STR("node")).first_child().set_value(STR("<>'\"&\x04\r\n\t"));
CHECK_NODE_EX(doc, STR("<node attr='&lt;>&apos;\"&amp;&#04;&#13;&#10;&#09;'>&lt;&gt;'\"&amp;&#04;\r\n\t</node>"), STR(""), format_raw | format_attribute_single_quote);
}
TEST_XML(write_escape_3, "<node attr=''>text</node>")
{
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&äöü");
doc.child(STR("node")).first_child().set_value(STR("<>'\"&äöü"));
CHECK_NODE_EX(doc, STR("<node attr=\"&lt;&gt;'&quot;&amp;&#228;&#246;&#252;\">&lt;&gt;'&quot;&amp;&#228;&#246;&#252;</node>"), STR(""), format_raw | format_escape_nonascii);
}
TEST_XML(write_escape_roundtrip, "<node attr=''>text</node>")
{
doc.child(STR("node")).attribute(STR("attr")) = STR("<>'\"&\x04\r\n\t");