DOCTYPE parsing is now stackless

This prevents malformed input XML with very deeply recursive DOCTYPE sections
from crashing the parser.

Fixes #29.
This commit is contained in:
Arseny Kapoulkine 2015-02-12 08:12:12 -08:00
parent 00b4b0192f
commit e94552c9ca
2 changed files with 62 additions and 15 deletions

View File

@ -2357,23 +2357,28 @@ PUGI__NS_BEGIN
char_t* parse_doctype_ignore(char_t* s)
{
size_t depth = 0;
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
s++;
s += 3;
while (*s)
{
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{
// nested ignore section
s = parse_doctype_ignore(s);
if (!s) return s;
s += 3;
depth++;
}
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{
// ignore section end
s += 3;
return s;
if (depth == 0)
return s;
depth--;
}
else s++;
}
@ -2381,10 +2386,12 @@ PUGI__NS_BEGIN
PUGI__THROW_ERROR(status_bad_doctype, s);
}
char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
char_t* parse_doctype_group(char_t* s, char_t endch)
{
size_t depth = 0;
assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
s++;
s += 2;
while (*s)
{
@ -2399,12 +2406,8 @@ PUGI__NS_BEGIN
else
{
// some control group
s = parse_doctype_group(s, endch, false);
if (!s) return s;
// skip >
assert(*s == '>');
s++;
s += 2;
depth++;
}
}
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
@ -2415,12 +2418,16 @@ PUGI__NS_BEGIN
}
else if (*s == '>')
{
return s;
if (depth == 0)
return s;
depth--;
s++;
}
else s++;
}
if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
return s;
}
@ -2512,7 +2519,7 @@ PUGI__NS_BEGIN
char_t* mark = s + 9;
s = parse_doctype_group(s, endch, true);
s = parse_doctype_group(s, endch);
if (!s) return s;
assert((*s == 0 && endch == '>') || *s == '>');

View File

@ -322,3 +322,43 @@ TEST(parse_doctype_error_ignore)
CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE[")).status == status_bad_doctype);
CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE["), parse_doctype).status == status_bad_doctype);
}
TEST(parse_doctype_stackless_group)
{
std::basic_string<char_t> str;
int count = 100000;
str += "<!DOCTYPE ";
for (int i = 0; i < count; ++i)
str += STR("<!G ");
for (int j = 0; j < count; ++j)
str += STR(">");
str += ">";
xml_document doc;
CHECK(doc.load_string(str.c_str(), parse_fragment));
}
TEST(parse_doctype_stackless_ignore)
{
std::basic_string<char_t> str;
int count = 100000;
str += "<!DOCTYPE ";
for (int i = 0; i < count; ++i)
str += STR("<![IGNORE[ ");
for (int j = 0; j < count; ++j)
str += STR("]]>");
str += ">";
xml_document doc;
CHECK(doc.load_string(str.c_str(), parse_fragment));
}