DOCTYPE parsing is now stackless

This prevents malformed input XML with very deeply recursive DOCTYPE sections
from crashing the parser.

Fixes #29.
This commit is contained in:
Arseny Kapoulkine 2015-02-12 08:12:12 -08:00
parent 00b4b0192f
commit e94552c9ca
2 changed files with 62 additions and 15 deletions

View File

@ -2357,23 +2357,28 @@ PUGI__NS_BEGIN
char_t* parse_doctype_ignore(char_t* s) char_t* parse_doctype_ignore(char_t* s)
{ {
size_t depth = 0;
assert(s[0] == '<' && s[1] == '!' && s[2] == '['); assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
s++; s += 3;
while (*s) while (*s)
{ {
if (s[0] == '<' && s[1] == '!' && s[2] == '[') if (s[0] == '<' && s[1] == '!' && s[2] == '[')
{ {
// nested ignore section // nested ignore section
s = parse_doctype_ignore(s); s += 3;
if (!s) return s; depth++;
} }
else if (s[0] == ']' && s[1] == ']' && s[2] == '>') else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
{ {
// ignore section end // ignore section end
s += 3; s += 3;
return s; if (depth == 0)
return s;
depth--;
} }
else s++; else s++;
} }
@ -2381,10 +2386,12 @@ PUGI__NS_BEGIN
PUGI__THROW_ERROR(status_bad_doctype, s); PUGI__THROW_ERROR(status_bad_doctype, s);
} }
char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel) char_t* parse_doctype_group(char_t* s, char_t endch)
{ {
size_t depth = 0;
assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
s++; s += 2;
while (*s) while (*s)
{ {
@ -2399,12 +2406,8 @@ PUGI__NS_BEGIN
else else
{ {
// some control group // some control group
s = parse_doctype_group(s, endch, false); s += 2;
if (!s) return s; depth++;
// skip >
assert(*s == '>');
s++;
} }
} }
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
@ -2415,12 +2418,16 @@ PUGI__NS_BEGIN
} }
else if (*s == '>') else if (*s == '>')
{ {
return s; if (depth == 0)
return s;
depth--;
s++;
} }
else s++; else s++;
} }
if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
return s; return s;
} }
@ -2512,7 +2519,7 @@ PUGI__NS_BEGIN
char_t* mark = s + 9; char_t* mark = s + 9;
s = parse_doctype_group(s, endch, true); s = parse_doctype_group(s, endch);
if (!s) return s; if (!s) return s;
assert((*s == 0 && endch == '>') || *s == '>'); assert((*s == 0 && endch == '>') || *s == '>');

View File

@ -322,3 +322,43 @@ TEST(parse_doctype_error_ignore)
CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE[")).status == status_bad_doctype); CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE[")).status == status_bad_doctype);
CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE["), parse_doctype).status == status_bad_doctype); CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE["), parse_doctype).status == status_bad_doctype);
} }
TEST(parse_doctype_stackless_group)
{
std::basic_string<char_t> str;
int count = 100000;
str += "<!DOCTYPE ";
for (int i = 0; i < count; ++i)
str += STR("<!G ");
for (int j = 0; j < count; ++j)
str += STR(">");
str += ">";
xml_document doc;
CHECK(doc.load_string(str.c_str(), parse_fragment));
}
TEST(parse_doctype_stackless_ignore)
{
std::basic_string<char_t> str;
int count = 100000;
str += "<!DOCTYPE ";
for (int i = 0; i < count; ++i)
str += STR("<![IGNORE[ ");
for (int j = 0; j < count; ++j)
str += STR("]]>");
str += ">";
xml_document doc;
CHECK(doc.load_string(str.c_str(), parse_fragment));
}