Implemented better DOCTYPE parsing, added more DOCTYPE tests
git-svn-id: http://pugixml.googlecode.com/svn/trunk@409 99668b35-9821-0410-8761-19e4c4f06640
This commit is contained in:
parent
55f3cba20c
commit
7bda2cb529
150
src/pugixml.cpp
150
src/pugixml.cpp
@ -1739,6 +1739,116 @@ namespace
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DOCTYPE consists of nested sections of the following possible types:
|
||||||
|
// <!-- ... -->, <? ... ?>, "...", '...'
|
||||||
|
// <![...]]>
|
||||||
|
// <!...>
|
||||||
|
// First group can not contain nested groups
|
||||||
|
// Second group can contain nested groups of the same type
|
||||||
|
// Third group can contain all other groups
|
||||||
|
xml_parse_result parse_doctype_primitive(char_t*& s, char_t* buffer_start)
|
||||||
|
{
|
||||||
|
if (*s == '"' || *s == '\'')
|
||||||
|
{
|
||||||
|
// quoted string
|
||||||
|
char_t ch = *s++;
|
||||||
|
SCANFOR(*s == ch);
|
||||||
|
if (!*s) THROW_ERROR(status_bad_doctype, s);
|
||||||
|
|
||||||
|
s++;
|
||||||
|
}
|
||||||
|
else if (s[0] == '<' && s[1] == '?')
|
||||||
|
{
|
||||||
|
// <? ... ?>
|
||||||
|
s += 2;
|
||||||
|
SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
|
||||||
|
if (!*s) THROW_ERROR(status_bad_doctype, s);
|
||||||
|
|
||||||
|
s += 2;
|
||||||
|
}
|
||||||
|
else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
|
||||||
|
{
|
||||||
|
s += 4;
|
||||||
|
SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
|
||||||
|
if (!*s) THROW_ERROR(status_bad_doctype, s);
|
||||||
|
|
||||||
|
s += 4;
|
||||||
|
}
|
||||||
|
else THROW_ERROR(status_bad_doctype, s);
|
||||||
|
|
||||||
|
THROW_ERROR(status_ok, s);
|
||||||
|
}
|
||||||
|
|
||||||
|
xml_parse_result parse_doctype_ignore(char_t*& s, char_t* buffer_start)
|
||||||
|
{
|
||||||
|
assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
|
||||||
|
s++;
|
||||||
|
|
||||||
|
while (*s)
|
||||||
|
{
|
||||||
|
if (s[0] == '<' && s[1] == '!' && s[2] == '[')
|
||||||
|
{
|
||||||
|
// nested ignore section
|
||||||
|
xml_parse_result res = parse_doctype_ignore(s, buffer_start);
|
||||||
|
|
||||||
|
if (!res) return res;
|
||||||
|
}
|
||||||
|
else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
|
||||||
|
{
|
||||||
|
// ignore section end
|
||||||
|
s += 3;
|
||||||
|
|
||||||
|
THROW_ERROR(status_ok, s);
|
||||||
|
}
|
||||||
|
else s++;
|
||||||
|
}
|
||||||
|
|
||||||
|
THROW_ERROR(status_bad_doctype, s);
|
||||||
|
}
|
||||||
|
|
||||||
|
xml_parse_result parse_doctype(char_t*& s, char_t* buffer_start, char_t endch, bool toplevel)
|
||||||
|
{
|
||||||
|
assert(s[0] == '<' && s[1] == '!');
|
||||||
|
s++;
|
||||||
|
|
||||||
|
while (*s)
|
||||||
|
{
|
||||||
|
if (s[0] == '<' && s[1] == '!' && s[2] != '-')
|
||||||
|
{
|
||||||
|
if (s[2] == '[')
|
||||||
|
{
|
||||||
|
// ignore
|
||||||
|
xml_parse_result res = parse_doctype_ignore(s, buffer_start);
|
||||||
|
|
||||||
|
if (!res) return res;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// some control group
|
||||||
|
xml_parse_result res = parse_doctype(s, buffer_start, endch, false);
|
||||||
|
|
||||||
|
if (!res) return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
|
||||||
|
{
|
||||||
|
// unknown tag (forbidden), or some primitive group
|
||||||
|
xml_parse_result res = parse_doctype_primitive(s, buffer_start);
|
||||||
|
|
||||||
|
if (!res) return res;
|
||||||
|
}
|
||||||
|
else if (*s == '>')
|
||||||
|
{
|
||||||
|
s++;
|
||||||
|
|
||||||
|
THROW_ERROR(status_ok, s);
|
||||||
|
}
|
||||||
|
else s++;
|
||||||
|
}
|
||||||
|
|
||||||
|
THROW_ERROR((toplevel && endch == '>') ? status_ok : status_bad_doctype, s);
|
||||||
|
}
|
||||||
|
|
||||||
xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch)
|
xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch)
|
||||||
{
|
{
|
||||||
// load into registers
|
// load into registers
|
||||||
@ -1833,45 +1943,11 @@ namespace
|
|||||||
{
|
{
|
||||||
if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s);
|
if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s);
|
||||||
|
|
||||||
LOC_DOCTYPE:
|
s -= 2;
|
||||||
SCANFOR(*s == '\'' || *s == '"' || *s == '[' || *s == '>');
|
|
||||||
if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
|
|
||||||
|
|
||||||
if (*s == '\'' || *s == '"') // '...SYSTEM "..."
|
xml_parse_result res = parse_doctype(s, buffer_start, endch, true);
|
||||||
{
|
|
||||||
ch = *s++;
|
|
||||||
SCANFOR(*s == ch);
|
|
||||||
if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
|
|
||||||
|
|
||||||
s += (*s != 0);
|
if (!res) return res;
|
||||||
goto LOC_DOCTYPE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(*s == '[') // '...[...'
|
|
||||||
{
|
|
||||||
++s;
|
|
||||||
unsigned int bd = 1; // Bracket depth counter.
|
|
||||||
while (*s!=0) // Loop till we're out of all brackets.
|
|
||||||
{
|
|
||||||
if (*s == ']') --bd;
|
|
||||||
else if (*s == '[') ++bd;
|
|
||||||
if (bd == 0) break;
|
|
||||||
++s;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bd != 0) THROW_ERROR(status_bad_doctype, s);
|
|
||||||
}
|
|
||||||
|
|
||||||
SCANFOR(*s == '>');
|
|
||||||
|
|
||||||
if (*s == 0)
|
|
||||||
{
|
|
||||||
if (endch != '>') THROW_ERROR(status_bad_doctype, s);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
++s;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s);
|
else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s);
|
||||||
else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s);
|
else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s);
|
||||||
|
|||||||
@ -497,35 +497,6 @@ TEST(parse_declaration_error)
|
|||||||
CHECK(doc.load(STR("<?xml version='1?>"), parse_minimal | parse_declaration).status == status_bad_attribute);
|
CHECK(doc.load(STR("<?xml version='1?>"), parse_minimal | parse_declaration).status == status_bad_attribute);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(parse_doctype_skip)
|
|
||||||
{
|
|
||||||
xml_document doc;
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc>")) && !doc.first_child());
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo'>")) && !doc.first_child());
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM \"foo\">")) && !doc.first_child());
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo\" 'bar'>")) && !doc.first_child());
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo'\">")) && !doc.first_child());
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]>")) && !doc.first_child());
|
|
||||||
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]><node/>")));
|
|
||||||
CHECK_NODE(doc, STR("<node />"));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(parse_doctype_error)
|
|
||||||
{
|
|
||||||
xml_document doc;
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM \"foo")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo\" 'bar")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo'\"")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]")).status == status_bad_doctype);
|
|
||||||
CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>] ")).status == status_bad_doctype);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(parse_empty)
|
TEST(parse_empty)
|
||||||
{
|
{
|
||||||
xml_document doc;
|
xml_document doc;
|
||||||
|
|||||||
92
tests/test_parse_doctype.cpp
Normal file
92
tests/test_parse_doctype.cpp
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
bool test_doctype_wf(const std::basic_string<char_t>& decl)
|
||||||
|
{
|
||||||
|
xml_document doc;
|
||||||
|
|
||||||
|
// standalone
|
||||||
|
if (!doc.load(decl.c_str()) || doc.first_child()) return false;
|
||||||
|
|
||||||
|
// pcdata pre/postfix
|
||||||
|
if (!doc.load(("a" + decl).c_str()) || doc.first_child()) return false;
|
||||||
|
if (!doc.load((decl + "b").c_str()) || doc.first_child()) return false;
|
||||||
|
if (!doc.load(("a" + decl + "b").c_str()) || doc.first_child()) return false;
|
||||||
|
|
||||||
|
// node pre/postfix
|
||||||
|
if (!doc.load(("<nodea/>" + decl).c_str()) || !test_node(doc, STR("<nodea />"), STR(""), format_raw)) return false;
|
||||||
|
if (!doc.load((decl + "<nodeb/>").c_str()) || !test_node(doc, STR("<nodeb />"), STR(""), format_raw)) return false;
|
||||||
|
if (!doc.load(("<nodea/>" + decl + "<nodeb/>").c_str()) || !test_node(doc, STR("<nodea /><nodeb />"), STR(""), format_raw)) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool test_doctype_nwf(const std::basic_string<char_t>& decl)
|
||||||
|
{
|
||||||
|
xml_document doc;
|
||||||
|
|
||||||
|
// standalone
|
||||||
|
if (doc.load(decl.c_str()).status != status_bad_doctype) return false;
|
||||||
|
|
||||||
|
// pcdata postfix
|
||||||
|
if (doc.load((decl + "b").c_str()).status != status_bad_doctype) return false;
|
||||||
|
|
||||||
|
// node postfix
|
||||||
|
if (doc.load((decl + "<nodeb/>").c_str()).status != status_bad_doctype) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define TEST_DOCTYPE_WF(contents) CHECK(test_doctype_wf(STR(contents)))
|
||||||
|
#define TEST_DOCTYPE_NWF(contents) CHECK(test_doctype_nwf(STR(contents)))
|
||||||
|
|
||||||
|
TEST(parse_doctype_skip)
|
||||||
|
{
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM 'foo'>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM \"foo\">");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc PUBLIC \"foo\" 'bar'>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc PUBLIC \"foo'\">");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]>");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(parse_doctype_error)
|
||||||
|
{
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM \"foo");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc PUBLIC \"foo\" 'bar");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc PUBLIC \"foo'\"");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>] ");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Examples from W3C recommendations
|
||||||
|
TEST(parse_doctype_w3c_wf)
|
||||||
|
{
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting SYSTEM \"hello.dtd\">");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)> ]>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ATTLIST list type (bullets|ordered|glossary) \"ordered\"> <!ATTLIST form method CDATA #FIXED \"POST\"> ]>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY % draft 'INCLUDE' > <!ENTITY % final 'IGNORE' > <![%draft;[ <!ELEMENT book (comments*, title, body, supplements?)> ]]> <![%final;[ <!ELEMENT book (title, body, supplements?)> ]]>]>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY open-hatch PUBLIC \"-//Textuality//TEXT Standard open-hatch boilerplate//EN\" \"http://www.textuality.com/boilerplate/OpenHatch.xml\"> ]>");
|
||||||
|
TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY EndAttr \"27'\" > ]>");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(parse_doctype_w3c_nwf)
|
||||||
|
{
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting SYSTEM \"hello.dtd>");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting SYSTEM");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)> ]");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)>");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ ");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type (bullets|ordered|glossary) \"ordered\"> ]");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type (bullets|ordered|glossary) \"ordered\">");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type (bullets|ordered|glossary) \"orde");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type (bullets|ordered|glossary) ");
|
||||||
|
TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ENTITY open-hatch PUBLIC \"-//Textuality//TEXT Standard open-hatch boilerplate//EN\" \"http://www.textuality.com/boilerplate/OpenHatch.x");
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user