Optimized attribute parsing; behavior of parse_wconv changed, it now assumes that parse_eol is set
git-svn-id: http://pugixml.googlecode.com/svn/trunk@503 99668b35-9821-0410-8761-19e4c4f06640
This commit is contained in:
parent
f9c7855143
commit
9fa82b15f5
174
src/pugixml.cpp
174
src/pugixml.cpp
@ -935,7 +935,7 @@ namespace
|
|||||||
{
|
{
|
||||||
ct_parse_pcdata = 1, // \0, &, \r, <
|
ct_parse_pcdata = 1, // \0, &, \r, <
|
||||||
ct_parse_attr = 2, // \0, &, \r, ', "
|
ct_parse_attr = 2, // \0, &, \r, ', "
|
||||||
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab
|
ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, tab
|
||||||
ct_space = 8, // \r, \n, space, tab
|
ct_space = 8, // \r, \n, space, tab
|
||||||
ct_parse_cdata = 16, // \0, ], >, \r
|
ct_parse_cdata = 16, // \0, ], >, \r
|
||||||
ct_parse_comment = 32, // \0, -, >, \r
|
ct_parse_comment = 32, // \0, -, >, \r
|
||||||
@ -947,7 +947,7 @@ namespace
|
|||||||
{
|
{
|
||||||
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
|
55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
|
||||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
|
||||||
12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
|
8, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
|
||||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
|
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 0, 1, 0, 48, 0, // 48-63
|
||||||
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
|
0, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, // 64-79
|
||||||
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
|
192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 0, 0, 16, 0, 192, // 80-95
|
||||||
@ -1020,19 +1020,6 @@ namespace
|
|||||||
template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o1 = _1;
|
template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o1 = _1;
|
||||||
template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o2 = _2;
|
template <bool _1, bool _2> const bool opt2_to_type<_1, _2>::o2 = _2;
|
||||||
|
|
||||||
template <bool _1, bool _2, bool _3, bool _4> struct opt4_to_type
|
|
||||||
{
|
|
||||||
static const bool o1;
|
|
||||||
static const bool o2;
|
|
||||||
static const bool o3;
|
|
||||||
static const bool o4;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o1 = _1;
|
|
||||||
template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o2 = _2;
|
|
||||||
template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o3 = _3;
|
|
||||||
template <bool _1, bool _2, bool _3, bool _4> const bool opt4_to_type<_1, _2, _3, _4>::o4 = _4;
|
|
||||||
|
|
||||||
bool is_little_endian()
|
bool is_little_endian()
|
||||||
{
|
{
|
||||||
unsigned int ui = 1;
|
unsigned int ui = 1;
|
||||||
@ -1628,19 +1615,16 @@ namespace
|
|||||||
|
|
||||||
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
|
typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
|
||||||
|
|
||||||
template <typename opt4> struct strconv_attribute_impl
|
template <typename opt1> struct strconv_attribute_impl
|
||||||
{
|
{
|
||||||
static char_t* parse(char_t* s, char_t end_quote)
|
static char_t* parse_wnorm(char_t* s, char_t end_quote)
|
||||||
{
|
{
|
||||||
const bool opt_wconv = opt4::o1;
|
const bool opt_escape = opt1::o1;
|
||||||
const bool opt_wnorm = opt4::o2;
|
|
||||||
const bool opt_eol = opt4::o3;
|
|
||||||
const bool opt_escape = opt4::o4;
|
|
||||||
|
|
||||||
gap g;
|
gap g;
|
||||||
|
|
||||||
// trim leading whitespaces
|
// trim leading whitespaces
|
||||||
if (opt_wnorm && IS_CHARTYPE(*s, ct_space))
|
if (IS_CHARTYPE(*s, ct_space))
|
||||||
{
|
{
|
||||||
char_t* str = s;
|
char_t* str = s;
|
||||||
|
|
||||||
@ -1652,22 +1636,18 @@ namespace
|
|||||||
|
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
while (!IS_CHARTYPE(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s;
|
while (!IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
|
||||||
|
|
||||||
if (*s == end_quote)
|
if (*s == end_quote)
|
||||||
{
|
{
|
||||||
char_t* str = g.flush(s);
|
char_t* str = g.flush(s);
|
||||||
|
|
||||||
if (opt_wnorm)
|
do *str-- = 0;
|
||||||
{
|
while (IS_CHARTYPE(*str, ct_space));
|
||||||
do *str-- = 0;
|
|
||||||
while (IS_CHARTYPE(*str, ct_space));
|
|
||||||
}
|
|
||||||
else *str = 0;
|
|
||||||
|
|
||||||
return s + 1;
|
return s + 1;
|
||||||
}
|
}
|
||||||
else if (opt_wnorm && IS_CHARTYPE(*s, ct_space))
|
else if (IS_CHARTYPE(*s, ct_space))
|
||||||
{
|
{
|
||||||
*s++ = ' ';
|
*s++ = ' ';
|
||||||
|
|
||||||
@ -1679,21 +1659,73 @@ namespace
|
|||||||
g.push(s, str - s);
|
g.push(s, str - s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (opt_wconv && IS_CHARTYPE(*s, ct_space))
|
else if (opt_escape && *s == '&')
|
||||||
{
|
{
|
||||||
if (opt_eol)
|
s = strconv_escape(s, g);
|
||||||
|
}
|
||||||
|
else if (!*s)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else ++s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static char_t* parse_wconv(char_t* s, char_t end_quote)
|
||||||
|
{
|
||||||
|
const bool opt_escape = opt1::o1;
|
||||||
|
|
||||||
|
gap g;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
while (!IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
|
||||||
|
|
||||||
|
if (*s == end_quote)
|
||||||
|
{
|
||||||
|
*g.flush(s) = 0;
|
||||||
|
|
||||||
|
return s + 1;
|
||||||
|
}
|
||||||
|
else if (IS_CHARTYPE(*s, ct_space))
|
||||||
|
{
|
||||||
|
if (*s == '\r')
|
||||||
{
|
{
|
||||||
if (*s == '\r')
|
*s++ = ' ';
|
||||||
{
|
|
||||||
*s++ = ' ';
|
if (*s == '\n') g.push(s, 1);
|
||||||
|
|
||||||
if (*s == '\n') g.push(s, 1);
|
|
||||||
}
|
|
||||||
else *s++ = ' ';
|
|
||||||
}
|
}
|
||||||
else *s++ = ' ';
|
else *s++ = ' ';
|
||||||
}
|
}
|
||||||
else if (opt_eol && *s == '\r')
|
else if (opt_escape && *s == '&')
|
||||||
|
{
|
||||||
|
s = strconv_escape(s, g);
|
||||||
|
}
|
||||||
|
else if (!*s)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else ++s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static char_t* parse_eol(char_t* s, char_t end_quote)
|
||||||
|
{
|
||||||
|
const bool opt_escape = opt1::o1;
|
||||||
|
|
||||||
|
gap g;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
|
||||||
|
|
||||||
|
if (*s == end_quote)
|
||||||
|
{
|
||||||
|
*g.flush(s) = 0;
|
||||||
|
|
||||||
|
return s + 1;
|
||||||
|
}
|
||||||
|
else if (*s == '\r')
|
||||||
{
|
{
|
||||||
*s++ = '\n';
|
*s++ = '\n';
|
||||||
|
|
||||||
@ -1710,30 +1742,58 @@ namespace
|
|||||||
else ++s;
|
else ++s;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static char_t* parse_simple(char_t* s, char_t end_quote)
|
||||||
|
{
|
||||||
|
const bool opt_escape = opt1::o1;
|
||||||
|
|
||||||
|
gap g;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
while (!IS_CHARTYPE(*s, ct_parse_attr)) ++s;
|
||||||
|
|
||||||
|
if (*s == end_quote)
|
||||||
|
{
|
||||||
|
*g.flush(s) = 0;
|
||||||
|
|
||||||
|
return s + 1;
|
||||||
|
}
|
||||||
|
else if (opt_escape && *s == '&')
|
||||||
|
{
|
||||||
|
s = strconv_escape(s, g);
|
||||||
|
}
|
||||||
|
else if (!*s)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else ++s;
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
strconv_attribute_t get_strconv_attribute(unsigned int optmask)
|
strconv_attribute_t get_strconv_attribute(unsigned int optmask)
|
||||||
{
|
{
|
||||||
STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x80);
|
STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40);
|
||||||
|
|
||||||
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
|
switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
|
||||||
{
|
{
|
||||||
case 0: return strconv_attribute_impl<opt4_to_type<0, 0, 0, 0> >::parse;
|
case 0: return strconv_attribute_impl<opt1_to_type<0> >::parse_simple;
|
||||||
case 1: return strconv_attribute_impl<opt4_to_type<0, 0, 0, 1> >::parse;
|
case 1: return strconv_attribute_impl<opt1_to_type<1> >::parse_simple;
|
||||||
case 2: return strconv_attribute_impl<opt4_to_type<0, 0, 1, 0> >::parse;
|
case 2: return strconv_attribute_impl<opt1_to_type<0> >::parse_eol;
|
||||||
case 3: return strconv_attribute_impl<opt4_to_type<0, 0, 1, 1> >::parse;
|
case 3: return strconv_attribute_impl<opt1_to_type<1> >::parse_eol;
|
||||||
case 4: return strconv_attribute_impl<opt4_to_type<0, 1, 0, 0> >::parse;
|
case 4: return strconv_attribute_impl<opt1_to_type<0> >::parse_wconv;
|
||||||
case 5: return strconv_attribute_impl<opt4_to_type<0, 1, 0, 1> >::parse;
|
case 5: return strconv_attribute_impl<opt1_to_type<1> >::parse_wconv;
|
||||||
case 6: return strconv_attribute_impl<opt4_to_type<0, 1, 1, 0> >::parse;
|
case 6: return strconv_attribute_impl<opt1_to_type<0> >::parse_wconv;
|
||||||
case 7: return strconv_attribute_impl<opt4_to_type<0, 1, 1, 1> >::parse;
|
case 7: return strconv_attribute_impl<opt1_to_type<1> >::parse_wconv;
|
||||||
case 8: return strconv_attribute_impl<opt4_to_type<1, 0, 0, 0> >::parse;
|
case 8: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
|
||||||
case 9: return strconv_attribute_impl<opt4_to_type<1, 0, 0, 1> >::parse;
|
case 9: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
|
||||||
case 10: return strconv_attribute_impl<opt4_to_type<1, 0, 1, 0> >::parse;
|
case 10: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
|
||||||
case 11: return strconv_attribute_impl<opt4_to_type<1, 0, 1, 1> >::parse;
|
case 11: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
|
||||||
case 12: return strconv_attribute_impl<opt4_to_type<1, 1, 0, 0> >::parse;
|
case 12: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
|
||||||
case 13: return strconv_attribute_impl<opt4_to_type<1, 1, 0, 1> >::parse;
|
case 13: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
|
||||||
case 14: return strconv_attribute_impl<opt4_to_type<1, 1, 1, 0> >::parse;
|
case 14: return strconv_attribute_impl<opt1_to_type<0> >::parse_wnorm;
|
||||||
case 15: return strconv_attribute_impl<opt4_to_type<1, 1, 1, 1> >::parse;
|
case 15: return strconv_attribute_impl<opt1_to_type<1> >::parse_wnorm;
|
||||||
default: return 0; // should not get here
|
default: return 0; // should not get here
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -235,17 +235,18 @@ namespace pugi
|
|||||||
#if !defined(__INTEL_COMPILER) || __INTEL_COMPILER > 800
|
#if !defined(__INTEL_COMPILER) || __INTEL_COMPILER > 800
|
||||||
PUGIXML_DEPRECATED
|
PUGIXML_DEPRECATED
|
||||||
#endif
|
#endif
|
||||||
const unsigned int parse_wnorm_attribute = 0x0040;
|
const unsigned int parse_wnorm_attribute = 0x0080;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This flag determines if attribute value normalization should be performed for all attributes.
|
* This flag determines if attribute value normalization should be performed for all attributes.
|
||||||
* This means, that whitespace characters (new line, tab and space) are replaced with space (' ').
|
* This means, that whitespace characters (new line, tab and space) are replaced with space (' ').
|
||||||
* Note, that the actions performed while this flag is on are also performed if parse_wnorm_attribute
|
* Note, that the actions performed while this flag is on are also performed if parse_wnorm_attribute
|
||||||
* is on, so this flag has no effect if parse_wnorm_attribute flag is set.
|
* is on, so this flag has no effect if parse_wnorm_attribute flag is set.
|
||||||
|
* New line characters are always treated as if parse_eol is set, i.e. \r\n is converted to single space.
|
||||||
*
|
*
|
||||||
* This flag is on by default.
|
* This flag is on by default.
|
||||||
*/
|
*/
|
||||||
const unsigned int parse_wconv_attribute = 0x0080;
|
const unsigned int parse_wconv_attribute = 0x0040;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This flag determines if XML document declaration (this node has the form of <?xml ... ?> in XML)
|
* This flag determines if XML document declaration (this node has the form of <?xml ... ?> in XML)
|
||||||
|
|||||||
@ -351,7 +351,7 @@ TEST(parse_attribute_no_eol_wconv)
|
|||||||
{
|
{
|
||||||
xml_document doc;
|
xml_document doc;
|
||||||
CHECK(doc.load(STR("<node id=' \t\r\rval1 \rval2\r\nval3\nval4\r\r'/>"), parse_minimal | parse_wconv_attribute));
|
CHECK(doc.load(STR("<node id=' \t\r\rval1 \rval2\r\nval3\nval4\r\r'/>"), parse_minimal | parse_wconv_attribute));
|
||||||
CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR(" val1 val2 val3 val4 "));
|
CHECK_STRING(doc.child(STR("node")).attribute(STR("id")).value(), STR(" val1 val2 val3 val4 "));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(parse_attribute_eol_wconv)
|
TEST(parse_attribute_eol_wconv)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user