diff --git a/README.md b/README.md index 6cf57bd52..674c74b24 100644 --- a/README.md +++ b/README.md @@ -1733,6 +1733,7 @@ I deeply appreciate the help of the following people. 314. [Berkus Decker](https://github.com/berkus) fixed a typo in the README. 315. [Illia Polishchuk](https://github.com/effolkronium) improved the CMake testing. 316. [Ikko Ashimine](https://github.com/eltociear) fixed a typo. +317. [Raphael Grimm](https://github.com/barcode) Added custom base classes as customization point and parser location information for sax parsers. Thanks a lot for helping out! Please [let me know](mailto:mail@nlohmann.me) if I forgot someone. diff --git a/cmake/ci.cmake b/cmake/ci.cmake index bbb2d4cb9..29d58d83b 100644 --- a/cmake/ci.cmake +++ b/cmake/ci.cmake @@ -636,7 +636,7 @@ add_custom_target(ci_test_valgrind -DJSON_BuildTests=ON -DJSON_Valgrind=ON -S${PROJECT_SOURCE_DIR} -B${PROJECT_BINARY_DIR}/build_valgrind COMMAND ${CMAKE_COMMAND} --build ${PROJECT_BINARY_DIR}/build_valgrind - COMMAND cd ${PROJECT_BINARY_DIR}/build_valgrind && ${CMAKE_CTEST_COMMAND} -L valgrind --parallel ${N} --output-on-failure + COMMAND cd ${PROJECT_BINARY_DIR}/build_valgrind && ${CMAKE_CTEST_COMMAND} -L valgrind --parallel ${N} --output-on-failure --timeout 10000 COMMENT "Compile and test with Valgrind" ) diff --git a/docs/examples/sax_parse_with_src_location.cpp b/docs/examples/sax_parse_with_src_location.cpp new file mode 100644 index 000000000..85457772f --- /dev/null +++ b/docs/examples/sax_parse_with_src_location.cpp @@ -0,0 +1,149 @@ +#include +#include +#include +#include + +using json = nlohmann::json; + +// a simple event consumer that collects string representations of the passed +// values and their source locations; +// note inheriting from json::json_sax_t is not required, but can +// help not to forget a required function +class sax_event_consumer : public json::json_sax_t +{ + public: + std::vector events; + std::size_t next_token_start_pos = 0; + std::size_t next_token_end_pos = 0; + + void next_token_start(std::size_t pos) + { + next_token_start_pos = pos; + } + + void next_token_end(std::size_t pos) + { + next_token_end_pos = pos; + } + + std::string location_as_str() const + { + return "at=[" + std::to_string(next_token_start_pos) + "," + std::to_string(next_token_end_pos) + ")"; + } + + bool null() override + { + events.push_back("null(" + location_as_str() + ")"); + return true; + } + + bool boolean(bool val) override + { + events.push_back("boolean(val=" + std::string(val ? "true" : "false") + ", " + location_as_str() + ")"); + return true; + } + + bool number_integer(number_integer_t val) override + { + events.push_back("number_integer(val=" + std::to_string(val) + ", " + location_as_str() + ")"); + return true; + } + + bool number_unsigned(number_unsigned_t val) override + { + events.push_back("number_unsigned(val=" + std::to_string(val) + ", " + location_as_str() + ")"); + return true; + } + + bool number_float(number_float_t val, const string_t& s) override + { + events.push_back("number_float(val=" + std::to_string(val) + ", s=" + s + ", " + location_as_str() + ")"); + return true; + } + + bool string(string_t& val) override + { + events.push_back("string(val=" + val + ", " + location_as_str() + ")"); + return true; + } + + bool start_object(std::size_t elements) override + { + events.push_back("start_object(elements=" + std::to_string(elements) + ", " + location_as_str() + ")"); + return true; + } + + bool end_object() override + { + events.push_back("end_object(" + location_as_str() + ")"); + return true; + } + + bool start_array(std::size_t elements) override + { + events.push_back("start_array(elements=" + std::to_string(elements) + ", " + location_as_str() + ")"); + return true; + } + + bool end_array() override + { + events.push_back("end_array(" + location_as_str() + ")"); + return true; + } + + bool key(string_t& val) override + { + events.push_back("key(val=" + val + ", " + location_as_str() + ")"); + return true; + } + + bool binary(json::binary_t& val) override + { + events.push_back("binary(val=[...], " + location_as_str() + ")"); + return true; + } + + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override + { + events.push_back("parse_error(position=" + std::to_string(position) + ", last_token=" + last_token + ",\n ex=" + std::string(ex.what()) + ")"); + return false; + } +}; + +int main() +{ + // a JSON text + auto text = R"( + { + "Image": { + "Width": 800, + "Height": 600, + "Title": "View from 15th Floor", + "Thumbnail": { + "Url": "http://www.example.com/image/481989943", + "Height": 125, + "Width": 100 + }, + "Animated" : false, + "IDs": [116, 943, 234, -38793], + "DeletionDate": null, + "Distance": 12.723374634 + } + }] + )"; + + // create a SAX event consumer object + sax_event_consumer sec; + + // parse JSON + bool result = json::sax_parse(text, &sec); + + // output the recorded events + for (auto& event : sec.events) + { + std::cout << event << "\n"; + } + + // output the result of sax_parse + std::cout << "\nresult: " << std::boolalpha << result << std::endl; +} diff --git a/docs/examples/sax_parse_with_src_location.output b/docs/examples/sax_parse_with_src_location.output new file mode 100644 index 000000000..dbc004110 --- /dev/null +++ b/docs/examples/sax_parse_with_src_location.output @@ -0,0 +1,37 @@ +start_object(elements=18446744073709551615, at=[5,6)) +key(val=Image, at=[15,22)) +start_object(elements=18446744073709551615, at=[24,25)) +key(val=Width, at=[38,45)) +number_unsigned(val=800, at=[48,51)) +key(val=Height, at=[65,73)) +number_unsigned(val=600, at=[75,78)) +key(val=Title, at=[92,99)) +string(val=View from 15th Floor, at=[102,124)) +key(val=Thumbnail, at=[138,149)) +start_object(elements=18446744073709551615, at=[151,152)) +key(val=Url, at=[169,174)) +string(val=http://www.example.com/image/481989943, at=[179,219)) +key(val=Height, at=[237,245)) +number_unsigned(val=125, at=[247,250)) +key(val=Width, at=[268,275)) +number_unsigned(val=100, at=[278,281)) +end_object(at=[294,295)) +key(val=Animated, at=[309,319)) +boolean(val=false, at=[322,327)) +key(val=IDs, at=[341,346)) +start_array(elements=18446744073709551615, at=[348,349)) +number_unsigned(val=116, at=[349,352)) +number_unsigned(val=943, at=[354,357)) +number_unsigned(val=234, at=[359,362)) +number_integer(val=-38793, at=[364,370)) +end_array(at=[370,371)) +key(val=DeletionDate, at=[385,399)) +null(at=[401,405)) +key(val=Distance, at=[419,429)) +number_float(val=12.723375, s=12.723374634, at=[431,443)) +end_object(at=[452,453)) +end_object(at=[458,459)) +parse_error(position=460, last_token=12.723374634 } }], + ex=[json.exception.parse_error.101] parse error at line 17, column 6: syntax error while parsing value - unexpected ']'; expected end of input) + +result: false diff --git a/docs/examples/sax_parse_with_src_location_in_json.cpp b/docs/examples/sax_parse_with_src_location_in_json.cpp new file mode 100644 index 000000000..8e7818352 --- /dev/null +++ b/docs/examples/sax_parse_with_src_location_in_json.cpp @@ -0,0 +1,336 @@ +#include +#include +#include +#include + +using json = nlohmann::json; + +// custom base class for the json node. +// allows us to store metadata and add custom methods to each node +struct token_start_stop +{ + nlohmann::position_t start{}; + nlohmann::position_t stop{}; + + std::string start_pos_str() const + { + return "{l=" + std::to_string(start.lines_read) + ":c=" + + std::to_string(start.chars_read_current_line) + "}"; + } + std::string stop_pos_str() const + { + return "{l=" + std::to_string(stop.lines_read) + ":c=" + std::to_string(stop.chars_read_current_line) + "}"; + } + std::string location_str() const + { + return "[" + start_pos_str() + ", " + stop_pos_str() + ")"; + } +}; + +//json type using token_start_stop as base class +using json_with_token_start_stop = + nlohmann::basic_json < + std::map, + std::vector, + std::string, + bool, + std::int64_t, + std::uint64_t, + double, + std::allocator, + nlohmann::adl_serializer, + std::vector, + token_start_stop >; + +// a parser storing the lexer information for each node +class sax_with_token_start_stop_metadata +{ + public: + using json = json_with_token_start_stop; + using number_integer_t = typename json::number_integer_t; + using number_unsigned_t = typename json::number_unsigned_t; + using number_float_t = typename json::number_float_t; + using string_t = typename json::string_t; + using binary_t = typename json::binary_t; + + /*! + @param[in,out] r reference to a JSON value that is manipulated while + parsing + @param[in] allow_exceptions_ whether parse errors yield exceptions + */ + explicit sax_with_token_start_stop_metadata(json& r, const bool allow_exceptions_ = true) + : root(r) + , ref_stack{} + , object_element{nullptr} + , errored{false} + , allow_exceptions(allow_exceptions_) + , start_stop{} + {} + + void next_token_start(const nlohmann::position_t& p) + { + start_stop.start = p; + } + + void next_token_end(const nlohmann::position_t& p) + { + start_stop.stop = p; + } + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::object)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive object size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool key(string_t& val) + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + // add null at given key and store the reference for later + object_element = &(*ref_stack.back())[val]; + return true; + } + + bool end_object() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + bool start_array(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::array)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive array size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool end_array() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_array()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + throw ex; + } + return false; + } + + bool is_errored() const + { + return errored; + } + + private: + /*! + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + */ + template + json* + handle_value(Value&& v) + { + if (ref_stack.empty()) + { + root = json(std::forward(v)); + root.start = start_stop.start; + root.stop = start_stop.stop; + return &root; + } + + assert(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + auto& array_element = ref_stack.back()->emplace_back(std::forward(v)); + array_element.start = start_stop.start; + array_element.stop = start_stop.stop; + return &array_element; + } + + assert(ref_stack.back()->is_object()); + assert(object_element); + *object_element = json(std::forward(v)); + object_element->start = start_stop.start; + object_element->stop = start_stop.stop; + return object_element; + } + + /// the parsed JSON value + json& root; + /// stack to model hierarchy of values + std::vector ref_stack{}; + /// helper to hold the reference for the next object element + json* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; + /// start / stop information for the current token + token_start_stop start_stop{}; +}; + +void dump(const json_with_token_start_stop& j, std::size_t indentlvl = 0) +{ + const std::string indent(indentlvl * 4, ' '); + switch (j.type()) + { + case nlohmann::json::value_t::null: + { + std::cout << indent << "null(at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::object: + { + std::cout << indent << "object(size=" << j.size() << ", at=" << j.location_str() << ")\n"; + for (const auto& elem : j.items()) + { + dump(elem.value(), indentlvl + 1); + } + } + break; + case nlohmann::json::value_t::array: + { + std::cout << indent << "array(size=" << j.size() << ", at=" << j.location_str() << ")\n"; + for (const auto& elem : j) + { + dump(elem, indentlvl + 1); + } + } + break; + case nlohmann::json::value_t::string: + { + std::cout << indent << "string(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::boolean: + { + std::cout << indent << "boolean(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_integer: + { + std::cout << indent << "number_integer(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::cout << indent << "number_unsigned(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_float: + { + std::cout << indent << "number_float(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + default: + throw std::runtime_error{"unexpected input"}; + } +} + +int main() +{ + // a JSON text + auto text = R"({ + "Image": { + "Width": 800, + "Height": 600, + "Title": "View from 15th Floor", + "Thumbnail": { + "Url": "http://www.example.com/image/481989943", + "Height": 125, + "Width": 100 + }, + "Animated" : false, + "IDs": [116, 943, 234, -38793], + "DeletionDate": null, + "Distance": 12.723374634 + } +})"; + + // create a SAX parser object + json_with_token_start_stop parsed; + sax_with_token_start_stop_metadata sax{parsed}; + + // parse JSON + bool result = json::sax_parse(text, &sax); + + // output the json data + dump(parsed); + + // output the result of sax_parse + std::cout << "\nresult: " << std::boolalpha << result << std::endl; +} diff --git a/docs/examples/sax_parse_with_src_location_in_json.output b/docs/examples/sax_parse_with_src_location_in_json.output new file mode 100644 index 000000000..676682e6d --- /dev/null +++ b/docs/examples/sax_parse_with_src_location_in_json.output @@ -0,0 +1,19 @@ +object(size=1, at=[{l=0:c=0}, {l=15:c=1})) + object(size=8, at=[{l=1:c=17}, {l=14:c=9})) + boolean(val=0, at=[{l=10:c=25}, {l=10:c=30})) + null(at=[{l=12:c=28}, {l=12:c=32})) + number_float(val=12.7234, at=[{l=13:c=24}, {l=13:c=0})) + number_unsigned(val=600, at=[{l=3:c=22}, {l=3:c=25})) + array(size=4, at=[{l=11:c=19}, {l=11:c=42})) + number_unsigned(val=116, at=[{l=11:c=20}, {l=11:c=23})) + number_unsigned(val=943, at=[{l=11:c=25}, {l=11:c=28})) + number_unsigned(val=234, at=[{l=11:c=30}, {l=11:c=33})) + number_integer(val=-38793, at=[{l=11:c=35}, {l=11:c=41})) + object(size=3, at=[{l=5:c=25}, {l=9:c=13})) + number_unsigned(val=125, at=[{l=7:c=26}, {l=7:c=29})) + string(val=http://www.example.com/image/481989943, at=[{l=6:c=26}, {l=6:c=66})) + number_unsigned(val=100, at=[{l=8:c=26}, {l=8:c=0})) + string(val=View from 15th Floor, at=[{l=4:c=22}, {l=4:c=44})) + number_unsigned(val=800, at=[{l=2:c=22}, {l=2:c=25})) + +result: true diff --git a/docs/mkdocs/docs/api/json_sax/index.md b/docs/mkdocs/docs/api/json_sax/index.md index f63e85c9a..719c037f8 100644 --- a/docs/mkdocs/docs/api/json_sax/index.md +++ b/docs/mkdocs/docs/api/json_sax/index.md @@ -37,8 +37,11 @@ processing the input. - [**start_array**](start_array.md) (_virtual_) - the beginning of an array was read - [**start_object**](start_object.md) (_virtual_) - the beginning of an object was read - [**string**](string.md) (_virtual_) - a string value was read +- [**next_token_start**](next_token_start.md) - called to provide the start of the next element in the parsed input. +- [**next_token_end**](next_token_end.md) - called to provide the end (one past convention) of the next element in the parsed input. ## Version history - Added in version 3.2.0. - Support for binary values (`binary_t`, `binary`) added in version 3.8.0. +- Support for parser location information (`next_token_start`, `next_token_end`) added in version ???.???.???. diff --git a/docs/mkdocs/docs/api/json_sax/next_token_end.md b/docs/mkdocs/docs/api/json_sax/next_token_end.md new file mode 100644 index 000000000..69e5da965 --- /dev/null +++ b/docs/mkdocs/docs/api/json_sax/next_token_end.md @@ -0,0 +1,66 @@ +# nlohmann::json_sax::next_token_end + +Informs the sax parser about the end of the next element. +There are two possible signatures for this method: + +1. +```cpp +void next_token_end(std::size_t pos); +``` +This version is called with the byte position after the next element ends. +This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). + +2. +```cpp +void next_token_end(const nlohmann::position_t& p) +``` +This version is called with the [detailed parser position information](../position_t/index.md) after the last character of the next element was parsed. +This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. + +## Parameters +1. +`pos` (in) +: Byte position one after the next elements last byte. +2. +`p` (in) +: [Detailed parser position information](../position_t/index.md) after the last char of the next element was parsed. + +## Notes + +Implementing either version is optional, and no function is called if neither version of `next_token_end` is available in the sax parser. + +It is recommended, but not required, to also implement [next_token_start](next_token_start.md). + +## Examples + +??? example + + The example below shows a SAX parser using the first version of this method to log the location. + + ```cpp + --8<-- "examples/sax_parse_with_src_location.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location.output" + ``` + +??? example + + The example below shows a SAX parser using the second version of this method and + storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` +## Version history + +- Added in version ???.???.???. diff --git a/docs/mkdocs/docs/api/json_sax/next_token_start.md b/docs/mkdocs/docs/api/json_sax/next_token_start.md new file mode 100644 index 000000000..49289b7ac --- /dev/null +++ b/docs/mkdocs/docs/api/json_sax/next_token_start.md @@ -0,0 +1,66 @@ +# nlohmann::json_sax::next_token_start + +Informs the sax parser about the start of the next element. +There are two possible signatures for this method: + +1. +```cpp +void next_token_start(std::size_t pos); +``` +This version is called with the byte position where the next element starts. +This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). + +2. +```cpp +void next_token_start(const nlohmann::position_t& p) +``` +This version is called with [detailed parser position information](../position_t/index.md). +This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. + +## Parameters +1. +`pos` (in) +: Byte position where the next element starts. +2. +`p` (in) +: [Detailed parser position information](../position_t/index.md) after the first char of the next element was parsed. + +## Notes + +Implementing either version is optional, and no function is called if neither version of `next_token_start` is available in the sax parser. + +It is recommended, but not required, to also implement [next_token_end](next_token_end.md). + +## Examples + +??? example + + The example below shows a SAX parser using the first version of this method to log the location. + + ```cpp + --8<-- "examples/sax_parse_with_src_location.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location.output" + ``` + +??? example + + The example below shows a SAX parser using the second version of this method and + storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` +## Version history + +- Added in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/chars_read_current_line.md b/docs/mkdocs/docs/api/position_t/chars_read_current_line.md new file mode 100644 index 000000000..740a3875d --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/chars_read_current_line.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::chars_read_current_line + +```cpp +std::size_t chars_read_current_line; +``` + +The number of characters read in the current line. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/chars_read_total.md b/docs/mkdocs/docs/api/position_t/chars_read_total.md new file mode 100644 index 000000000..9f6e736cf --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/chars_read_total.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::chars_read_total + +```cpp +std::size_t chars_read_total; +``` + +The total number of characters read. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/index.md b/docs/mkdocs/docs/api/position_t/index.md new file mode 100644 index 000000000..16c4fd431 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/index.md @@ -0,0 +1,23 @@ +# nlohmann::position_t + +```cpp +struct position_t; +``` + +This type represents the parsers position when parsing a json string using. +This position can be retrieved when using a [sax parser](../json_sax/index.md) with the format `nlohmann::json::input_format_t::json` +and implementing [next_token_start](../json_sax/next_token_start.md) or [next_token_end](../json_sax/next_token_end.md). + +## Member functions + +- [**operator size_t**](operator_size_t.md) - return the value of [chars_read_total](chars_read_total.md). + +## Member variables + +- [**chars_read_total**](chars_read_total.md) - The total number of characters read. +- [**lines_read**](lines_read.md) - The number of lines read. +- [**chars_read_current_line**](chars_read_current_line.md) - The number of characters read in the current line. + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/lines_read.md b/docs/mkdocs/docs/api/position_t/lines_read.md new file mode 100644 index 000000000..e22ee1d45 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/lines_read.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::lines_read + +```cpp +std::size_t lines_read; +``` + +The number of lines read. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/operator_size_t.md b/docs/mkdocs/docs/api/position_t/operator_size_t.md new file mode 100644 index 000000000..bc0325fd4 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/operator_size_t.md @@ -0,0 +1,28 @@ +# nlohmann::position_t:: + +```cpp +constexpr operator size_t() const; +``` + +return the value of [chars_read_total](chars_read_total.md). + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/features/parsing/sax_interface.md b/docs/mkdocs/docs/features/parsing/sax_interface.md index 0796a55f5..e925d07c0 100644 --- a/docs/mkdocs/docs/features/parsing/sax_interface.md +++ b/docs/mkdocs/docs/features/parsing/sax_interface.md @@ -67,6 +67,30 @@ To implement your own SAX handler, proceed as follows: Note the `sax_parse` function only returns a `#!cpp bool` indicating the result of the last executed SAX event. It does not return `json` value - it is up to you to decide what to do with the SAX events. Furthermore, no exceptions are thrown in case of a parse error - it is up to you what to do with the exception object passed to your `parse_error` implementation. Internally, the SAX interface is used for the DOM parser (class `json_sax_dom_parser`) as well as the acceptor (`json_sax_acceptor`), see file `json_sax.hpp`. +## Element position information + +The position of a parsed element can be retrieved by implementing the optional methods [next_token_start](../../api/json_sax/next_token_start.md) and [next_token_end](../../api/json_sax/next_token_end.md). +These methods will be called with the parser position before any of the other methods are called and can be used to retrieve the half open bounds (`[start, end)`) of a parsed element. + +These Methods come in two flavors: + +1. +```cpp +void next_token_start(std::size_t pos); +void next_token_end(std::size_t pos); +``` +This flavor is called with the byte positions of each element and are available for any `nlohmann::json::input_format_t` passed to `nlohmann::json::sax_parse`. + +2. +```cpp +void next_token_start(const nlohmann::position_t& p); +void next_token_end(const nlohmann::position_t& p); +``` +This flavor is called with the [detailed parser position information](../../api/position_t/index.md) of each element and are only available if `nlohmann::json::sax_parse` is called with `nlohmann::json::input_format_t::json`. +Furthermore this flavor takes precedence over the first flavor. + +Depending on the required information it is possible for the SAX parser to implement all four or only one or none of these methods. + ## See also - [json_sax](../../api/json_sax/index.md) - documentation of the SAX interface diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 8319354a4..f29960ecf 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -250,6 +250,8 @@ nav: - 'start_array': api/json_sax/start_array.md - 'start_object': api/json_sax/start_object.md - 'string': api/json_sax/string.md + - 'next_token_start' : api/json_sax/next_token_start.md + - 'next_token_end' : api/json_sax/next_token_end.md - 'operator<<(basic_json)': api/operator_ltlt.md - 'operator<<(json_pointer)': api/operator_ltlt.md - 'operator>>(basic_json)': api/operator_gtgt.md @@ -257,6 +259,12 @@ nav: - 'operator""_json_pointer': api/operator_literal_json_pointer.md - 'ordered_json': api/ordered_json.md - 'ordered_map': api/ordered_map.md + - position_t: + - 'Overview': api/position_t/index.md + - 'operator size_t': api/position_t/operator_size_t.md + - 'chars_read_total': api/position_t/chars_read_total.md + - 'lines_read': api/position_t/lines_read.md + - 'chars_read_current_line': api/position_t/chars_read_current_line.md - macros: - 'Overview': api/macros/index.md - 'JSON_ASSERT': api/macros/json_assert.md diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 263fdb525..9413f92da 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -167,8 +167,9 @@ class binary_reader bool parse_bson_internal() { std::int32_t document_size{}; + detail::sax_call_next_token_start_pos(sax, chars_read); get_number(input_format_t::bson, document_size); - + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -179,6 +180,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -276,6 +278,7 @@ class binary_reader case 0x01: // double { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(number)); return get_number(input_format_t::bson, number) && sax->number_float(static_cast(number), ""); } @@ -283,7 +286,10 @@ class binary_reader { std::int32_t len{}; string_t value; - return get_number(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_string(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(value); } case 0x03: // object @@ -300,28 +306,35 @@ class binary_reader { std::int32_t len{}; binary_t value; - return get_number(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_binary(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(value); } case 0x08: // boolean { + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + 1); return sax->boolean(get() != 0); } case 0x0A: // null { + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->null(); } case 0x10: // int32 { std::int32_t value{}; - return get_number(input_format_t::bson, value) && sax->number_integer(value); + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); + return get_number(input_format_t::bson, value) && sax->number_integer(value); } case 0x12: // int64 { std::int64_t value{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); return get_number(input_format_t::bson, value) && sax->number_integer(value); } @@ -360,14 +373,22 @@ class binary_reader } const std::size_t element_type_parse_position = chars_read; + if (!is_array) + { + detail::sax_call_next_token_start_pos(sax, chars_read); + } if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) { return false; } - if (!is_array && !sax->key(key)) + if (!is_array) { - return false; + detail::sax_call_next_token_end_pos(sax, chars_read); + if (!sax->key(key)) + { + return false; + } } if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) @@ -389,6 +410,7 @@ class binary_reader bool parse_bson_array() { std::int32_t document_size{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(std::int32_t)); get_number(input_format_t::bson, document_size); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) @@ -401,6 +423,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -450,29 +473,34 @@ class binary_reader case 0x15: case 0x16: case 0x17: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); case 0x18: // Unsigned integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x19: // Unsigned integer (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1A: // Unsigned integer (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1B: // Unsigned integer (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } @@ -501,29 +529,34 @@ class binary_reader case 0x35: case 0x36: case 0x37: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(0x20 - 1 - current)); case 0x38: // Negative integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x39: // Negative integer -1-n (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - static_cast(number)); } @@ -560,7 +593,10 @@ class binary_reader case 0x5F: // Binary data (indefinite length) { binary_t b; - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } // UTF-8 string (0x00..0x17 bytes follow) @@ -595,7 +631,10 @@ class binary_reader case 0x7F: // UTF-8 string (indefinite length) { string_t s; - return get_cbor_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } // array (0x00..0x17 data items follow) @@ -623,35 +662,51 @@ class binary_reader case 0x95: case 0x96: case 0x97: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array( conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0x98: // array (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x99: // array (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x9A: // array (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9B: // array (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9F: // array (indefinite length) + { + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array(static_cast(-1), tag_handler); + } // map (0x00..0x17 pairs of data items follow) case 0xA0: @@ -678,33 +733,47 @@ class binary_reader case 0xB5: case 0xB6: case 0xB7: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0xB8: // map (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xB9: // map (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xBA: // map (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBB: // map (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBF: // map (indefinite length) + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(static_cast(-1), tag_handler); case 0xC6: // tagged item @@ -809,7 +878,10 @@ class binary_reader return parse_cbor_internal(true, tag_handler); } get(); - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } default: // LCOV_EXCL_LINE @@ -819,16 +891,20 @@ class binary_reader } case 0xF4: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xF5: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xF6: // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xF9: // Half-Precision Float (two-byte IEEE 754) { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); const auto byte1_raw = get(); if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) { @@ -870,6 +946,7 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) : static_cast(val), ""); @@ -878,12 +955,14 @@ class binary_reader case 0xFA: // Single-Precision Float (four-byte IEEE 754) { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } case 0xFB: // Double-Precision Float (eight-byte IEEE 754) { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } @@ -1127,6 +1206,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -1152,7 +1232,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1168,7 +1251,10 @@ class binary_reader { while (get() != 0xFF) { - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1182,6 +1268,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -1329,6 +1416,7 @@ class binary_reader case 0x7D: case 0x7E: case 0x7F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); // fixmap @@ -1348,6 +1436,7 @@ class binary_reader case 0x8D: case 0x8E: case 0x8F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_object(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixarray @@ -1367,6 +1456,7 @@ class binary_reader case 0x9D: case 0x9E: case 0x9F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_array(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixstr @@ -1407,16 +1497,22 @@ class binary_reader case 0xDB: // str 32 { string_t s; - return get_msgpack_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case 0xC0: // nil + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xC2: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xC3: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xC4: // bin 8 @@ -1432,90 +1528,107 @@ class binary_reader case 0xD8: // fixext 16 { binary_t b; - return get_msgpack_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } case 0xCA: // float 32 { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCB: // float 64 { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCC: // uint 8 { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCD: // uint 16 { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCE: // uint 32 { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCF: // uint 64 { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xD0: // int 8 { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD1: // int 16 { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD2: // int 32 { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD3: // int 64 { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xDC: // array 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); } case 0xDD: // array 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast(len)); } case 0xDE: // map 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); } case 0xDF: // map 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast(len)); } @@ -1552,6 +1665,7 @@ class binary_reader case 0xFD: case 0xFE: case 0xFF: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(current)); default: // anything else @@ -1782,6 +1896,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -1800,7 +1915,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1812,6 +1930,7 @@ class binary_reader key.clear(); } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -2174,7 +2293,6 @@ class binary_reader return true; } } - string_t key = "_ArraySize_"; if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size()))) { @@ -2235,7 +2353,6 @@ class binary_reader bool is_ndarray = false; get_ignore_noop(); - if (current == '$') { result.second = get(); // must not ignore 'N', because 'N' maybe the type @@ -2264,7 +2381,9 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + //detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -2279,7 +2398,9 @@ class binary_reader if (current == '#') { + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + // detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -2288,6 +2409,7 @@ class binary_reader return is_error; } + // detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read - 1); return true; } @@ -2303,40 +2425,47 @@ class binary_reader return unexpect_eof(input_format, "value"); case 'T': // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 'F': // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 'Z': // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 'U': { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'i': { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'I': { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'l': { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } - case 'L': { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } @@ -2347,6 +2476,7 @@ class binary_reader break; } std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -2357,6 +2487,7 @@ class binary_reader break; } std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -2367,11 +2498,13 @@ class binary_reader break; } std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'h': { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); if (input_format != input_format_t::bjdata) { break; @@ -2417,25 +2550,30 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) - : static_cast(val), ""); + : static_cast(val), + ""); } case 'd': { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'D': { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'H': { + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_high_precision_number(); } @@ -2453,19 +2591,25 @@ class binary_reader exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr)); } string_t s(1, static_cast(current)); + detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read); return sax->string(s); } case 'S': // string { string_t s; - return get_ubjson_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case '[': // array + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_array(); case '{': // object + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_object(); default: // anything else @@ -2480,6 +2624,7 @@ class binary_reader */ bool get_ubjson_array() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -2504,6 +2649,7 @@ class binary_reader exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr)); } + detail::sax_call_next_token_end_pos(sax, chars_read); string_t type = it->second; // sax->string() takes a reference if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type))) { @@ -2515,6 +2661,7 @@ class binary_reader size_and_type.second = 'U'; } + detail::sax_call_next_token_start_end_pos(sax, chars_read); key = "_ArrayData_"; if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) )) { @@ -2523,17 +2670,20 @@ class binary_reader for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return (sax->end_array() && sax->end_object()); } if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) { return false; @@ -2545,6 +2695,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; @@ -2556,6 +2707,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) { return false; @@ -2565,6 +2717,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) { return false; @@ -2580,6 +2733,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -2588,6 +2742,7 @@ class binary_reader */ bool get_ubjson_object() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -2605,6 +2760,7 @@ class binary_reader string_t key; if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) { return false; @@ -2614,7 +2770,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2629,7 +2788,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2643,6 +2805,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -2650,7 +2813,10 @@ class binary_reader while (current != '}') { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key, false); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2663,6 +2829,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -2671,6 +2838,7 @@ class binary_reader bool get_ubjson_high_precision_number() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); // get size of following number string std::size_t size{}; bool no_ndarray = true; @@ -2691,6 +2859,7 @@ class binary_reader } number_vector.push_back(static_cast(current)); } + detail::sax_call_next_token_end_pos(sax, chars_read); // parse number string using ia_type = decltype(detail::input_adapter(number_vector)); @@ -2888,6 +3057,7 @@ class binary_reader { if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) { + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); } diff --git a/include/nlohmann/detail/input/lexer.hpp b/include/nlohmann/detail/input/lexer.hpp index 50fc9df59..5eab2ee71 100644 --- a/include/nlohmann/detail/input/lexer.hpp +++ b/include/nlohmann/detail/input/lexer.hpp @@ -1506,13 +1506,13 @@ scan_number_done: while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); } - token_type scan() + bool scan_start() { // initially, skip the BOM if (position.chars_read_total == 0 && !skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; + return false; } // read next character and ignore whitespace @@ -1523,13 +1523,17 @@ scan_number_done: { if (!scan_comment()) { - return token_type::parse_error; + return false; } // skip following whitespace skip_whitespace(); } + return true; + } + token_type scan_end() + { switch (current) { // structural characters @@ -1593,6 +1597,10 @@ scan_number_done: return token_type::parse_error; } } + token_type scan() + { + return !scan_start() ? token_type::parse_error : scan_end(); + } private: /// input adapter diff --git a/include/nlohmann/detail/input/parser.hpp b/include/nlohmann/detail/input/parser.hpp index 8acbd4fca..af20e3167 100644 --- a/include/nlohmann/detail/input/parser.hpp +++ b/include/nlohmann/detail/input/parser.hpp @@ -76,8 +76,6 @@ class parser , m_lexer(std::move(adapter), skip_comments) , allow_exceptions(allow_exceptions_) { - // read first token - get_token(); } /*! @@ -98,7 +96,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -126,7 +124,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -164,7 +162,7 @@ class parser const bool result = sax_parse_internal(sax); // strict mode: next byte must be EOF - if (result && strict && (get_token() != token_type::end_of_input)) + if (result && strict && (get_token(sax) != token_type::end_of_input)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -185,6 +183,8 @@ class parser // value to avoid a goto (see comment where set to true) bool skip_to_state_evaluation = false; + // read first token + get_token(sax); while (true) { if (!skip_to_state_evaluation) @@ -200,7 +200,7 @@ class parser } // closing } -> we are done - if (get_token() == token_type::end_object) + if (get_token(sax) == token_type::end_object) { if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) { @@ -222,7 +222,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -233,7 +233,7 @@ class parser states.push_back(false); // parse values - get_token(); + get_token(sax); continue; } @@ -245,7 +245,7 @@ class parser } // closing ] -> we are done - if (get_token() == token_type::end_array) + if (get_token(sax) == token_type::end_array) { if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) { @@ -372,10 +372,10 @@ class parser if (states.back()) // array { // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse a new value - get_token(); + get_token(sax); continue; } @@ -405,10 +405,10 @@ class parser // states.back() is false -> object // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse key - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::value_string)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -421,7 +421,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -429,7 +429,7 @@ class parser } // parse values - get_token(); + get_token(sax); continue; } @@ -457,10 +457,19 @@ class parser } } - /// get next token from lexer - token_type get_token() + /// get next token from lexer and pass position info to sax (if it is accepted) + template + token_type get_token(SAX* sax) { - return last_token = m_lexer.scan(); + if (!m_lexer.scan_start()) + { + last_token = token_type::parse_error; + return token_type::parse_error; + } + detail::sax_call_next_token_start_pos(sax, m_lexer); + last_token = m_lexer.scan_end(); + detail::sax_call_next_token_end_pos(sax, m_lexer); + return last_token; } std::string exception_message(const token_type expected, const std::string& context) diff --git a/include/nlohmann/detail/input/position_t.hpp b/include/nlohmann/detail/input/position_t.hpp index 396db0e16..5450ee961 100644 --- a/include/nlohmann/detail/input/position_t.hpp +++ b/include/nlohmann/detail/input/position_t.hpp @@ -13,9 +13,6 @@ #include NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - /// struct to capture the start position of the current token struct position_t { @@ -32,6 +29,4 @@ struct position_t return chars_read_total; } }; - -} // namespace detail NLOHMANN_JSON_NAMESPACE_END diff --git a/include/nlohmann/detail/meta/is_sax.hpp b/include/nlohmann/detail/meta/is_sax.hpp index 215008963..38831e56c 100644 --- a/include/nlohmann/detail/meta/is_sax.hpp +++ b/include/nlohmann/detail/meta/is_sax.hpp @@ -15,10 +15,180 @@ #include #include #include +#include NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { +// helper struct to call sax->next_token_start +//(we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_start_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_start(std::forward(ts)...)) + { + sax->next_token_start(std::forward(ts)...); + } +}; +// helper struct to call sax->next_token_end +// (we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_end_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_end(std::forward(ts)...)) + { + sax->next_token_end(std::forward(ts)...); + } +}; + +// dispatch the calls to next_token_start next_token_end +// and drop the calls if the sax parser does not support these methods. +// +// DirectCaller can be set to one of sax_call_next_token_{start,end}_pos_direct to +// determine which method is called +template +struct sax_call_function +{ + // is the parameter a lexer or a byte position + static constexpr bool called_with_byte_pos = std::is_same::value; + + template + using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); + + //the sax parser supports calls with a position + static constexpr bool detected_call_with_byte_pos = + is_detected_exact::value; + + //the sax parser supports calls with a lexer + static constexpr bool detected_call_with_lex_pos = + !called_with_byte_pos && + is_detected_exact::value; + + //there either has to be a version accepting a lexer or a position + static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; + + //called with byte pos and byte pos is method supported -> pass data on + template + static typename std::enable_if < + std::is_same::value && + valid && + detected_call_with_byte_pos + >::type + call(SaxT* sax, std::size_t pos) + { + DirectCaller::call(sax, pos); + } + + //the sax parser has no version of the method -> drop call + template + static typename std::enable_if < + std::is_same::value && + !valid + >::type + call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} + + //called with lex and lex pos method is supported -> call with position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + JSON_ASSERT(lex.get_position().chars_read_current_line > 0); + //the lexer has already read the first char of the current element -> fix this + auto pos_copy = lex.get_position(); + --pos_copy.chars_read_total; + --pos_copy.chars_read_current_line; + DirectCaller::call(sax, pos_copy); + } + + //called with lex and lex pos method is supported -> pass data on + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position()); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total); + } +}; + +//set the element start pos of a sax parser by calling any version of sax->next_token_start (if available) +template +void sax_call_next_token_start_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element end pos of a sax parser by calling any version of sax->next_token_end (if available) +template +void sax_call_next_token_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos1& lexOrPos1, const LexOrPos2& lexOrPos2) +{ + sax_call_next_token_start_pos(sax, lexOrPos1); + sax_call_next_token_end_pos(sax, lexOrPos2); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + sax_call_next_token_start_pos(sax, lexOrPos); + sax_call_next_token_end_pos(sax, lexOrPos); +} + + template using null_function_t = decltype(std::declval().null()); diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 14bc07d5f..695b7e601 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -3022,9 +3022,6 @@ NLOHMANN_JSON_NAMESPACE_END NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - /// struct to capture the start position of the current token struct position_t { @@ -3041,8 +3038,6 @@ struct position_t return chars_read_total; } }; - -} // namespace detail NLOHMANN_JSON_NAMESPACE_END // #include @@ -8810,13 +8805,13 @@ scan_number_done: while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); } - token_type scan() + bool scan_start() { // initially, skip the BOM if (position.chars_read_total == 0 && !skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; + return false; } // read next character and ignore whitespace @@ -8827,13 +8822,17 @@ scan_number_done: { if (!scan_comment()) { - return token_type::parse_error; + return false; } // skip following whitespace skip_whitespace(); } + return true; + } + token_type scan_end() + { switch (current) { // structural characters @@ -8897,6 +8896,10 @@ scan_number_done: return token_type::parse_error; } } + token_type scan() + { + return !scan_start() ? token_type::parse_error : scan_end(); + } private: /// input adapter @@ -8958,10 +8961,181 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { +// helper struct to call sax->next_token_start +//(we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_start_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_start(std::forward(ts)...)) + { + sax->next_token_start(std::forward(ts)...); + } +}; +// helper struct to call sax->next_token_end +// (we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_end_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_end(std::forward(ts)...)) + { + sax->next_token_end(std::forward(ts)...); + } +}; + +// dispatch the calls to next_token_start next_token_end +// and drop the calls if the sax parser does not support these methods. +// +// DirectCaller can be set to one of sax_call_next_token_{start,end}_pos_direct to +// determine which method is called +template +struct sax_call_function +{ + // is the parameter a lexer or a byte position + static constexpr bool called_with_byte_pos = std::is_same::value; + + template + using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); + + //the sax parser supports calls with a position + static constexpr bool detected_call_with_byte_pos = + is_detected_exact::value; + + //the sax parser supports calls with a lexer + static constexpr bool detected_call_with_lex_pos = + !called_with_byte_pos && + is_detected_exact::value; + + //there either has to be a version accepting a lexer or a position + static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; + + //called with byte pos and byte pos is method supported -> pass data on + template + static typename std::enable_if < + std::is_same::value && + valid && + detected_call_with_byte_pos + >::type + call(SaxT* sax, std::size_t pos) + { + DirectCaller::call(sax, pos); + } + + //the sax parser has no version of the method -> drop call + template + static typename std::enable_if < + std::is_same::value && + !valid + >::type + call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} + + //called with lex and lex pos method is supported -> call with position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + JSON_ASSERT(lex.get_position().chars_read_current_line > 0); + //the lexer has already read the first char of the current element -> fix this + auto pos_copy = lex.get_position(); + --pos_copy.chars_read_total; + --pos_copy.chars_read_current_line; + DirectCaller::call(sax, pos_copy); + } + + //called with lex and lex pos method is supported -> pass data on + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position()); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total); + } +}; + +//set the element start pos of a sax parser by calling any version of sax->next_token_start (if available) +template +void sax_call_next_token_start_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element end pos of a sax parser by calling any version of sax->next_token_end (if available) +template +void sax_call_next_token_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos1& lexOrPos1, const LexOrPos2& lexOrPos2) +{ + sax_call_next_token_start_pos(sax, lexOrPos1); + sax_call_next_token_end_pos(sax, lexOrPos2); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + sax_call_next_token_start_pos(sax, lexOrPos); + sax_call_next_token_end_pos(sax, lexOrPos); +} + + template using null_function_t = decltype(std::declval().null()); @@ -9244,8 +9418,9 @@ class binary_reader bool parse_bson_internal() { std::int32_t document_size{}; + detail::sax_call_next_token_start_pos(sax, chars_read); get_number(input_format_t::bson, document_size); - + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -9256,6 +9431,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -9353,6 +9529,7 @@ class binary_reader case 0x01: // double { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(number)); return get_number(input_format_t::bson, number) && sax->number_float(static_cast(number), ""); } @@ -9360,7 +9537,10 @@ class binary_reader { std::int32_t len{}; string_t value; - return get_number(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_string(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(value); } case 0x03: // object @@ -9377,28 +9557,35 @@ class binary_reader { std::int32_t len{}; binary_t value; - return get_number(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_binary(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(value); } case 0x08: // boolean { + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + 1); return sax->boolean(get() != 0); } case 0x0A: // null { + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->null(); } case 0x10: // int32 { std::int32_t value{}; - return get_number(input_format_t::bson, value) && sax->number_integer(value); + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); + return get_number(input_format_t::bson, value) && sax->number_integer(value); } case 0x12: // int64 { std::int64_t value{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); return get_number(input_format_t::bson, value) && sax->number_integer(value); } @@ -9437,14 +9624,22 @@ class binary_reader } const std::size_t element_type_parse_position = chars_read; + if (!is_array) + { + detail::sax_call_next_token_start_pos(sax, chars_read); + } if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) { return false; } - if (!is_array && !sax->key(key)) + if (!is_array) { - return false; + detail::sax_call_next_token_end_pos(sax, chars_read); + if (!sax->key(key)) + { + return false; + } } if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) @@ -9466,6 +9661,7 @@ class binary_reader bool parse_bson_array() { std::int32_t document_size{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(std::int32_t)); get_number(input_format_t::bson, document_size); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) @@ -9478,6 +9674,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -9527,29 +9724,34 @@ class binary_reader case 0x15: case 0x16: case 0x17: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); case 0x18: // Unsigned integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x19: // Unsigned integer (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1A: // Unsigned integer (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1B: // Unsigned integer (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } @@ -9578,29 +9780,34 @@ class binary_reader case 0x35: case 0x36: case 0x37: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(0x20 - 1 - current)); case 0x38: // Negative integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x39: // Negative integer -1-n (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - static_cast(number)); } @@ -9637,7 +9844,10 @@ class binary_reader case 0x5F: // Binary data (indefinite length) { binary_t b; - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } // UTF-8 string (0x00..0x17 bytes follow) @@ -9672,7 +9882,10 @@ class binary_reader case 0x7F: // UTF-8 string (indefinite length) { string_t s; - return get_cbor_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } // array (0x00..0x17 data items follow) @@ -9700,35 +9913,51 @@ class binary_reader case 0x95: case 0x96: case 0x97: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array( conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0x98: // array (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x99: // array (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x9A: // array (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9B: // array (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9F: // array (indefinite length) + { + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array(static_cast(-1), tag_handler); + } // map (0x00..0x17 pairs of data items follow) case 0xA0: @@ -9755,33 +9984,47 @@ class binary_reader case 0xB5: case 0xB6: case 0xB7: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0xB8: // map (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xB9: // map (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xBA: // map (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBB: // map (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBF: // map (indefinite length) + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(static_cast(-1), tag_handler); case 0xC6: // tagged item @@ -9886,7 +10129,10 @@ class binary_reader return parse_cbor_internal(true, tag_handler); } get(); - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } default: // LCOV_EXCL_LINE @@ -9896,16 +10142,20 @@ class binary_reader } case 0xF4: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xF5: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xF6: // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xF9: // Half-Precision Float (two-byte IEEE 754) { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); const auto byte1_raw = get(); if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) { @@ -9947,6 +10197,7 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) : static_cast(val), ""); @@ -9955,12 +10206,14 @@ class binary_reader case 0xFA: // Single-Precision Float (four-byte IEEE 754) { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } case 0xFB: // Double-Precision Float (eight-byte IEEE 754) { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } @@ -10204,6 +10457,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -10229,7 +10483,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10245,7 +10502,10 @@ class binary_reader { while (get() != 0xFF) { - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10259,6 +10519,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -10406,6 +10667,7 @@ class binary_reader case 0x7D: case 0x7E: case 0x7F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); // fixmap @@ -10425,6 +10687,7 @@ class binary_reader case 0x8D: case 0x8E: case 0x8F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_object(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixarray @@ -10444,6 +10707,7 @@ class binary_reader case 0x9D: case 0x9E: case 0x9F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_array(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixstr @@ -10484,16 +10748,22 @@ class binary_reader case 0xDB: // str 32 { string_t s; - return get_msgpack_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case 0xC0: // nil + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xC2: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xC3: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xC4: // bin 8 @@ -10509,90 +10779,107 @@ class binary_reader case 0xD8: // fixext 16 { binary_t b; - return get_msgpack_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } case 0xCA: // float 32 { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCB: // float 64 { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCC: // uint 8 { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCD: // uint 16 { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCE: // uint 32 { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCF: // uint 64 { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xD0: // int 8 { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD1: // int 16 { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD2: // int 32 { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD3: // int 64 { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xDC: // array 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); } case 0xDD: // array 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast(len)); } case 0xDE: // map 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); } case 0xDF: // map 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast(len)); } @@ -10629,6 +10916,7 @@ class binary_reader case 0xFD: case 0xFE: case 0xFF: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(current)); default: // anything else @@ -10859,6 +11147,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -10877,7 +11166,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10889,6 +11181,7 @@ class binary_reader key.clear(); } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -11251,7 +11544,6 @@ class binary_reader return true; } } - string_t key = "_ArraySize_"; if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size()))) { @@ -11312,7 +11604,6 @@ class binary_reader bool is_ndarray = false; get_ignore_noop(); - if (current == '$') { result.second = get(); // must not ignore 'N', because 'N' maybe the type @@ -11341,7 +11632,9 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + //detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -11356,7 +11649,9 @@ class binary_reader if (current == '#') { + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + // detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -11365,6 +11660,7 @@ class binary_reader return is_error; } + // detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read - 1); return true; } @@ -11380,40 +11676,47 @@ class binary_reader return unexpect_eof(input_format, "value"); case 'T': // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 'F': // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 'Z': // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 'U': { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'i': { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'I': { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'l': { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } - case 'L': { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } @@ -11424,6 +11727,7 @@ class binary_reader break; } std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -11434,6 +11738,7 @@ class binary_reader break; } std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -11444,11 +11749,13 @@ class binary_reader break; } std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'h': { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); if (input_format != input_format_t::bjdata) { break; @@ -11494,25 +11801,30 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) - : static_cast(val), ""); + : static_cast(val), + ""); } case 'd': { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'D': { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'H': { + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_high_precision_number(); } @@ -11530,19 +11842,25 @@ class binary_reader exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr)); } string_t s(1, static_cast(current)); + detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read); return sax->string(s); } case 'S': // string { string_t s; - return get_ubjson_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case '[': // array + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_array(); case '{': // object + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_object(); default: // anything else @@ -11557,6 +11875,7 @@ class binary_reader */ bool get_ubjson_array() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -11581,6 +11900,7 @@ class binary_reader exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr)); } + detail::sax_call_next_token_end_pos(sax, chars_read); string_t type = it->second; // sax->string() takes a reference if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type))) { @@ -11592,6 +11912,7 @@ class binary_reader size_and_type.second = 'U'; } + detail::sax_call_next_token_start_end_pos(sax, chars_read); key = "_ArrayData_"; if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) )) { @@ -11600,17 +11921,20 @@ class binary_reader for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return (sax->end_array() && sax->end_object()); } if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) { return false; @@ -11622,6 +11946,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; @@ -11633,6 +11958,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) { return false; @@ -11642,6 +11968,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) { return false; @@ -11657,6 +11984,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -11665,6 +11993,7 @@ class binary_reader */ bool get_ubjson_object() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -11682,6 +12011,7 @@ class binary_reader string_t key; if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) { return false; @@ -11691,7 +12021,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11706,7 +12039,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11720,6 +12056,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -11727,7 +12064,10 @@ class binary_reader while (current != '}') { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key, false); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11740,6 +12080,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -11748,6 +12089,7 @@ class binary_reader bool get_ubjson_high_precision_number() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); // get size of following number string std::size_t size{}; bool no_ndarray = true; @@ -11768,6 +12110,7 @@ class binary_reader } number_vector.push_back(static_cast(current)); } + detail::sax_call_next_token_end_pos(sax, chars_read); // parse number string using ia_type = decltype(detail::input_adapter(number_vector)); @@ -11965,6 +12308,7 @@ class binary_reader { if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) { + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); } @@ -12176,8 +12520,6 @@ class parser , m_lexer(std::move(adapter), skip_comments) , allow_exceptions(allow_exceptions_) { - // read first token - get_token(); } /*! @@ -12198,7 +12540,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12226,7 +12568,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12264,7 +12606,7 @@ class parser const bool result = sax_parse_internal(sax); // strict mode: next byte must be EOF - if (result && strict && (get_token() != token_type::end_of_input)) + if (result && strict && (get_token(sax) != token_type::end_of_input)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12285,6 +12627,8 @@ class parser // value to avoid a goto (see comment where set to true) bool skip_to_state_evaluation = false; + // read first token + get_token(sax); while (true) { if (!skip_to_state_evaluation) @@ -12300,7 +12644,7 @@ class parser } // closing } -> we are done - if (get_token() == token_type::end_object) + if (get_token(sax) == token_type::end_object) { if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) { @@ -12322,7 +12666,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12333,7 +12677,7 @@ class parser states.push_back(false); // parse values - get_token(); + get_token(sax); continue; } @@ -12345,7 +12689,7 @@ class parser } // closing ] -> we are done - if (get_token() == token_type::end_array) + if (get_token(sax) == token_type::end_array) { if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) { @@ -12472,10 +12816,10 @@ class parser if (states.back()) // array { // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse a new value - get_token(); + get_token(sax); continue; } @@ -12505,10 +12849,10 @@ class parser // states.back() is false -> object // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse key - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::value_string)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12521,7 +12865,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12529,7 +12873,7 @@ class parser } // parse values - get_token(); + get_token(sax); continue; } @@ -12557,10 +12901,19 @@ class parser } } - /// get next token from lexer - token_type get_token() + /// get next token from lexer and pass position info to sax (if it is accepted) + template + token_type get_token(SAX* sax) { - return last_token = m_lexer.scan(); + if (!m_lexer.scan_start()) + { + last_token = token_type::parse_error; + return token_type::parse_error; + } + detail::sax_call_next_token_start_pos(sax, m_lexer); + last_token = m_lexer.scan_end(); + detail::sax_call_next_token_end_pos(sax, m_lexer); + return last_token; } std::string exception_message(const token_type expected, const std::string& context) diff --git a/tests/src/unit-sax-parser-extended.cpp b/tests/src/unit-sax-parser-extended.cpp new file mode 100644 index 000000000..e81107fba --- /dev/null +++ b/tests/src/unit-sax-parser-extended.cpp @@ -0,0 +1,1821 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ (test suite) +| | |__ | | | | | | version 3.10.2 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#include "doctest_compatibility.h" + +#include + +// ignore warning to replace if with if constexpr since there are +// several in the file, just deactivate it here to prevent repeated ifdefs +DOCTEST_MSVC_SUPPRESS_WARNING(4127) + +//option to make this test more verbose +#define verbose_out \ + if (0) \ + std::cout + +//prototype to make -Wmissing-prototypes happy +struct element_info_t; +bool operator<(const element_info_t& l, const element_info_t& r); +std::ostream& operator<<(std::ostream& out, const element_info_t& v); +std::ostream& operator<<(std::ostream& out, const std::set& v); +template +void fill_expected_sax_pos_json(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset); +template +void fill_expected_sax_pos_bson(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset); +template +void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::json& part); +template +void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann::json& part); +template +void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::json& part); +void test_json(nlohmann::json& json); + +//implementation + +struct element_info_t +{ + element_info_t(std::size_t idx, std::size_t first, std::size_t last) + : index{idx} + , start{first} + , end{last} + {} + std::size_t index = 0; + std::size_t start = 0; + std::size_t end = 0; +}; +bool operator<(const element_info_t& l, const element_info_t& r) +{ + return std::tie(l.index, l.start, l.end) < std::tie(r.index, r.start, r.end); +} +std::ostream& operator<<(std::ostream& out, const element_info_t& v) +{ + return (out << v.index << ':' << v.start << '-' << v.end + << '(' << v.end - v.start << ')'); +} +std::ostream& operator<<(std::ostream& out, const std::set& v) +{ + out << "{"; + if (v.size() > 32) + { + out << ">32 elements..."; + } + else + { + for (const auto& e : v) + { + out << ' ' << e; + } + } + out << " }"; + return out; +} + +template +struct Sax +{ + static constexpr bool has_callback = WithBytePos || (WithLexPos && !LexCallImpossible); + using json = nlohmann::json; + + enum class last_call_t + { + element, + start_pos, + end_pos + }; + + last_call_t last_call = last_call_t::element; + + element_info_t se{0, 0, 0}; + + std::set pos_null{}; + std::set pos_boolean{}; + std::set pos_number_integer{}; + std::set pos_number_unsigned{}; + std::set pos_number_float{}; + std::set pos_string{}; + std::set pos_binary{}; + std::set pos_start_object{}; + std::set pos_key{}; + std::set pos_end_object{}; + std::set pos_start_array{}; + std::set pos_end_array{}; + + void check_call(std::set& set, const char* fnname) + { + INFO("function " << fnname << ": " << se + << " (options = " << set << ')'); + if (has_callback) + { + CHECK(set.count(se) == 1); + CHECK(last_call == last_call_t::end_pos); + } + last_call = last_call_t::element; + set.erase(se); + ++se.index; + } + void check_start(std::size_t pos) + { + INFO("set start pos " << pos); + CHECK((last_call == last_call_t::element || last_call == last_call_t::end_pos)); + se.start = pos; + last_call = last_call_t::start_pos; + } + void check_end(std::size_t pos) + { + INFO("set end pos " << pos); + CHECK(last_call == last_call_t::start_pos); + se.end = pos; + last_call = last_call_t::end_pos; + } + + template + typename std::enable_if::type next_token_start(std::size_t pos) + { + check_start(pos); + CHECK((!WithLexPos || LexCallImpossible)); + } + + template < bool Act = WithLexPos > + typename std::enable_if::type next_token_start(const nlohmann::position_t& p) + { + check_start(p.chars_read_total); + CHECK(WithLexPos); + } + + template + typename std::enable_if::type next_token_end(std::size_t pos) + { + check_end(pos); + CHECK((!WithLexPos || LexCallImpossible)); + } + + template < bool Act = WithLexPos > + typename std::enable_if::type next_token_end(const nlohmann::position_t& p) + { + check_end(p.chars_read_total); + CHECK(WithLexPos); + } + + bool null() + { + check_call(pos_null, __func__); + verbose_out << "got null\n"; + return true; + } + bool boolean(bool val) + { + check_call(pos_boolean, __func__); + verbose_out << "got boolean " << val << "\n"; + return true; + } + bool number_integer(json::number_integer_t val) + { + check_call(pos_number_integer, __func__); + verbose_out << "got number_integer " << val << "\n"; + return true; + } + bool number_unsigned(json::number_unsigned_t val) + { + check_call(pos_number_unsigned, __func__); + verbose_out << "got number_unsigned " << val << "\n"; + return true; + } + bool number_float(json::number_float_t val, const std::string& str) + { + check_call(pos_number_float, __func__); + verbose_out << "got float " << val << " (" << str << ")" + << "\n"; + return true; + } + bool string(std::string& val) + { + check_call(pos_string, __func__); + verbose_out << "got string " << val << "\n"; + return true; + } + bool binary(std::vector& val) + { + check_call(pos_binary, __func__); + verbose_out << "got binary: size " << val.size() << "\n"; + return true; + } + bool start_object(std::size_t val) + { + check_call(pos_start_object, __func__); + verbose_out << "got start_object: size " << val << "\n"; + return true; + } + bool key(std::string& val) + { + check_call(pos_key, __func__); + verbose_out << "got key " << val << "\n"; + return true; + } + bool end_object() + { + check_call(pos_end_object, __func__); + verbose_out << "got end_object\n"; + return true; + } + bool start_array(std::size_t val) + { + check_call(pos_start_array, __func__); + verbose_out << "got start_array: size " << val << "\n"; + return true; + } + bool end_array() + { + check_call(pos_end_array, __func__); + verbose_out << "got end_array\n"; + return true; + } + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const json::exception& /*unused*/) // NOLINT(readability-convert-member-functions-to-static) + { + std::cout << "got parse_error\n"; + CHECK(false); // should not happen + return false; + } + void check_all_pos_found() + { + INFO("check all null were found (elements left: " << pos_null << ')'); + CHECK(pos_null.empty()); + INFO("check all boolean were found (elements left: " << pos_boolean << ')'); + CHECK(pos_boolean.empty()); + INFO("check all number_integer were found (elements left: " << pos_number_integer << ')'); + CHECK(pos_number_integer.empty()); + INFO("check all number_unsigned were found (elements left: " << pos_number_unsigned << ')'); + CHECK(pos_number_unsigned.empty()); + INFO("check all number_float were found (elements left: " << pos_number_float << ')'); + CHECK(pos_number_float.empty()); + INFO("check all string were found (elements left: " << pos_string << ')'); + CHECK(pos_string.empty()); + INFO("check all binary were found (elements left: " << pos_binary << ')'); + CHECK(pos_binary.empty()); + INFO("check all start_object were found (elements left: " << pos_start_object << ')'); + CHECK(pos_start_object.empty()); + INFO("check all key were found (elements left: " << pos_key << ')'); + CHECK(pos_key.empty()); + INFO("check all end_object were found (elements left: " << pos_end_object << ')'); + CHECK(pos_end_object.empty()); + INFO("check all start_array were found (elements left: " << pos_start_array << ')'); + CHECK(pos_start_array.empty()); + INFO("check all end_array were found (elements left: " << pos_end_array << ')'); + CHECK(pos_end_array.empty()); + } +}; + +template +struct Opt +{ + static constexpr bool WithBytePos = WithBytePosV; + static constexpr bool WithLexPos = WithLexPosV; +}; + +using OptNone = Opt; +using OptLex = Opt; +using OptPos = Opt; +using OptBoth = Opt; + +//test basic functionality +TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) +{ + const bool with_pos = T::WithBytePos; + const bool with_lex = T::WithLexPos; + + INFO("WithBytePos " << with_pos << ", WithLexPos " << with_lex); + //element count 0 1 2 3 4 5 6 7 8 9 10 + //index 10s place 0 1 2 3 4 5 + //index 1s place 012345678901234567890123456789012345678901234567890123 + const std::string str = R"({ "array" : [14294967296,-1,true,4.2,null,"str" ] })"; + std::size_t elem_idx = 0; + std::size_t char_idx = 0; + const auto element = [&](std::size_t bytes) + { + const auto start = char_idx; + char_idx += bytes; + return element_info_t{elem_idx++, start, char_idx}; + }; + const auto skip = [&](std::size_t bytes) + { + char_idx += bytes; + }; + SECTION("json") + { + std::string reconstructed; + const auto elementFromStr = [&](const std::string & s) + { + reconstructed += s; + return element(s.size()); + }; + const auto skipFromStr = [&](const std::string & s) + { + reconstructed += s; + skip(s.size()); + }; + Sax sax; + sax.pos_start_object.emplace(elementFromStr("{")); + skipFromStr(" "); + sax.pos_key.emplace(elementFromStr(R"("array")")); + skipFromStr(" : "); + sax.pos_start_array.emplace(elementFromStr("[")); + sax.pos_number_unsigned.emplace(elementFromStr("14294967296")); + skipFromStr(","); + sax.pos_number_integer.emplace(elementFromStr("-1")); + skipFromStr(","); + sax.pos_boolean.emplace(elementFromStr("true")); + skipFromStr(","); + sax.pos_number_float.emplace(elementFromStr("4.2")); + skipFromStr(","); + sax.pos_null.emplace(elementFromStr("null")); + skipFromStr(","); + sax.pos_string.emplace(elementFromStr(R"("str")")); + skipFromStr(" "); + sax.pos_end_array.emplace(elementFromStr("]")); + skipFromStr(" "); + sax.pos_end_object.emplace(elementFromStr("}")); + CHECK(nlohmann::json::sax_parse(str, &sax, nlohmann::json::input_format_t::json)); + if (with_pos || with_lex) + { + sax.check_all_pos_found(); + } + CHECK(char_idx == str.size()); + CHECK(str == reconstructed); + } + SECTION("bson") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_bson(j); + Sax sax; + sax.pos_start_object.emplace(element(4)); //4 bytes size + skip(1); //one byte type array + sax.pos_key.emplace(element(6)); //6 key (array\0) + sax.pos_start_array.emplace(element(4)); //4 bytes size + skip(3); //one byte type + key 0\0 + sax.pos_number_integer.emplace(element(8)); //8 bytes int64 + skip(3); //one byte type + key 1\0 + sax.pos_number_integer.emplace(element(4)); //4 bytes int32 + skip(3); //one byte type + key 2\0 + sax.pos_boolean.emplace(element(1)); //1 byte bool + skip(3); //one byte type + key 3\0 + sax.pos_number_float.emplace(element(8)); //8 bytes double + skip(3); //one byte type + key 4\0 + sax.pos_null.emplace(element((0))); //0 bytes + skip(3); //one byte type + key 4\0 + sax.pos_string.emplace(element(8)); //4 bytes size + (str\0) + sax.pos_end_array.emplace(element(1)); //1 byte \0 end of array + sax.pos_end_object.emplace(element(1)); //1 byte \0 end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bson)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("cbor") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_cbor(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_unsigned.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(1)); //1 byte type + 0 bytes int -> implicit value since small + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(4)); //1 byte type + 3 bytes string (str) (size implicit) + sax.pos_end_array.emplace(element(0)); //0 byte end of array + sax.pos_end_object.emplace(element(0)); //0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::cbor)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("msgpack") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_msgpack(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_unsigned.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(1)); //1 byte type + 0 bytes int -> implicit value since small + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(4)); //1 byte type + 3 bytes string (str) (size implicit) + sax.pos_end_array.emplace(element(0)); //0 byte end of array + sax.pos_end_object.emplace(element(0)); //0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::msgpack)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("ubjson") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_ubjson(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_integer.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(2)); //1 byte type + 1 bytes int8 + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(6)); //1 type + 1 type of len + 1 len +3 string (str) + sax.pos_end_array.emplace(element(1)); //1 byte type + 0 byte end of array + sax.pos_end_object.emplace(element(1)); //1 byte type + 0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::ubjson)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("bjdata") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_bjdata(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_integer.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(2)); //1 byte type + 1 bytes int8 + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(6)); //1 type + 1 type of len + 1 len +3 string (str) + sax.pos_end_array.emplace(element(1)); //1 byte type + 0 byte end of array + sax.pos_end_object.emplace(element(1)); //1 byte type + 0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bjdata)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } +} + +//cover more advanced cases (e.g. msgpack fixint) (but only use one templated version) +template +void fill_expected_sax_pos_json(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(4)); //null + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); // { + for (const auto& el : part.items()) + { + sax.pos_key.emplace(element(el.key().size() + 2)); //'"' + str + '"' + offset += 1; // separator ':' between key and value + fill_expected_sax_pos_json(sax, element, el.value(), offset); + offset += 1; // add , + } + if (!part.empty()) + { + offset -= 1; // remove last , + } + sax.pos_end_object.emplace(element(1)); // } + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); // [ + for (const auto& el : part.items()) + { + fill_expected_sax_pos_json(sax, element, el.value(), offset); + offset += 1; // add , + } + if (!part.empty()) + { + offset -= 1; // remove last , + } + sax.pos_end_array.emplace(element(1)); // ] + } + break; + case nlohmann::json::value_t::string: + { + const auto val = part.get(); + const std::size_t nbytes = val.size() + 2; //'"' + value + '"' + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + const auto val = part.get(); + if (val) + { + sax.pos_boolean.emplace(element(4)); // true + } + else + { + sax.pos_boolean.emplace(element(5)); // false + } + } + break; + case nlohmann::json::value_t::number_integer: + { + const auto val = part.get(); + const std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + const auto val = part.get(); + const std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + const auto val = part.get(); + const std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + //stored as object with array and subtype + nlohmann::json sub; + sub["bytes"] = nlohmann::json::array(); + for (const auto e : part.get_binary()) + { + sub["bytes"].emplace_back(e); + } + sub["subtype"]; + fill_expected_sax_pos_json(sax, element, sub, offset); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_bson(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + //type is before the key -> not included + sax.pos_null.emplace(element(0)); + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(4)); //32 bit size + for (const auto& el : part.items()) + { + offset += 1; // type of item + sax.pos_key.emplace(element(el.key().size() + 1)); // str + terminator + fill_expected_sax_pos_bson(sax, element, el.value(), offset); + } + sax.pos_end_object.emplace(element(1)); // \0 terminator + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(4)); //32 bit size + std::size_t i = 0; + for (const auto& el : part.items()) + { + offset += 1; // type of item + offset += 1 + std::to_string(i).size(); // dummy key + terminator + fill_expected_sax_pos_bson(sax, element, el.value(), offset); + ++i; + } + sax.pos_end_array.emplace(element(1)); // \0 terminator + } + break; + case nlohmann::json::value_t::string: + { + //type is before the key -> not included + std::size_t nbytes = 4; //size + const auto val = part.get(); + nbytes += val.size() + 1; //value + \0 terminate + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + //type is before the key -> not included + sax.pos_boolean.emplace(element(1)); //value + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 0; //type is before the key -> not included + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 0; //type is before the key -> not included + const auto val = part.get(); + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 0; //type is before the key -> not included + nbytes += 8; //value + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 0; //type is before the key -> not included + nbytes += 4; // length of bin (32 bit) + nbytes += 1; // subtype + nbytes += part.get_binary().size(); + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::object: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x17) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 1; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_object.emplace(element(nbytes)); + //key follows same rules as string + for (const auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type + nbyteskey += el.key().size(); + if (el.key().size() <= 0x17) + { + //size implicit in type + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; + } + else + { + nbyteskey += 8; + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_cbor(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(0)); + } + break; + case nlohmann::json::value_t::array: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x17) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 1; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_array.emplace(element(nbytes)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_cbor(sax, element, elem); + } + sax.pos_end_array.emplace(element(0)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + if (val.size() <= static_cast(0x17)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + sax.pos_boolean.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= -24) + { + //value implicit in type + } + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(0x17)) + { + //value implicit in type + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //really depends on the input type + if (val < 0) + { + std::cout << "unexpected float <0\n"; + throw std::logic_error{"unexpected float <0"}; + } + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; //float + } + else + { + nbytes += 8; //double float + } + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 1; //type + const auto& val = part.get_binary(); + nbytes += val.size(); + if (val.size() <= static_cast(0x17)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::object: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x0F) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_object.emplace(element(nbytes)); + //key follows same rules as string + for (const auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type + nbyteskey += el.key().size(); + if (el.key().size() <= 0x1F) + { + //size implicit in type + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; + } + else + { + nbyteskey += 8; + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_msgpack(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(0)); + } + break; + case nlohmann::json::value_t::array: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x0F) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_array.emplace(element(nbytes)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_msgpack(sax, element, elem); + } + sax.pos_end_array.emplace(element(0)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + if (val.size() <= static_cast(0x1F)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + sax.pos_boolean.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= -32) + { + //value implicit in type + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(0x7F)) + { + //value implicit in type + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //really depends on the input type + if (val < 0) + { + std::cout << "unexpected float <0\n"; + throw std::logic_error{"unexpected float <0"}; + } + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; //float + } + else + { + nbytes += 8; //double float + } + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 1; //type + const auto& val = part.get_binary(); + nbytes += val.size(); + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); + //key follows same rules as string + for (const auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type of len + nbyteskey += el.key().size(); + if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; // size of len + } + else + { + nbyteskey += 8; // size of len + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_ubjson(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(1)); + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_ubjson(sax, element, elem); + } + sax.pos_end_array.emplace(element(1)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + nbytes += 1; // type of length + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + sax.pos_boolean.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + //supported integer types : + // uint8 + // int8/16/32/64/High precision + // --> only 128-255 are stored as uint + high precision > max int64 + bool use_uint = false; + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val < 128) + { + ++nbytes; + } + else if (val <= 255) + { + use_uint = true; + ++nbytes; + } + else + { + //sorted as signed int! + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 8; + } + else + { + //High precision + //more complex calculation of size is not done here + //the size includes + // type (high precision) + // type of size of value length + // size of value length + // value as array of chars + //in this case + nbytes = 22; + if (val > std::numeric_limits::max() - 128) + { + //in this test case the value needs one more char + nbytes += 1; + } + if (val > static_cast(std::numeric_limits::max())) + { + use_uint = true; + } + } + } + if (use_uint) + { + sax.pos_number_unsigned.emplace(element(nbytes)); + } + else + { + sax.pos_number_integer.emplace(element(nbytes)); + } + } + break; + case nlohmann::json::value_t::number_float: + { + //everything is serialized as double (type+double value) + sax.pos_number_float.emplace(element(8 + 1)); + } + break; + case nlohmann::json::value_t::binary: + { + // Note, no reader for UBJSON binary types is implemented because they do + auto sub = nlohmann::json::array(); + for (const auto i : part.get_binary()) + { + sub.emplace_back(i); + } + fill_expected_sax_pos_ubjson(sax, element, sub); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_bjdata(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); + //key follows same rules as string + for (const auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type of len + nbyteskey += el.key().size(); + if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; // size of len + } + else + { + nbyteskey += 8; // size of len + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_bjdata(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(1)); + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_bjdata(sax, element, elem); + } + sax.pos_end_array.emplace(element(1)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + nbytes += 1; // type of length + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + sax.pos_boolean.emplace(element(1)); //type + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + auto* category = &sax.pos_number_unsigned; + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int8 for these values + category = &sax.pos_number_integer; + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int6 for these values + category = &sax.pos_number_integer; + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int32 for these values + category = &sax.pos_number_integer; + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int64 for these values + category = &sax.pos_number_integer; + nbytes += 8; + } + else + { + nbytes += 8; + } + category->emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + //everything is serialized as double (type+double value) + sax.pos_number_float.emplace(element(8 + 1)); + } + break; + case nlohmann::json::value_t::binary: + { + // Note, no reader for UBJSON binary types is implemented because they do + auto sub = nlohmann::json::array(); + for (const auto i : part.get_binary()) + { + sub.emplace_back(i); + } + fill_expected_sax_pos_ubjson(sax, element, sub); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +void test_json(nlohmann::json& json) +{ + Sax sax; + std::size_t elem_idx = 0; + std::size_t char_idx = 0; + const auto element = [&](std::size_t bytes) + { + const auto start = char_idx; + char_idx += bytes; + return element_info_t{elem_idx++, start, char_idx}; + }; + SECTION("json") + { + const auto bin = json.dump(); + std::cout << "json has size of " << bin.size() << '\n'; + fill_expected_sax_pos_json(sax, element, json, char_idx); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::json)); + sax.check_all_pos_found(); + } + SECTION("bson") + { + //since bson can't deal with values > int64 max we need to remove some + if (json.contains("uints")) + { + auto& ar = json["uints"]; + const std::uint64_t limit = std::numeric_limits::max(); + while (ar.back() > limit) + { + ar.erase(ar.size() - 1); + } + } + const auto bin = nlohmann::json::to_bson(json); + std::cout << "bson has size of " << bin.size() << '\n'; + fill_expected_sax_pos_bson(sax, element, json, char_idx); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bson)); + sax.check_all_pos_found(); + } + SECTION("cbor") + { + const auto bin = nlohmann::json::to_cbor(json); + std::cout << "cbor has size of " << bin.size() << '\n'; + fill_expected_sax_pos_cbor(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::cbor)); + sax.check_all_pos_found(); + } + SECTION("msgpack") + { + const auto bin = nlohmann::json::to_msgpack(json); + std::cout << "msgpack has size of " << bin.size() << '\n'; + fill_expected_sax_pos_msgpack(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::msgpack)); + sax.check_all_pos_found(); + } + SECTION("ubjson") + { + const auto bin = nlohmann::json::to_ubjson(json); + std::cout << "ubjson has size of " << bin.size() << '\n'; + fill_expected_sax_pos_ubjson(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::ubjson)); + sax.check_all_pos_found(); + } + SECTION("bjdata") + { + const auto bin = nlohmann::json::to_bjdata(json); + std::cout << "bjdata has size of " << bin.size() << '\n'; + fill_expected_sax_pos_bjdata(sax, element, json); + //CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bjdata)); + sax.check_all_pos_found(); + } +} + +TEST_CASE("extended parser generated (uint)") +{ + std::cout << "extended parser generated (uint) "; + nlohmann::json json; + auto& array = json["uints"]; + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(i); + } + //check area around key points + const auto add_area = [&](std::uint64_t mid, std::uint64_t lower, std::uint64_t higher) + { + for (std::uint64_t i = mid - lower; i < mid + higher; ++i) + { + array.emplace_back(i); + } + array.emplace_back(mid + higher); + }; + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 32); + + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 32); + + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 0); + test_json(json); +} +TEST_CASE("extended parser generated (int)") +{ + std::cout << "extended parser generated (int) "; + nlohmann::json json; + auto& array = json["ints"]; + for (std::int64_t i = -512; i <= -1; ++i) + { + array.emplace_back(i); + } + //check area around key points + const auto add_area = [&](std::int64_t mid, std::int64_t lower, std::int64_t higher) + { + for (std::int64_t i = mid - lower; i <= mid + higher; ++i) + { + array.emplace_back(i); + } + }; + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 0, 32); + test_json(json); +} +TEST_CASE("extended parser generated (array / bool)") +{ + std::cout << "extended parser generated (array / bool) "; + nlohmann::json json; + auto& array = json["arrays"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + auto sub = nlohmann::json::array(); + for (std::uint64_t j = 0; j < i; ++j) + { + sub.emplace_back((j % 2 == 0)); + } + array.emplace_back(std::move(sub)); + } + //add large aray + auto sub = nlohmann::json::array(); + for (std::uint64_t j = 0; j < std::numeric_limits::max() + 1; ++j) + { + sub.emplace_back((j % 2 == 0)); + } + array.emplace_back(std::move(sub)); + test_json(json); +} +TEST_CASE("extended parser generated (object / null)") +{ + std::cout << "extended parser generated (object / null) "; + nlohmann::json json; + auto& array = json["objects"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + auto sub = nlohmann::json::object(); + for (std::uint64_t j = 0; j < i; ++j) + { + sub[std::string(static_cast(j), 'k')]; + + } + array.emplace_back(std::move(sub)); + } + //add object with long keý + auto sub = nlohmann::json::object(); + sub[std::string(std::numeric_limits::max() + 1, 'k')]; + array.emplace_back(std::move(sub)); + test_json(json); +} +TEST_CASE("extended parser generated (string)") +{ + std::cout << "extended parser generated (string) "; + nlohmann::json json; + auto& array = json["strings"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(std::string(static_cast(i), '|')); + } + array.emplace_back(std::string(std::numeric_limits::max() + 1, '|')); + //test with large strings (e.g. requiring uint64 as size type) are not done + test_json(json); +} +TEST_CASE("extended parser generated (binary)") +{ + std::cout << "extended parser generated (binary) "; + nlohmann::json json; + auto& array = json["binary"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(nlohmann::json::binary(std::vector(static_cast(i), 255))); + } + //add large binary + std::vector data(std::numeric_limits::max() + 1, 255); + array.emplace_back(nlohmann::json::binary(std::move(data))); + test_json(json); +} diff --git a/tests/src/unit-sax-parser-store-source-location.cpp b/tests/src/unit-sax-parser-store-source-location.cpp new file mode 100644 index 000000000..68805f40c --- /dev/null +++ b/tests/src/unit-sax-parser-store-source-location.cpp @@ -0,0 +1,340 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ (test suite) +| | |__ | | | | | | version 3.10.2 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include + +#include "doctest_compatibility.h" + +#include + +//prototype to make -Wmissing-prototypes happy +std::ostream& operator<<(std::ostream& out, const nlohmann::position_t& p); + +//test json parser with detailed line / col information as metadata + +struct token_start_stop +{ + nlohmann::position_t start{}; + nlohmann::position_t stop{}; +}; + +std::ostream& operator<<(std::ostream& out, const nlohmann::position_t& p) +{ + out << p.chars_read_total << '(' << p.lines_read << ':' << p.chars_read_current_line << ')'; + return out; +} + +using json_with_token_start_stop = + nlohmann::basic_json < + std::map, + std::vector, + std::string, + bool, + std::int64_t, + std::uint64_t, + double, + std::allocator, + nlohmann::adl_serializer, + std::vector, + token_start_stop >; + +//adapted from detail::json_sax_dom_parser +class sax_with_token_start_stop_metadata +{ + public: + using json = json_with_token_start_stop; + using number_integer_t = typename json::number_integer_t; + using number_unsigned_t = typename json::number_unsigned_t; + using number_float_t = typename json::number_float_t; + using string_t = typename json::string_t; + using binary_t = typename json::binary_t; + + /*! + @param[in,out] r reference to a JSON value that is manipulated while + parsing + @param[in] allow_exceptions_ whether parse errors yield exceptions + */ + explicit sax_with_token_start_stop_metadata(json& r, const bool allow_exceptions_ = true) + : root(r) + , object_element{nullptr} // NOLINT(modernize-use-default-member-init) + , errored{false} // NOLINT(modernize-use-default-member-init) + , allow_exceptions(allow_exceptions_) + {} + + sax_with_token_start_stop_metadata(sax_with_token_start_stop_metadata&&) = delete; + sax_with_token_start_stop_metadata(const sax_with_token_start_stop_metadata&) = delete; + sax_with_token_start_stop_metadata& operator=(sax_with_token_start_stop_metadata&&) = delete; + sax_with_token_start_stop_metadata& operator=(const sax_with_token_start_stop_metadata&) = delete; + + ~sax_with_token_start_stop_metadata() = default; + + void next_token_start(const nlohmann::position_t& p) + { + start_stop.start = p; + } + + void next_token_end(const nlohmann::position_t& p) + { + start_stop.stop = p; + } + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::object)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive object size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool key(string_t& val) + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + // add null at given key and store the reference for later + object_element = &(*ref_stack.back())[val]; + return true; + } + + bool end_object() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + bool start_array(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::array)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive array size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool end_array() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_array()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + throw ex; + } + return false; + } + + bool is_errored() const + { + return errored; + } + + private: + /*! + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + */ + template + json* + handle_value(Value&& v) + { + if (ref_stack.empty()) + { + root = json(std::forward(v)); + root.start = start_stop.start; + root.stop = start_stop.stop; + return &root; + } + + assert(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + auto& array_element = ref_stack.back()->emplace_back(std::forward(v)); + array_element.start = start_stop.start; + array_element.stop = start_stop.stop; + return &array_element; + } + + assert(ref_stack.back()->is_object()); + assert(object_element); + *object_element = json(std::forward(v)); + object_element->start = start_stop.start; + object_element->stop = start_stop.stop; + return object_element; + } + + /// the parsed JSON value + json& root; + /// stack to model hierarchy of values + std::vector ref_stack{}; + /// helper to hold the reference for the next object element + json* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; + /// start / stop information for the current token + token_start_stop start_stop {}; +}; + +TEST_CASE("parse-json-with-position-info") +{ + const std::string str = + /*line 0*/ R"({)" + "\n" + /*line 1*/ R"( "array" : [)" + "\n" + /*line 2*/ R"( 14294967296,)" + "\n" + /*line 3*/ R"( -1,)" + "\n" + /*line 4*/ R"( true,)" + "\n" + /*line 5*/ R"( 4.2,)" + "\n" + /*line 6*/ R"( null,)" + "\n" + /*line 7*/ R"( "str")" + "\n" + /*line 8*/ R"( ])" + "\n" + /*line 9*/ R"(})"; + json_with_token_start_stop j; + sax_with_token_start_stop_metadata sax{j}; + CHECK(nlohmann::json::sax_parse(str, &sax, nlohmann::json::input_format_t::json)); + CHECK(j.start.lines_read == 0); + CHECK(j.start.chars_read_current_line == 0); + + CHECK(j["array"].start.lines_read == 1); + CHECK(j["array"].start.chars_read_current_line == 12); + + CHECK(j["array"][0].start.lines_read == 2); + CHECK(j["array"][0].start.chars_read_current_line == 4); + CHECK(j["array"][0].stop.lines_read == 2); + CHECK(j["array"][0].stop.chars_read_current_line == 15); + + CHECK(j["array"][1].start.lines_read == 3); + CHECK(j["array"][1].start.chars_read_current_line == 4); + CHECK(j["array"][1].stop.lines_read == 3); + CHECK(j["array"][1].stop.chars_read_current_line == 6); + + CHECK(j["array"][2].start.lines_read == 4); + CHECK(j["array"][2].start.chars_read_current_line == 4); + CHECK(j["array"][2].stop.lines_read == 4); + CHECK(j["array"][2].stop.chars_read_current_line == 8); + + CHECK(j["array"][3].start.lines_read == 5); + CHECK(j["array"][3].start.chars_read_current_line == 4); + CHECK(j["array"][3].stop.lines_read == 5); + CHECK(j["array"][3].stop.chars_read_current_line == 7); + + CHECK(j["array"][4].start.lines_read == 6); //starts directly after last value.... + CHECK(j["array"][4].start.chars_read_current_line == 4); + CHECK(j["array"][4].stop.lines_read == 6); + CHECK(j["array"][4].stop.chars_read_current_line == 8); + + CHECK(j["array"][5].start.lines_read == 7); + CHECK(j["array"][5].start.chars_read_current_line == 4); + CHECK(j["array"][5].stop.lines_read == 7); + CHECK(j["array"][5].stop.chars_read_current_line == 9); + + CHECK(j["array"].stop.lines_read == 8); + CHECK(j["array"].stop.chars_read_current_line == 3); + + CHECK(j.stop.lines_read == 9); + CHECK(j.stop.chars_read_current_line == 1); +}