From bd9cdcd99cc71710c4befe55d6d6560762e9ae7a Mon Sep 17 00:00:00 2001 From: Raphael Grimm Date: Sat, 20 Nov 2021 14:22:56 +0100 Subject: [PATCH 1/7] Extend sax parser to optionally accept position information for parsed tokens --- .../nlohmann/detail/input/binary_reader.hpp | 230 ++- include/nlohmann/detail/input/lexer.hpp | 14 +- include/nlohmann/detail/input/parser.hpp | 45 +- include/nlohmann/detail/meta/is_sax.hpp | 145 ++ single_include/nlohmann/json.hpp | 434 +++- tests/src/unit-sax-parser-extended.cpp | 1830 +++++++++++++++++ .../unit-sax-parser-store-source-location.cpp | 337 +++ 7 files changed, 2933 insertions(+), 102 deletions(-) create mode 100644 tests/src/unit-sax-parser-extended.cpp create mode 100644 tests/src/unit-sax-parser-store-source-location.cpp diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 832c36ddf..ea9032c3c 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -168,8 +168,9 @@ class binary_reader bool parse_bson_internal() { std::int32_t document_size{}; + detail::sax_call_next_token_start_pos(sax, chars_read); get_number(input_format_t::bson, document_size); - + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -180,6 +181,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -277,6 +279,7 @@ class binary_reader case 0x01: // double { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(number)); return get_number(input_format_t::bson, number) && sax->number_float(static_cast(number), ""); } @@ -284,7 +287,10 @@ class binary_reader { std::int32_t len{}; string_t value; - return get_number(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_string(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(value); } case 0x03: // object @@ -301,28 +307,35 @@ class binary_reader { std::int32_t len{}; binary_t value; - return get_number(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_binary(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(value); } case 0x08: // boolean { + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + 1); return sax->boolean(get() != 0); } case 0x0A: // null { + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->null(); } case 0x10: // int32 { std::int32_t value{}; - return get_number(input_format_t::bson, value) && sax->number_integer(value); + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); + return get_number(input_format_t::bson, value) && sax->number_integer(value); } case 0x12: // int64 { std::int64_t value{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); return get_number(input_format_t::bson, value) && sax->number_integer(value); } @@ -361,14 +374,22 @@ class binary_reader } const std::size_t element_type_parse_position = chars_read; + if (!is_array) + { + detail::sax_call_next_token_start_pos(sax, chars_read); + } if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) { return false; } - if (!is_array && !sax->key(key)) + if (!is_array) { - return false; + detail::sax_call_next_token_end_pos(sax, chars_read); + if (!sax->key(key)) + { + return false; + } } if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) @@ -390,6 +411,7 @@ class binary_reader bool parse_bson_array() { std::int32_t document_size{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(std::int32_t)); get_number(input_format_t::bson, document_size); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) @@ -402,6 +424,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -451,29 +474,34 @@ class binary_reader case 0x15: case 0x16: case 0x17: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); case 0x18: // Unsigned integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x19: // Unsigned integer (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1A: // Unsigned integer (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1B: // Unsigned integer (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } @@ -502,29 +530,34 @@ class binary_reader case 0x35: case 0x36: case 0x37: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(0x20 - 1 - current)); case 0x38: // Negative integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x39: // Negative integer -1-n (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - static_cast(number)); } @@ -561,7 +594,10 @@ class binary_reader case 0x5F: // Binary data (indefinite length) { binary_t b; - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } // UTF-8 string (0x00..0x17 bytes follow) @@ -596,7 +632,10 @@ class binary_reader case 0x7F: // UTF-8 string (indefinite length) { string_t s; - return get_cbor_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } // array (0x00..0x17 data items follow) @@ -624,35 +663,51 @@ class binary_reader case 0x95: case 0x96: case 0x97: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array( conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0x98: // array (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x99: // array (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x9A: // array (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9B: // array (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9F: // array (indefinite length) + { + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array(static_cast(-1), tag_handler); + } // map (0x00..0x17 pairs of data items follow) case 0xA0: @@ -679,33 +734,47 @@ class binary_reader case 0xB5: case 0xB6: case 0xB7: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0xB8: // map (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xB9: // map (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xBA: // map (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBB: // map (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBF: // map (indefinite length) + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(static_cast(-1), tag_handler); case 0xC6: // tagged item @@ -810,7 +879,10 @@ class binary_reader return parse_cbor_internal(true, tag_handler); } get(); - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } default: // LCOV_EXCL_LINE @@ -820,16 +892,20 @@ class binary_reader } case 0xF4: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xF5: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xF6: // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xF9: // Half-Precision Float (two-byte IEEE 754) { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); const auto byte1_raw = get(); if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) { @@ -871,6 +947,7 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) : static_cast(val), ""); @@ -879,12 +956,14 @@ class binary_reader case 0xFA: // Single-Precision Float (four-byte IEEE 754) { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } case 0xFB: // Double-Precision Float (eight-byte IEEE 754) { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } @@ -1128,6 +1207,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -1153,7 +1233,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1169,7 +1252,10 @@ class binary_reader { while (get() != 0xFF) { - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1183,6 +1269,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -1330,6 +1417,7 @@ class binary_reader case 0x7D: case 0x7E: case 0x7F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); // fixmap @@ -1349,6 +1437,7 @@ class binary_reader case 0x8D: case 0x8E: case 0x8F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_object(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixarray @@ -1368,6 +1457,7 @@ class binary_reader case 0x9D: case 0x9E: case 0x9F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_array(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixstr @@ -1408,16 +1498,22 @@ class binary_reader case 0xDB: // str 32 { string_t s; - return get_msgpack_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case 0xC0: // nil + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xC2: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xC3: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xC4: // bin 8 @@ -1433,90 +1529,107 @@ class binary_reader case 0xD8: // fixext 16 { binary_t b; - return get_msgpack_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } case 0xCA: // float 32 { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCB: // float 64 { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCC: // uint 8 { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCD: // uint 16 { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCE: // uint 32 { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCF: // uint 64 { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xD0: // int 8 { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD1: // int 16 { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD2: // int 32 { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD3: // int 64 { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xDC: // array 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); } case 0xDD: // array 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast(len)); } case 0xDE: // map 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); } case 0xDF: // map 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast(len)); } @@ -1553,6 +1666,7 @@ class binary_reader case 0xFD: case 0xFE: case 0xFF: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(current)); default: // anything else @@ -1783,6 +1897,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -1801,7 +1916,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -1813,6 +1931,7 @@ class binary_reader key.clear(); } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -2175,7 +2294,6 @@ class binary_reader return true; } } - string_t key = "_ArraySize_"; if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size()))) { @@ -2236,7 +2354,6 @@ class binary_reader bool is_ndarray = false; get_ignore_noop(); - if (current == '$') { result.second = get(); // must not ignore 'N', because 'N' maybe the type @@ -2265,7 +2382,9 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + //detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -2280,7 +2399,9 @@ class binary_reader if (current == '#') { + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + // detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -2289,6 +2410,7 @@ class binary_reader return is_error; } + // detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read - 1); return true; } @@ -2304,40 +2426,47 @@ class binary_reader return unexpect_eof(input_format, "value"); case 'T': // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 'F': // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 'Z': // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 'U': { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'i': { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'I': { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'l': { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } - case 'L': { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } @@ -2348,6 +2477,7 @@ class binary_reader break; } std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -2358,6 +2488,7 @@ class binary_reader break; } std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -2368,11 +2499,13 @@ class binary_reader break; } std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'h': { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); if (input_format != input_format_t::bjdata) { break; @@ -2418,25 +2551,30 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) - : static_cast(val), ""); + : static_cast(val), + ""); } case 'd': { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'D': { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'H': { + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_high_precision_number(); } @@ -2454,19 +2592,25 @@ class binary_reader exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr)); } string_t s(1, static_cast(current)); + detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read); return sax->string(s); } case 'S': // string { string_t s; - return get_ubjson_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case '[': // array + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_array(); case '{': // object + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_object(); default: // anything else @@ -2481,6 +2625,7 @@ class binary_reader */ bool get_ubjson_array() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -2505,6 +2650,7 @@ class binary_reader exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr)); } + detail::sax_call_next_token_end_pos(sax, chars_read); string_t type = it->second; // sax->string() takes a reference if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type))) { @@ -2516,6 +2662,7 @@ class binary_reader size_and_type.second = 'U'; } + detail::sax_call_next_token_start_end_pos(sax, chars_read); key = "_ArrayData_"; if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) )) { @@ -2524,17 +2671,20 @@ class binary_reader for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return (sax->end_array() && sax->end_object()); } if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) { return false; @@ -2546,6 +2696,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; @@ -2557,6 +2708,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) { return false; @@ -2566,6 +2718,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) { return false; @@ -2581,6 +2734,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -2589,6 +2743,7 @@ class binary_reader */ bool get_ubjson_object() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -2606,6 +2761,7 @@ class binary_reader string_t key; if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) { return false; @@ -2615,7 +2771,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2630,7 +2789,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2644,6 +2806,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -2651,7 +2814,10 @@ class binary_reader while (current != '}') { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key, false); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -2664,6 +2830,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -2672,6 +2839,7 @@ class binary_reader bool get_ubjson_high_precision_number() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); // get size of following number string std::size_t size{}; bool no_ndarray = true; @@ -2692,6 +2860,7 @@ class binary_reader } number_vector.push_back(static_cast(current)); } + detail::sax_call_next_token_end_pos(sax, chars_read); // parse number string using ia_type = decltype(detail::input_adapter(number_vector)); @@ -2889,6 +3058,7 @@ class binary_reader { if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) { + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); } diff --git a/include/nlohmann/detail/input/lexer.hpp b/include/nlohmann/detail/input/lexer.hpp index 72e995108..174092b0d 100644 --- a/include/nlohmann/detail/input/lexer.hpp +++ b/include/nlohmann/detail/input/lexer.hpp @@ -1506,13 +1506,13 @@ scan_number_done: while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); } - token_type scan() + bool scan_start() { // initially, skip the BOM if (position.chars_read_total == 0 && !skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; + return false; } // read next character and ignore whitespace @@ -1523,13 +1523,17 @@ scan_number_done: { if (!scan_comment()) { - return token_type::parse_error; + return false; } // skip following whitespace skip_whitespace(); } + return true; + } + token_type scan_end() + { switch (current) { // structural characters @@ -1593,6 +1597,10 @@ scan_number_done: return token_type::parse_error; } } + token_type scan() + { + return !scan_start() ? token_type::parse_error : scan_end(); + } private: /// input adapter diff --git a/include/nlohmann/detail/input/parser.hpp b/include/nlohmann/detail/input/parser.hpp index 8acbd4fca..af20e3167 100644 --- a/include/nlohmann/detail/input/parser.hpp +++ b/include/nlohmann/detail/input/parser.hpp @@ -76,8 +76,6 @@ class parser , m_lexer(std::move(adapter), skip_comments) , allow_exceptions(allow_exceptions_) { - // read first token - get_token(); } /*! @@ -98,7 +96,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -126,7 +124,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -164,7 +162,7 @@ class parser const bool result = sax_parse_internal(sax); // strict mode: next byte must be EOF - if (result && strict && (get_token() != token_type::end_of_input)) + if (result && strict && (get_token(sax) != token_type::end_of_input)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -185,6 +183,8 @@ class parser // value to avoid a goto (see comment where set to true) bool skip_to_state_evaluation = false; + // read first token + get_token(sax); while (true) { if (!skip_to_state_evaluation) @@ -200,7 +200,7 @@ class parser } // closing } -> we are done - if (get_token() == token_type::end_object) + if (get_token(sax) == token_type::end_object) { if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) { @@ -222,7 +222,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -233,7 +233,7 @@ class parser states.push_back(false); // parse values - get_token(); + get_token(sax); continue; } @@ -245,7 +245,7 @@ class parser } // closing ] -> we are done - if (get_token() == token_type::end_array) + if (get_token(sax) == token_type::end_array) { if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) { @@ -372,10 +372,10 @@ class parser if (states.back()) // array { // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse a new value - get_token(); + get_token(sax); continue; } @@ -405,10 +405,10 @@ class parser // states.back() is false -> object // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse key - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::value_string)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -421,7 +421,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -429,7 +429,7 @@ class parser } // parse values - get_token(); + get_token(sax); continue; } @@ -457,10 +457,19 @@ class parser } } - /// get next token from lexer - token_type get_token() + /// get next token from lexer and pass position info to sax (if it is accepted) + template + token_type get_token(SAX* sax) { - return last_token = m_lexer.scan(); + if (!m_lexer.scan_start()) + { + last_token = token_type::parse_error; + return token_type::parse_error; + } + detail::sax_call_next_token_start_pos(sax, m_lexer); + last_token = m_lexer.scan_end(); + detail::sax_call_next_token_end_pos(sax, m_lexer); + return last_token; } std::string exception_message(const token_type expected, const std::string& context) diff --git a/include/nlohmann/detail/meta/is_sax.hpp b/include/nlohmann/detail/meta/is_sax.hpp index 215008963..fd0586434 100644 --- a/include/nlohmann/detail/meta/is_sax.hpp +++ b/include/nlohmann/detail/meta/is_sax.hpp @@ -19,6 +19,151 @@ NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { +// helper struct to call sax->next_token_start +//(we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_start_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_start(std::forward(ts)...)) + { + sax->next_token_start(std::forward(ts)...); + } +}; +// helper struct to call sax->next_token_end +// (we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_end_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_end(std::forward(ts)...)) + { + sax->next_token_end(std::forward(ts)...); + } +}; + +// dispatch the calls to next_token_start next_token_end +// and drop the calls if the sax parser does not support these methods. +// +// DirectCaller can be set to one of sax_call_next_token_{start,end}_pos_direct to +// determine which method is called +template +struct sax_call_function +{ + // is the parameter a lexer or a position + static constexpr bool no_lexer = std::is_same::value; + + template + using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); + + //the sax parser supports calls with a position + static constexpr bool detected_call_with_pos = + is_detected_exact::value; + + //the sax parser supports calls with a lexer + static constexpr bool detected_call_with_lex = + !no_lexer && + is_detected_exact::value; + + //there either has to be a version accepting a lexer or a position + static constexpr bool valid = detected_call_with_pos || detected_call_with_lex; + + //called with pos and pos is method supported -> pass data on + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + sax_call_function::detected_call_with_pos + >::type + call(SaxT* sax, std::size_t pos) + { + DirectCaller::call(sax, pos); + } + + //the sax parser has no version of the method -> drop call + template + static typename std::enable_if < + std::is_same::value && + !sax_call_function::valid + >::type + call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} + + //called with lex and lex method is supported -> pass data on + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + sax_call_function::detected_call_with_lex + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex); + } + + // called with lex and only pos method is supported -> call with position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + !sax_call_function::detected_call_with_lex && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only pos method is supported -> call with position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + !sax_call_function::detected_call_with_lex && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total); + } +}; + +//set the element start pos of a sax parser by calling any version of sax->next_token_start (if available) +template +void sax_call_next_token_start_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element end pos of a sax parser by calling any version of sax->next_token_end (if available) +template +void sax_call_next_token_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos1& lexOrPos1, const LexOrPos2& lexOrPos2) +{ + sax_call_next_token_start_pos(sax, lexOrPos1); + sax_call_next_token_end_pos(sax, lexOrPos2); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + sax_call_next_token_start_pos(sax, lexOrPos); + sax_call_next_token_end_pos(sax, lexOrPos); +} + + template using null_function_t = decltype(std::declval().null()); diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index ddd3131dc..f4852d864 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -8811,13 +8811,13 @@ scan_number_done: while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); } - token_type scan() + bool scan_start() { // initially, skip the BOM if (position.chars_read_total == 0 && !skip_bom()) { error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; + return false; } // read next character and ignore whitespace @@ -8828,13 +8828,17 @@ scan_number_done: { if (!scan_comment()) { - return token_type::parse_error; + return false; } // skip following whitespace skip_whitespace(); } + return true; + } + token_type scan_end() + { switch (current) { // structural characters @@ -8898,6 +8902,10 @@ scan_number_done: return token_type::parse_error; } } + token_type scan() + { + return !scan_start() ? token_type::parse_error : scan_end(); + } private: /// input adapter @@ -8963,6 +8971,151 @@ NLOHMANN_JSON_NAMESPACE_END NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { +// helper struct to call sax->next_token_start +//(we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_start_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_start(std::forward(ts)...)) + { + sax->next_token_start(std::forward(ts)...); + } +}; +// helper struct to call sax->next_token_end +// (we want this functionality as a type to ease passing it as template argument) +struct sax_call_next_token_end_pos_direct +{ + template + static auto call(SAX* sax, Ts&& ...ts) + -> decltype(sax->next_token_end(std::forward(ts)...)) + { + sax->next_token_end(std::forward(ts)...); + } +}; + +// dispatch the calls to next_token_start next_token_end +// and drop the calls if the sax parser does not support these methods. +// +// DirectCaller can be set to one of sax_call_next_token_{start,end}_pos_direct to +// determine which method is called +template +struct sax_call_function +{ + // is the parameter a lexer or a position + static constexpr bool no_lexer = std::is_same::value; + + template + using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); + + //the sax parser supports calls with a position + static constexpr bool detected_call_with_pos = + is_detected_exact::value; + + //the sax parser supports calls with a lexer + static constexpr bool detected_call_with_lex = + !no_lexer && + is_detected_exact::value; + + //there either has to be a version accepting a lexer or a position + static constexpr bool valid = detected_call_with_pos || detected_call_with_lex; + + //called with pos and pos is method supported -> pass data on + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + sax_call_function::detected_call_with_pos + >::type + call(SaxT* sax, std::size_t pos) + { + DirectCaller::call(sax, pos); + } + + //the sax parser has no version of the method -> drop call + template + static typename std::enable_if < + std::is_same::value && + !sax_call_function::valid + >::type + call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} + + //called with lex and lex method is supported -> pass data on + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + sax_call_function::detected_call_with_lex + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex); + } + + // called with lex and only pos method is supported -> call with position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + !sax_call_function::detected_call_with_lex && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only pos method is supported -> call with position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + sax_call_function::valid && + std::is_same::value && + !sax_call_function::no_lexer && + !sax_call_function::detected_call_with_lex && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position().chars_read_total); + } +}; + +//set the element start pos of a sax parser by calling any version of sax->next_token_start (if available) +template +void sax_call_next_token_start_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element end pos of a sax parser by calling any version of sax->next_token_end (if available) +template +void sax_call_next_token_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + using call_t = sax_call_function; + call_t::call(sax, lexOrPos); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos1& lexOrPos1, const LexOrPos2& lexOrPos2) +{ + sax_call_next_token_start_pos(sax, lexOrPos1); + sax_call_next_token_end_pos(sax, lexOrPos2); +} +//set the element start end pos of a sax parser by calling any version of +// sax->next_token_start and sax->next_token_end (if available) +template +void sax_call_next_token_start_end_pos(SAX* sax, const LexOrPos& lexOrPos) +{ + sax_call_next_token_start_pos(sax, lexOrPos); + sax_call_next_token_end_pos(sax, lexOrPos); +} + + template using null_function_t = decltype(std::declval().null()); @@ -9246,8 +9399,9 @@ class binary_reader bool parse_bson_internal() { std::int32_t document_size{}; + detail::sax_call_next_token_start_pos(sax, chars_read); get_number(input_format_t::bson, document_size); - + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -9258,6 +9412,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -9355,6 +9510,7 @@ class binary_reader case 0x01: // double { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(number)); return get_number(input_format_t::bson, number) && sax->number_float(static_cast(number), ""); } @@ -9362,7 +9518,10 @@ class binary_reader { std::int32_t len{}; string_t value; - return get_number(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_string(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(value); } case 0x03: // object @@ -9379,28 +9538,35 @@ class binary_reader { std::int32_t len{}; binary_t value; - return get_number(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_number(input_format_t::bson, len) && get_bson_binary(len, value); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(value); } case 0x08: // boolean { + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + 1); return sax->boolean(get() != 0); } case 0x0A: // null { + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->null(); } case 0x10: // int32 { std::int32_t value{}; - return get_number(input_format_t::bson, value) && sax->number_integer(value); + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); + return get_number(input_format_t::bson, value) && sax->number_integer(value); } case 0x12: // int64 { std::int64_t value{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(value)); return get_number(input_format_t::bson, value) && sax->number_integer(value); } @@ -9439,14 +9605,22 @@ class binary_reader } const std::size_t element_type_parse_position = chars_read; + if (!is_array) + { + detail::sax_call_next_token_start_pos(sax, chars_read); + } if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key))) { return false; } - if (!is_array && !sax->key(key)) + if (!is_array) { - return false; + detail::sax_call_next_token_end_pos(sax, chars_read); + if (!sax->key(key)) + { + return false; + } } if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position))) @@ -9468,6 +9642,7 @@ class binary_reader bool parse_bson_array() { std::int32_t document_size{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read, chars_read + sizeof(std::int32_t)); get_number(input_format_t::bson, document_size); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) @@ -9480,6 +9655,7 @@ class binary_reader return false; } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -9529,29 +9705,34 @@ class binary_reader case 0x15: case 0x16: case 0x17: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); case 0x18: // Unsigned integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x19: // Unsigned integer (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1A: // Unsigned integer (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } case 0x1B: // Unsigned integer (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_unsigned(number); } @@ -9580,29 +9761,34 @@ class binary_reader case 0x35: case 0x36: case 0x37: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(0x20 - 1 - current)); case 0x38: // Negative integer (one-byte uint8_t follows) { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x39: // Negative integer -1-n (two-byte uint16_t follows) { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3A: // Negative integer -1-n (four-byte uint32_t follows) { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - number); } case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows) { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast(-1) - static_cast(number)); } @@ -9639,7 +9825,10 @@ class binary_reader case 0x5F: // Binary data (indefinite length) { binary_t b; - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } // UTF-8 string (0x00..0x17 bytes follow) @@ -9674,7 +9863,10 @@ class binary_reader case 0x7F: // UTF-8 string (indefinite length) { string_t s; - return get_cbor_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } // array (0x00..0x17 data items follow) @@ -9702,35 +9894,51 @@ class binary_reader case 0x95: case 0x96: case 0x97: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array( conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0x98: // array (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x99: // array (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(static_cast(len), tag_handler); } case 0x9A: // array (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9B: // array (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_array(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_array(conditional_static_cast(len), tag_handler); } case 0x9F: // array (indefinite length) + { + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_array(static_cast(-1), tag_handler); + } // map (0x00..0x17 pairs of data items follow) case 0xA0: @@ -9757,33 +9965,47 @@ class binary_reader case 0xB5: case 0xB6: case 0xB7: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(conditional_static_cast(static_cast(current) & 0x1Fu), tag_handler); case 0xB8: // map (one-byte uint8_t for n follows) { std::uint8_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xB9: // map (two-byte uint16_t for n follow) { std::uint16_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(static_cast(len), tag_handler); } case 0xBA: // map (four-byte uint32_t for n follow) { std::uint32_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBB: // map (eight-byte uint64_t for n follow) { std::uint64_t len{}; - return get_number(input_format_t::cbor, len) && get_cbor_object(conditional_static_cast(len), tag_handler); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_number(input_format_t::cbor, len); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && get_cbor_object(conditional_static_cast(len), tag_handler); } case 0xBF: // map (indefinite length) + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_cbor_object(static_cast(-1), tag_handler); case 0xC6: // tagged item @@ -9888,7 +10110,10 @@ class binary_reader return parse_cbor_internal(true, tag_handler); } get(); - return get_cbor_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read); + const bool result_get = get_cbor_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } default: // LCOV_EXCL_LINE @@ -9898,16 +10123,20 @@ class binary_reader } case 0xF4: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xF5: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xF6: // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xF9: // Half-Precision Float (two-byte IEEE 754) { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); const auto byte1_raw = get(); if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number"))) { @@ -9949,6 +10178,7 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) : static_cast(val), ""); @@ -9957,12 +10187,14 @@ class binary_reader case 0xFA: // Single-Precision Float (four-byte IEEE 754) { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } case 0xFB: // Double-Precision Float (eight-byte IEEE 754) { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::cbor, number) && sax->number_float(static_cast(number), ""); } @@ -10206,6 +10438,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -10231,7 +10464,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10247,7 +10483,10 @@ class binary_reader { while (get() != 0xFF) { - if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_cbor_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10261,6 +10500,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -10408,6 +10648,7 @@ class binary_reader case 0x7D: case 0x7E: case 0x7F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_unsigned(static_cast(current)); // fixmap @@ -10427,6 +10668,7 @@ class binary_reader case 0x8D: case 0x8E: case 0x8F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_object(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixarray @@ -10446,6 +10688,7 @@ class binary_reader case 0x9D: case 0x9E: case 0x9F: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return get_msgpack_array(conditional_static_cast(static_cast(current) & 0x0Fu)); // fixstr @@ -10486,16 +10729,22 @@ class binary_reader case 0xDB: // str 32 { string_t s; - return get_msgpack_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case 0xC0: // nil + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 0xC2: // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 0xC3: // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 0xC4: // bin 8 @@ -10511,90 +10760,107 @@ class binary_reader case 0xD8: // fixext 16 { binary_t b; - return get_msgpack_binary(b) && sax->binary(b); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_binary(b); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->binary(b); } case 0xCA: // float 32 { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCB: // float 64 { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast(number), ""); } case 0xCC: // uint 8 { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCD: // uint 16 { std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCE: // uint 32 { std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xCF: // uint 64 { std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number); } case 0xD0: // int 8 { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD1: // int 16 { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD2: // int 32 { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xD3: // int 64 { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format_t::msgpack, number) && sax->number_integer(number); } case 0xDC: // array 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast(len)); } case 0xDD: // array 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_array(conditional_static_cast(len)); } case 0xDE: // map 16 { std::uint16_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast(len)); } case 0xDF: // map 32 { std::uint32_t len{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(len)); return get_number(input_format_t::msgpack, len) && get_msgpack_object(conditional_static_cast(len)); } @@ -10631,6 +10897,7 @@ class binary_reader case 0xFD: case 0xFE: case 0xFF: + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->number_integer(static_cast(current)); default: // anything else @@ -10861,6 +11128,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_array(); } @@ -10879,7 +11147,10 @@ class binary_reader for (std::size_t i = 0; i < len; ++i) { get(); - if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_msgpack_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -10891,6 +11162,7 @@ class binary_reader key.clear(); } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return sax->end_object(); } @@ -11253,7 +11525,6 @@ class binary_reader return true; } } - string_t key = "_ArraySize_"; if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size()))) { @@ -11314,7 +11585,6 @@ class binary_reader bool is_ndarray = false; get_ignore_noop(); - if (current == '$') { result.second = get(); // must not ignore 'N', because 'N' maybe the type @@ -11343,7 +11613,9 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + //detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -11358,7 +11630,9 @@ class binary_reader if (current == '#') { + // detail::sax_call_next_token_start_pos(sax, chars_read - 1); const bool is_error = get_ubjson_size_value(result.first, is_ndarray); + // detail::sax_call_next_token_end_pos(sax, chars_read); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -11367,6 +11641,7 @@ class binary_reader return is_error; } + // detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read - 1); return true; } @@ -11382,40 +11657,47 @@ class binary_reader return unexpect_eof(input_format, "value"); case 'T': // true + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(true); case 'F': // false + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->boolean(false); case 'Z': // null + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->null(); case 'U': { std::uint8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'i': { std::int8_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'I': { std::int16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } case 'l': { std::int32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } - case 'L': { std::int64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_integer(number); } @@ -11426,6 +11708,7 @@ class binary_reader break; } std::uint16_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -11436,6 +11719,7 @@ class binary_reader break; } std::uint32_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } @@ -11446,11 +11730,13 @@ class binary_reader break; } std::uint64_t number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_unsigned(number); } case 'h': { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); if (input_format != input_format_t::bjdata) { break; @@ -11496,25 +11782,30 @@ class binary_reader return std::ldexp(mant + 1024, exp - 25); } }(); + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->number_float((half & 0x8000u) != 0 ? static_cast(-val) - : static_cast(val), ""); + : static_cast(val), + ""); } case 'd': { float number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'D': { double number{}; + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read + sizeof(number)); return get_number(input_format, number) && sax->number_float(static_cast(number), ""); } case 'H': { + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_high_precision_number(); } @@ -11532,19 +11823,25 @@ class binary_reader exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr)); } string_t s(1, static_cast(current)); + detail::sax_call_next_token_start_end_pos(sax, chars_read - 2, chars_read); return sax->string(s); } case 'S': // string { string_t s; - return get_ubjson_string(s) && sax->string(s); + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(s); + detail::sax_call_next_token_end_pos(sax, chars_read); + return result_get && sax->string(s); } case '[': // array + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_array(); case '{': // object + // call to detail::sax_call_next_token_start_end_pos inside of the method return get_ubjson_object(); default: // anything else @@ -11559,6 +11856,7 @@ class binary_reader */ bool get_ubjson_array() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -11583,6 +11881,7 @@ class binary_reader exception_message(input_format, "invalid byte: 0x" + last_token, "type"), nullptr)); } + detail::sax_call_next_token_end_pos(sax, chars_read); string_t type = it->second; // sax->string() takes a reference if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->string(type))) { @@ -11594,6 +11893,7 @@ class binary_reader size_and_type.second = 'U'; } + detail::sax_call_next_token_start_end_pos(sax, chars_read); key = "_ArrayData_"; if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) )) { @@ -11602,17 +11902,20 @@ class binary_reader for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; } } + detail::sax_call_next_token_start_end_pos(sax, chars_read); return (sax->end_array() && sax->end_object()); } if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first))) { return false; @@ -11624,6 +11927,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second))) { return false; @@ -11635,6 +11939,7 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { + // call to detail::sax_call_next_token_start_end_pos inside of the method if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal())) { return false; @@ -11644,6 +11949,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_array(static_cast(-1)))) { return false; @@ -11659,6 +11965,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_array(); } @@ -11667,6 +11974,7 @@ class binary_reader */ bool get_ubjson_object() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); std::pair size_and_type; if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type))) { @@ -11684,6 +11992,7 @@ class binary_reader string_t key; if (size_and_type.first != npos) { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first))) { return false; @@ -11693,7 +12002,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11708,7 +12020,10 @@ class binary_reader { for (std::size_t i = 0; i < size_and_type.first; ++i) { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11722,6 +12037,7 @@ class binary_reader } else { + detail::sax_call_next_token_end_pos(sax, chars_read - 1); if (JSON_HEDLEY_UNLIKELY(!sax->start_object(static_cast(-1)))) { return false; @@ -11729,7 +12045,10 @@ class binary_reader while (current != '}') { - if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key))) + detail::sax_call_next_token_start_pos(sax, chars_read - 1); + const bool result_get = get_ubjson_string(key, false); + detail::sax_call_next_token_end_pos(sax, chars_read); + if (JSON_HEDLEY_UNLIKELY(!result_get || !sax->key(key))) { return false; } @@ -11742,6 +12061,7 @@ class binary_reader } } + detail::sax_call_next_token_start_end_pos(sax, chars_read - 1, chars_read); return sax->end_object(); } @@ -11750,6 +12070,7 @@ class binary_reader bool get_ubjson_high_precision_number() { + detail::sax_call_next_token_start_pos(sax, chars_read - 1); // get size of following number string std::size_t size{}; bool no_ndarray = true; @@ -11770,6 +12091,7 @@ class binary_reader } number_vector.push_back(static_cast(current)); } + detail::sax_call_next_token_end_pos(sax, chars_read); // parse number string using ia_type = decltype(detail::input_adapter(number_vector)); @@ -11967,6 +12289,7 @@ class binary_reader { if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) { + detail::sax_call_next_token_end_pos(sax, chars_read); return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); } @@ -12178,8 +12501,6 @@ class parser , m_lexer(std::move(adapter), skip_comments) , allow_exceptions(allow_exceptions_) { - // read first token - get_token(); } /*! @@ -12200,7 +12521,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12228,7 +12549,7 @@ class parser sax_parse_internal(&sdp); // in strict mode, input must be completely read - if (strict && (get_token() != token_type::end_of_input)) + if (strict && (get_token(&sdp) != token_type::end_of_input)) { sdp.parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12266,7 +12587,7 @@ class parser const bool result = sax_parse_internal(sax); // strict mode: next byte must be EOF - if (result && strict && (get_token() != token_type::end_of_input)) + if (result && strict && (get_token(sax) != token_type::end_of_input)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12287,6 +12608,8 @@ class parser // value to avoid a goto (see comment where set to true) bool skip_to_state_evaluation = false; + // read first token + get_token(sax); while (true) { if (!skip_to_state_evaluation) @@ -12302,7 +12625,7 @@ class parser } // closing } -> we are done - if (get_token() == token_type::end_object) + if (get_token(sax) == token_type::end_object) { if (JSON_HEDLEY_UNLIKELY(!sax->end_object())) { @@ -12324,7 +12647,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12335,7 +12658,7 @@ class parser states.push_back(false); // parse values - get_token(); + get_token(sax); continue; } @@ -12347,7 +12670,7 @@ class parser } // closing ] -> we are done - if (get_token() == token_type::end_array) + if (get_token(sax) == token_type::end_array) { if (JSON_HEDLEY_UNLIKELY(!sax->end_array())) { @@ -12474,10 +12797,10 @@ class parser if (states.back()) // array { // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse a new value - get_token(); + get_token(sax); continue; } @@ -12507,10 +12830,10 @@ class parser // states.back() is false -> object // comma -> next value - if (get_token() == token_type::value_separator) + if (get_token(sax) == token_type::value_separator) { // parse key - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::value_string)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12523,7 +12846,7 @@ class parser } // parse separator (:) - if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator)) + if (JSON_HEDLEY_UNLIKELY(get_token(sax) != token_type::name_separator)) { return sax->parse_error(m_lexer.get_position(), m_lexer.get_token_string(), @@ -12531,7 +12854,7 @@ class parser } // parse values - get_token(); + get_token(sax); continue; } @@ -12559,10 +12882,19 @@ class parser } } - /// get next token from lexer - token_type get_token() + /// get next token from lexer and pass position info to sax (if it is accepted) + template + token_type get_token(SAX* sax) { - return last_token = m_lexer.scan(); + if (!m_lexer.scan_start()) + { + last_token = token_type::parse_error; + return token_type::parse_error; + } + detail::sax_call_next_token_start_pos(sax, m_lexer); + last_token = m_lexer.scan_end(); + detail::sax_call_next_token_end_pos(sax, m_lexer); + return last_token; } std::string exception_message(const token_type expected, const std::string& context) diff --git a/tests/src/unit-sax-parser-extended.cpp b/tests/src/unit-sax-parser-extended.cpp new file mode 100644 index 000000000..769e19859 --- /dev/null +++ b/tests/src/unit-sax-parser-extended.cpp @@ -0,0 +1,1830 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ (test suite) +| | |__ | | | | | | version 3.10.2 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#include "doctest_compatibility.h" + +#include + +// ignore warning to replace if with if constexpr since there are +// several in the file, just deactivate it here to prevent repeated ifdefs +DOCTEST_MSVC_SUPPRESS_WARNING(4127) + +//option to make this test more verbose +#define verbose_out \ + if (0) \ + std::cout + +//prototype to make -Wmissing-prototypes happy +struct element_info_t; +bool operator<(const element_info_t& l, const element_info_t& r); +std::ostream& operator<<(std::ostream& out, const element_info_t& v); +std::ostream& operator<<(std::ostream& out, const std::set& v); +template +void fill_expected_sax_pos_json(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset); +template +void fill_expected_sax_pos_bson(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset); +template +void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::json& part); +template +void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann::json& part); +template +void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::json& part); +void test_json(nlohmann::json& json); + +//implementation + +struct element_info_t +{ + element_info_t(std::size_t idx, std::size_t first, std::size_t last) + : index{idx} + , start{first} + , end{last} + {} + std::size_t index = 0; + std::size_t start = 0; + std::size_t end = 0; +}; +bool operator<(const element_info_t& l, const element_info_t& r) +{ + return std::tie(l.index, l.start, l.end) < std::tie(r.index, r.start, r.end); +} +std::ostream& operator<<(std::ostream& out, const element_info_t& v) +{ + return (out << v.index << ':' << v.start << '-' << v.end + << '(' << v.end - v.start << ')'); +} +std::ostream& operator<<(std::ostream& out, const std::set& v) +{ + out << "{"; + if (v.size() > 32) + { + out << ">32 elements..."; + } + else + { + for (const auto& e : v) + { + out << ' ' << e; + } + } + out << " }"; + return out; +} + +template +struct Sax +{ + static constexpr bool has_callback = WithPos || (WithLex && !LexCallImpossible); + using json = nlohmann::json; + + enum class last_call_t + { + element, + start_pos, + end_pos + }; + + last_call_t last_call = last_call_t::element; + + element_info_t se{0, 0, 0}; + + std::set pos_null{}; + std::set pos_boolean{}; + std::set pos_number_integer{}; + std::set pos_number_unsigned{}; + std::set pos_number_float{}; + std::set pos_string{}; + std::set pos_binary{}; + std::set pos_start_object{}; + std::set pos_key{}; + std::set pos_end_object{}; + std::set pos_start_array{}; + std::set pos_end_array{}; + + void check_call(std::set& set, const char* fnname) + { + INFO("function " << fnname << ": " << se + << " (options = " << set << ')'); + if (has_callback) + { + CHECK(set.count(se) == 1); + CHECK(last_call == last_call_t::end_pos); + } + last_call = last_call_t::element; + set.erase(se); + ++se.index; + } + void check_start(std::size_t pos) + { + INFO("set start pos " << pos); + CHECK((last_call == last_call_t::element || last_call == last_call_t::end_pos)); + se.start = pos; + last_call = last_call_t::start_pos; + } + void check_end(std::size_t pos) + { + INFO("set end pos " << pos); + CHECK(last_call == last_call_t::start_pos); + se.end = pos; + last_call = last_call_t::end_pos; + } + + template + typename std::enable_if::type next_token_start(std::size_t pos) + { + check_start(pos); + CHECK((!WithLex || LexCallImpossible)); + } + + template < class LexT, bool Act = WithLex && !std::is_same::value > + typename std::enable_if::type next_token_start(const LexT& lex) + { + check_start(lex.get_position().chars_read_total - 1); + CHECK(WithLex); + } + + template + typename std::enable_if::type next_token_end(std::size_t pos) + { + check_end(pos); + CHECK((!WithLex || LexCallImpossible)); + } + + template < class LexT, bool Act = WithLex && !std::is_same::value > + typename std::enable_if::type next_token_end(const LexT& lex) + { + check_end(lex.get_position().chars_read_total); + CHECK(WithLex); + } + + bool null() + { + check_call(pos_null, __func__); + verbose_out << "got null\n"; + return true; + } + bool boolean(bool val) + { + check_call(pos_boolean, __func__); + verbose_out << "got boolean " << val << "\n"; + return true; + } + bool number_integer(json::number_integer_t val) + { + check_call(pos_number_integer, __func__); + verbose_out << "got number_integer " << val << "\n"; + return true; + } + bool number_unsigned(json::number_unsigned_t val) + { + check_call(pos_number_unsigned, __func__); + verbose_out << "got number_unsigned " << val << "\n"; + return true; + } + bool number_float(json::number_float_t val, const std::string& str) + { + check_call(pos_number_float, __func__); + verbose_out << "got float " << val << " (" << str << ")" + << "\n"; + return true; + } + bool string(std::string& val) + { + check_call(pos_string, __func__); + verbose_out << "got string " << val << "\n"; + return true; + } + bool binary(std::vector& val) + { + check_call(pos_binary, __func__); + verbose_out << "got binary: size " << val.size() << "\n"; + return true; + } + bool start_object(std::size_t val) + { + check_call(pos_start_object, __func__); + verbose_out << "got start_object: size " << val << "\n"; + return true; + } + bool key(std::string& val) + { + check_call(pos_key, __func__); + verbose_out << "got key " << val << "\n"; + return true; + } + bool end_object() + { + check_call(pos_end_object, __func__); + verbose_out << "got end_object\n"; + return true; + } + bool start_array(std::size_t val) + { + check_call(pos_start_array, __func__); + verbose_out << "got start_array: size " << val << "\n"; + return true; + } + bool end_array() + { + check_call(pos_end_array, __func__); + verbose_out << "got end_array\n"; + return true; + } + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const json::exception& /*unused*/) // NOLINT(readability-convert-member-functions-to-static) + { + std::cout << "got parse_error\n"; + CHECK(false); // should not happen + return false; + } + void check_all_pos_found() + { + INFO("check all null were found (elements left: " << pos_null << ')'); + CHECK(pos_null.empty()); + INFO("check all boolean were found (elements left: " << pos_boolean << ')'); + CHECK(pos_boolean.empty()); + INFO("check all number_integer were found (elements left: " << pos_number_integer << ')'); + CHECK(pos_number_integer.empty()); + INFO("check all number_unsigned were found (elements left: " << pos_number_unsigned << ')'); + CHECK(pos_number_unsigned.empty()); + INFO("check all number_float were found (elements left: " << pos_number_float << ')'); + CHECK(pos_number_float.empty()); + INFO("check all string were found (elements left: " << pos_string << ')'); + CHECK(pos_string.empty()); + INFO("check all binary were found (elements left: " << pos_binary << ')'); + CHECK(pos_binary.empty()); + INFO("check all start_object were found (elements left: " << pos_start_object << ')'); + CHECK(pos_start_object.empty()); + INFO("check all key were found (elements left: " << pos_key << ')'); + CHECK(pos_key.empty()); + INFO("check all end_object were found (elements left: " << pos_end_object << ')'); + CHECK(pos_end_object.empty()); + INFO("check all start_array were found (elements left: " << pos_start_array << ')'); + CHECK(pos_start_array.empty()); + INFO("check all end_array were found (elements left: " << pos_end_array << ')'); + CHECK(pos_end_array.empty()); + } +}; + +template +struct Opt +{ + static constexpr bool WithPos = WithPosV; + static constexpr bool WithLex = WithLexV; +}; + +using OptNone = Opt; +using OptLex = Opt; +using OptPos = Opt; +using OptBoth = Opt; + +//test basic functionality +TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) +{ + const bool with_pos = T::WithPos; + const bool with_lex = T::WithLex; + + INFO("WithPos " << with_pos << ", WithLex " << with_lex); + //element count 0 1 2 3 4 5 6 7 8 9 10 + //index 10s place 0 1 2 3 4 5 + //index 1s place 012345678901234567890123456789012345678901234567890123 + const std::string str = R"({ "array" : [14294967296,-1,true,4.2,null,"str" ] })"; + std::size_t elem_idx = 0; + std::size_t char_idx = 0; + const auto element = [&](std::size_t bytes) + { + const auto start = char_idx; + char_idx += bytes; + return element_info_t{elem_idx++, start, char_idx}; + }; + const auto skip = [&](std::size_t bytes) + { + char_idx += bytes; + }; + SECTION("json") + { + std::string reconstructed; + const auto elementFromStr = [&](const std::string & s) + { + reconstructed += s; + return element(s.size()); + }; + const auto skipFromStr = [&](const std::string & s) + { + reconstructed += s; + skip(s.size()); + }; + Sax sax; + sax.pos_start_object.emplace(elementFromStr("{")); + skipFromStr(" "); + sax.pos_key.emplace(elementFromStr(R"("array")")); + skipFromStr(" : "); + sax.pos_start_array.emplace(elementFromStr("[")); + sax.pos_number_unsigned.emplace(elementFromStr("14294967296")); + skipFromStr(","); + sax.pos_number_integer.emplace(elementFromStr("-1")); + skipFromStr(","); + sax.pos_boolean.emplace(elementFromStr("true")); + skipFromStr(","); + sax.pos_number_float.emplace(elementFromStr("4.2")); + skipFromStr(","); + sax.pos_null.emplace(elementFromStr("null")); + skipFromStr(","); + sax.pos_string.emplace(elementFromStr(R"("str")")); + skipFromStr(" "); + sax.pos_end_array.emplace(elementFromStr("]")); + skipFromStr(" "); + sax.pos_end_object.emplace(elementFromStr("}")); + CHECK(nlohmann::json::sax_parse(str, &sax, nlohmann::json::input_format_t::json)); + if (with_pos || with_lex) + { + sax.check_all_pos_found(); + } + CHECK(char_idx == str.size()); + CHECK(str == reconstructed); + } + SECTION("bson") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_bson(j); + Sax sax; + sax.pos_start_object.emplace(element(4)); //4 bytes size + skip(1); //one byte type array + sax.pos_key.emplace(element(6)); //6 key (array\0) + sax.pos_start_array.emplace(element(4)); //4 bytes size + skip(3); //one byte type + key 0\0 + sax.pos_number_integer.emplace(element(8)); //8 bytes int64 + skip(3); //one byte type + key 1\0 + sax.pos_number_integer.emplace(element(4)); //4 bytes int32 + skip(3); //one byte type + key 2\0 + sax.pos_boolean.emplace(element(1)); //1 byte bool + skip(3); //one byte type + key 3\0 + sax.pos_number_float.emplace(element(8)); //8 bytes double + skip(3); //one byte type + key 4\0 + sax.pos_null.emplace(element((0))); //0 bytes + skip(3); //one byte type + key 4\0 + sax.pos_string.emplace(element(8)); //4 bytes size + (str\0) + sax.pos_end_array.emplace(element(1)); //1 byte \0 end of array + sax.pos_end_object.emplace(element(1)); //1 byte \0 end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bson)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("cbor") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_cbor(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_unsigned.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(1)); //1 byte type + 0 bytes int -> implicit value since small + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(4)); //1 byte type + 3 bytes string (str) (size implicit) + sax.pos_end_array.emplace(element(0)); //0 byte end of array + sax.pos_end_object.emplace(element(0)); //0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::cbor)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("msgpack") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_msgpack(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_unsigned.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(1)); //1 byte type + 0 bytes int -> implicit value since small + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(4)); //1 byte type + 3 bytes string (str) (size implicit) + sax.pos_end_array.emplace(element(0)); //0 byte end of array + sax.pos_end_object.emplace(element(0)); //0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::msgpack)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("ubjson") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_ubjson(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_integer.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(2)); //1 byte type + 1 bytes int8 + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(6)); //1 type + 1 type of len + 1 len +3 string (str) + sax.pos_end_array.emplace(element(1)); //1 byte type + 0 byte end of array + sax.pos_end_object.emplace(element(1)); //1 byte type + 0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::ubjson)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } + SECTION("bjdata") + { + const auto j = nlohmann::json::parse(str); + const auto bin = nlohmann::json::to_bjdata(j); + Sax sax; + sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size + sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) + sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) + sax.pos_number_integer.emplace(element(9)); //1 byte type + 8 bytes uint64 + sax.pos_number_integer.emplace(element(2)); //1 byte type + 1 bytes int8 + sax.pos_boolean.emplace(element(1)); //1 byte type + 0 byte bool (value in type) + sax.pos_number_float.emplace(element(9)); //1 byte type + 8 bytes double + sax.pos_null.emplace(element((1))); //1 byte type + 0 bytes + sax.pos_string.emplace(element(6)); //1 type + 1 type of len + 1 len +3 string (str) + sax.pos_end_array.emplace(element(1)); //1 byte type + 0 byte end of array + sax.pos_end_object.emplace(element(1)); //1 byte type + 0 byte end of object + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bjdata)); + if (with_pos) + { + sax.check_all_pos_found(); + } + } +} + +//cover more advanced cases (e.g. msgpack fixint) (but only use one templated version) +template +void fill_expected_sax_pos_json(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + sax.pos_null.emplace(element(4)); //null + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); // { + for (auto& el : part.items()) + { + sax.pos_key.emplace(element(el.key().size() + 2)); //'"' + str + '"' + offset += 1; // separator ':' between key and value + fill_expected_sax_pos_json(sax, element, el.value(), offset); + offset += 1; // add , + } + if (!part.empty()) + { + offset -= 1; // remove last , + } + sax.pos_end_object.emplace(element(1)); // } + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); // [ + for (auto& el : part.items()) + { + fill_expected_sax_pos_json(sax, element, el.value(), offset); + offset += 1; // add , + } + if (!part.empty()) + { + offset -= 1; // remove last , + } + sax.pos_end_array.emplace(element(1)); // ] + } + break; + case nlohmann::json::value_t::string: + { + const auto val = part.get(); + std::size_t nbytes = val.size() + 2; //'"' + value + '"' + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + const auto val = part.get(); + if (val) + { + sax.pos_boolean.emplace(element(4)); // true + } + else + { + sax.pos_boolean.emplace(element(5)); // false + } + } + break; + case nlohmann::json::value_t::number_integer: + { + const auto val = part.get(); + std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + const auto val = part.get(); + std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + const auto val = part.get(); + std::size_t nbytes = std::to_string(val).size(); + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + //stored as object with array and subtype + nlohmann::json sub; + sub["bytes"] = nlohmann::json::array(); + for (const auto e : part.get_binary()) + { + sub["bytes"].emplace_back(e); + } + sub["subtype"]; + fill_expected_sax_pos_json(sax, element, sub, offset); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_bson(SAX& sax, + const FN& element, + const nlohmann::json& part, + std::size_t& offset) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + //type is before the key -> not included + sax.pos_null.emplace(element(0)); + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(4)); //32 bit size + for (auto& el : part.items()) + { + offset += 1; // type of item + sax.pos_key.emplace(element(el.key().size() + 1)); // str + terminator + fill_expected_sax_pos_bson(sax, element, el.value(), offset); + } + sax.pos_end_object.emplace(element(1)); // \0 terminator + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(4)); //32 bit size + std::size_t i = 0; + for (auto& el : part.items()) + { + offset += 1; // type of item + offset += 1 + std::to_string(i).size(); // dummy key + terminator + fill_expected_sax_pos_bson(sax, element, el.value(), offset); + ++i; + } + sax.pos_end_array.emplace(element(1)); // \0 terminator + } + break; + case nlohmann::json::value_t::string: + { + //type is before the key -> not included + std::size_t nbytes = 4; //size + const auto val = part.get(); + nbytes += val.size() + 1; //value + \0 terminate + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + //type is before the key -> not included + std::size_t nbytes = 1; //value + sax.pos_boolean.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 0; //type is before the key -> not included + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 0; //type is before the key -> not included + const auto val = part.get(); + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 0; //type is before the key -> not included + nbytes += 8; //value + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 0; //type is before the key -> not included + nbytes += 4; // length of bin (32 bit) + nbytes += 1; // subtype + nbytes += part.get_binary().size(); + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + std::size_t nbytes = 1; //type + sax.pos_null.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::object: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x17) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 1; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_object.emplace(element(nbytes)); + //key follows same rules as string + for (auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type + nbyteskey += el.key().size(); + if (el.key().size() <= 0x17) + { + //size implicit in type + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; + } + else + { + nbyteskey += 8; + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_cbor(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(0)); + } + break; + case nlohmann::json::value_t::array: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x17) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 1; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_array.emplace(element(nbytes)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_cbor(sax, element, elem); + } + sax.pos_end_array.emplace(element(0)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + if (val.size() <= static_cast(0x17)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + std::size_t nbytes = 1; //type + sax.pos_boolean.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= -24) + { + //value implicit in type + } + else if (-val - 1 <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (-val - 1 <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (-val - 1 <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(0x17)) + { + //value implicit in type + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //really depends on the input type + if (val < 0) + { + std::cout << "unexpected float <0\n"; + throw std::logic_error{"unexpected float <0"}; + } + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; //float + } + else + { + nbytes += 8; //double float + } + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 1; //type + const auto& val = part.get_binary(); + nbytes += val.size(); + if (val.size() <= static_cast(0x17)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + std::size_t nbytes = 1; //type + sax.pos_null.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::object: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x0F) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_object.emplace(element(nbytes)); + //key follows same rules as string + for (auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type + nbyteskey += el.key().size(); + if (el.key().size() <= 0x1F) + { + //size implicit in type + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; + } + else + { + nbyteskey += 8; + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_msgpack(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(0)); + } + break; + case nlohmann::json::value_t::array: + { + std::size_t nbytes = 1; //type + if (part.size() <= 0x0F) + { + //size implicit in type + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 2; + } + else if (part.size() <= std::numeric_limits::max()) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_start_array.emplace(element(nbytes)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_msgpack(sax, element, elem); + } + sax.pos_end_array.emplace(element(0)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + if (val.size() <= static_cast(0x1F)) + { + //size implicit in type + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + std::size_t nbytes = 1; //type + sax.pos_boolean.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= -32) + { + //value implicit in type + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(0x7F)) + { + //value implicit in type + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_unsigned.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //really depends on the input type + if (val < 0) + { + std::cout << "unexpected float <0\n"; + throw std::logic_error{"unexpected float <0"}; + } + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; //float + } + else + { + nbytes += 8; //double float + } + sax.pos_number_float.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::binary: + { + std::size_t nbytes = 1; //type + const auto& val = part.get_binary(); + nbytes += val.size(); + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_binary.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + std::size_t nbytes = 1; //type + sax.pos_null.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); + //key follows same rules as string + for (auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type of len + nbyteskey += el.key().size(); + if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; // size of len + } + else + { + nbyteskey += 8; // size of len + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_ubjson(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(1)); + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_ubjson(sax, element, elem); + } + sax.pos_end_array.emplace(element(1)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + nbytes += 1; // type of length + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + std::size_t nbytes = 1; //type + sax.pos_boolean.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + //for <-24 : -n-1 + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + //supported integer types : + // uint8 + // int8/16/32/64/High precision + // --> only 128-255 are stored as uint + high precision > max int64 + bool use_uint = false; + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val < 128) + { + ++nbytes; + } + else if (val >= 128 && val <= 255) + { + use_uint = true; + ++nbytes; + } + else + { + //sorted as signed int! + if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 8; + } + else + { + //High precision + //more complex calculation of size is not done here + //the size includes + // type (high precision) + // type of size of value length + // size of value length + // value as array of chars + //in this case + nbytes = 22; + if (val > std::numeric_limits::max() - 128) + { + //in this test case the value needs one more char + nbytes += 1; + } + if (val > static_cast(std::numeric_limits::max())) + { + use_uint = true; + } + } + } + if (use_uint) + { + sax.pos_number_unsigned.emplace(element(nbytes)); + } + else + { + sax.pos_number_integer.emplace(element(nbytes)); + } + } + break; + case nlohmann::json::value_t::number_float: + { + //everything is serialized as double (type+double value) + sax.pos_number_float.emplace(element(8 + 1)); + } + break; + case nlohmann::json::value_t::binary: + { + // Note, no reader for UBJSON binary types is implemented because they do + auto sub = nlohmann::json::array(); + for (const auto i : part.get_binary()) + { + sub.emplace_back(i); + } + fill_expected_sax_pos_ubjson(sax, element, sub); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +template +void fill_expected_sax_pos_bjdata(SAX& sax, const FN& element, const nlohmann::json& part) +{ + switch (part.type()) + { + case nlohmann::json::value_t::null: + { + std::size_t nbytes = 1; //type + sax.pos_null.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::object: + { + sax.pos_start_object.emplace(element(1)); + //key follows same rules as string + for (auto& el : part.items()) + { + std::size_t nbyteskey = 1; //type of len + nbyteskey += el.key().size(); + if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 1; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 2; // size of len + } + else if (el.key().size() <= std::numeric_limits::max()) + { + nbyteskey += 4; // size of len + } + else + { + nbyteskey += 8; // size of len + } + sax.pos_key.emplace(element(nbyteskey)); + fill_expected_sax_pos_bjdata(sax, element, el.value()); + } + sax.pos_end_object.emplace(element(1)); + } + break; + case nlohmann::json::value_t::array: + { + sax.pos_start_array.emplace(element(1)); + //add elements + for (const auto& elem : part) + { + fill_expected_sax_pos_bjdata(sax, element, elem); + } + sax.pos_end_array.emplace(element(1)); + } + break; + case nlohmann::json::value_t::string: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + nbytes += val.size(); + nbytes += 1; // type of length + if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val.size() <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_string.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::boolean: + { + std::size_t nbytes = 1; //type + sax.pos_boolean.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_integer: + { + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val >= 0) + { + std::cout << "unexpected int >= 0\n"; + throw std::logic_error{"unexpected int >= 0"}; + } + if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 1; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 2; + } + else if (val >= static_cast(std::numeric_limits::min())) + { + nbytes += 4; + } + else + { + nbytes += 8; + } + sax.pos_number_integer.emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_unsigned: + { + auto* category = &sax.pos_number_unsigned; + std::size_t nbytes = 1; //type + const auto val = part.get(); + if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int8 for these values + category = &sax.pos_number_integer; + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 1; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int6 for these values + category = &sax.pos_number_integer; + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 2; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int32 for these values + category = &sax.pos_number_integer; + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + nbytes += 4; + } + else if (val <= static_cast(std::numeric_limits::max())) + { + //the serializer uses int64 for these values + category = &sax.pos_number_integer; + nbytes += 8; + } + else + { + nbytes += 8; + } + category->emplace(element(nbytes)); + } + break; + case nlohmann::json::value_t::number_float: + { + //everything is serialized as double (type+double value) + sax.pos_number_float.emplace(element(8 + 1)); + } + break; + case nlohmann::json::value_t::binary: + { + // Note, no reader for UBJSON binary types is implemented because they do + auto sub = nlohmann::json::array(); + for (const auto i : part.get_binary()) + { + sub.emplace_back(i); + } + fill_expected_sax_pos_ubjson(sax, element, sub); + } + break; + case nlohmann::json::value_t::discarded: + { + std::cout << "unexpected! value_t::discarded\n"; + throw std::logic_error{"unexpected! value_t::discarded"}; + } + break; + default: + throw std::logic_error{"unexpected! default"}; + } +} + +void test_json(nlohmann::json& json) +{ + Sax sax; + std::size_t elem_idx = 0; + std::size_t char_idx = 0; + const auto element = [&](std::size_t bytes) + { + const auto start = char_idx; + char_idx += bytes; + return element_info_t{elem_idx++, start, char_idx}; + }; + SECTION("json") + { + const auto bin = json.dump(); + std::cout << "json has size of " << bin.size() << '\n'; + fill_expected_sax_pos_json(sax, element, json, char_idx); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::json)); + sax.check_all_pos_found(); + } + SECTION("bson") + { + //since bson can't deal with values > int64 max we need to remove some + if (json.contains("uints")) + { + auto& ar = json["uints"]; + const std::uint64_t limit = std::numeric_limits::max(); + while (ar.back() > limit) + { + ar.erase(ar.size() - 1); + } + } + const auto bin = nlohmann::json::to_bson(json); + std::cout << "bson has size of " << bin.size() << '\n'; + fill_expected_sax_pos_bson(sax, element, json, char_idx); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bson)); + sax.check_all_pos_found(); + } + SECTION("cbor") + { + const auto bin = nlohmann::json::to_cbor(json); + std::cout << "cbor has size of " << bin.size() << '\n'; + fill_expected_sax_pos_cbor(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::cbor)); + sax.check_all_pos_found(); + } + SECTION("msgpack") + { + const auto bin = nlohmann::json::to_msgpack(json); + std::cout << "msgpack has size of " << bin.size() << '\n'; + fill_expected_sax_pos_msgpack(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::msgpack)); + sax.check_all_pos_found(); + } + SECTION("ubjson") + { + const auto bin = nlohmann::json::to_ubjson(json); + std::cout << "ubjson has size of " << bin.size() << '\n'; + fill_expected_sax_pos_ubjson(sax, element, json); + CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::ubjson)); + sax.check_all_pos_found(); + } + SECTION("bjdata") + { + const auto bin = nlohmann::json::to_bjdata(json); + std::cout << "bjdata has size of " << bin.size() << '\n'; + fill_expected_sax_pos_bjdata(sax, element, json); + //CHECK(char_idx == bin.size()); + CHECK(nlohmann::json::sax_parse(bin, &sax, nlohmann::json::input_format_t::bjdata)); + sax.check_all_pos_found(); + } +} + +TEST_CASE("extended parser generated (uint)") +{ + std::cout << "extended parser generated (uint) "; + nlohmann::json json; + auto& array = json["uints"]; + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(i); + } + //check area around key points + const auto add_area = [&](std::uint64_t mid, std::uint64_t lower, std::uint64_t higher) + { + for (std::uint64_t i = mid - lower; i < mid + higher; ++i) + { + array.emplace_back(i); + } + array.emplace_back(mid + higher); + }; + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 32); + + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 32); + + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max() / 2, 32, 32); + add_area(std::numeric_limits::max(), 32, 0); + test_json(json); +} +TEST_CASE("extended parser generated (int)") +{ + std::cout << "extended parser generated (int) "; + nlohmann::json json; + auto& array = json["ints"]; + for (std::int64_t i = -512; i <= -1; ++i) + { + array.emplace_back(i); + } + //check area around key points + const auto add_area = [&](std::int64_t mid, std::int64_t lower, std::int64_t higher) + { + for (std::int64_t i = mid - lower; i <= mid + higher; ++i) + { + array.emplace_back(i); + } + }; + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 32, 32); + add_area(std::numeric_limits::min(), 0, 32); + test_json(json); +} +TEST_CASE("extended parser generated (array / bool)") +{ + std::cout << "extended parser generated (array / bool) "; + nlohmann::json json; + auto& array = json["arrays"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + auto sub = nlohmann::json::array(); + for (std::uint64_t j = 0; j < i; ++j) + { + sub.emplace_back((j % 2 == 0)); + } + array.emplace_back(std::move(sub)); + } + //add large aray + auto sub = nlohmann::json::array(); + for (std::uint64_t j = 0; j < std::numeric_limits::max() + 1; ++j) + { + sub.emplace_back((j % 2 == 0)); + } + array.emplace_back(std::move(sub)); + test_json(json); +} +TEST_CASE("extended parser generated (object / null)") +{ + std::cout << "extended parser generated (object / null) "; + nlohmann::json json; + auto& array = json["objects"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + auto sub = nlohmann::json::object(); + for (std::uint64_t j = 0; j < i; ++j) + { + sub[std::string(static_cast(j), 'k')]; + + } + array.emplace_back(std::move(sub)); + } + //add object with long keý + auto sub = nlohmann::json::object(); + sub[std::string(std::numeric_limits::max() + 1, 'k')]; + array.emplace_back(std::move(sub)); + test_json(json); +} +TEST_CASE("extended parser generated (string)") +{ + std::cout << "extended parser generated (string) "; + nlohmann::json json; + auto& array = json["strings"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(std::string(static_cast(i), '|')); + } + array.emplace_back(std::string(std::numeric_limits::max() + 1, '|')); + //test with large strings (e.g. requiring uint64 as size type) are not done + test_json(json); +} +TEST_CASE("extended parser generated (binary)") +{ + std::cout << "extended parser generated (binary) "; + nlohmann::json json; + auto& array = json["binary"]; + array = nlohmann::json::array(); + for (std::uint64_t i = 0; i < 512; ++i) + { + array.emplace_back(nlohmann::json::binary(std::vector(static_cast(i), 255))); + } + //add large binary + std::vector data(std::numeric_limits::max() + 1, 255); + array.emplace_back(nlohmann::json::binary(std::move(data))); + test_json(json); +} diff --git a/tests/src/unit-sax-parser-store-source-location.cpp b/tests/src/unit-sax-parser-store-source-location.cpp new file mode 100644 index 000000000..0820a81ce --- /dev/null +++ b/tests/src/unit-sax-parser-store-source-location.cpp @@ -0,0 +1,337 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ (test suite) +| | |__ | | | | | | version 3.10.2 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include +#include + +#include "doctest_compatibility.h" + +#include + +//prototype to make -Wmissing-prototypes happy +std::ostream& operator<<(std::ostream& out, const nlohmann::detail::position_t& p); + +//test json parser with detailed line / col information as metadata + +struct token_start_stop +{ + nlohmann::detail::position_t start{}; + nlohmann::detail::position_t stop{}; +}; + +std::ostream& operator<<(std::ostream& out, const nlohmann::detail::position_t& p) +{ + out << p.chars_read_total << '(' << p.lines_read << ':' << p.chars_read_current_line << ')'; + return out; +} + +using json_with_token_start_stop = + nlohmann::basic_json < + std::map, + std::vector, + std::string, + bool, + std::int64_t, + std::uint64_t, + double, + std::allocator, + nlohmann::adl_serializer, + std::vector, + token_start_stop >; + +//adapted from detail::json_sax_dom_parser +class sax_with_token_start_stop_metadata +{ + public: + using json = json_with_token_start_stop; + using number_integer_t = typename json::number_integer_t; + using number_unsigned_t = typename json::number_unsigned_t; + using number_float_t = typename json::number_float_t; + using string_t = typename json::string_t; + using binary_t = typename json::binary_t; + + /*! + @param[in,out] r reference to a JSON value that is manipulated while + parsing + @param[in] allow_exceptions_ whether parse errors yield exceptions + */ + explicit sax_with_token_start_stop_metadata(json& r, const bool allow_exceptions_ = true) + : root(r) + , ref_stack{} + , object_element{nullptr} + , errored{false} + , allow_exceptions(allow_exceptions_) + , start_stop{} + {} + + template + void next_token_start(const nlohmann::detail::lexer& lex) + { + start_stop.start = lex.get_position(); + } + + template + void next_token_end(const nlohmann::detail::lexer& lex) + { + start_stop.stop = lex.get_position(); + } + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::object)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive object size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool key(string_t& val) + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + // add null at given key and store the reference for later + object_element = &(*ref_stack.back())[val]; + return true; + } + + bool end_object() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + bool start_array(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::array)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive array size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool end_array() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_array()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + throw ex; + } + return false; + } + + constexpr bool is_errored() const + { + return errored; + } + + private: + /*! + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + */ + template + json* + handle_value(Value&& v) + { + if (ref_stack.empty()) + { + root = json(std::forward(v)); + root.start = start_stop.start; + root.stop = start_stop.stop; + return &root; + } + + assert(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + auto& array_element = ref_stack.back()->emplace_back(std::forward(v)); + array_element.start = start_stop.start; + array_element.stop = start_stop.stop; + return &array_element; + } + + assert(ref_stack.back()->is_object()); + assert(object_element); + *object_element = json(std::forward(v)); + object_element->start = start_stop.start; + object_element->stop = start_stop.stop; + return object_element; + } + + /// the parsed JSON value + json& root; + /// stack to model hierarchy of values + std::vector ref_stack{}; + /// helper to hold the reference for the next object element + json* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; + /// start / stop information for the current token + token_start_stop start_stop{}; +}; + +TEST_CASE("parse-json-with-position-info") +{ + const std::string str = + /*line 0*/ R"({)" + "\n" + /*line 1*/ R"( "array" : [)" + "\n" + /*line 2*/ R"( 14294967296,)" + "\n" + /*line 3*/ R"( -1,)" + "\n" + /*line 4*/ R"( true,)" + "\n" + /*line 5*/ R"( 4.2,)" + "\n" + /*line 6*/ R"( null,)" + "\n" + /*line 7*/ R"( "str")" + "\n" + /*line 8*/ R"( ])" + "\n" + /*line 9*/ R"(})"; + json_with_token_start_stop j; + sax_with_token_start_stop_metadata sax{j}; + CHECK(nlohmann::json::sax_parse(str, &sax, nlohmann::json::input_format_t::json)); + CHECK(j.start.lines_read == 0); + CHECK(j.start.chars_read_current_line == 1); + + CHECK(j["array"].start.lines_read == 1); + CHECK(j["array"].start.chars_read_current_line == 13); + + CHECK(j["array"][0].start.lines_read == 2); + CHECK(j["array"][0].start.chars_read_current_line == 5); + CHECK(j["array"][0].stop.lines_read == 2); + CHECK(j["array"][0].stop.chars_read_current_line == 15); + + CHECK(j["array"][1].start.lines_read == 3); + CHECK(j["array"][1].start.chars_read_current_line == 5); + CHECK(j["array"][1].stop.lines_read == 3); + CHECK(j["array"][1].stop.chars_read_current_line == 6); + + CHECK(j["array"][2].start.lines_read == 4); + CHECK(j["array"][2].start.chars_read_current_line == 5); + CHECK(j["array"][2].stop.lines_read == 4); + CHECK(j["array"][2].stop.chars_read_current_line == 8); + + CHECK(j["array"][3].start.lines_read == 5); + CHECK(j["array"][3].start.chars_read_current_line == 5); + CHECK(j["array"][3].stop.lines_read == 5); + CHECK(j["array"][3].stop.chars_read_current_line == 7); + + CHECK(j["array"][4].start.lines_read == 6); //starts directly after last value.... + CHECK(j["array"][4].start.chars_read_current_line == 5); + CHECK(j["array"][4].stop.lines_read == 6); + CHECK(j["array"][4].stop.chars_read_current_line == 8); + + CHECK(j["array"][5].start.lines_read == 7); + CHECK(j["array"][5].start.chars_read_current_line == 5); + CHECK(j["array"][5].stop.lines_read == 7); + CHECK(j["array"][5].stop.chars_read_current_line == 9); + + CHECK(j["array"].stop.lines_read == 8); + CHECK(j["array"].stop.chars_read_current_line == 3); + + CHECK(j.stop.lines_read == 9); + CHECK(j.stop.chars_read_current_line == 1); +} From 52a1d542cac56d846486810b79357f82f65a3327 Mon Sep 17 00:00:00 2001 From: Raphael Grimm Date: Mon, 19 Dec 2022 16:20:47 +0100 Subject: [PATCH 2/7] Add examples for extend sax parser --- docs/examples/sax_parse_with_src_location.cpp | 149 ++++++++ .../sax_parse_with_src_location.output | 37 ++ .../sax_parse_with_src_location_in_json.cpp | 339 ++++++++++++++++++ ...sax_parse_with_src_location_in_json.output | 19 + 4 files changed, 544 insertions(+) create mode 100644 docs/examples/sax_parse_with_src_location.cpp create mode 100644 docs/examples/sax_parse_with_src_location.output create mode 100644 docs/examples/sax_parse_with_src_location_in_json.cpp create mode 100644 docs/examples/sax_parse_with_src_location_in_json.output diff --git a/docs/examples/sax_parse_with_src_location.cpp b/docs/examples/sax_parse_with_src_location.cpp new file mode 100644 index 000000000..85457772f --- /dev/null +++ b/docs/examples/sax_parse_with_src_location.cpp @@ -0,0 +1,149 @@ +#include +#include +#include +#include + +using json = nlohmann::json; + +// a simple event consumer that collects string representations of the passed +// values and their source locations; +// note inheriting from json::json_sax_t is not required, but can +// help not to forget a required function +class sax_event_consumer : public json::json_sax_t +{ + public: + std::vector events; + std::size_t next_token_start_pos = 0; + std::size_t next_token_end_pos = 0; + + void next_token_start(std::size_t pos) + { + next_token_start_pos = pos; + } + + void next_token_end(std::size_t pos) + { + next_token_end_pos = pos; + } + + std::string location_as_str() const + { + return "at=[" + std::to_string(next_token_start_pos) + "," + std::to_string(next_token_end_pos) + ")"; + } + + bool null() override + { + events.push_back("null(" + location_as_str() + ")"); + return true; + } + + bool boolean(bool val) override + { + events.push_back("boolean(val=" + std::string(val ? "true" : "false") + ", " + location_as_str() + ")"); + return true; + } + + bool number_integer(number_integer_t val) override + { + events.push_back("number_integer(val=" + std::to_string(val) + ", " + location_as_str() + ")"); + return true; + } + + bool number_unsigned(number_unsigned_t val) override + { + events.push_back("number_unsigned(val=" + std::to_string(val) + ", " + location_as_str() + ")"); + return true; + } + + bool number_float(number_float_t val, const string_t& s) override + { + events.push_back("number_float(val=" + std::to_string(val) + ", s=" + s + ", " + location_as_str() + ")"); + return true; + } + + bool string(string_t& val) override + { + events.push_back("string(val=" + val + ", " + location_as_str() + ")"); + return true; + } + + bool start_object(std::size_t elements) override + { + events.push_back("start_object(elements=" + std::to_string(elements) + ", " + location_as_str() + ")"); + return true; + } + + bool end_object() override + { + events.push_back("end_object(" + location_as_str() + ")"); + return true; + } + + bool start_array(std::size_t elements) override + { + events.push_back("start_array(elements=" + std::to_string(elements) + ", " + location_as_str() + ")"); + return true; + } + + bool end_array() override + { + events.push_back("end_array(" + location_as_str() + ")"); + return true; + } + + bool key(string_t& val) override + { + events.push_back("key(val=" + val + ", " + location_as_str() + ")"); + return true; + } + + bool binary(json::binary_t& val) override + { + events.push_back("binary(val=[...], " + location_as_str() + ")"); + return true; + } + + bool parse_error(std::size_t position, const std::string& last_token, const json::exception& ex) override + { + events.push_back("parse_error(position=" + std::to_string(position) + ", last_token=" + last_token + ",\n ex=" + std::string(ex.what()) + ")"); + return false; + } +}; + +int main() +{ + // a JSON text + auto text = R"( + { + "Image": { + "Width": 800, + "Height": 600, + "Title": "View from 15th Floor", + "Thumbnail": { + "Url": "http://www.example.com/image/481989943", + "Height": 125, + "Width": 100 + }, + "Animated" : false, + "IDs": [116, 943, 234, -38793], + "DeletionDate": null, + "Distance": 12.723374634 + } + }] + )"; + + // create a SAX event consumer object + sax_event_consumer sec; + + // parse JSON + bool result = json::sax_parse(text, &sec); + + // output the recorded events + for (auto& event : sec.events) + { + std::cout << event << "\n"; + } + + // output the result of sax_parse + std::cout << "\nresult: " << std::boolalpha << result << std::endl; +} diff --git a/docs/examples/sax_parse_with_src_location.output b/docs/examples/sax_parse_with_src_location.output new file mode 100644 index 000000000..dbc004110 --- /dev/null +++ b/docs/examples/sax_parse_with_src_location.output @@ -0,0 +1,37 @@ +start_object(elements=18446744073709551615, at=[5,6)) +key(val=Image, at=[15,22)) +start_object(elements=18446744073709551615, at=[24,25)) +key(val=Width, at=[38,45)) +number_unsigned(val=800, at=[48,51)) +key(val=Height, at=[65,73)) +number_unsigned(val=600, at=[75,78)) +key(val=Title, at=[92,99)) +string(val=View from 15th Floor, at=[102,124)) +key(val=Thumbnail, at=[138,149)) +start_object(elements=18446744073709551615, at=[151,152)) +key(val=Url, at=[169,174)) +string(val=http://www.example.com/image/481989943, at=[179,219)) +key(val=Height, at=[237,245)) +number_unsigned(val=125, at=[247,250)) +key(val=Width, at=[268,275)) +number_unsigned(val=100, at=[278,281)) +end_object(at=[294,295)) +key(val=Animated, at=[309,319)) +boolean(val=false, at=[322,327)) +key(val=IDs, at=[341,346)) +start_array(elements=18446744073709551615, at=[348,349)) +number_unsigned(val=116, at=[349,352)) +number_unsigned(val=943, at=[354,357)) +number_unsigned(val=234, at=[359,362)) +number_integer(val=-38793, at=[364,370)) +end_array(at=[370,371)) +key(val=DeletionDate, at=[385,399)) +null(at=[401,405)) +key(val=Distance, at=[419,429)) +number_float(val=12.723375, s=12.723374634, at=[431,443)) +end_object(at=[452,453)) +end_object(at=[458,459)) +parse_error(position=460, last_token=12.723374634 } }], + ex=[json.exception.parse_error.101] parse error at line 17, column 6: syntax error while parsing value - unexpected ']'; expected end of input) + +result: false diff --git a/docs/examples/sax_parse_with_src_location_in_json.cpp b/docs/examples/sax_parse_with_src_location_in_json.cpp new file mode 100644 index 000000000..ab9b30cc5 --- /dev/null +++ b/docs/examples/sax_parse_with_src_location_in_json.cpp @@ -0,0 +1,339 @@ +#include +#include +#include +#include + +using json = nlohmann::json; + +// custom base class for the json node. +// allows us to store metadata and add custom methods to each node +struct token_start_stop +{ + nlohmann::detail::position_t start{}; + nlohmann::detail::position_t stop{}; + + std::string start_pos_str() const + { + return "{l=" + std::to_string(start.lines_read) + ":c=" + //the lexer is already one char ahead (e.g. the opening { of an object ) + + std::to_string(start.chars_read_current_line - 1) + "}"; + } + std::string stop_pos_str() const + { + return "{l=" + std::to_string(stop.lines_read) + ":c=" + std::to_string(stop.chars_read_current_line) + "}"; + } + std::string location_str() const + { + return "[" + start_pos_str() + ", " + stop_pos_str() + ")"; + } +}; + +//json type using token_start_stop as base class +using json_with_token_start_stop = + nlohmann::basic_json < + std::map, + std::vector, + std::string, + bool, + std::int64_t, + std::uint64_t, + double, + std::allocator, + nlohmann::adl_serializer, + std::vector, + token_start_stop >; + +// a parser storing the lexer information for each node +class sax_with_token_start_stop_metadata +{ + public: + using json = json_with_token_start_stop; + using number_integer_t = typename json::number_integer_t; + using number_unsigned_t = typename json::number_unsigned_t; + using number_float_t = typename json::number_float_t; + using string_t = typename json::string_t; + using binary_t = typename json::binary_t; + + /*! + @param[in,out] r reference to a JSON value that is manipulated while + parsing + @param[in] allow_exceptions_ whether parse errors yield exceptions + */ + explicit sax_with_token_start_stop_metadata(json& r, const bool allow_exceptions_ = true) + : root(r) + , ref_stack{} + , object_element{nullptr} + , errored{false} + , allow_exceptions(allow_exceptions_) + , start_stop{} + {} + + template + void next_token_start(const nlohmann::detail::lexer& lex) + { + start_stop.start = lex.get_position(); + } + + template + void next_token_end(const nlohmann::detail::lexer& lex) + { + start_stop.stop = lex.get_position(); + } + + bool null() + { + handle_value(nullptr); + return true; + } + + bool boolean(bool val) + { + handle_value(val); + return true; + } + + bool number_integer(number_integer_t val) + { + handle_value(val); + return true; + } + + bool number_unsigned(number_unsigned_t val) + { + handle_value(val); + return true; + } + + bool number_float(number_float_t val, const string_t& /*unused*/) + { + handle_value(val); + return true; + } + + bool string(string_t& val) + { + handle_value(val); + return true; + } + + bool binary(binary_t& val) + { + handle_value(std::move(val)); + return true; + } + + bool start_object(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::object)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive object size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool key(string_t& val) + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + // add null at given key and store the reference for later + object_element = &(*ref_stack.back())[val]; + return true; + } + + bool end_object() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_object()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + bool start_array(std::size_t len) + { + ref_stack.push_back(handle_value(json::value_t::array)); + ref_stack.back()->start = start_stop.start; + + if (len != static_cast(-1) && len > ref_stack.back()->max_size()) + { + throw nlohmann::detail::out_of_range::create(408, nlohmann::detail::concat("excessive array size: ", std::to_string(len)), ref_stack.back()); + } + + return true; + } + + bool end_array() + { + assert(!ref_stack.empty()); + assert(ref_stack.back()->is_array()); + + ref_stack.back()->stop = start_stop.stop; + ref_stack.pop_back(); + return true; + } + + template + bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const Exception& ex) + { + errored = true; + static_cast(ex); + if (allow_exceptions) + { + throw ex; + } + return false; + } + + constexpr bool is_errored() const + { + return errored; + } + + private: + /*! + @invariant If the ref stack is empty, then the passed value will be the new + root. + @invariant If the ref stack contains a value, then it is an array or an + object to which we can add elements + */ + template + json* + handle_value(Value&& v) + { + if (ref_stack.empty()) + { + root = json(std::forward(v)); + root.start = start_stop.start; + root.stop = start_stop.stop; + return &root; + } + + assert(ref_stack.back()->is_array() || ref_stack.back()->is_object()); + + if (ref_stack.back()->is_array()) + { + auto& array_element = ref_stack.back()->emplace_back(std::forward(v)); + array_element.start = start_stop.start; + array_element.stop = start_stop.stop; + return &array_element; + } + + assert(ref_stack.back()->is_object()); + assert(object_element); + *object_element = json(std::forward(v)); + object_element->start = start_stop.start; + object_element->stop = start_stop.stop; + return object_element; + } + + /// the parsed JSON value + json& root; + /// stack to model hierarchy of values + std::vector ref_stack{}; + /// helper to hold the reference for the next object element + json* object_element = nullptr; + /// whether a syntax error occurred + bool errored = false; + /// whether to throw exceptions in case of errors + const bool allow_exceptions = true; + /// start / stop information for the current token + token_start_stop start_stop{}; +}; + +void dump(const json_with_token_start_stop& j, std::size_t indentlvl = 0) +{ + const std::string indent(indentlvl * 4, ' '); + switch (j.type()) + { + case nlohmann::json::value_t::null: + { + std::cout << indent << "null(at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::object: + { + std::cout << indent << "object(size=" << j.size() << ", at=" << j.location_str() << ")\n"; + for (const auto& elem : j.items()) + { + dump(elem.value(), indentlvl + 1); + } + } + break; + case nlohmann::json::value_t::array: + { + std::cout << indent << "array(size=" << j.size() << ", at=" << j.location_str() << ")\n"; + for (const auto& elem : j) + { + dump(elem, indentlvl + 1); + } + } + break; + case nlohmann::json::value_t::string: + { + std::cout << indent << "string(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::boolean: + { + std::cout << indent << "boolean(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_integer: + { + std::cout << indent << "number_integer(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_unsigned: + { + std::cout << indent << "number_unsigned(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + case nlohmann::json::value_t::number_float: + { + std::cout << indent << "number_float(val=" << j.get() << ", at=" << j.location_str() << ")\n"; + } + break; + default: + throw std::runtime_error{"unexpected input"}; + } +} + +int main() +{ + // a JSON text + auto text = R"({ + "Image": { + "Width": 800, + "Height": 600, + "Title": "View from 15th Floor", + "Thumbnail": { + "Url": "http://www.example.com/image/481989943", + "Height": 125, + "Width": 100 + }, + "Animated" : false, + "IDs": [116, 943, 234, -38793], + "DeletionDate": null, + "Distance": 12.723374634 + } +})"; + + // create a SAX parser object + json_with_token_start_stop parsed; + sax_with_token_start_stop_metadata sax{parsed}; + + // parse JSON + bool result = json::sax_parse(text, &sax); + + // output the json data + dump(parsed); + + // output the result of sax_parse + std::cout << "\nresult: " << std::boolalpha << result << std::endl; +} diff --git a/docs/examples/sax_parse_with_src_location_in_json.output b/docs/examples/sax_parse_with_src_location_in_json.output new file mode 100644 index 000000000..676682e6d --- /dev/null +++ b/docs/examples/sax_parse_with_src_location_in_json.output @@ -0,0 +1,19 @@ +object(size=1, at=[{l=0:c=0}, {l=15:c=1})) + object(size=8, at=[{l=1:c=17}, {l=14:c=9})) + boolean(val=0, at=[{l=10:c=25}, {l=10:c=30})) + null(at=[{l=12:c=28}, {l=12:c=32})) + number_float(val=12.7234, at=[{l=13:c=24}, {l=13:c=0})) + number_unsigned(val=600, at=[{l=3:c=22}, {l=3:c=25})) + array(size=4, at=[{l=11:c=19}, {l=11:c=42})) + number_unsigned(val=116, at=[{l=11:c=20}, {l=11:c=23})) + number_unsigned(val=943, at=[{l=11:c=25}, {l=11:c=28})) + number_unsigned(val=234, at=[{l=11:c=30}, {l=11:c=33})) + number_integer(val=-38793, at=[{l=11:c=35}, {l=11:c=41})) + object(size=3, at=[{l=5:c=25}, {l=9:c=13})) + number_unsigned(val=125, at=[{l=7:c=26}, {l=7:c=29})) + string(val=http://www.example.com/image/481989943, at=[{l=6:c=26}, {l=6:c=66})) + number_unsigned(val=100, at=[{l=8:c=26}, {l=8:c=0})) + string(val=View from 15th Floor, at=[{l=4:c=22}, {l=4:c=44})) + number_unsigned(val=800, at=[{l=2:c=22}, {l=2:c=25})) + +result: true From baef8ad137623687cdfd45c0c6e72cbea442cbff Mon Sep 17 00:00:00 2001 From: Raphael Grimm Date: Mon, 19 Dec 2022 18:16:18 +0100 Subject: [PATCH 3/7] Add documentation for extend sax parser --- docs/mkdocs/docs/api/json_sax/index.md | 3 + .../docs/api/json_sax/next_token_end.md | 73 +++++++++++++++++++ .../docs/api/json_sax/next_token_start.md | 73 +++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 docs/mkdocs/docs/api/json_sax/next_token_end.md create mode 100644 docs/mkdocs/docs/api/json_sax/next_token_start.md diff --git a/docs/mkdocs/docs/api/json_sax/index.md b/docs/mkdocs/docs/api/json_sax/index.md index f63e85c9a..719c037f8 100644 --- a/docs/mkdocs/docs/api/json_sax/index.md +++ b/docs/mkdocs/docs/api/json_sax/index.md @@ -37,8 +37,11 @@ processing the input. - [**start_array**](start_array.md) (_virtual_) - the beginning of an array was read - [**start_object**](start_object.md) (_virtual_) - the beginning of an object was read - [**string**](string.md) (_virtual_) - a string value was read +- [**next_token_start**](next_token_start.md) - called to provide the start of the next element in the parsed input. +- [**next_token_end**](next_token_end.md) - called to provide the end (one past convention) of the next element in the parsed input. ## Version history - Added in version 3.2.0. - Support for binary values (`binary_t`, `binary`) added in version 3.8.0. +- Support for parser location information (`next_token_start`, `next_token_end`) added in version ???.???.???. diff --git a/docs/mkdocs/docs/api/json_sax/next_token_end.md b/docs/mkdocs/docs/api/json_sax/next_token_end.md new file mode 100644 index 000000000..25f9fd4bd --- /dev/null +++ b/docs/mkdocs/docs/api/json_sax/next_token_end.md @@ -0,0 +1,73 @@ +# nlohmann::json_sax::next_token_end + +Informs the sax parser about the end of the next element. +There are two possible signatures for this method: + +1. +```cpp +void next_token_end(std::size_t pos); +``` +This version is called with the byte position after the next element ends. This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). + +2. +```cpp +template +void next_token_end(const nlohmann::detail::lexer& lex) +``` +This version is called with the lexer after the last character of the next element was parsed. The lexer can provide additional information about the current parse context. This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. + +## Template parameters +1. +(none) +2. +`BasicJsonType` +: a specialization of `basic_json` used by the lexer. (Leave this as a template parameter) +`InputAdapterType` +: The input adapter used by the lexer. (Leave this as a template parameter) + +## Parameters +1. +`pos` (in) +: Byte position one after the next elements last byte. +2. +`lex` (in) +: Lexer after the last char of the next element was parsed. + +## Notes + +Implementing either version is optional, and no function is called if neither version of `next_token_end` is available in the sax parser. + +It is recommended, but not required, to also implement [next_token_start](next_token_start.md). + +## Examples + +??? example + + The example below shows a SAX parser using the first version of this method to log the location. + + ```cpp + --8<-- "examples/sax_parse_with_src_location.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location.output" + ``` + +??? example + + The example below shows a SAX parser using the second version of this method and storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` +## Version history + +- Added in version ???.???.???. diff --git a/docs/mkdocs/docs/api/json_sax/next_token_start.md b/docs/mkdocs/docs/api/json_sax/next_token_start.md new file mode 100644 index 000000000..9543f53ad --- /dev/null +++ b/docs/mkdocs/docs/api/json_sax/next_token_start.md @@ -0,0 +1,73 @@ +# nlohmann::json_sax::next_token_start + +Informs the sax parser about the start of the next element. +There are two possible signatures for this method: + +1. +```cpp +void next_token_start(std::size_t pos); +``` +This version is called with the byte position where the next element starts. This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). + +2. +```cpp +template +void next_token_start(const nlohmann::detail::lexer& lex) +``` +This version is called with the lexer after the first character of the next element was parsed. The lexer can provide additional information about the current parse context. This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. + +## Template parameters +1. +(none) +2. +`BasicJsonType` +: a specialization of `basic_json` used by the lexer. (Leave this as a template parameter) +`InputAdapterType` +: The input adapter used by the lexer. (Leave this as a template parameter) + +## Parameters +1. +`pos` (in) +: Byte position where the next element starts. +2. +`lex` (in) +: Lexer after the first char of the next element was parsed. + +## Notes + +Implementing either version is optional, and no function is called if neither version of `next_token_start` is available in the sax parser. + +It is recommended, but not required, to also implement [next_token_end](next_token_end.md). + +## Examples + +??? example + + The example below shows a SAX parser using the first version of this method to log the location. + + ```cpp + --8<-- "examples/sax_parse_with_src_location.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location.output" + ``` + +??? example + + The example below shows a SAX parser using the second version of this method and storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` +## Version history + +- Added in version ???.???.???. From 8ba93931ca65b69cab37f35de583107575ccb1c7 Mon Sep 17 00:00:00 2001 From: Raphael Grimm Date: Mon, 19 Dec 2022 18:16:54 +0100 Subject: [PATCH 4/7] Add contributor mention --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 43aacf4d5..2c88b0cae 100644 --- a/README.md +++ b/README.md @@ -1731,6 +1731,7 @@ I deeply appreciate the help of the following people. 314. [Berkus Decker](https://github.com/berkus) fixed a typo in the README. 315. [Illia Polishchuk](https://github.com/effolkronium) improved the CMake testing. 316. [Ikko Ashimine](https://github.com/eltociear) fixed a typo. +317. [Raphael Grimm](https://github.com/barcode) Added custom base classes as customization point and parser location information for sax parsers. Thanks a lot for helping out! Please [let me know](mailto:mail@nlohmann.me) if I forgot someone. From 22f56995fbfa7871397e9c905341942892aff3f8 Mon Sep 17 00:00:00 2001 From: Raphael Grimm Date: Mon, 19 Dec 2022 18:38:39 +0100 Subject: [PATCH 5/7] Fix ci issue --- tests/src/unit-sax-parser-extended.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/unit-sax-parser-extended.cpp b/tests/src/unit-sax-parser-extended.cpp index 769e19859..08e33941b 100644 --- a/tests/src/unit-sax-parser-extended.cpp +++ b/tests/src/unit-sax-parser-extended.cpp @@ -1351,7 +1351,7 @@ void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::j { ++nbytes; } - else if (val >= 128 && val <= 255) + else if (val <= 255) { use_uint = true; ++nbytes; From bda3e4b7bc152fc65799da87cbfce6cbd40d01f0 Mon Sep 17 00:00:00 2001 From: barcode Date: Fri, 23 Dec 2022 14:32:04 +0100 Subject: [PATCH 6/7] Use nlohmann::position_t instead of lexer for detailed position information when using a sax parser --- .../sax_parse_with_src_location_in_json.cpp | 17 ++-- .../docs/api/json_sax/next_token_end.md | 25 ++--- .../docs/api/json_sax/next_token_start.md | 25 ++--- .../api/position_t/chars_read_current_line.md | 28 ++++++ .../docs/api/position_t/chars_read_total.md | 28 ++++++ docs/mkdocs/docs/api/position_t/index.md | 23 +++++ docs/mkdocs/docs/api/position_t/lines_read.md | 28 ++++++ .../docs/api/position_t/operator_size_t.md | 28 ++++++ .../docs/features/parsing/sax_interface.md | 24 +++++ docs/mkdocs/mkdocs.yml | 8 ++ include/nlohmann/detail/input/position_t.hpp | 5 - include/nlohmann/detail/meta/is_sax.hpp | 90 +++++++++++------- single_include/nlohmann/json.hpp | 95 +++++++++++-------- tests/src/unit-sax-parser-extended.cpp | 52 +++++----- .../unit-sax-parser-store-source-location.cpp | 34 ++++--- 15 files changed, 348 insertions(+), 162 deletions(-) create mode 100644 docs/mkdocs/docs/api/position_t/chars_read_current_line.md create mode 100644 docs/mkdocs/docs/api/position_t/chars_read_total.md create mode 100644 docs/mkdocs/docs/api/position_t/index.md create mode 100644 docs/mkdocs/docs/api/position_t/lines_read.md create mode 100644 docs/mkdocs/docs/api/position_t/operator_size_t.md diff --git a/docs/examples/sax_parse_with_src_location_in_json.cpp b/docs/examples/sax_parse_with_src_location_in_json.cpp index ab9b30cc5..cf7adc1fb 100644 --- a/docs/examples/sax_parse_with_src_location_in_json.cpp +++ b/docs/examples/sax_parse_with_src_location_in_json.cpp @@ -9,14 +9,13 @@ using json = nlohmann::json; // allows us to store metadata and add custom methods to each node struct token_start_stop { - nlohmann::detail::position_t start{}; - nlohmann::detail::position_t stop{}; + nlohmann::position_t start{}; + nlohmann::position_t stop{}; std::string start_pos_str() const { return "{l=" + std::to_string(start.lines_read) + ":c=" - //the lexer is already one char ahead (e.g. the opening { of an object ) - + std::to_string(start.chars_read_current_line - 1) + "}"; + + std::to_string(start.chars_read_current_line) + "}"; } std::string stop_pos_str() const { @@ -68,16 +67,14 @@ class sax_with_token_start_stop_metadata , start_stop{} {} - template - void next_token_start(const nlohmann::detail::lexer& lex) + void next_token_start(const nlohmann::position_t& p) { - start_stop.start = lex.get_position(); + start_stop.start = p; } - template - void next_token_end(const nlohmann::detail::lexer& lex) + void next_token_end(const nlohmann::position_t& p) { - start_stop.stop = lex.get_position(); + start_stop.stop = p; } bool null() diff --git a/docs/mkdocs/docs/api/json_sax/next_token_end.md b/docs/mkdocs/docs/api/json_sax/next_token_end.md index 25f9fd4bd..69e5da965 100644 --- a/docs/mkdocs/docs/api/json_sax/next_token_end.md +++ b/docs/mkdocs/docs/api/json_sax/next_token_end.md @@ -7,31 +7,23 @@ There are two possible signatures for this method: ```cpp void next_token_end(std::size_t pos); ``` -This version is called with the byte position after the next element ends. This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). +This version is called with the byte position after the next element ends. +This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). 2. ```cpp -template -void next_token_end(const nlohmann::detail::lexer& lex) +void next_token_end(const nlohmann::position_t& p) ``` -This version is called with the lexer after the last character of the next element was parsed. The lexer can provide additional information about the current parse context. This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. - -## Template parameters -1. -(none) -2. -`BasicJsonType` -: a specialization of `basic_json` used by the lexer. (Leave this as a template parameter) -`InputAdapterType` -: The input adapter used by the lexer. (Leave this as a template parameter) +This version is called with the [detailed parser position information](../position_t/index.md) after the last character of the next element was parsed. +This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. ## Parameters 1. `pos` (in) : Byte position one after the next elements last byte. 2. -`lex` (in) -: Lexer after the last char of the next element was parsed. +`p` (in) +: [Detailed parser position information](../position_t/index.md) after the last char of the next element was parsed. ## Notes @@ -57,7 +49,8 @@ It is recommended, but not required, to also implement [next_token_start](next_t ??? example - The example below shows a SAX parser using the second version of this method and storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + The example below shows a SAX parser using the second version of this method and + storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. ```cpp --8<-- "examples/sax_parse_with_src_location_in_json.cpp" diff --git a/docs/mkdocs/docs/api/json_sax/next_token_start.md b/docs/mkdocs/docs/api/json_sax/next_token_start.md index 9543f53ad..49289b7ac 100644 --- a/docs/mkdocs/docs/api/json_sax/next_token_start.md +++ b/docs/mkdocs/docs/api/json_sax/next_token_start.md @@ -7,31 +7,23 @@ There are two possible signatures for this method: ```cpp void next_token_start(std::size_t pos); ``` -This version is called with the byte position where the next element starts. This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). +This version is called with the byte position where the next element starts. +This version also works when parsing binary formats such as [msgpack](../basic_json/input_format_t.md). 2. ```cpp -template -void next_token_start(const nlohmann::detail::lexer& lex) +void next_token_start(const nlohmann::position_t& p) ``` -This version is called with the lexer after the first character of the next element was parsed. The lexer can provide additional information about the current parse context. This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. - -## Template parameters -1. -(none) -2. -`BasicJsonType` -: a specialization of `basic_json` used by the lexer. (Leave this as a template parameter) -`InputAdapterType` -: The input adapter used by the lexer. (Leave this as a template parameter) +This version is called with [detailed parser position information](../position_t/index.md). +This version only available when calling `nlohmann::json::sax_parse` with `nlohmann::json::input_format_t::json` and takes precedence. ## Parameters 1. `pos` (in) : Byte position where the next element starts. 2. -`lex` (in) -: Lexer after the first char of the next element was parsed. +`p` (in) +: [Detailed parser position information](../position_t/index.md) after the first char of the next element was parsed. ## Notes @@ -57,7 +49,8 @@ It is recommended, but not required, to also implement [next_token_end](next_tok ??? example - The example below shows a SAX parser using the second version of this method and storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + The example below shows a SAX parser using the second version of this method and + storing the location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. ```cpp --8<-- "examples/sax_parse_with_src_location_in_json.cpp" diff --git a/docs/mkdocs/docs/api/position_t/chars_read_current_line.md b/docs/mkdocs/docs/api/position_t/chars_read_current_line.md new file mode 100644 index 000000000..740a3875d --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/chars_read_current_line.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::chars_read_current_line + +```cpp +std::size_t chars_read_current_line; +``` + +The number of characters read in the current line. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/chars_read_total.md b/docs/mkdocs/docs/api/position_t/chars_read_total.md new file mode 100644 index 000000000..9f6e736cf --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/chars_read_total.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::chars_read_total + +```cpp +std::size_t chars_read_total; +``` + +The total number of characters read. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/index.md b/docs/mkdocs/docs/api/position_t/index.md new file mode 100644 index 000000000..16c4fd431 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/index.md @@ -0,0 +1,23 @@ +# nlohmann::position_t + +```cpp +struct position_t; +``` + +This type represents the parsers position when parsing a json string using. +This position can be retrieved when using a [sax parser](../json_sax/index.md) with the format `nlohmann::json::input_format_t::json` +and implementing [next_token_start](../json_sax/next_token_start.md) or [next_token_end](../json_sax/next_token_end.md). + +## Member functions + +- [**operator size_t**](operator_size_t.md) - return the value of [chars_read_total](chars_read_total.md). + +## Member variables + +- [**chars_read_total**](chars_read_total.md) - The total number of characters read. +- [**lines_read**](lines_read.md) - The number of lines read. +- [**chars_read_current_line**](chars_read_current_line.md) - The number of characters read in the current line. + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/lines_read.md b/docs/mkdocs/docs/api/position_t/lines_read.md new file mode 100644 index 000000000..e22ee1d45 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/lines_read.md @@ -0,0 +1,28 @@ +# nlohmann::position_t::lines_read + +```cpp +std::size_t lines_read; +``` + +The number of lines read. + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/api/position_t/operator_size_t.md b/docs/mkdocs/docs/api/position_t/operator_size_t.md new file mode 100644 index 000000000..bc0325fd4 --- /dev/null +++ b/docs/mkdocs/docs/api/position_t/operator_size_t.md @@ -0,0 +1,28 @@ +# nlohmann::position_t:: + +```cpp +constexpr operator size_t() const; +``` + +return the value of [chars_read_total](chars_read_total.md). + +## Examples + +??? example + + The example below shows a SAX receiving the element bounds as `nlohmann::position_t` and + storing this location information in each json node using a [base class](../basic_json/json_base_class_t.md) for `nlohmann::json` as customization point. + + ```cpp + --8<-- "examples/sax_parse_with_src_location_in_json.cpp" + ``` + + Output: + + ```json + --8<-- "examples/sax_parse_with_src_location_in_json.output" + ``` + +## Version history + +- Moved from namespace `nlohmann::detail` to `nlohmann` in version ???.???.???. diff --git a/docs/mkdocs/docs/features/parsing/sax_interface.md b/docs/mkdocs/docs/features/parsing/sax_interface.md index 0796a55f5..e925d07c0 100644 --- a/docs/mkdocs/docs/features/parsing/sax_interface.md +++ b/docs/mkdocs/docs/features/parsing/sax_interface.md @@ -67,6 +67,30 @@ To implement your own SAX handler, proceed as follows: Note the `sax_parse` function only returns a `#!cpp bool` indicating the result of the last executed SAX event. It does not return `json` value - it is up to you to decide what to do with the SAX events. Furthermore, no exceptions are thrown in case of a parse error - it is up to you what to do with the exception object passed to your `parse_error` implementation. Internally, the SAX interface is used for the DOM parser (class `json_sax_dom_parser`) as well as the acceptor (`json_sax_acceptor`), see file `json_sax.hpp`. +## Element position information + +The position of a parsed element can be retrieved by implementing the optional methods [next_token_start](../../api/json_sax/next_token_start.md) and [next_token_end](../../api/json_sax/next_token_end.md). +These methods will be called with the parser position before any of the other methods are called and can be used to retrieve the half open bounds (`[start, end)`) of a parsed element. + +These Methods come in two flavors: + +1. +```cpp +void next_token_start(std::size_t pos); +void next_token_end(std::size_t pos); +``` +This flavor is called with the byte positions of each element and are available for any `nlohmann::json::input_format_t` passed to `nlohmann::json::sax_parse`. + +2. +```cpp +void next_token_start(const nlohmann::position_t& p); +void next_token_end(const nlohmann::position_t& p); +``` +This flavor is called with the [detailed parser position information](../../api/position_t/index.md) of each element and are only available if `nlohmann::json::sax_parse` is called with `nlohmann::json::input_format_t::json`. +Furthermore this flavor takes precedence over the first flavor. + +Depending on the required information it is possible for the SAX parser to implement all four or only one or none of these methods. + ## See also - [json_sax](../../api/json_sax/index.md) - documentation of the SAX interface diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 8319354a4..f29960ecf 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -250,6 +250,8 @@ nav: - 'start_array': api/json_sax/start_array.md - 'start_object': api/json_sax/start_object.md - 'string': api/json_sax/string.md + - 'next_token_start' : api/json_sax/next_token_start.md + - 'next_token_end' : api/json_sax/next_token_end.md - 'operator<<(basic_json)': api/operator_ltlt.md - 'operator<<(json_pointer)': api/operator_ltlt.md - 'operator>>(basic_json)': api/operator_gtgt.md @@ -257,6 +259,12 @@ nav: - 'operator""_json_pointer': api/operator_literal_json_pointer.md - 'ordered_json': api/ordered_json.md - 'ordered_map': api/ordered_map.md + - position_t: + - 'Overview': api/position_t/index.md + - 'operator size_t': api/position_t/operator_size_t.md + - 'chars_read_total': api/position_t/chars_read_total.md + - 'lines_read': api/position_t/lines_read.md + - 'chars_read_current_line': api/position_t/chars_read_current_line.md - macros: - 'Overview': api/macros/index.md - 'JSON_ASSERT': api/macros/json_assert.md diff --git a/include/nlohmann/detail/input/position_t.hpp b/include/nlohmann/detail/input/position_t.hpp index 396db0e16..5450ee961 100644 --- a/include/nlohmann/detail/input/position_t.hpp +++ b/include/nlohmann/detail/input/position_t.hpp @@ -13,9 +13,6 @@ #include NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - /// struct to capture the start position of the current token struct position_t { @@ -32,6 +29,4 @@ struct position_t return chars_read_total; } }; - -} // namespace detail NLOHMANN_JSON_NAMESPACE_END diff --git a/include/nlohmann/detail/meta/is_sax.hpp b/include/nlohmann/detail/meta/is_sax.hpp index fd0586434..6e8266f3f 100644 --- a/include/nlohmann/detail/meta/is_sax.hpp +++ b/include/nlohmann/detail/meta/is_sax.hpp @@ -50,30 +50,30 @@ struct sax_call_next_token_end_pos_direct template struct sax_call_function { - // is the parameter a lexer or a position - static constexpr bool no_lexer = std::is_same::value; + // is the parameter a lexer or a byte position + static constexpr bool called_with_byte_pos = std::is_same::value; template using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); //the sax parser supports calls with a position - static constexpr bool detected_call_with_pos = + static constexpr bool detected_call_with_byte_pos = is_detected_exact::value; //the sax parser supports calls with a lexer - static constexpr bool detected_call_with_lex = - !no_lexer && - is_detected_exact::value; + static constexpr bool detected_call_with_lex_pos = + !called_with_byte_pos && + is_detected_exact::value; //there either has to be a version accepting a lexer or a position - static constexpr bool valid = detected_call_with_pos || detected_call_with_lex; + static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; - //called with pos and pos is method supported -> pass data on + //called with byte pos and byte pos is method supported -> pass data on template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - sax_call_function::detected_call_with_pos + valid && + detected_call_with_byte_pos >::type call(SaxT* sax, std::size_t pos) { @@ -84,46 +84,70 @@ struct sax_call_function template static typename std::enable_if < std::is_same::value && - !sax_call_function::valid + !valid >::type call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} - //called with lex and lex method is supported -> pass data on - template - static typename std::enable_if < - sax_call_function::valid && - std::is_same::value && - !sax_call_function::no_lexer && - sax_call_function::detected_call_with_lex - >::type - call(SaxT* sax, const LexOrPos& lex) - { - DirectCaller::call(sax, lex); - } - - // called with lex and only pos method is supported -> call with position from lexer + //called with lex and lex pos method is supported -> call with position from lexer // the start pos in the lexer is last read char -> chars_read_total-1 template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - !sax_call_function::no_lexer && - !sax_call_function::detected_call_with_lex && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && std::is_same::value >::type call(SaxT* sax, const LexOrPos& lex) { - DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + JSON_ASSERT(lex.get_position().chars_read_total > 0); + JSON_ASSERT(lex.get_position().chars_read_current_line > 0); + //the lexer has already read the first char of the current element -> fix this + auto pos_copy = lex.get_position(); + --pos_copy.chars_read_total; + --pos_copy.chars_read_current_line; + DirectCaller::call(sax, pos_copy); } - // called with lex and only pos method is supported -> call with position from lexer + //called with lex and lex pos method is supported -> pass data on // the one past end pos in the lexer is the current index -> chars_read_total template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - !sax_call_function::no_lexer && - !sax_call_function::detected_call_with_lex && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position()); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && std::is_same::value >::type call(SaxT* sax, const LexOrPos& lex) diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index f4852d864..aabe9fbbe 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -3015,9 +3015,6 @@ NLOHMANN_JSON_NAMESPACE_END NLOHMANN_JSON_NAMESPACE_BEGIN -namespace detail -{ - /// struct to capture the start position of the current token struct position_t { @@ -3034,8 +3031,6 @@ struct position_t return chars_read_total; } }; - -} // namespace detail NLOHMANN_JSON_NAMESPACE_END // #include @@ -9002,30 +8997,30 @@ struct sax_call_next_token_end_pos_direct template struct sax_call_function { - // is the parameter a lexer or a position - static constexpr bool no_lexer = std::is_same::value; + // is the parameter a lexer or a byte position + static constexpr bool called_with_byte_pos = std::is_same::value; template using call_t = decltype(DirectCaller::call(std::declval(), std::declval()...)); //the sax parser supports calls with a position - static constexpr bool detected_call_with_pos = + static constexpr bool detected_call_with_byte_pos = is_detected_exact::value; //the sax parser supports calls with a lexer - static constexpr bool detected_call_with_lex = - !no_lexer && - is_detected_exact::value; + static constexpr bool detected_call_with_lex_pos = + !called_with_byte_pos && + is_detected_exact::value; //there either has to be a version accepting a lexer or a position - static constexpr bool valid = detected_call_with_pos || detected_call_with_lex; + static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; - //called with pos and pos is method supported -> pass data on + //called with byte pos and byte pos is method supported -> pass data on template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - sax_call_function::detected_call_with_pos + valid && + detected_call_with_byte_pos >::type call(SaxT* sax, std::size_t pos) { @@ -9036,46 +9031,70 @@ struct sax_call_function template static typename std::enable_if < std::is_same::value && - !sax_call_function::valid + !valid >::type call(SaxT* /*unused*/, const LexOrPos& /*unused*/) {} - //called with lex and lex method is supported -> pass data on - template - static typename std::enable_if < - sax_call_function::valid && - std::is_same::value && - !sax_call_function::no_lexer && - sax_call_function::detected_call_with_lex - >::type - call(SaxT* sax, const LexOrPos& lex) - { - DirectCaller::call(sax, lex); - } - - // called with lex and only pos method is supported -> call with position from lexer + //called with lex and lex pos method is supported -> call with position from lexer // the start pos in the lexer is last read char -> chars_read_total-1 template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - !sax_call_function::no_lexer && - !sax_call_function::detected_call_with_lex && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && std::is_same::value >::type call(SaxT* sax, const LexOrPos& lex) { - DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + JSON_ASSERT(lex.get_position().chars_read_total > 0); + JSON_ASSERT(lex.get_position().chars_read_current_line > 0); + //the lexer has already read the first char of the current element -> fix this + auto pos_copy = lex.get_position(); + --pos_copy.chars_read_total; + --pos_copy.chars_read_current_line; + DirectCaller::call(sax, pos_copy); } - // called with lex and only pos method is supported -> call with position from lexer + //called with lex and lex pos method is supported -> pass data on // the one past end pos in the lexer is the current index -> chars_read_total template static typename std::enable_if < - sax_call_function::valid && std::is_same::value && - !sax_call_function::no_lexer && - !sax_call_function::detected_call_with_lex && + valid && + !called_with_byte_pos && + detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + DirectCaller::call(sax, lex.get_position()); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the start pos in the lexer is last read char -> chars_read_total-1 + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && + std::is_same::value + >::type + call(SaxT* sax, const LexOrPos& lex) + { + JSON_ASSERT(lex.get_position().chars_read_total > 0); + DirectCaller::call(sax, lex.get_position().chars_read_total - 1); + } + + // called with lex and only byte pos method is supported -> call with byte position from lexer + // the one past end pos in the lexer is the current index -> chars_read_total + template + static typename std::enable_if < + std::is_same::value && + valid && + !called_with_byte_pos && + !detected_call_with_lex_pos && std::is_same::value >::type call(SaxT* sax, const LexOrPos& lex) diff --git a/tests/src/unit-sax-parser-extended.cpp b/tests/src/unit-sax-parser-extended.cpp index 08e33941b..88342965d 100644 --- a/tests/src/unit-sax-parser-extended.cpp +++ b/tests/src/unit-sax-parser-extended.cpp @@ -109,10 +109,10 @@ std::ostream& operator<<(std::ostream& out, const std::set& v) return out; } -template +template struct Sax { - static constexpr bool has_callback = WithPos || (WithLex && !LexCallImpossible); + static constexpr bool has_callback = WithBytePos || (WithLexPos && !LexCallImpossible); using json = nlohmann::json; enum class last_call_t @@ -167,32 +167,32 @@ struct Sax last_call = last_call_t::end_pos; } - template + template typename std::enable_if::type next_token_start(std::size_t pos) { check_start(pos); - CHECK((!WithLex || LexCallImpossible)); + CHECK((!WithLexPos || LexCallImpossible)); } - template < class LexT, bool Act = WithLex && !std::is_same::value > - typename std::enable_if::type next_token_start(const LexT& lex) + template < bool Act = WithLexPos > + typename std::enable_if::type next_token_start(const nlohmann::position_t& p) { - check_start(lex.get_position().chars_read_total - 1); - CHECK(WithLex); + check_start(p.chars_read_total); + CHECK(WithLexPos); } - template + template typename std::enable_if::type next_token_end(std::size_t pos) { check_end(pos); - CHECK((!WithLex || LexCallImpossible)); + CHECK((!WithLexPos || LexCallImpossible)); } - template < class LexT, bool Act = WithLex && !std::is_same::value > - typename std::enable_if::type next_token_end(const LexT& lex) + template < bool Act = WithLexPos > + typename std::enable_if::type next_token_end(const nlohmann::position_t& p) { - check_end(lex.get_position().chars_read_total); - CHECK(WithLex); + check_end(p.chars_read_total); + CHECK(WithLexPos); } bool null() @@ -303,11 +303,11 @@ struct Sax } }; -template +template struct Opt { - static constexpr bool WithPos = WithPosV; - static constexpr bool WithLex = WithLexV; + static constexpr bool WithBytePos = WithBytePosV; + static constexpr bool WithLexPos = WithLexPosV; }; using OptNone = Opt; @@ -318,10 +318,10 @@ using OptBoth = Opt; //test basic functionality TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { - const bool with_pos = T::WithPos; - const bool with_lex = T::WithLex; + const bool with_pos = T::WithBytePos; + const bool with_lex = T::WithLexPos; - INFO("WithPos " << with_pos << ", WithLex " << with_lex); + INFO("WithBytePos " << with_pos << ", WithLexPos " << with_lex); //element count 0 1 2 3 4 5 6 7 8 9 10 //index 10s place 0 1 2 3 4 5 //index 1s place 012345678901234567890123456789012345678901234567890123 @@ -351,7 +351,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) reconstructed += s; skip(s.size()); }; - Sax sax; + Sax sax; sax.pos_start_object.emplace(elementFromStr("{")); skipFromStr(" "); sax.pos_key.emplace(elementFromStr(R"("array")")); @@ -384,7 +384,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { const auto j = nlohmann::json::parse(str); const auto bin = nlohmann::json::to_bson(j); - Sax sax; + Sax sax; sax.pos_start_object.emplace(element(4)); //4 bytes size skip(1); //one byte type array sax.pos_key.emplace(element(6)); //6 key (array\0) @@ -414,7 +414,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { const auto j = nlohmann::json::parse(str); const auto bin = nlohmann::json::to_cbor(j); - Sax sax; + Sax sax; sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) @@ -437,7 +437,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { const auto j = nlohmann::json::parse(str); const auto bin = nlohmann::json::to_msgpack(j); - Sax sax; + Sax sax; sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size sax.pos_key.emplace(element(6)); //1 byte type + 5 bytes string (array) (size implicit) sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) @@ -460,7 +460,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { const auto j = nlohmann::json::parse(str); const auto bin = nlohmann::json::to_ubjson(j); - Sax sax; + Sax sax; sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) @@ -483,7 +483,7 @@ TEST_CASE_TEMPLATE("extended parser", T, OptNone, OptLex, OptPos, OptBoth) { const auto j = nlohmann::json::parse(str); const auto bin = nlohmann::json::to_bjdata(j); - Sax sax; + Sax sax; sax.pos_start_object.emplace(element(1)); //1 byte type + 0 bytes size sax.pos_key.emplace(element(7)); //1 byte type + 6 bytes string (array\0) sax.pos_start_array.emplace(element(1)); //1 byte type + 0 bytes size (implicit in type) diff --git a/tests/src/unit-sax-parser-store-source-location.cpp b/tests/src/unit-sax-parser-store-source-location.cpp index 0820a81ce..4a069c38b 100644 --- a/tests/src/unit-sax-parser-store-source-location.cpp +++ b/tests/src/unit-sax-parser-store-source-location.cpp @@ -35,17 +35,17 @@ SOFTWARE. #include //prototype to make -Wmissing-prototypes happy -std::ostream& operator<<(std::ostream& out, const nlohmann::detail::position_t& p); +std::ostream& operator<<(std::ostream& out, const nlohmann::position_t& p); //test json parser with detailed line / col information as metadata struct token_start_stop { - nlohmann::detail::position_t start{}; - nlohmann::detail::position_t stop{}; + nlohmann::position_t start{}; + nlohmann::position_t stop{}; }; -std::ostream& operator<<(std::ostream& out, const nlohmann::detail::position_t& p) +std::ostream& operator<<(std::ostream& out, const nlohmann::position_t& p) { out << p.chars_read_total << '(' << p.lines_read << ':' << p.chars_read_current_line << ')'; return out; @@ -90,16 +90,14 @@ class sax_with_token_start_stop_metadata , start_stop{} {} - template - void next_token_start(const nlohmann::detail::lexer& lex) + void next_token_start(const nlohmann::position_t& p) { - start_stop.start = lex.get_position(); + start_stop.start = p; } - template - void next_token_end(const nlohmann::detail::lexer& lex) + void next_token_end(const nlohmann::position_t& p) { - start_stop.stop = lex.get_position(); + start_stop.stop = p; } bool null() @@ -294,38 +292,38 @@ TEST_CASE("parse-json-with-position-info") sax_with_token_start_stop_metadata sax{j}; CHECK(nlohmann::json::sax_parse(str, &sax, nlohmann::json::input_format_t::json)); CHECK(j.start.lines_read == 0); - CHECK(j.start.chars_read_current_line == 1); + CHECK(j.start.chars_read_current_line == 0); CHECK(j["array"].start.lines_read == 1); - CHECK(j["array"].start.chars_read_current_line == 13); + CHECK(j["array"].start.chars_read_current_line == 12); CHECK(j["array"][0].start.lines_read == 2); - CHECK(j["array"][0].start.chars_read_current_line == 5); + CHECK(j["array"][0].start.chars_read_current_line == 4); CHECK(j["array"][0].stop.lines_read == 2); CHECK(j["array"][0].stop.chars_read_current_line == 15); CHECK(j["array"][1].start.lines_read == 3); - CHECK(j["array"][1].start.chars_read_current_line == 5); + CHECK(j["array"][1].start.chars_read_current_line == 4); CHECK(j["array"][1].stop.lines_read == 3); CHECK(j["array"][1].stop.chars_read_current_line == 6); CHECK(j["array"][2].start.lines_read == 4); - CHECK(j["array"][2].start.chars_read_current_line == 5); + CHECK(j["array"][2].start.chars_read_current_line == 4); CHECK(j["array"][2].stop.lines_read == 4); CHECK(j["array"][2].stop.chars_read_current_line == 8); CHECK(j["array"][3].start.lines_read == 5); - CHECK(j["array"][3].start.chars_read_current_line == 5); + CHECK(j["array"][3].start.chars_read_current_line == 4); CHECK(j["array"][3].stop.lines_read == 5); CHECK(j["array"][3].stop.chars_read_current_line == 7); CHECK(j["array"][4].start.lines_read == 6); //starts directly after last value.... - CHECK(j["array"][4].start.chars_read_current_line == 5); + CHECK(j["array"][4].start.chars_read_current_line == 4); CHECK(j["array"][4].stop.lines_read == 6); CHECK(j["array"][4].stop.chars_read_current_line == 8); CHECK(j["array"][5].start.lines_read == 7); - CHECK(j["array"][5].start.chars_read_current_line == 5); + CHECK(j["array"][5].start.chars_read_current_line == 4); CHECK(j["array"][5].stop.lines_read == 7); CHECK(j["array"][5].stop.chars_read_current_line == 9); From a41686881881e8c8cb2924eac60d72235efe2bc4 Mon Sep 17 00:00:00 2001 From: barcode Date: Fri, 23 Dec 2022 21:12:27 +0100 Subject: [PATCH 7/7] Fix ci issues --- cmake/ci.cmake | 2 +- .../sax_parse_with_src_location_in_json.cpp | 2 +- include/nlohmann/detail/meta/is_sax.hpp | 3 +- single_include/nlohmann/json.hpp | 4 +- tests/src/unit-sax-parser-extended.cpp | 57 ++++++++----------- .../unit-sax-parser-store-source-location.cpp | 17 ++++-- 6 files changed, 42 insertions(+), 43 deletions(-) diff --git a/cmake/ci.cmake b/cmake/ci.cmake index bbb2d4cb9..29d58d83b 100644 --- a/cmake/ci.cmake +++ b/cmake/ci.cmake @@ -636,7 +636,7 @@ add_custom_target(ci_test_valgrind -DJSON_BuildTests=ON -DJSON_Valgrind=ON -S${PROJECT_SOURCE_DIR} -B${PROJECT_BINARY_DIR}/build_valgrind COMMAND ${CMAKE_COMMAND} --build ${PROJECT_BINARY_DIR}/build_valgrind - COMMAND cd ${PROJECT_BINARY_DIR}/build_valgrind && ${CMAKE_CTEST_COMMAND} -L valgrind --parallel ${N} --output-on-failure + COMMAND cd ${PROJECT_BINARY_DIR}/build_valgrind && ${CMAKE_CTEST_COMMAND} -L valgrind --parallel ${N} --output-on-failure --timeout 10000 COMMENT "Compile and test with Valgrind" ) diff --git a/docs/examples/sax_parse_with_src_location_in_json.cpp b/docs/examples/sax_parse_with_src_location_in_json.cpp index cf7adc1fb..8e7818352 100644 --- a/docs/examples/sax_parse_with_src_location_in_json.cpp +++ b/docs/examples/sax_parse_with_src_location_in_json.cpp @@ -187,7 +187,7 @@ class sax_with_token_start_stop_metadata return false; } - constexpr bool is_errored() const + bool is_errored() const { return errored; } diff --git a/include/nlohmann/detail/meta/is_sax.hpp b/include/nlohmann/detail/meta/is_sax.hpp index 6e8266f3f..38831e56c 100644 --- a/include/nlohmann/detail/meta/is_sax.hpp +++ b/include/nlohmann/detail/meta/is_sax.hpp @@ -15,6 +15,7 @@ #include #include #include +#include NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -63,7 +64,7 @@ struct sax_call_function //the sax parser supports calls with a lexer static constexpr bool detected_call_with_lex_pos = !called_with_byte_pos && - is_detected_exact::value; + is_detected_exact::value; //there either has to be a version accepting a lexer or a position static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index aabe9fbbe..8cc45335c 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -8962,6 +8962,8 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -9010,7 +9012,7 @@ struct sax_call_function //the sax parser supports calls with a lexer static constexpr bool detected_call_with_lex_pos = !called_with_byte_pos && - is_detected_exact::value; + is_detected_exact::value; //there either has to be a version accepting a lexer or a position static constexpr bool valid = detected_call_with_byte_pos || detected_call_with_lex_pos; diff --git a/tests/src/unit-sax-parser-extended.cpp b/tests/src/unit-sax-parser-extended.cpp index 88342965d..e81107fba 100644 --- a/tests/src/unit-sax-parser-extended.cpp +++ b/tests/src/unit-sax-parser-extended.cpp @@ -521,7 +521,7 @@ void fill_expected_sax_pos_json(SAX& sax, case nlohmann::json::value_t::object: { sax.pos_start_object.emplace(element(1)); // { - for (auto& el : part.items()) + for (const auto& el : part.items()) { sax.pos_key.emplace(element(el.key().size() + 2)); //'"' + str + '"' offset += 1; // separator ':' between key and value @@ -538,7 +538,7 @@ void fill_expected_sax_pos_json(SAX& sax, case nlohmann::json::value_t::array: { sax.pos_start_array.emplace(element(1)); // [ - for (auto& el : part.items()) + for (const auto& el : part.items()) { fill_expected_sax_pos_json(sax, element, el.value(), offset); offset += 1; // add , @@ -553,7 +553,7 @@ void fill_expected_sax_pos_json(SAX& sax, case nlohmann::json::value_t::string: { const auto val = part.get(); - std::size_t nbytes = val.size() + 2; //'"' + value + '"' + const std::size_t nbytes = val.size() + 2; //'"' + value + '"' sax.pos_string.emplace(element(nbytes)); } break; @@ -573,21 +573,21 @@ void fill_expected_sax_pos_json(SAX& sax, case nlohmann::json::value_t::number_integer: { const auto val = part.get(); - std::size_t nbytes = std::to_string(val).size(); + const std::size_t nbytes = std::to_string(val).size(); sax.pos_number_integer.emplace(element(nbytes)); } break; case nlohmann::json::value_t::number_unsigned: { const auto val = part.get(); - std::size_t nbytes = std::to_string(val).size(); + const std::size_t nbytes = std::to_string(val).size(); sax.pos_number_unsigned.emplace(element(nbytes)); } break; case nlohmann::json::value_t::number_float: { const auto val = part.get(); - std::size_t nbytes = std::to_string(val).size(); + const std::size_t nbytes = std::to_string(val).size(); sax.pos_number_float.emplace(element(nbytes)); } break; @@ -632,7 +632,7 @@ void fill_expected_sax_pos_bson(SAX& sax, case nlohmann::json::value_t::object: { sax.pos_start_object.emplace(element(4)); //32 bit size - for (auto& el : part.items()) + for (const auto& el : part.items()) { offset += 1; // type of item sax.pos_key.emplace(element(el.key().size() + 1)); // str + terminator @@ -645,7 +645,7 @@ void fill_expected_sax_pos_bson(SAX& sax, { sax.pos_start_array.emplace(element(4)); //32 bit size std::size_t i = 0; - for (auto& el : part.items()) + for (const auto& el : part.items()) { offset += 1; // type of item offset += 1 + std::to_string(i).size(); // dummy key + terminator @@ -667,8 +667,7 @@ void fill_expected_sax_pos_bson(SAX& sax, case nlohmann::json::value_t::boolean: { //type is before the key -> not included - std::size_t nbytes = 1; //value - sax.pos_boolean.emplace(element(nbytes)); + sax.pos_boolean.emplace(element(1)); //value } break; case nlohmann::json::value_t::number_integer: @@ -741,8 +740,7 @@ void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::jso { case nlohmann::json::value_t::null: { - std::size_t nbytes = 1; //type - sax.pos_null.emplace(element(nbytes)); + sax.pos_null.emplace(element(1)); //type } break; case nlohmann::json::value_t::object: @@ -770,7 +768,7 @@ void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::jso } sax.pos_start_object.emplace(element(nbytes)); //key follows same rules as string - for (auto& el : part.items()) + for (const auto& el : part.items()) { std::size_t nbyteskey = 1; //type nbyteskey += el.key().size(); @@ -862,8 +860,7 @@ void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::jso break; case nlohmann::json::value_t::boolean: { - std::size_t nbytes = 1; //type - sax.pos_boolean.emplace(element(nbytes)); + sax.pos_boolean.emplace(element(1)); //type } break; case nlohmann::json::value_t::number_integer: @@ -880,15 +877,15 @@ void fill_expected_sax_pos_cbor(SAX& sax, const FN& element, const nlohmann::jso { //value implicit in type } - else if (-val - 1 <= static_cast(std::numeric_limits::max())) + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) { nbytes += 1; } - else if (-val - 1 <= static_cast(std::numeric_limits::max())) + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) { nbytes += 2; } - else if (-val - 1 <= static_cast(std::numeric_limits::max())) + else if (-(val + 1) <= static_cast(std::numeric_limits::max())) { nbytes += 4; } @@ -993,8 +990,7 @@ void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann:: { case nlohmann::json::value_t::null: { - std::size_t nbytes = 1; //type - sax.pos_null.emplace(element(nbytes)); + sax.pos_null.emplace(element(1)); //type } break; case nlohmann::json::value_t::object: @@ -1018,7 +1014,7 @@ void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann:: } sax.pos_start_object.emplace(element(nbytes)); //key follows same rules as string - for (auto& el : part.items()) + for (const auto& el : part.items()) { std::size_t nbyteskey = 1; //type nbyteskey += el.key().size(); @@ -1106,8 +1102,7 @@ void fill_expected_sax_pos_msgpack(SAX& sax, const FN& element, const nlohmann:: break; case nlohmann::json::value_t::boolean: { - std::size_t nbytes = 1; //type - sax.pos_boolean.emplace(element(nbytes)); + sax.pos_boolean.emplace(element(1)); //type } break; case nlohmann::json::value_t::number_integer: @@ -1233,15 +1228,14 @@ void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::j { case nlohmann::json::value_t::null: { - std::size_t nbytes = 1; //type - sax.pos_null.emplace(element(nbytes)); + sax.pos_null.emplace(element(1)); //type } break; case nlohmann::json::value_t::object: { sax.pos_start_object.emplace(element(1)); //key follows same rules as string - for (auto& el : part.items()) + for (const auto& el : part.items()) { std::size_t nbyteskey = 1; //type of len nbyteskey += el.key().size(); @@ -1305,8 +1299,7 @@ void fill_expected_sax_pos_ubjson(SAX& sax, const FN& element, const nlohmann::j break; case nlohmann::json::value_t::boolean: { - std::size_t nbytes = 1; //type - sax.pos_boolean.emplace(element(nbytes)); + sax.pos_boolean.emplace(element(1)); //type } break; case nlohmann::json::value_t::number_integer: @@ -1442,15 +1435,14 @@ void fill_expected_sax_pos_bjdata(SAX& sax, const FN& element, const nlohmann::j { case nlohmann::json::value_t::null: { - std::size_t nbytes = 1; //type - sax.pos_null.emplace(element(nbytes)); + sax.pos_null.emplace(element(1)); //type } break; case nlohmann::json::value_t::object: { sax.pos_start_object.emplace(element(1)); //key follows same rules as string - for (auto& el : part.items()) + for (const auto& el : part.items()) { std::size_t nbyteskey = 1; //type of len nbyteskey += el.key().size(); @@ -1514,8 +1506,7 @@ void fill_expected_sax_pos_bjdata(SAX& sax, const FN& element, const nlohmann::j break; case nlohmann::json::value_t::boolean: { - std::size_t nbytes = 1; //type - sax.pos_boolean.emplace(element(nbytes)); + sax.pos_boolean.emplace(element(1)); //type } break; case nlohmann::json::value_t::number_integer: diff --git a/tests/src/unit-sax-parser-store-source-location.cpp b/tests/src/unit-sax-parser-store-source-location.cpp index 4a069c38b..68805f40c 100644 --- a/tests/src/unit-sax-parser-store-source-location.cpp +++ b/tests/src/unit-sax-parser-store-source-location.cpp @@ -83,13 +83,18 @@ class sax_with_token_start_stop_metadata */ explicit sax_with_token_start_stop_metadata(json& r, const bool allow_exceptions_ = true) : root(r) - , ref_stack{} - , object_element{nullptr} - , errored{false} + , object_element{nullptr} // NOLINT(modernize-use-default-member-init) + , errored{false} // NOLINT(modernize-use-default-member-init) , allow_exceptions(allow_exceptions_) - , start_stop{} {} + sax_with_token_start_stop_metadata(sax_with_token_start_stop_metadata&&) = delete; + sax_with_token_start_stop_metadata(const sax_with_token_start_stop_metadata&) = delete; + sax_with_token_start_stop_metadata& operator=(sax_with_token_start_stop_metadata&&) = delete; + sax_with_token_start_stop_metadata& operator=(const sax_with_token_start_stop_metadata&) = delete; + + ~sax_with_token_start_stop_metadata() = default; + void next_token_start(const nlohmann::position_t& p) { start_stop.start = p; @@ -210,7 +215,7 @@ class sax_with_token_start_stop_metadata return false; } - constexpr bool is_errored() const + bool is_errored() const { return errored; } @@ -263,7 +268,7 @@ class sax_with_token_start_stop_metadata /// whether to throw exceptions in case of errors const bool allow_exceptions = true; /// start / stop information for the current token - token_start_stop start_stop{}; + token_start_stop start_stop {}; }; TEST_CASE("parse-json-with-position-info")