From 3e9d74df56060eab4c0335780ba68bd914130e4b Mon Sep 17 00:00:00 2001 From: Isaac Nickaein Date: Sun, 29 Dec 2019 16:07:27 +0330 Subject: [PATCH] Split test-unicode to avoid timeouts --- test/CMakeLists.txt | 3 +- ...nit-unicode.cpp => unit-unicode-part1.cpp} | 132 +------ test/src/unit-unicode-part2.cpp | 325 ++++++++++++++++++ 3 files changed, 328 insertions(+), 132 deletions(-) rename test/src/{unit-unicode.cpp => unit-unicode-part1.cpp} (92%) create mode 100644 test/src/unit-unicode-part2.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8769c6b9d..047ec08c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -140,7 +140,8 @@ set(files src/unit-to_chars.cpp src/unit-ubjson.cpp src/unit-udt.cpp - src/unit-unicode.cpp + src/unit-unicode-part1.cpp + src/unit-unicode-part2.cpp src/unit-wstring.cpp) foreach(file ${files}) diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode-part1.cpp similarity index 92% rename from test/src/unit-unicode.cpp rename to test/src/unit-unicode-part1.cpp index a3d997631..fb8c26bf8 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode-part1.cpp @@ -44,7 +44,6 @@ using nlohmann::json; namespace { -extern size_t calls; size_t calls = 0; void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4); @@ -128,7 +127,7 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte { if (++calls % 100000 == 0) { - std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl; + std::cout << calls << " of 3300000 UTF-8 strings checked (part 1)" << std::endl; } std::string json_string = "\""; @@ -738,135 +737,6 @@ TEST_CASE("Unicode" * doctest::skip()) } } - SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)") - { - SECTION("well-formed") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) - { - for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) - { - for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) - { - check_utf8string(true, byte1, byte2, byte3, byte4); - check_utf8dump(true, byte1, byte2, byte3, byte4); - } - } - } - } - } - - SECTION("ill-formed: missing second byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - check_utf8string(false, byte1); - check_utf8dump(false, byte1); - } - } - - SECTION("ill-formed: missing third byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) - { - check_utf8string(false, byte1, byte2); - check_utf8dump(false, byte1, byte2); - } - } - } - - SECTION("ill-formed: missing fourth byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) - { - for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) - { - check_utf8string(false, byte1, byte2, byte3); - check_utf8dump(false, byte1, byte2, byte3); - } - } - } - } - - SECTION("ill-formed: wrong second byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2) - { - // skip correct second byte - if (0x80 <= byte2 and byte2 <= 0xBF) - { - continue; - } - - for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) - { - for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) - { - check_utf8string(false, byte1, byte2, byte3, byte4); - check_utf8dump(false, byte1, byte2, byte3, byte4); - } - } - } - } - } - - SECTION("ill-formed: wrong third byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) - { - for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3) - { - // skip correct third byte - if (0x80 <= byte3 and byte3 <= 0xBF) - { - continue; - } - - for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) - { - check_utf8string(false, byte1, byte2, byte3, byte4); - check_utf8dump(false, byte1, byte2, byte3, byte4); - } - } - } - } - } - - SECTION("ill-formed: wrong fourth byte") - { - for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) - { - for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) - { - for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) - { - for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4) - { - // skip correct fourth byte - if (0x80 <= byte3 and byte3 <= 0xBF) - { - continue; - } - - check_utf8string(false, byte1, byte2, byte3, byte4); - check_utf8dump(false, byte1, byte2, byte3, byte4); - } - } - } - } - } - } - SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)") { SECTION("well-formed") diff --git a/test/src/unit-unicode-part2.cpp b/test/src/unit-unicode-part2.cpp new file mode 100644 index 000000000..2cbe469c2 --- /dev/null +++ b/test/src/unit-unicode-part2.cpp @@ -0,0 +1,325 @@ +/* + __ _____ _____ _____ + __| | __| | | | JSON for Modern C++ (test suite) +| | |__ | | | | | | version 3.7.3 +|_____|_____|_____|_|___| https://github.com/nlohmann/json + +Licensed under the MIT License . +SPDX-License-Identifier: MIT +Copyright (c) 2013-2019 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "doctest_compatibility.h" + +// for some reason including this after the json header leads to linker errors with VS 2017... +#include + +#define private public +#include +using nlohmann::json; +#undef private + +#include +#include +#include +#include + +namespace +{ +size_t calls = 0; + +void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4); + +void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1) +{ + std::string json_string; + + CAPTURE(byte1) + CAPTURE(byte2) + CAPTURE(byte3) + CAPTURE(byte4) + + json_string += std::string(1, static_cast(byte1)); + + if (byte2 != -1) + { + json_string += std::string(1, static_cast(byte2)); + } + + if (byte3 != -1) + { + json_string += std::string(1, static_cast(byte3)); + } + + if (byte4 != -1) + { + json_string += std::string(1, static_cast(byte4)); + } + + CAPTURE(json_string) + + // store the string in a JSON value + json j = json_string; + json j2 = "abc" + json_string + "xyz"; + + // dumping with ignore/replace must not throw in any case + auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); + auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore); + auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore); + auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore); + auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace); + auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); + auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); + auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); + + if (success_expected) + { + // strict mode must not throw if success is expected + auto s_strict = j.dump(); + // all dumps should agree on the string + CHECK(s_strict == s_ignored); + CHECK(s_strict == s_replaced); + } + else + { + // strict mode must throw if success is not expected + CHECK_THROWS_AS(j.dump(), json::type_error&); + // ignore and replace must create different dumps + CHECK(s_ignored != s_replaced); + + // check that replace string contains a replacement character + CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); + } + + // check that prefix and suffix are preserved + CHECK(s_ignored2.substr(1, 3) == "abc"); + CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); + CHECK(s_ignored2_ascii.substr(1, 3) == "abc"); + CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz"); + CHECK(s_replaced2.substr(1, 3) == "abc"); + CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz"); + CHECK(s_replaced2_ascii.substr(1, 3) == "abc"); + CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz"); +} + +void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4); + +// create and check a JSON string with up to four UTF-8 bytes +void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1) +{ + if (++calls % 100000 == 0) + { + std::cout << calls << " of 5500000 UTF-8 strings checked (part 2)" << std::endl; + } + + std::string json_string = "\""; + + CAPTURE(byte1) + json_string += std::string(1, static_cast(byte1)); + + if (byte2 != -1) + { + CAPTURE(byte2) + json_string += std::string(1, static_cast(byte2)); + } + + if (byte3 != -1) + { + CAPTURE(byte3) + json_string += std::string(1, static_cast(byte3)); + } + + if (byte4 != -1) + { + CAPTURE(byte4) + json_string += std::string(1, static_cast(byte4)); + } + + json_string += "\""; + + CAPTURE(json_string) + + json _; + if (success_expected) + { + CHECK_NOTHROW(_ = json::parse(json_string)); + } + else + { + CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&); + } +} +} + +TEST_CASE("Unicode-Part2" * doctest::skip()) +{ + SECTION("RFC 3629") + { + /* + RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as + follows: + + A UTF-8 string is a sequence of octets representing a sequence of UCS + characters. An octet sequence is valid UTF-8 only if it matches the + following syntax, which is derived from the rules for encoding UTF-8 + and is expressed in the ABNF of [RFC2234]. + + UTF8-octets = *( UTF8-char ) + UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + UTF8-1 = %x00-7F + UTF8-2 = %xC2-DF UTF8-tail + UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + %xF4 %x80-8F 2( UTF8-tail ) + UTF8-tail = %x80-BF + */ + + SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)") + { + SECTION("well-formed") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) + { + for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) + { + for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) + { + check_utf8string(true, byte1, byte2, byte3, byte4); + check_utf8dump(true, byte1, byte2, byte3, byte4); + } + } + } + } + } + + SECTION("ill-formed: missing second byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + check_utf8string(false, byte1); + check_utf8dump(false, byte1); + } + } + + SECTION("ill-formed: missing third byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) + { + check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); + } + } + } + + SECTION("ill-formed: missing fourth byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) + { + for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) + { + check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); + } + } + } + } + + SECTION("ill-formed: wrong second byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2) + { + // skip correct second byte + if (0x80 <= byte2 and byte2 <= 0xBF) + { + continue; + } + + for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) + { + for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) + { + check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); + } + } + } + } + } + + SECTION("ill-formed: wrong third byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) + { + for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3) + { + // skip correct third byte + if (0x80 <= byte3 and byte3 <= 0xBF) + { + continue; + } + + for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) + { + check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); + } + } + } + } + } + + SECTION("ill-formed: wrong fourth byte") + { + for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) + { + for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) + { + for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) + { + for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4) + { + // skip correct fourth byte + if (0x80 <= byte3 and byte3 <= 0xBF) + { + continue; + } + + check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); + } + } + } + } + } + } + } +}