Split test-unicode to avoid timeouts

2019-12-29 16:07:27 +03:30 · 2019-12-29 16:07:27 +03:30 · 3e9d74df56
commit 3e9d74df56
parent bde5712418
3 changed files with 328 additions and 132 deletions
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -140,7 +140,8 @@ set(files
    src/unit-to_chars.cpp
    src/unit-ubjson.cpp
    src/unit-udt.cpp
-    src/unit-unicode.cpp
+    src/unit-unicode-part1.cpp
+    src/unit-unicode-part2.cpp
    src/unit-wstring.cpp)

 foreach(file ${files})
--- a/test/src/unit-unicode-part1.cpp
+++ b/test/src/unit-unicode-part1.cpp
@ -44,7 +44,6 @@ using nlohmann::json;

 namespace
 {
-extern size_t calls;
 size_t calls = 0;

 void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
@ -128,7 +127,7 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte
 {
    if (++calls % 100000 == 0)
    {
-        std::cout << calls << " of 8860608 UTF-8 strings checked" << std::endl;
+        std::cout << calls << " of 3300000 UTF-8 strings checked (part 1)" << std::endl;
    }

    std::string json_string = "\"";
@ -738,135 +737,6 @@ TEST_CASE("Unicode" * doctest::skip())
            }
        }

-        SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
-        {
-            SECTION("well-formed")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
-                    {
-                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
-                        {
-                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
-                            {
-                                check_utf8string(true, byte1, byte2, byte3, byte4);
-                                check_utf8dump(true, byte1, byte2, byte3, byte4);
-                            }
-                        }
-                    }
-                }
-            }
-
-            SECTION("ill-formed: missing second byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    check_utf8string(false, byte1);
-                    check_utf8dump(false, byte1);
-                }
-            }
-
-            SECTION("ill-formed: missing third byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
-                    {
-                        check_utf8string(false, byte1, byte2);
-                        check_utf8dump(false, byte1, byte2);
-                    }
-                }
-            }
-
-            SECTION("ill-formed: missing fourth byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
-                    {
-                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
-                        {
-                            check_utf8string(false, byte1, byte2, byte3);
-                            check_utf8dump(false, byte1, byte2, byte3);
-                        }
-                    }
-                }
-            }
-
-            SECTION("ill-formed: wrong second byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
-                    {
-                        // skip correct second byte
-                        if (0x80 <= byte2 and byte2 <= 0xBF)
-                        {
-                            continue;
-                        }
-
-                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
-                        {
-                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
-                            {
-                                check_utf8string(false, byte1, byte2, byte3, byte4);
-                                check_utf8dump(false, byte1, byte2, byte3, byte4);
-                            }
-                        }
-                    }
-                }
-            }
-
-            SECTION("ill-formed: wrong third byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
-                    {
-                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
-                        {
-                            // skip correct third byte
-                            if (0x80 <= byte3 and byte3 <= 0xBF)
-                            {
-                                continue;
-                            }
-
-                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
-                            {
-                                check_utf8string(false, byte1, byte2, byte3, byte4);
-                                check_utf8dump(false, byte1, byte2, byte3, byte4);
-                            }
-                        }
-                    }
-                }
-            }
-
-            SECTION("ill-formed: wrong fourth byte")
-            {
-                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
-                {
-                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
-                    {
-                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
-                        {
-                            for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
-                            {
-                                // skip correct fourth byte
-                                if (0x80 <= byte3 and byte3 <= 0xBF)
-                                {
-                                    continue;
-                                }
-
-                                check_utf8string(false, byte1, byte2, byte3, byte4);
-                                check_utf8dump(false, byte1, byte2, byte3, byte4);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
        SECTION("UTF8-4 (xF4 x80-8F UTF8-tail UTF8-tail)")
        {
            SECTION("well-formed")
--- a/test/src/unit-unicode-part2.cpp
+++ b/test/src/unit-unicode-part2.cpp
@ -0,0 +1,325 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++ (test suite)
+|  |  |__   |  |  | | | |  version 3.7.3
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "doctest_compatibility.h"
+
+// for some reason including this after the json header leads to linker errors with VS 2017...
+#include <locale>
+
+#define private public
+#include <nlohmann/json.hpp>
+using nlohmann::json;
+#undef private
+
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+namespace
+{
+size_t calls = 0;
+
+void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
+
+void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
+{
+    std::string json_string;
+
+    CAPTURE(byte1)
+    CAPTURE(byte2)
+    CAPTURE(byte3)
+    CAPTURE(byte4)
+
+    json_string += std::string(1, static_cast<char>(byte1));
+
+    if (byte2 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte2));
+    }
+
+    if (byte3 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte3));
+    }
+
+    if (byte4 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte4));
+    }
+
+    CAPTURE(json_string)
+
+    // store the string in a JSON value
+    json j = json_string;
+    json j2 = "abc" + json_string + "xyz";
+
+    // dumping with ignore/replace must not throw in any case
+    auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
+    auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
+    auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
+    auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
+    auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
+    auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
+    auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
+    auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
+
+    if (success_expected)
+    {
+        // strict mode must not throw if success is expected
+        auto s_strict = j.dump();
+        // all dumps should agree on the string
+        CHECK(s_strict == s_ignored);
+        CHECK(s_strict == s_replaced);
+    }
+    else
+    {
+        // strict mode must throw if success is not expected
+        CHECK_THROWS_AS(j.dump(), json::type_error&);
+        // ignore and replace must create different dumps
+        CHECK(s_ignored != s_replaced);
+
+        // check that replace string contains a replacement character
+        CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
+    }
+
+    // check that prefix and suffix are preserved
+    CHECK(s_ignored2.substr(1, 3) == "abc");
+    CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
+    CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
+    CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
+    CHECK(s_replaced2.substr(1, 3) == "abc");
+    CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
+    CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
+    CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
+}
+
+void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
+
+// create and check a JSON string with up to four UTF-8 bytes
+void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
+{
+    if (++calls % 100000 == 0)
+    {
+        std::cout << calls << " of 5500000 UTF-8 strings checked (part 2)" << std::endl;
+    }
+
+    std::string json_string = "\"";
+
+    CAPTURE(byte1)
+    json_string += std::string(1, static_cast<char>(byte1));
+
+    if (byte2 != -1)
+    {
+        CAPTURE(byte2)
+        json_string += std::string(1, static_cast<char>(byte2));
+    }
+
+    if (byte3 != -1)
+    {
+        CAPTURE(byte3)
+        json_string += std::string(1, static_cast<char>(byte3));
+    }
+
+    if (byte4 != -1)
+    {
+        CAPTURE(byte4)
+        json_string += std::string(1, static_cast<char>(byte4));
+    }
+
+    json_string += "\"";
+
+    CAPTURE(json_string)
+
+    json _;
+    if (success_expected)
+    {
+        CHECK_NOTHROW(_ = json::parse(json_string));
+    }
+    else
+    {
+        CHECK_THROWS_AS(_ = json::parse(json_string), json::parse_error&);
+    }
+}
+}
+
+TEST_CASE("Unicode-Part2" * doctest::skip())
+{
+    SECTION("RFC 3629")
+    {
+        /*
+        RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
+        follows:
+
+            A UTF-8 string is a sequence of octets representing a sequence of UCS
+            characters.  An octet sequence is valid UTF-8 only if it matches the
+            following syntax, which is derived from the rules for encoding UTF-8
+            and is expressed in the ABNF of [RFC2234].
+
+            UTF8-octets = *( UTF8-char )
+            UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+            UTF8-1      = %x00-7F
+            UTF8-2      = %xC2-DF UTF8-tail
+            UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
+                          %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
+            UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
+                          %xF4 %x80-8F 2( UTF8-tail )
+            UTF8-tail   = %x80-BF
+        */
+
+        SECTION("UTF8-4 (xF1-F3 UTF8-tail UTF8-tail UTF8-tail)")
+        {
+            SECTION("well-formed")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(true, byte1, byte2, byte3, byte4);
+                                check_utf8dump(true, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing second byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
+                }
+            }
+
+            SECTION("ill-formed: missing third byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
+                    }
+                }
+            }
+
+            SECTION("ill-formed: missing fourth byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong second byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x00; byte2 <= 0xFF; ++byte2)
+                    {
+                        // skip correct second byte
+                        if (0x80 <= byte2 and byte2 <= 0xBF)
+                        {
+                            continue;
+                        }
+
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong third byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x00; byte3 <= 0xFF; ++byte3)
+                        {
+                            // skip correct third byte
+                            if (0x80 <= byte3 and byte3 <= 0xBF)
+                            {
+                                continue;
+                            }
+
+                            for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
+                            {
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+
+            SECTION("ill-formed: wrong fourth byte")
+            {
+                for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
+                {
+                    for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
+                    {
+                        for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
+                        {
+                            for (int byte4 = 0x00; byte4 <= 0xFF; ++byte4)
+                            {
+                                // skip correct fourth byte
+                                if (0x80 <= byte3 and byte3 <= 0xBF)
+                                {
+                                    continue;
+                                }
+
+                                check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}