Implement wide->UTF-8 string conversion more correctly
This commit is contained in:
parent
c6e674dbb3
commit
0c5a66245b
@ -225,6 +225,12 @@
|
||||
#include <sys/mman.h>
|
||||
#endif // GTEST_HAS_STD_STRING && defined(GTEST_OS_LINUX)
|
||||
|
||||
// Determines whether the system compiler uses UTF-16 for encoding wide strings.
|
||||
#if defined(GTEST_OS_WINDOWS) || defined(GTEST_OS_CYGWIN) || \
|
||||
defined(__SYMBIAN32__)
|
||||
#define GTEST_WIDE_STRING_USES_UTF16_ 1
|
||||
#endif
|
||||
|
||||
// Defines some utility macros.
|
||||
|
||||
// The GNU compiler emits a warning if nested "if" statements are followed by
|
||||
|
@ -133,8 +133,30 @@ class GTestFlagSaver {
|
||||
internal::Int32 repeat_;
|
||||
} GTEST_ATTRIBUTE_UNUSED;
|
||||
|
||||
// Converts a Unicode code-point to its UTF-8 encoding.
|
||||
String ToUtf8String(wchar_t wchar);
|
||||
// Converts a Unicode code point to a narrow string in UTF-8 encoding.
|
||||
// code_point parameter is of type UInt32 because wchar_t may not be
|
||||
// wide enough to contain a code point.
|
||||
// The output buffer str must containt at least 32 characters.
|
||||
// The function returns the address of the output buffer.
|
||||
// If the code_point is not a valid Unicode code point
|
||||
// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
|
||||
// as '(Invalid Unicode 0xXXXXXXXX)'.
|
||||
char* CodePointToUtf8(UInt32 code_point, char* str);
|
||||
|
||||
// Converts a wide string to a narrow string in UTF-8 encoding.
|
||||
// The wide string is assumed to have the following encoding:
|
||||
// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
|
||||
// UTF-32 if sizeof(wchar_t) == 4 (on Linux)
|
||||
// Parameter str points to a null-terminated wide string.
|
||||
// Parameter num_chars may additionally limit the number
|
||||
// of wchar_t characters processed. -1 is used when the entire string
|
||||
// should be processed.
|
||||
// If the string contains code points that are not valid Unicode code points
|
||||
// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
|
||||
// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
|
||||
// and contains invalid UTF-16 surrogate pairs, values in those pairs
|
||||
// will be encoded as individual Unicode characters from Basic Normal Plane.
|
||||
String WideStringToUtf8(const wchar_t* str, int num_chars);
|
||||
|
||||
// Returns the number of active threads, or 0 when there is an error.
|
||||
size_t GetThreadCount();
|
||||
|
163
src/gtest.cc
163
src/gtest.cc
@ -784,16 +784,19 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
|
||||
// encoding, and streams the result to the given Message object.
|
||||
static void StreamWideCharsToMessage(const wchar_t* wstr, size_t len,
|
||||
Message* msg) {
|
||||
for (size_t i = 0; i != len; i++) {
|
||||
// TODO(wan): consider allowing a testing::String object to
|
||||
// contain '\0'. This will make it behave more like std::string,
|
||||
// and will allow ToUtf8String() to return the correct encoding
|
||||
// for '\0' s.t. we can get rid of the conditional here (and in
|
||||
// several other places).
|
||||
if (wstr[i]) {
|
||||
*msg << internal::ToUtf8String(wstr[i]);
|
||||
// TODO(wan): consider allowing a testing::String object to
|
||||
// contain '\0'. This will make it behave more like std::string,
|
||||
// and will allow ToUtf8String() to return the correct encoding
|
||||
// for '\0' s.t. we can get rid of the conditional here (and in
|
||||
// several other places).
|
||||
for (size_t i = 0; i != len; ) { // NOLINT
|
||||
if (wstr[i] != L'\0') {
|
||||
*msg << WideStringToUtf8(wstr + i, len - i);
|
||||
while (i != len && wstr[i] != L'\0')
|
||||
i++;
|
||||
} else {
|
||||
*msg << '\0';
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -852,8 +855,10 @@ String FormatForFailureMessage(wchar_t wchar) {
|
||||
Message msg;
|
||||
// A String object cannot contain '\0', so we print "\\0" when wchar is
|
||||
// L'\0'.
|
||||
msg << "L'" << (wchar ? ToUtf8String(wchar).c_str() : "\\0") << "' ("
|
||||
<< wchar_as_uint64 << ", 0x" << ::std::setbase(16)
|
||||
char buffer[32]; // CodePointToUtf8 requires a buffer that big.
|
||||
msg << "L'"
|
||||
<< (wchar ? CodePointToUtf8(static_cast<UInt32>(wchar), buffer) : "\\0")
|
||||
<< "' (" << wchar_as_uint64 << ", 0x" << ::std::setbase(16)
|
||||
<< wchar_as_uint64 << ")";
|
||||
return msg.GetString();
|
||||
}
|
||||
@ -1317,31 +1322,118 @@ inline UInt32 ChopLowBits(UInt32* bits, int n) {
|
||||
return low_bits;
|
||||
}
|
||||
|
||||
// Converts a Unicode code-point to its UTF-8 encoding.
|
||||
String ToUtf8String(wchar_t wchar) {
|
||||
char str[5] = {}; // Initializes str to all '\0' characters.
|
||||
|
||||
UInt32 code = static_cast<UInt32>(wchar);
|
||||
if (code <= kMaxCodePoint1) {
|
||||
str[0] = static_cast<char>(code); // 0xxxxxxx
|
||||
} else if (code <= kMaxCodePoint2) {
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xC0 | code); // 110xxxxx
|
||||
} else if (code <= kMaxCodePoint3) {
|
||||
str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xE0 | code); // 1110xxxx
|
||||
} else if (code <= kMaxCodePoint4) {
|
||||
str[3] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[2] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xF0 | code); // 11110xxx
|
||||
// Converts a Unicode code point to a narrow string in UTF-8 encoding.
|
||||
// code_point parameter is of type UInt32 because wchar_t may not be
|
||||
// wide enough to contain a code point.
|
||||
// The output buffer str must containt at least 32 characters.
|
||||
// The function returns the address of the output buffer.
|
||||
// If the code_point is not a valid Unicode code point
|
||||
// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
|
||||
// as '(Invalid Unicode 0xXXXXXXXX)'.
|
||||
char* CodePointToUtf8(UInt32 code_point, char* str) {
|
||||
if (code_point <= kMaxCodePoint1) {
|
||||
str[1] = '\0';
|
||||
str[0] = static_cast<char>(code_point); // 0xxxxxxx
|
||||
} else if (code_point <= kMaxCodePoint2) {
|
||||
str[2] = '\0';
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xC0 | code_point); // 110xxxxx
|
||||
} else if (code_point <= kMaxCodePoint3) {
|
||||
str[3] = '\0';
|
||||
str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xE0 | code_point); // 1110xxxx
|
||||
} else if (code_point <= kMaxCodePoint4) {
|
||||
str[4] = '\0';
|
||||
str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx
|
||||
str[0] = static_cast<char>(0xF0 | code_point); // 11110xxx
|
||||
} else {
|
||||
return String::Format("(Invalid Unicode 0x%llX)",
|
||||
static_cast<UInt64>(wchar));
|
||||
// The longest string String::Format can produce when invoked
|
||||
// with these parameters is 28 character long (not including
|
||||
// the terminating nul character). We are asking for 32 character
|
||||
// buffer just in case. This is also enough for strncpy to
|
||||
// null-terminate the destination string.
|
||||
// MSVC 8 deprecates strncpy(), so we want to suppress warning
|
||||
// 4996 (deprecated function) there.
|
||||
#ifdef GTEST_OS_WINDOWS // We are on Windows.
|
||||
#pragma warning(push) // Saves the current warning state.
|
||||
#pragma warning(disable:4996) // Temporarily disables warning 4996.
|
||||
#endif
|
||||
strncpy(str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(),
|
||||
32);
|
||||
#ifdef GTEST_OS_WINDOWS // We are on Windows.
|
||||
#pragma warning(pop) // Restores the warning state.
|
||||
#endif
|
||||
str[31] = '\0'; // Makes sure no change in the format to strncpy leaves
|
||||
// the result unterminated.
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
return String(str);
|
||||
// The following two functions only make sense if the the system
|
||||
// uses UTF-16 for wide string encoding. All supported systems
|
||||
// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
|
||||
|
||||
// Determines if the arguments constitute UTF-16 surrogate pair
|
||||
// and thus should be combined into a single Unicode code point
|
||||
// using CreateCodePointFromUtf16SurrogatePair.
|
||||
inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
|
||||
if (sizeof(wchar_t) == 2)
|
||||
return (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Creates a Unicode code point from UTF16 surrogate pair.
|
||||
inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
|
||||
wchar_t second) {
|
||||
if (sizeof(wchar_t) == 2) {
|
||||
const UInt32 mask = (1 << 10) - 1;
|
||||
return (((first & mask) << 10) | (second & mask)) + 0x10000;
|
||||
} else {
|
||||
// This should not be called, but we provide a sensible default
|
||||
// in case it is.
|
||||
return static_cast<UInt32>(first);
|
||||
}
|
||||
}
|
||||
|
||||
// Converts a wide string to a narrow string in UTF-8 encoding.
|
||||
// The wide string is assumed to have the following encoding:
|
||||
// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
|
||||
// UTF-32 if sizeof(wchar_t) == 4 (on Linux)
|
||||
// Parameter str points to a null-terminated wide string.
|
||||
// Parameter num_chars may additionally limit the number
|
||||
// of wchar_t characters processed. -1 is used when the entire string
|
||||
// should be processed.
|
||||
// If the string contains code points that are not valid Unicode code points
|
||||
// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
|
||||
// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
|
||||
// and contains invalid UTF-16 surrogate pairs, values in those pairs
|
||||
// will be encoded as individual Unicode characters from Basic Normal Plane.
|
||||
String WideStringToUtf8(const wchar_t* str, int num_chars) {
|
||||
if (num_chars == -1)
|
||||
num_chars = wcslen(str);
|
||||
|
||||
StrStream stream;
|
||||
for (int i = 0; i < num_chars; ++i) {
|
||||
UInt32 unicode_code_point;
|
||||
|
||||
if (str[i] == L'\0') {
|
||||
break;
|
||||
} else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
|
||||
unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
|
||||
str[i + 1]);
|
||||
i++;
|
||||
} else {
|
||||
unicode_code_point = static_cast<UInt32>(str[i]);
|
||||
}
|
||||
|
||||
char buffer[32]; // CodePointToUtf8 requires a buffer this big.
|
||||
stream << CodePointToUtf8(unicode_code_point, buffer);
|
||||
}
|
||||
return StrStreamToString(&stream);
|
||||
}
|
||||
|
||||
// Converts a wide C string to a String using the UTF-8 encoding.
|
||||
@ -1349,12 +1441,7 @@ String ToUtf8String(wchar_t wchar) {
|
||||
String String::ShowWideCString(const wchar_t * wide_c_str) {
|
||||
if (wide_c_str == NULL) return String("(null)");
|
||||
|
||||
StrStream ss;
|
||||
while (*wide_c_str) {
|
||||
ss << internal::ToUtf8String(*wide_c_str++);
|
||||
}
|
||||
|
||||
return internal::StrStreamToString(&ss);
|
||||
return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
|
||||
}
|
||||
|
||||
// Similar to ShowWideCString(), except that this function encloses
|
||||
|
@ -101,6 +101,7 @@ using testing::TPRT_NONFATAL_FAILURE;
|
||||
using testing::TPRT_SUCCESS;
|
||||
using testing::UnitTest;
|
||||
using testing::internal::AppendUserMessage;
|
||||
using testing::internal::CodePointToUtf8;
|
||||
using testing::internal::EqFailure;
|
||||
using testing::internal::FloatingPoint;
|
||||
using testing::internal::GTestFlagSaver;
|
||||
@ -111,8 +112,8 @@ using testing::internal::StreamableToString;
|
||||
using testing::internal::String;
|
||||
using testing::internal::TestProperty;
|
||||
using testing::internal::TestResult;
|
||||
using testing::internal::ToUtf8String;
|
||||
using testing::internal::UnitTestImpl;
|
||||
using testing::internal::WideStringToUtf8;
|
||||
|
||||
// This line tests that we can define tests in an unnamed namespace.
|
||||
namespace {
|
||||
@ -142,65 +143,184 @@ TEST(NullLiteralTest, IsFalseForNonNullLiterals) {
|
||||
}
|
||||
|
||||
#endif // __SYMBIAN32__
|
||||
// Tests ToUtf8String().
|
||||
//
|
||||
// Tests CodePointToUtf8().
|
||||
|
||||
// Tests that the NUL character L'\0' is encoded correctly.
|
||||
TEST(ToUtf8StringTest, CanEncodeNul) {
|
||||
EXPECT_STREQ("", ToUtf8String(L'\0').c_str());
|
||||
TEST(CodePointToUtf8Test, CanEncodeNul) {
|
||||
char buffer[32];
|
||||
EXPECT_STREQ("", CodePointToUtf8(L'\0', buffer));
|
||||
}
|
||||
|
||||
// Tests that ASCII characters are encoded correctly.
|
||||
TEST(ToUtf8StringTest, CanEncodeAscii) {
|
||||
EXPECT_STREQ("a", ToUtf8String(L'a').c_str());
|
||||
EXPECT_STREQ("Z", ToUtf8String(L'Z').c_str());
|
||||
EXPECT_STREQ("&", ToUtf8String(L'&').c_str());
|
||||
EXPECT_STREQ("\x7F", ToUtf8String(L'\x7F').c_str());
|
||||
TEST(CodePointToUtf8Test, CanEncodeAscii) {
|
||||
char buffer[32];
|
||||
EXPECT_STREQ("a", CodePointToUtf8(L'a', buffer));
|
||||
EXPECT_STREQ("Z", CodePointToUtf8(L'Z', buffer));
|
||||
EXPECT_STREQ("&", CodePointToUtf8(L'&', buffer));
|
||||
EXPECT_STREQ("\x7F", CodePointToUtf8(L'\x7F', buffer));
|
||||
}
|
||||
|
||||
// Tests that Unicode code-points that have 8 to 11 bits are encoded
|
||||
// as 110xxxxx 10xxxxxx.
|
||||
TEST(ToUtf8StringTest, CanEncode8To11Bits) {
|
||||
TEST(CodePointToUtf8Test, CanEncode8To11Bits) {
|
||||
char buffer[32];
|
||||
// 000 1101 0011 => 110-00011 10-010011
|
||||
EXPECT_STREQ("\xC3\x93", ToUtf8String(L'\xD3').c_str());
|
||||
EXPECT_STREQ("\xC3\x93", CodePointToUtf8(L'\xD3', buffer));
|
||||
|
||||
// 101 0111 0110 => 110-10101 10-110110
|
||||
EXPECT_STREQ("\xD5\xB6", ToUtf8String(L'\x576').c_str());
|
||||
EXPECT_STREQ("\xD5\xB6", CodePointToUtf8(L'\x576', buffer));
|
||||
}
|
||||
|
||||
// Tests that Unicode code-points that have 12 to 16 bits are encoded
|
||||
// as 1110xxxx 10xxxxxx 10xxxxxx.
|
||||
TEST(ToUtf8StringTest, CanEncode12To16Bits) {
|
||||
TEST(CodePointToUtf8Test, CanEncode12To16Bits) {
|
||||
char buffer[32];
|
||||
// 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011
|
||||
EXPECT_STREQ("\xE0\xA3\x93", ToUtf8String(L'\x8D3').c_str());
|
||||
EXPECT_STREQ("\xE0\xA3\x93", CodePointToUtf8(L'\x8D3', buffer));
|
||||
|
||||
// 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101
|
||||
EXPECT_STREQ("\xEC\x9D\x8D", ToUtf8String(L'\xC74D').c_str());
|
||||
EXPECT_STREQ("\xEC\x9D\x8D", CodePointToUtf8(L'\xC74D', buffer));
|
||||
}
|
||||
|
||||
#if !defined(GTEST_OS_WINDOWS) && !defined(GTEST_OS_CYGWIN) && \
|
||||
!defined(__SYMBIAN32__)
|
||||
|
||||
#ifndef GTEST_WIDE_STRING_USES_UTF16_
|
||||
// Tests in this group require a wchar_t to hold > 16 bits, and thus
|
||||
// are skipped on Windows, Cygwin, and Symbian, where a wchar_t is
|
||||
// 16-bit wide.
|
||||
// 16-bit wide. This code may not compile on those systems.
|
||||
|
||||
// Tests that Unicode code-points that have 17 to 21 bits are encoded
|
||||
// as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
|
||||
TEST(ToUtf8StringTest, CanEncode17To21Bits) {
|
||||
TEST(CodePointToUtf8Test, CanEncode17To21Bits) {
|
||||
char buffer[32];
|
||||
// 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011
|
||||
EXPECT_STREQ("\xF0\x90\xA3\x93", ToUtf8String(L'\x108D3').c_str());
|
||||
EXPECT_STREQ("\xF0\x90\xA3\x93", CodePointToUtf8(L'\x108D3', buffer));
|
||||
|
||||
// 1 0111 1000 0110 0011 0100 => 11110-101 10-111000 10-011000 10-110100
|
||||
EXPECT_STREQ("\xF5\xB8\x98\xB4", ToUtf8String(L'\x178634').c_str());
|
||||
// 0 0001 0000 0100 0000 0000 => 11110-000 10-010000 10-010000 10-000000
|
||||
EXPECT_STREQ("\xF0\x90\x90\x80", CodePointToUtf8(L'\x10400', buffer));
|
||||
|
||||
// 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100
|
||||
EXPECT_STREQ("\xF4\x88\x98\xB4", CodePointToUtf8(L'\x108634', buffer));
|
||||
}
|
||||
|
||||
// Tests that encoding an invalid code-point generates the expected result.
|
||||
TEST(ToUtf8StringTest, CanEncodeInvalidCodePoint) {
|
||||
TEST(CodePointToUtf8Test, CanEncodeInvalidCodePoint) {
|
||||
char buffer[32];
|
||||
EXPECT_STREQ("(Invalid Unicode 0x1234ABCD)",
|
||||
ToUtf8String(L'\x1234ABCD').c_str());
|
||||
CodePointToUtf8(L'\x1234ABCD', buffer));
|
||||
}
|
||||
|
||||
#endif // Windows, Cygwin, or Symbian
|
||||
#endif // GTEST_WIDE_STRING_USES_UTF16_
|
||||
|
||||
// Tests WideStringToUtf8().
|
||||
|
||||
// Tests that the NUL character L'\0' is encoded correctly.
|
||||
TEST(WideStringToUtf8Test, CanEncodeNul) {
|
||||
EXPECT_STREQ("", WideStringToUtf8(L"", 0).c_str());
|
||||
EXPECT_STREQ("", WideStringToUtf8(L"", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that ASCII strings are encoded correctly.
|
||||
TEST(WideStringToUtf8Test, CanEncodeAscii) {
|
||||
EXPECT_STREQ("a", WideStringToUtf8(L"a", 1).c_str());
|
||||
EXPECT_STREQ("ab", WideStringToUtf8(L"ab", 2).c_str());
|
||||
EXPECT_STREQ("a", WideStringToUtf8(L"a", -1).c_str());
|
||||
EXPECT_STREQ("ab", WideStringToUtf8(L"ab", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that Unicode code-points that have 8 to 11 bits are encoded
|
||||
// as 110xxxxx 10xxxxxx.
|
||||
TEST(WideStringToUtf8Test, CanEncode8To11Bits) {
|
||||
// 000 1101 0011 => 110-00011 10-010011
|
||||
EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", 1).c_str());
|
||||
EXPECT_STREQ("\xC3\x93", WideStringToUtf8(L"\xD3", -1).c_str());
|
||||
|
||||
// 101 0111 0110 => 110-10101 10-110110
|
||||
EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(L"\x576", 1).c_str());
|
||||
EXPECT_STREQ("\xD5\xB6", WideStringToUtf8(L"\x576", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that Unicode code-points that have 12 to 16 bits are encoded
|
||||
// as 1110xxxx 10xxxxxx 10xxxxxx.
|
||||
TEST(WideStringToUtf8Test, CanEncode12To16Bits) {
|
||||
// 0000 1000 1101 0011 => 1110-0000 10-100011 10-010011
|
||||
EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(L"\x8D3", 1).c_str());
|
||||
EXPECT_STREQ("\xE0\xA3\x93", WideStringToUtf8(L"\x8D3", -1).c_str());
|
||||
|
||||
// 1100 0111 0100 1101 => 1110-1100 10-011101 10-001101
|
||||
EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(L"\xC74D", 1).c_str());
|
||||
EXPECT_STREQ("\xEC\x9D\x8D", WideStringToUtf8(L"\xC74D", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that the conversion stops when the function encounters \0 character.
|
||||
TEST(WideStringToUtf8Test, StopsOnNulCharacter) {
|
||||
EXPECT_STREQ("ABC", WideStringToUtf8(L"ABC\0XYZ", 100).c_str());
|
||||
}
|
||||
|
||||
// Tests that the conversion stops when the function reaches the limit
|
||||
// specified by the 'length' parameter.
|
||||
TEST(WideStringToUtf8Test, StopsWhenLengthLimitReached) {
|
||||
EXPECT_STREQ("ABC", WideStringToUtf8(L"ABCDEF", 3).c_str());
|
||||
}
|
||||
|
||||
|
||||
#ifndef GTEST_WIDE_STRING_USES_UTF16_
|
||||
// Tests that Unicode code-points that have 17 to 21 bits are encoded
|
||||
// as 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx. This code may not compile
|
||||
// on the systems using UTF-16 encoding.
|
||||
TEST(WideStringToUtf8Test, CanEncode17To21Bits) {
|
||||
// 0 0001 0000 1000 1101 0011 => 11110-000 10-010000 10-100011 10-010011
|
||||
EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", 1).c_str());
|
||||
EXPECT_STREQ("\xF0\x90\xA3\x93", WideStringToUtf8(L"\x108D3", -1).c_str());
|
||||
|
||||
// 1 0000 1000 0110 0011 0100 => 11110-100 10-001000 10-011000 10-110100
|
||||
EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", 1).c_str());
|
||||
EXPECT_STREQ("\xF4\x88\x98\xB4", WideStringToUtf8(L"\x108634", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that encoding an invalid code-point generates the expected result.
|
||||
TEST(WideStringToUtf8Test, CanEncodeInvalidCodePoint) {
|
||||
EXPECT_STREQ("(Invalid Unicode 0xABCDFF)",
|
||||
WideStringToUtf8(L"\xABCDFF", -1).c_str());
|
||||
}
|
||||
#else
|
||||
// Tests that surrogate pairs are encoded correctly on the systems using
|
||||
// UTF-16 encoding in the wide strings.
|
||||
TEST(WideStringToUtf8Test, CanEncodeValidUtf16SUrrogatePairs) {
|
||||
EXPECT_STREQ("\xF0\x90\x90\x80",
|
||||
WideStringToUtf8(L"\xD801\xDC00", -1).c_str());
|
||||
}
|
||||
|
||||
// Tests that encoding an invalid UTF-16 surrogate pair
|
||||
// generates the expected result.
|
||||
TEST(WideStringToUtf8Test, CanEncodeInvalidUtf16SurrogatePair) {
|
||||
// Leading surrogate is at the end of the string.
|
||||
EXPECT_STREQ("\xED\xA0\x80", WideStringToUtf8(L"\xD800", -1).c_str());
|
||||
// Leading surrogate is not followed by the trailing surrogate.
|
||||
EXPECT_STREQ("\xED\xA0\x80$", WideStringToUtf8(L"\xD800$", -1).c_str());
|
||||
// Trailing surrogate appearas without a leading surrogate.
|
||||
EXPECT_STREQ("\xED\xB0\x80PQR", WideStringToUtf8(L"\xDC00PQR", -1).c_str());
|
||||
}
|
||||
#endif // GTEST_WIDE_STRING_USES_UTF16_
|
||||
|
||||
// Tests that codepoint concatenation works correctly.
|
||||
#ifndef GTEST_WIDE_STRING_USES_UTF16_
|
||||
TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) {
|
||||
EXPECT_STREQ(
|
||||
"\xF4\x88\x98\xB4"
|
||||
"\xEC\x9D\x8D"
|
||||
"\n"
|
||||
"\xD5\xB6"
|
||||
"\xE0\xA3\x93"
|
||||
"\xF4\x88\x98\xB4",
|
||||
WideStringToUtf8(L"\x108634\xC74D\n\x576\x8D3\x108634", -1).c_str());
|
||||
}
|
||||
#else
|
||||
TEST(WideStringToUtf8Test, ConcatenatesCodepointsCorrectly) {
|
||||
EXPECT_STREQ(
|
||||
"\xEC\x9D\x8D" "\n" "\xD5\xB6" "\xE0\xA3\x93",
|
||||
WideStringToUtf8(L"\xC74D\n\x576\x8D3", -1).c_str());
|
||||
}
|
||||
#endif // GTEST_WIDE_STRING_USES_UTF16_
|
||||
|
||||
// Tests the List template class.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user