diff options
Diffstat (limited to 'src/zencore/string.cpp')
| -rw-r--r-- | src/zencore/string.cpp | 131 |
1 files changed, 122 insertions, 9 deletions
diff --git a/src/zencore/string.cpp b/src/zencore/string.cpp index a9aed6309..ab1c7de58 100644 --- a/src/zencore/string.cpp +++ b/src/zencore/string.cpp @@ -4,6 +4,7 @@ #include <zencore/memoryview.h> #include <zencore/string.h> #include <zencore/testing.h> +#include <zencore/testutils.h> #include <inttypes.h> #include <math.h> @@ -184,7 +185,21 @@ Utf8ToWide(const std::u8string_view& Str8, WideStringBuilderBase& OutString) if (!ByteCount) { +#if ZEN_SIZEOF_WCHAR_T == 2 + if (CurrentOutChar > 0xFFFF) + { + // Supplementary plane: emit a UTF-16 surrogate pair + uint32_t Adjusted = uint32_t(CurrentOutChar - 0x10000); + OutString.Append(wchar_t(0xD800 + (Adjusted >> 10))); + OutString.Append(wchar_t(0xDC00 + (Adjusted & 0x3FF))); + } + else + { + OutString.Append(wchar_t(CurrentOutChar)); + } +#else OutString.Append(wchar_t(CurrentOutChar)); +#endif CurrentOutChar = 0; } } @@ -967,33 +982,131 @@ TEST_CASE("ExtendableWideStringBuilder") TEST_CASE("utf8") { + using namespace utf8test; + SUBCASE("utf8towide") { - // TODO: add more extensive testing here - this covers a very small space - WideStringBuilder<32> wout; Utf8ToWide(u8"abcdefghi", wout); CHECK(StringEquals(L"abcdefghi", wout.c_str())); wout.Reset(); + Utf8ToWide(u8"abc\xC3\xA4\xC3\xB6\xC3\xBC", wout); + CHECK(StringEquals(L"abc\u00E4\u00F6\u00FC", wout.c_str())); + + wout.Reset(); + Utf8ToWide(std::string_view(kLatin), wout); + CHECK(StringEquals(kLatinW, wout.c_str())); + + wout.Reset(); + Utf8ToWide(std::string_view(kCyrillic), wout); + CHECK(StringEquals(kCyrillicW, wout.c_str())); + + wout.Reset(); + Utf8ToWide(std::string_view(kCJK), wout); + CHECK(StringEquals(kCJKW, wout.c_str())); + + wout.Reset(); + Utf8ToWide(std::string_view(kMixed), wout); + CHECK(StringEquals(kMixedW, wout.c_str())); - Utf8ToWide(u8"abc���", wout); - CHECK(StringEquals(L"abc���", wout.c_str())); + wout.Reset(); + Utf8ToWide(std::string_view(kEmoji), wout); + CHECK(StringEquals(kEmojiW, wout.c_str())); } SUBCASE("widetoutf8") { - // TODO: add more extensive testing here - this covers a very small space - - StringBuilder<32> out; + StringBuilder<64> out; WideToUtf8(L"abcdefghi", out); CHECK(StringEquals("abcdefghi", out.c_str())); out.Reset(); + WideToUtf8(kLatinW, out); + CHECK(StringEquals(kLatin, out.c_str())); - WideToUtf8(L"abc���", out); - CHECK(StringEquals(u8"abc���", out.c_str())); + out.Reset(); + WideToUtf8(kCyrillicW, out); + CHECK(StringEquals(kCyrillic, out.c_str())); + + out.Reset(); + WideToUtf8(kCJKW, out); + CHECK(StringEquals(kCJK, out.c_str())); + + out.Reset(); + WideToUtf8(kMixedW, out); + CHECK(StringEquals(kMixed, out.c_str())); + + out.Reset(); + WideToUtf8(kEmojiW, out); + CHECK(StringEquals(kEmoji, out.c_str())); + } + + SUBCASE("roundtrip") + { + // UTF-8 -> Wide -> UTF-8 identity + const char* Utf8Strings[] = {kLatin, kCyrillic, kCJK, kMixed, kEmoji}; + for (const char* Utf8Str : Utf8Strings) + { + ExtendableWideStringBuilder<64> Wide; + Utf8ToWide(std::string_view(Utf8Str), Wide); + + ExtendableStringBuilder<64> Back; + WideToUtf8(std::wstring_view(Wide.c_str()), Back); + CHECK(StringEquals(Utf8Str, Back.c_str())); + } + + // Wide -> UTF-8 -> Wide identity + const wchar_t* WideStrings[] = {kLatinW, kCyrillicW, kCJKW, kMixedW, kEmojiW}; + for (const wchar_t* WideStr : WideStrings) + { + ExtendableStringBuilder<64> Utf8; + WideToUtf8(std::wstring_view(WideStr), Utf8); + + ExtendableWideStringBuilder<64> Back; + Utf8ToWide(std::string_view(Utf8.c_str()), Back); + CHECK(StringEquals(WideStr, Back.c_str())); + } + + // Empty string round-trip + { + ExtendableWideStringBuilder<8> Wide; + Utf8ToWide(std::string_view(""), Wide); + CHECK(Wide.Size() == 0); + + ExtendableStringBuilder<8> Narrow; + WideToUtf8(std::wstring_view(L""), Narrow); + CHECK(Narrow.Size() == 0); + } + } + + SUBCASE("IsValidUtf8") + { + // Valid inputs + CHECK(IsValidUtf8("")); + CHECK(IsValidUtf8("hello world")); + CHECK(IsValidUtf8(kLatin)); + CHECK(IsValidUtf8(kCyrillic)); + CHECK(IsValidUtf8(kCJK)); + CHECK(IsValidUtf8(kMixed)); + CHECK(IsValidUtf8(kEmoji)); + + // Invalid: truncated 2-byte sequence + CHECK(!IsValidUtf8(std::string_view("\xC3", 1))); + + // Invalid: truncated 3-byte sequence + CHECK(!IsValidUtf8(std::string_view("\xE6\x97", 2))); + + // Invalid: truncated 4-byte sequence + CHECK(!IsValidUtf8(std::string_view("\xF0\x9F\x93", 3))); + + // Invalid: bad start byte + CHECK(!IsValidUtf8(std::string_view("\xFF", 1))); + CHECK(!IsValidUtf8(std::string_view("\xFE", 1))); + + // Invalid: overlong encoding of '/' (U+002F) + CHECK(!IsValidUtf8(std::string_view("\xC0\xAF", 2))); } } |