1 files changed, 122 insertions, 9 deletions
diff --git a/src/zencore/string.cpp b/src/zencore/string.cpp
index a9aed6309..ab1c7de58 100644
--- a/src/zencore/string.cpp
+++ b/src/zencore/string.cpp
@@ -4,6 +4,7 @@
 #include <zencore/memoryview.h>
 #include <zencore/string.h>
 #include <zencore/testing.h>
+#include <zencore/testutils.h>
 
 #include <inttypes.h>
 #include <math.h>
@@ -184,7 +185,21 @@ Utf8ToWide(const std::u8string_view& Str8, WideStringBuilderBase& OutString)
 
 			if (!ByteCount)
 			{
+#if ZEN_SIZEOF_WCHAR_T == 2
+				if (CurrentOutChar > 0xFFFF)
+				{
+					// Supplementary plane: emit a UTF-16 surrogate pair
+					uint32_t Adjusted = uint32_t(CurrentOutChar - 0x10000);
+					OutString.Append(wchar_t(0xD800 + (Adjusted >> 10)));
+					OutString.Append(wchar_t(0xDC00 + (Adjusted & 0x3FF)));
+				}
+				else
+				{
+					OutString.Append(wchar_t(CurrentOutChar));
+				}
+#else
 				OutString.Append(wchar_t(CurrentOutChar));
+#endif
 				CurrentOutChar = 0;
 			}
 		}
@@ -967,33 +982,131 @@ TEST_CASE("ExtendableWideStringBuilder")
 
 TEST_CASE("utf8")
 {
+	using namespace utf8test;
+
 	SUBCASE("utf8towide")
 	{
-		// TODO: add more extensive testing here - this covers a very small space
-
 		WideStringBuilder<32> wout;
 		Utf8ToWide(u8"abcdefghi", wout);
 		CHECK(StringEquals(L"abcdefghi", wout.c_str()));
 
 		wout.Reset();
+		Utf8ToWide(u8"abc\xC3\xA4\xC3\xB6\xC3\xBC", wout);
+		CHECK(StringEquals(L"abc\u00E4\u00F6\u00FC", wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kLatin), wout);
+		CHECK(StringEquals(kLatinW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kCyrillic), wout);
+		CHECK(StringEquals(kCyrillicW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kCJK), wout);
+		CHECK(StringEquals(kCJKW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kMixed), wout);
+		CHECK(StringEquals(kMixedW, wout.c_str()));
 
-		Utf8ToWide(u8"abc���", wout);
-		CHECK(StringEquals(L"abc���", wout.c_str()));
+		wout.Reset();
+		Utf8ToWide(std::string_view(kEmoji), wout);
+		CHECK(StringEquals(kEmojiW, wout.c_str()));
 	}
 
 	SUBCASE("widetoutf8")
 	{
-		// TODO: add more extensive testing here - this covers a very small space
-
-		StringBuilder<32> out;
+		StringBuilder<64> out;
 
 		WideToUtf8(L"abcdefghi", out);
 		CHECK(StringEquals("abcdefghi", out.c_str()));
 
 		out.Reset();
+		WideToUtf8(kLatinW, out);
+		CHECK(StringEquals(kLatin, out.c_str()));
 
-		WideToUtf8(L"abc���", out);
-		CHECK(StringEquals(u8"abc���", out.c_str()));
+		out.Reset();
+		WideToUtf8(kCyrillicW, out);
+		CHECK(StringEquals(kCyrillic, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kCJKW, out);
+		CHECK(StringEquals(kCJK, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kMixedW, out);
+		CHECK(StringEquals(kMixed, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kEmojiW, out);
+		CHECK(StringEquals(kEmoji, out.c_str()));
+	}
+
+	SUBCASE("roundtrip")
+	{
+		// UTF-8 -> Wide -> UTF-8 identity
+		const char* Utf8Strings[] = {kLatin, kCyrillic, kCJK, kMixed, kEmoji};
+		for (const char* Utf8Str : Utf8Strings)
+		{
+			ExtendableWideStringBuilder<64> Wide;
+			Utf8ToWide(std::string_view(Utf8Str), Wide);
+
+			ExtendableStringBuilder<64> Back;
+			WideToUtf8(std::wstring_view(Wide.c_str()), Back);
+			CHECK(StringEquals(Utf8Str, Back.c_str()));
+		}
+
+		// Wide -> UTF-8 -> Wide identity
+		const wchar_t* WideStrings[] = {kLatinW, kCyrillicW, kCJKW, kMixedW, kEmojiW};
+		for (const wchar_t* WideStr : WideStrings)
+		{
+			ExtendableStringBuilder<64> Utf8;
+			WideToUtf8(std::wstring_view(WideStr), Utf8);
+
+			ExtendableWideStringBuilder<64> Back;
+			Utf8ToWide(std::string_view(Utf8.c_str()), Back);
+			CHECK(StringEquals(WideStr, Back.c_str()));
+		}
+
+		// Empty string round-trip
+		{
+			ExtendableWideStringBuilder<8> Wide;
+			Utf8ToWide(std::string_view(""), Wide);
+			CHECK(Wide.Size() == 0);
+
+			ExtendableStringBuilder<8> Narrow;
+			WideToUtf8(std::wstring_view(L""), Narrow);
+			CHECK(Narrow.Size() == 0);
+		}
+	}
+
+	SUBCASE("IsValidUtf8")
+	{
+		// Valid inputs
+		CHECK(IsValidUtf8(""));
+		CHECK(IsValidUtf8("hello world"));
+		CHECK(IsValidUtf8(kLatin));
+		CHECK(IsValidUtf8(kCyrillic));
+		CHECK(IsValidUtf8(kCJK));
+		CHECK(IsValidUtf8(kMixed));
+		CHECK(IsValidUtf8(kEmoji));
+
+		// Invalid: truncated 2-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xC3", 1)));
+
+		// Invalid: truncated 3-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xE6\x97", 2)));
+
+		// Invalid: truncated 4-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xF0\x9F\x93", 3)));
+
+		// Invalid: bad start byte
+		CHECK(!IsValidUtf8(std::string_view("\xFF", 1)));
+		CHECK(!IsValidUtf8(std::string_view("\xFE", 1)));
+
+		// Invalid: overlong encoding of '/' (U+002F)
+		CHECK(!IsValidUtf8(std::string_view("\xC0\xAF", 2)));
 	}
 }