1 files changed, 190 insertions, 20 deletions
diff --git a/src/zencore/string.cpp b/src/zencore/string.cpp
index 0ee863b74..ed0ba6f46 100644
--- a/src/zencore/string.cpp
+++ b/src/zencore/string.cpp
@@ -4,6 +4,7 @@
 #include <zencore/memoryview.h>
 #include <zencore/string.h>
 #include <zencore/testing.h>
+#include <zencore/testutils.h>
 
 #include <inttypes.h>
 #include <math.h>
@@ -24,6 +25,10 @@ utf16to8_impl(u16bit_iterator StartIt, u16bit_iterator EndIt, ::zen::StringBuild
 		// Take care of surrogate pairs first
 		if (utf8::internal::is_lead_surrogate(cp))
 		{
+			if (StartIt == EndIt)
+			{
+				break;
+			}
 			uint32_t trail_surrogate = utf8::internal::mask16(*StartIt++);
 			cp						 = (cp << 10) + trail_surrogate + utf8::internal::SURROGATE_OFFSET;
 		}
@@ -180,7 +185,21 @@ Utf8ToWide(const std::u8string_view& Str8, WideStringBuilderBase& OutString)
 
 			if (!ByteCount)
 			{
+#if ZEN_SIZEOF_WCHAR_T == 2
+				if (CurrentOutChar > 0xFFFF)
+				{
+					// Supplementary plane: emit a UTF-16 surrogate pair
+					uint32_t Adjusted = uint32_t(CurrentOutChar - 0x10000);
+					OutString.Append(wchar_t(0xD800 + (Adjusted >> 10)));
+					OutString.Append(wchar_t(0xDC00 + (Adjusted & 0x3FF)));
+				}
+				else
+				{
+					OutString.Append(wchar_t(CurrentOutChar));
+				}
+#else
 				OutString.Append(wchar_t(CurrentOutChar));
+#endif
 				CurrentOutChar = 0;
 			}
 		}
@@ -249,6 +268,17 @@ namespace {
 		/* kNicenumTime  */ 1000};
 }  // namespace
 
+uint64_t
+IntPow(uint64_t Base, int Exp)
+{
+	uint64_t Result = 1;
+	for (int I = 0; I < Exp; ++I)
+	{
+		Result *= Base;
+	}
+	return Result;
+}
+
 /*
  * Convert a number to an appropriately human-readable output.
  */
@@ -296,7 +326,7 @@ NiceNumGeneral(uint64_t Num, std::span<char> Buffer, NicenumFormat Format)
 
 	const char* u = UnitStrings[Format][Index];
 
-	if ((Index == 0) || ((Num % (uint64_t)powl((int)KiloUnit[Format], Index)) == 0))
+	if ((Index == 0) || ((Num % IntPow(KiloUnit[Format], Index)) == 0))
 	{
 		/*
 		 * If this is an even multiple of the base, always display
@@ -320,7 +350,7 @@ NiceNumGeneral(uint64_t Num, std::span<char> Buffer, NicenumFormat Format)
 
 		for (int i = 2; i >= 0; i--)
 		{
-			double Value = (double)Num / (uint64_t)powl((int)KiloUnit[Format], Index);
+			double Value = (double)Num / IntPow(KiloUnit[Format], Index);
 
 			/*
 			 * Don't print floating point values for time.  Note,
@@ -520,13 +550,38 @@ UrlDecode(std::string_view InUrl)
 	return std::string(Url.ToView());
 }
 
-//////////////////////////////////////////////////////////////////////////
-//
-// Unit tests
-//
+std::string
+HideSensitiveString(std::string_view String)
+{
+	const size_t	  Length	   = String.length();
+	const size_t	  SourceLength = Length > 16 ? 4 : 0;
+	const size_t	  PadLength	   = Min(Length - SourceLength, 4u);
+	const bool		  AddEllipsis  = (SourceLength + PadLength) < Length;
+	StringBuilder<16> SB;
+	if (SourceLength > 0)
+	{
+		SB << String.substr(0, SourceLength);
+	}
+	if (PadLength > 0)
+	{
+		SB << std::string(PadLength, 'X');
+	}
+	if (AddEllipsis)
+	{
+		SB << "...";
+	}
+	return SB.ToString();
+};
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Unit tests
+	//
 
 #if ZEN_WITH_TESTS
 
+TEST_SUITE_BEGIN("core.string");
+
 TEST_CASE("url")
 {
 	using namespace std::literals;
@@ -793,11 +848,6 @@ TEST_CASE("niceNum")
 	}
 }
 
-void
-string_forcelink()
-{
-}
-
 TEST_CASE("StringBuilder")
 {
 	StringBuilder<64> sb;
@@ -963,33 +1013,131 @@ TEST_CASE("ExtendableWideStringBuilder")
 
 TEST_CASE("utf8")
 {
+	using namespace utf8test;
+
 	SUBCASE("utf8towide")
 	{
-		// TODO: add more extensive testing here - this covers a very small space
-
 		WideStringBuilder<32> wout;
 		Utf8ToWide(u8"abcdefghi", wout);
 		CHECK(StringEquals(L"abcdefghi", wout.c_str()));
 
 		wout.Reset();
+		Utf8ToWide(u8"abc\xC3\xA4\xC3\xB6\xC3\xBC", wout);
+		CHECK(StringEquals(L"abc\u00E4\u00F6\u00FC", wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kLatin), wout);
+		CHECK(StringEquals(kLatinW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kCyrillic), wout);
+		CHECK(StringEquals(kCyrillicW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kCJK), wout);
+		CHECK(StringEquals(kCJKW, wout.c_str()));
+
+		wout.Reset();
+		Utf8ToWide(std::string_view(kMixed), wout);
+		CHECK(StringEquals(kMixedW, wout.c_str()));
 
-		Utf8ToWide(u8"abc���", wout);
-		CHECK(StringEquals(L"abc���", wout.c_str()));
+		wout.Reset();
+		Utf8ToWide(std::string_view(kEmoji), wout);
+		CHECK(StringEquals(kEmojiW, wout.c_str()));
 	}
 
 	SUBCASE("widetoutf8")
 	{
-		// TODO: add more extensive testing here - this covers a very small space
-
-		StringBuilder<32> out;
+		StringBuilder<64> out;
 
 		WideToUtf8(L"abcdefghi", out);
 		CHECK(StringEquals("abcdefghi", out.c_str()));
 
 		out.Reset();
+		WideToUtf8(kLatinW, out);
+		CHECK(StringEquals(kLatin, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kCyrillicW, out);
+		CHECK(StringEquals(kCyrillic, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kCJKW, out);
+		CHECK(StringEquals(kCJK, out.c_str()));
+
+		out.Reset();
+		WideToUtf8(kMixedW, out);
+		CHECK(StringEquals(kMixed, out.c_str()));
 
-		WideToUtf8(L"abc���", out);
-		CHECK(StringEquals(u8"abc���", out.c_str()));
+		out.Reset();
+		WideToUtf8(kEmojiW, out);
+		CHECK(StringEquals(kEmoji, out.c_str()));
+	}
+
+	SUBCASE("roundtrip")
+	{
+		// UTF-8 -> Wide -> UTF-8 identity
+		const char* Utf8Strings[] = {kLatin, kCyrillic, kCJK, kMixed, kEmoji};
+		for (const char* Utf8Str : Utf8Strings)
+		{
+			ExtendableWideStringBuilder<64> Wide;
+			Utf8ToWide(std::string_view(Utf8Str), Wide);
+
+			ExtendableStringBuilder<64> Back;
+			WideToUtf8(std::wstring_view(Wide.c_str()), Back);
+			CHECK(StringEquals(Utf8Str, Back.c_str()));
+		}
+
+		// Wide -> UTF-8 -> Wide identity
+		const wchar_t* WideStrings[] = {kLatinW, kCyrillicW, kCJKW, kMixedW, kEmojiW};
+		for (const wchar_t* WideStr : WideStrings)
+		{
+			ExtendableStringBuilder<64> Utf8;
+			WideToUtf8(std::wstring_view(WideStr), Utf8);
+
+			ExtendableWideStringBuilder<64> Back;
+			Utf8ToWide(std::string_view(Utf8.c_str()), Back);
+			CHECK(StringEquals(WideStr, Back.c_str()));
+		}
+
+		// Empty string round-trip
+		{
+			ExtendableWideStringBuilder<8> Wide;
+			Utf8ToWide(std::string_view(""), Wide);
+			CHECK(Wide.Size() == 0);
+
+			ExtendableStringBuilder<8> Narrow;
+			WideToUtf8(std::wstring_view(L""), Narrow);
+			CHECK(Narrow.Size() == 0);
+		}
+	}
+
+	SUBCASE("IsValidUtf8")
+	{
+		// Valid inputs
+		CHECK(IsValidUtf8(""));
+		CHECK(IsValidUtf8("hello world"));
+		CHECK(IsValidUtf8(kLatin));
+		CHECK(IsValidUtf8(kCyrillic));
+		CHECK(IsValidUtf8(kCJK));
+		CHECK(IsValidUtf8(kMixed));
+		CHECK(IsValidUtf8(kEmoji));
+
+		// Invalid: truncated 2-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xC3", 1)));
+
+		// Invalid: truncated 3-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xE6\x97", 2)));
+
+		// Invalid: truncated 4-byte sequence
+		CHECK(!IsValidUtf8(std::string_view("\xF0\x9F\x93", 3)));
+
+		// Invalid: bad start byte
+		CHECK(!IsValidUtf8(std::string_view("\xFF", 1)));
+		CHECK(!IsValidUtf8(std::string_view("\xFE", 1)));
+
+		// Invalid: overlong encoding of '/' (U+002F)
+		CHECK(!IsValidUtf8(std::string_view("\xC0\xAF", 2)));
 	}
 }
 
@@ -1105,6 +1253,28 @@ TEST_CASE("string")
 	}
 }
 
+TEST_CASE("hidesensitivestring")
+{
+	using namespace std::literals;
+
+	CHECK_EQ(HideSensitiveString(""sv), ""sv);
+	CHECK_EQ(HideSensitiveString("A"sv), "X"sv);
+	CHECK_EQ(HideSensitiveString("ABCD"sv), "XXXX"sv);
+	CHECK_EQ(HideSensitiveString("ABCDE"sv), "XXXX..."sv);
+	CHECK_EQ(HideSensitiveString("ABCDEFGH"sv), "XXXX..."sv);
+	CHECK_EQ(HideSensitiveString("ABCDEFGHIJKLMNOP"sv), "XXXX..."sv);
+	CHECK_EQ(HideSensitiveString("ABCDEFGHIJKLMNOPQ"sv), "ABCDXXXX..."sv);
+	CHECK_EQ(HideSensitiveString("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345"sv), "ABCDXXXX..."sv);
+	CHECK_EQ(HideSensitiveString("1234567890123456789"sv), "1234XXXX..."sv);
+}
+
+TEST_SUITE_END();
+
+void
+string_forcelink()
+{
+}
+
 #endif
 
 }  // namespace zen