aboutsummaryrefslogtreecommitdiff
path: root/src/zencore/string.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/zencore/string.cpp')
-rw-r--r--src/zencore/string.cpp131
1 files changed, 122 insertions, 9 deletions
diff --git a/src/zencore/string.cpp b/src/zencore/string.cpp
index a9aed6309..ab1c7de58 100644
--- a/src/zencore/string.cpp
+++ b/src/zencore/string.cpp
@@ -4,6 +4,7 @@
#include <zencore/memoryview.h>
#include <zencore/string.h>
#include <zencore/testing.h>
+#include <zencore/testutils.h>
#include <inttypes.h>
#include <math.h>
@@ -184,7 +185,21 @@ Utf8ToWide(const std::u8string_view& Str8, WideStringBuilderBase& OutString)
if (!ByteCount)
{
+#if ZEN_SIZEOF_WCHAR_T == 2
+ if (CurrentOutChar > 0xFFFF)
+ {
+ // Supplementary plane: emit a UTF-16 surrogate pair
+ uint32_t Adjusted = uint32_t(CurrentOutChar - 0x10000);
+ OutString.Append(wchar_t(0xD800 + (Adjusted >> 10)));
+ OutString.Append(wchar_t(0xDC00 + (Adjusted & 0x3FF)));
+ }
+ else
+ {
+ OutString.Append(wchar_t(CurrentOutChar));
+ }
+#else
OutString.Append(wchar_t(CurrentOutChar));
+#endif
CurrentOutChar = 0;
}
}
@@ -967,33 +982,131 @@ TEST_CASE("ExtendableWideStringBuilder")
TEST_CASE("utf8")
{
+ using namespace utf8test;
+
SUBCASE("utf8towide")
{
- // TODO: add more extensive testing here - this covers a very small space
-
WideStringBuilder<32> wout;
Utf8ToWide(u8"abcdefghi", wout);
CHECK(StringEquals(L"abcdefghi", wout.c_str()));
wout.Reset();
+ Utf8ToWide(u8"abc\xC3\xA4\xC3\xB6\xC3\xBC", wout);
+ CHECK(StringEquals(L"abc\u00E4\u00F6\u00FC", wout.c_str()));
+
+ wout.Reset();
+ Utf8ToWide(std::string_view(kLatin), wout);
+ CHECK(StringEquals(kLatinW, wout.c_str()));
+
+ wout.Reset();
+ Utf8ToWide(std::string_view(kCyrillic), wout);
+ CHECK(StringEquals(kCyrillicW, wout.c_str()));
+
+ wout.Reset();
+ Utf8ToWide(std::string_view(kCJK), wout);
+ CHECK(StringEquals(kCJKW, wout.c_str()));
+
+ wout.Reset();
+ Utf8ToWide(std::string_view(kMixed), wout);
+ CHECK(StringEquals(kMixedW, wout.c_str()));
- Utf8ToWide(u8"abc���", wout);
- CHECK(StringEquals(L"abc���", wout.c_str()));
+ wout.Reset();
+ Utf8ToWide(std::string_view(kEmoji), wout);
+ CHECK(StringEquals(kEmojiW, wout.c_str()));
}
SUBCASE("widetoutf8")
{
- // TODO: add more extensive testing here - this covers a very small space
-
- StringBuilder<32> out;
+ StringBuilder<64> out;
WideToUtf8(L"abcdefghi", out);
CHECK(StringEquals("abcdefghi", out.c_str()));
out.Reset();
+ WideToUtf8(kLatinW, out);
+ CHECK(StringEquals(kLatin, out.c_str()));
- WideToUtf8(L"abc���", out);
- CHECK(StringEquals(u8"abc���", out.c_str()));
+ out.Reset();
+ WideToUtf8(kCyrillicW, out);
+ CHECK(StringEquals(kCyrillic, out.c_str()));
+
+ out.Reset();
+ WideToUtf8(kCJKW, out);
+ CHECK(StringEquals(kCJK, out.c_str()));
+
+ out.Reset();
+ WideToUtf8(kMixedW, out);
+ CHECK(StringEquals(kMixed, out.c_str()));
+
+ out.Reset();
+ WideToUtf8(kEmojiW, out);
+ CHECK(StringEquals(kEmoji, out.c_str()));
+ }
+
+ SUBCASE("roundtrip")
+ {
+ // UTF-8 -> Wide -> UTF-8 identity
+ const char* Utf8Strings[] = {kLatin, kCyrillic, kCJK, kMixed, kEmoji};
+ for (const char* Utf8Str : Utf8Strings)
+ {
+ ExtendableWideStringBuilder<64> Wide;
+ Utf8ToWide(std::string_view(Utf8Str), Wide);
+
+ ExtendableStringBuilder<64> Back;
+ WideToUtf8(std::wstring_view(Wide.c_str()), Back);
+ CHECK(StringEquals(Utf8Str, Back.c_str()));
+ }
+
+ // Wide -> UTF-8 -> Wide identity
+ const wchar_t* WideStrings[] = {kLatinW, kCyrillicW, kCJKW, kMixedW, kEmojiW};
+ for (const wchar_t* WideStr : WideStrings)
+ {
+ ExtendableStringBuilder<64> Utf8;
+ WideToUtf8(std::wstring_view(WideStr), Utf8);
+
+ ExtendableWideStringBuilder<64> Back;
+ Utf8ToWide(std::string_view(Utf8.c_str()), Back);
+ CHECK(StringEquals(WideStr, Back.c_str()));
+ }
+
+ // Empty string round-trip
+ {
+ ExtendableWideStringBuilder<8> Wide;
+ Utf8ToWide(std::string_view(""), Wide);
+ CHECK(Wide.Size() == 0);
+
+ ExtendableStringBuilder<8> Narrow;
+ WideToUtf8(std::wstring_view(L""), Narrow);
+ CHECK(Narrow.Size() == 0);
+ }
+ }
+
+ SUBCASE("IsValidUtf8")
+ {
+ // Valid inputs
+ CHECK(IsValidUtf8(""));
+ CHECK(IsValidUtf8("hello world"));
+ CHECK(IsValidUtf8(kLatin));
+ CHECK(IsValidUtf8(kCyrillic));
+ CHECK(IsValidUtf8(kCJK));
+ CHECK(IsValidUtf8(kMixed));
+ CHECK(IsValidUtf8(kEmoji));
+
+ // Invalid: truncated 2-byte sequence
+ CHECK(!IsValidUtf8(std::string_view("\xC3", 1)));
+
+ // Invalid: truncated 3-byte sequence
+ CHECK(!IsValidUtf8(std::string_view("\xE6\x97", 2)));
+
+ // Invalid: truncated 4-byte sequence
+ CHECK(!IsValidUtf8(std::string_view("\xF0\x9F\x93", 3)));
+
+ // Invalid: bad start byte
+ CHECK(!IsValidUtf8(std::string_view("\xFF", 1)));
+ CHECK(!IsValidUtf8(std::string_view("\xFE", 1)));
+
+ // Invalid: overlong encoding of '/' (U+002F)
+ CHECK(!IsValidUtf8(std::string_view("\xC0\xAF", 2)));
}
}