diff options
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/bundle.lua | 5 | ||||
| -rw-r--r-- | scripts/test.lua | 8 | ||||
| -rwxr-xr-x | scripts/test_linux/block-clone-test.sh (renamed from scripts/test_scripts/block-clone-test.sh) | 2 | ||||
| -rwxr-xr-x | scripts/test_linux/crashpad-test.sh | 184 | ||||
| -rwxr-xr-x | scripts/test_linux/service-test.sh | 380 | ||||
| -rwxr-xr-x | scripts/test_mac/block-clone-test.sh (renamed from scripts/test_scripts/block-clone-test-mac.sh) | 2 | ||||
| -rwxr-xr-x | scripts/test_mac/service-test.sh | 368 | ||||
| -rw-r--r-- | scripts/test_scripts/hub_load_test.py | 969 | ||||
| -rw-r--r-- | scripts/test_scripts/hub_provision_perf_test.py | 501 | ||||
| -rw-r--r-- | scripts/test_windows/block-clone-test.ps1 (renamed from scripts/test_scripts/block-clone-test-windows.ps1) | 2 | ||||
| -rw-r--r-- | scripts/test_windows/service-test.ps1 | 353 |
11 files changed, 2769 insertions, 5 deletions
diff --git a/scripts/bundle.lua b/scripts/bundle.lua index 6f4552890..1d94451e7 100644 --- a/scripts/bundle.lua +++ b/scripts/bundle.lua @@ -34,6 +34,9 @@ local function _build(arch, debug, config_args) "--mode="..variant, "--arch="..arch, "--zensentry=yes", + "--zencompute=no", + "--zenhorde=no", + "--zennomad=no", toolchain_arg, sdk_arg, config_args) @@ -289,7 +292,9 @@ local function main_linux() _zip(false, "build/zenserver-linux.zip", "build/linux/x86_64/release/zenserver", + "build/linux/x86_64/release/zenserver.sym", "build/linux/x86_64/release/zen", + "build/linux/x86_64/release/zen.sym", "build/linux/x86_64/release/crashpad_handler", "build/linux/x86_64/release/OidcToken") end diff --git a/scripts/test.lua b/scripts/test.lua index 3c18225fb..52495342a 100644 --- a/scripts/test.lua +++ b/scripts/test.lua @@ -231,8 +231,12 @@ function main() if use_noskip then cmd = string.format("%s --no-skip", cmd) end - if use_verbose and name == "integration" then - cmd = string.format("%s --verbose", cmd) + if name == "integration" then + cmd = string.format("%s --kill-stale-processes", cmd) + + if use_verbose then + cmd = string.format("%s --verbose", cmd) + end end for _, arg in ipairs(extra_args) do cmd = string.format("%s %s", cmd, arg) diff --git a/scripts/test_scripts/block-clone-test.sh b/scripts/test_linux/block-clone-test.sh index 7c6bf5605..0a74283f2 100755 --- a/scripts/test_scripts/block-clone-test.sh +++ b/scripts/test_linux/block-clone-test.sh @@ -4,7 +4,7 @@ # Requires: root/sudo, btrfs-progs (mkfs.btrfs), xfsprogs (mkfs.xfs) # # Usage: -# sudo ./scripts/test_scripts/block-clone-test.sh [path-to-zencore-test] +# sudo ./scripts/test_linux/block-clone-test.sh [path-to-zencore-test] # # If no path is given, defaults to build/linux/x86_64/debug/zencore-test # relative to the repository root. diff --git a/scripts/test_linux/crashpad-test.sh b/scripts/test_linux/crashpad-test.sh new file mode 100755 index 000000000..bd08cefc5 --- /dev/null +++ b/scripts/test_linux/crashpad-test.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +# Verify that crashpad_handler is active when zenserver (release build) starts. +# +# This test: +# 1. Launches zenserver from the release build. +# 2. Waits for the HTTP health endpoint to become ready. +# 3. Checks that a crashpad_handler child process is running. +# 4. Checks that the startup log contains "sentry initialized" (not a failure). +# +# Usage: +# ./scripts/test_linux/crashpad-test.sh [path-to-zenserver] +# +# If no path is given, defaults to build/linux/x86_64/release/zenserver +# relative to the repository root. +# +# The test exits 0 on success, 1 on failure. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +ZENSERVER_BINARY="${1:-$REPO_ROOT/build/linux/x86_64/release/zenserver}" +CRASHPAD_HANDLER="$(dirname "$ZENSERVER_BINARY")/crashpad_handler" +PORT=18558 # Use a non-default port so we don't collide with a running zenserver +DATA_DIR="$(mktemp -d)" +STDOUT_FILE="$DATA_DIR/stdout.log" +ZENSERVER_PID="" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +PASSED=0 +FAILED=0 + +pass() { + echo -e " ${GREEN}PASS${NC} $1" + (( PASSED++ )) || true +} + +fail() { + echo -e " ${RED}FAIL${NC} $1" + (( FAILED++ )) || true +} + +cleanup() { + set +e + if [ -n "$ZENSERVER_PID" ] && kill -0 "$ZENSERVER_PID" 2>/dev/null; then + kill "$ZENSERVER_PID" 2>/dev/null + wait "$ZENSERVER_PID" 2>/dev/null + fi + rm -rf "$DATA_DIR" +} +trap cleanup EXIT + +# ── Preflight ──────────────────────────────────────────────────────────────── + +echo "" +echo "==============================" +echo " Crashpad active check" +echo "==============================" +echo "" + +if [ ! -f "$ZENSERVER_BINARY" ]; then + echo -e "${RED}ERROR: zenserver binary not found: $ZENSERVER_BINARY${NC}" + echo " Build with: xmake config -m release && xmake build zenserver" + exit 1 +fi + +if [ ! -f "$CRASHPAD_HANDLER" ]; then + echo -e "${RED}ERROR: crashpad_handler not found alongside zenserver: $CRASHPAD_HANDLER${NC}" + echo " It should be copied there automatically by the build." + exit 1 +fi + +echo "zenserver: $ZENSERVER_BINARY" +echo "crashpad_handler: $CRASHPAD_HANDLER" +echo "port: $PORT" +echo "data dir: $DATA_DIR" +echo "" + +# ── Start zenserver ────────────────────────────────────────────────────────── + +# zenserver runs in the foreground until SIGINT/SIGTERM. Launch in background +# so we can poll its health endpoint and inspect child processes. +"$ZENSERVER_BINARY" \ + --port="$PORT" \ + --data-dir="$DATA_DIR" \ + > "$STDOUT_FILE" 2>&1 & +ZENSERVER_PID=$! + +echo "Started zenserver (pid $ZENSERVER_PID), waiting for health endpoint..." + +# ── Wait for health endpoint ───────────────────────────────────────────────── + +READY=false +for i in $(seq 1 40); do + if curl -sf "http://localhost:$PORT/health" > /dev/null 2>&1; then + READY=true + break + fi + sleep 0.5 +done + +if [ "$READY" = false ]; then + echo -e "${RED}ERROR: zenserver did not become ready within 20 seconds${NC}" + if [ -f "$STDOUT_FILE" ]; then + echo "" + echo "--- stdout ---" + cat "$STDOUT_FILE" + fi + exit 1 +fi + +echo "Server is ready." + +# Give the server a moment to finish startup logging (sentry init log +# message is emitted after the health endpoint comes up). +sleep 1 +echo "" + +# ── Test 1: crashpad_handler process is running for our data dir ───────────── +# +# sentry-native starts crashpad_handler with --database pointing at +# $DATA_DIR/.sentry-native. The process re-parents itself out of zenserver's +# process tree, so we look for it by its command-line arguments rather than +# by parent PID. + +HANDLER_PID="$(pgrep -f "crashpad_handler.*${DATA_DIR}" 2>/dev/null | head -1 || true)" +if [ -n "$HANDLER_PID" ]; then + pass "crashpad_handler is running (pid $HANDLER_PID) with database in our data dir" +else + fail "crashpad_handler process not found — sentry_init may have failed or sentry is disabled" +fi + +# ── Test 2: No sentry_init failure in startup log ──────────────────────────── +# +# The "sentry initialized" success message is logged at INFO level under the +# sentry-sdk log category, which is filtered to Warn by default — so it won't +# appear in normal stdout. We check for the absence of the failure message +# instead (which IS at Warn level and would appear if sentry_init failed). + +if grep -q "sentry_init returned failure" "$STDOUT_FILE" 2>/dev/null; then + ERRMSG="$(grep "sentry_init returned failure" "$STDOUT_FILE" | head -1)" + fail "sentry_init reported failure: $ERRMSG" +else + pass "No sentry_init failure message in startup log" +fi + +# ── Test 3: ldd sanity — crashpad_handler must not need libc++.so.1 ────────── + +MISSING_LIBCXX="$(ldd "$CRASHPAD_HANDLER" 2>/dev/null | grep "libc++\.so\.1" | grep "not found" || true)" +if [ -n "$MISSING_LIBCXX" ]; then + fail "crashpad_handler has an unsatisfied libc++.so.1 dependency (static linking patch not applied)" +elif ldd "$CRASHPAD_HANDLER" 2>/dev/null | grep -q "libc++\.so\.1"; then + fail "crashpad_handler links libc++.so.1 dynamically — it should be statically linked" +else + pass "crashpad_handler has no dynamic libc++.so.1 dependency" +fi + +# ── Summary ────────────────────────────────────────────────────────────────── + +echo "" +echo "==============================" +printf " Passed: " +echo -e "${GREEN}${PASSED}${NC}" +printf " Failed: " +if [ "$FAILED" -gt 0 ]; then + echo -e "${RED}${FAILED}${NC}" +else + echo -e "${GREEN}${FAILED}${NC}" +fi +echo "==============================" +echo "" + +if [ "$FAILED" -gt 0 ]; then + if [ -f "$STDOUT_FILE" ]; then + echo "--- stdout ---" + cat "$STDOUT_FILE" + fi + exit 1 +fi diff --git a/scripts/test_linux/service-test.sh b/scripts/test_linux/service-test.sh new file mode 100755 index 000000000..f91339e1b --- /dev/null +++ b/scripts/test_linux/service-test.sh @@ -0,0 +1,380 @@ +#!/usr/bin/env bash +# Test zen service command lifecycle on Linux (systemd). +# +# Requires: root/sudo, systemd +# +# Usage: +# sudo ./scripts/test_linux/service-test.sh [path-to-zen] +# +# If no path is given, defaults to build/linux/x86_64/debug/zen +# relative to the repository root. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +ZEN_BINARY="${1:-$REPO_ROOT/build/linux/x86_64/debug/zen}" +ZENSERVER_BINARY="$(dirname "$ZEN_BINARY")/zenserver" +SERVICE_NAME="ZenServerTest-$$" +UNIT_NAME="com.epicgames.unreal.${SERVICE_NAME}" +UNIT_FILE="/etc/systemd/system/${UNIT_NAME}.service" + +PASSED=0 +FAILED=0 +TESTS_RUN=0 + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +cleanup() { + local exit_code=$? + set +e + + echo "" + echo "--- Cleanup ---" + + # Stop service if running + if systemctl is-active --quiet "$UNIT_NAME" 2>/dev/null; then + echo "Stopping test service..." + systemctl stop "$UNIT_NAME" 2>/dev/null + fi + + # Disable and remove unit file + if [ -f "$UNIT_FILE" ]; then + echo "Removing test unit file..." + systemctl disable "$UNIT_NAME" 2>/dev/null || true + rm -f "$UNIT_FILE" + systemctl daemon-reload 2>/dev/null || true + fi + + echo "" + echo "==============================" + printf " Tests run: %d\n" "$TESTS_RUN" + printf " ${GREEN}Passed: %d${NC}\n" "$PASSED" + if [ "$FAILED" -gt 0 ]; then + printf " ${RED}Failed: %d${NC}\n" "$FAILED" + else + printf " Failed: %d\n" "$FAILED" + fi + echo "==============================" + + if [ "$FAILED" -gt 0 ]; then + exit 1 + fi + exit "$exit_code" +} + +trap cleanup EXIT + +pass() { + TESTS_RUN=$((TESTS_RUN + 1)) + PASSED=$((PASSED + 1)) + printf " ${GREEN}PASS${NC}: %s\n" "$1" +} + +fail() { + TESTS_RUN=$((TESTS_RUN + 1)) + FAILED=$((FAILED + 1)) + printf " ${RED}FAIL${NC}: %s\n" "$1" + if [ -n "${2:-}" ]; then + printf " %s\n" "$2" + fi +} + +# ── Preflight checks ────────────────────────────────────────────── + +if [ "$(id -u)" -ne 0 ]; then + echo "Error: this test must be run as root (sudo)." + exit 1 +fi + +if ! command -v systemctl &>/dev/null; then + echo "Error: systemctl not found — this test requires systemd." + exit 1 +fi + +if [ ! -x "$ZEN_BINARY" ]; then + echo "Error: zen binary not found at '$ZEN_BINARY'" + echo "Build with: xmake config -m debug && xmake build zen" + exit 1 +fi + +if [ ! -x "$ZENSERVER_BINARY" ]; then + echo "Error: zenserver binary not found at '$ZENSERVER_BINARY'" + echo "Build with: xmake config -m debug && xmake build zenserver" + exit 1 +fi + +echo "zen binary: $ZEN_BINARY" +echo "zenserver binary: $ZENSERVER_BINARY" +echo "service name: $SERVICE_NAME" +echo "unit name: $UNIT_NAME" +echo "" + +# Determine which user to run the service as (the user who invoked sudo) +SERVICE_USER="${SUDO_USER:-$(whoami)}" + +# ── Test: status before install (should fail) ───────────────────── + +echo "--- Test: status before install ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "status reports 'not installed' for non-existent service" +else + fail "status should report 'not installed'" "got: $OUTPUT" +fi + +# ── Test: install ───────────────────────────────────────────────── + +echo "--- Test: install ---" + +OUTPUT=$("$ZEN_BINARY" service install "$ZENSERVER_BINARY" "$SERVICE_NAME" --user "$SERVICE_USER" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "install exits with code 0" +else + fail "install exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +if [ -f "$UNIT_FILE" ]; then + pass "unit file created at $UNIT_FILE" +else + fail "unit file created at $UNIT_FILE" +fi + +# Verify unit file contents +if [ -f "$UNIT_FILE" ]; then + if grep -q "ExecStart=.*zenserver" "$UNIT_FILE"; then + pass "unit file contains ExecStart with zenserver" + else + fail "unit file contains ExecStart with zenserver" "$(cat "$UNIT_FILE")" + fi + + if grep -q "User=$SERVICE_USER" "$UNIT_FILE"; then + pass "unit file contains correct User=$SERVICE_USER" + else + fail "unit file contains correct User=$SERVICE_USER" "$(grep User "$UNIT_FILE")" + fi + + if grep -q "Type=notify" "$UNIT_FILE"; then + pass "unit file uses Type=notify" + else + fail "unit file uses Type=notify" + fi + + if grep -q "WantedBy=multi-user.target" "$UNIT_FILE"; then + pass "unit file has WantedBy=multi-user.target" + else + fail "unit file has WantedBy=multi-user.target" + fi + + # Verify the service is enabled + if systemctl is-enabled --quiet "$UNIT_NAME" 2>/dev/null; then + pass "service is enabled after install" + else + fail "service is enabled after install" + fi +fi + +# ── Test: install again (already installed, no --full) ──────────── + +echo "--- Test: install again (idempotent) ---" + +OUTPUT=$("$ZEN_BINARY" service install "$ZENSERVER_BINARY" "$SERVICE_NAME" --user "$SERVICE_USER" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "re-install exits with code 0" +else + fail "re-install exits with code 0" "got exit code: $EXIT_CODE" +fi + +if echo "$OUTPUT" | grep -qi "already installed"; then + pass "re-install reports service already installed" +else + fail "re-install reports service already installed" "got: $OUTPUT" +fi + +# ── Test: status after install (not yet started) ────────────────── + +echo "--- Test: status after install (stopped) ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +# The status command throws if not running, so we expect an error message +if echo "$OUTPUT" | grep -qi "not running"; then + pass "status reports 'not running' for stopped service" +else + fail "status reports 'not running' for stopped service" "got: $OUTPUT" +fi + +# ── Test: start ─────────────────────────────────────────────────── + +echo "--- Test: start ---" + +OUTPUT=$("$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "start exits with code 0" +else + # The TODO comments in the code indicate start may not work perfectly + fail "start exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +# Give the service a moment to start +sleep 1 + +if systemctl is-active --quiet "$UNIT_NAME" 2>/dev/null; then + pass "service is active after start" + + # ── Test: status while running ──────────────────────────────── + + echo "--- Test: status (running) ---" + + OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) + if echo "$OUTPUT" | grep -qi "Running"; then + pass "status reports 'Running'" + else + fail "status reports 'Running'" "got: $OUTPUT" + fi + + if echo "$OUTPUT" | grep -qi "zenserver"; then + pass "status shows executable path" + else + fail "status shows executable path" "got: $OUTPUT" + fi + + # ── Test: start again (already running) ─────────────────────── + + echo "--- Test: start again (already running) ---" + + OUTPUT=$("$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1) + if echo "$OUTPUT" | grep -qi "already running"; then + pass "start reports service already running" + else + fail "start reports service already running" "got: $OUTPUT" + fi + + # ── Test: stop ──────────────────────────────────────────────── + + echo "--- Test: stop ---" + + OUTPUT=$("$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1) + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 0 ]; then + pass "stop exits with code 0" + else + fail "stop exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" + fi + + sleep 1 + + if ! systemctl is-active --quiet "$UNIT_NAME" 2>/dev/null; then + pass "service is inactive after stop" + else + fail "service is inactive after stop" + fi +else + fail "service is active after start" "(skipping start-dependent tests)" +fi + +# ── Test: stop when already stopped ─────────────────────────────── + +echo "--- Test: stop when already stopped ---" + +# Make sure it's stopped first +systemctl stop "$UNIT_NAME" 2>/dev/null || true +sleep 1 + +OUTPUT=$("$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not running"; then + pass "stop reports 'not running' when already stopped" +else + fail "stop reports 'not running' when already stopped" "got: $OUTPUT" +fi + +# ── Test: uninstall while running (should fail) ─────────────────── + +echo "--- Test: uninstall while running (should fail) ---" + +# Start the service so we can test uninstall-while-running +"$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1 || true +sleep 1 + +if systemctl is-active --quiet "$UNIT_NAME" 2>/dev/null; then + OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1 || true) + EXIT_CODE=$? + + if [ "$EXIT_CODE" -ne 0 ] || echo "$OUTPUT" | grep -qi "running.*stop"; then + pass "uninstall refuses while service is running" + else + fail "uninstall refuses while service is running" "got: exit=$EXIT_CODE, output: $OUTPUT" + fi + + # Stop it for the real uninstall test + "$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1 || true + systemctl stop "$UNIT_NAME" 2>/dev/null || true + sleep 1 +else + echo " (skipped: could not start service for this test)" +fi + +# ── Test: uninstall ─────────────────────────────────────────────── + +echo "--- Test: uninstall ---" + +# Ensure stopped +systemctl stop "$UNIT_NAME" 2>/dev/null || true +sleep 1 + +OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "uninstall exits with code 0" +else + fail "uninstall exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +if [ ! -f "$UNIT_FILE" ]; then + pass "unit file removed after uninstall" +else + fail "unit file removed after uninstall" +fi + +# ── Test: status after uninstall ────────────────────────────────── + +echo "--- Test: status after uninstall ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "status reports 'not installed' after uninstall" +else + fail "status reports 'not installed' after uninstall" "got: $OUTPUT" +fi + +# ── Test: uninstall when not installed (idempotent) ─────────────── + +echo "--- Test: uninstall when not installed ---" + +OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "uninstall of non-existent service exits with code 0" +else + fail "uninstall of non-existent service exits with code 0" "got exit code: $EXIT_CODE" +fi + +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "uninstall reports service not installed" +else + fail "uninstall reports service not installed" "got: $OUTPUT" +fi diff --git a/scripts/test_scripts/block-clone-test-mac.sh b/scripts/test_mac/block-clone-test.sh index a3d3ca4d3..1a9575dcf 100755 --- a/scripts/test_scripts/block-clone-test-mac.sh +++ b/scripts/test_mac/block-clone-test.sh @@ -5,7 +5,7 @@ # clonefile(), so no special setup is needed — just run the tests. # # Usage: -# ./scripts/test_scripts/block-clone-test-mac.sh [path-to-zencore-test] +# ./scripts/test_mac/block-clone-test.sh [path-to-zencore-test] # # If no path is given, defaults to build/macosx/<arch>/debug/zencore-test # relative to the repository root. diff --git a/scripts/test_mac/service-test.sh b/scripts/test_mac/service-test.sh new file mode 100755 index 000000000..da4a966e7 --- /dev/null +++ b/scripts/test_mac/service-test.sh @@ -0,0 +1,368 @@ +#!/usr/bin/env bash +# Test zen service command lifecycle on macOS (launchd). +# +# Does NOT require sudo — macOS service commands run as the current user +# via ~/Library/LaunchAgents. +# +# Usage: +# ./scripts/test_mac/service-test.sh [path-to-zen] +# +# If no path is given, defaults to build/macosx/<arch>/debug/zen +# relative to the repository root. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +ARCH="$(uname -m)" +ZEN_BINARY="${1:-$REPO_ROOT/build/macosx/$ARCH/debug/zen}" +ZENSERVER_BINARY="$(dirname "$ZEN_BINARY")/zenserver" +SERVICE_NAME="ZenServerTest-$$" +DAEMON_NAME="com.epicgames.unreal.${SERVICE_NAME}" +PLIST_FILE="$HOME/Library/LaunchAgents/${DAEMON_NAME}.plist" + +PASSED=0 +FAILED=0 +TESTS_RUN=0 + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +cleanup() { + local exit_code=$? + set +e + + echo "" + echo "--- Cleanup ---" + + # Bootout (stop) the agent if loaded + if launchctl list "$DAEMON_NAME" &>/dev/null; then + echo "Stopping test service..." + launchctl bootout "gui/$(id -u)" "$PLIST_FILE" 2>/dev/null || true + fi + + # Remove plist file + if [ -f "$PLIST_FILE" ]; then + echo "Removing test plist file..." + rm -f "$PLIST_FILE" + fi + + echo "" + echo "==============================" + printf " Tests run: %d\n" "$TESTS_RUN" + printf " ${GREEN}Passed: %d${NC}\n" "$PASSED" + if [ "$FAILED" -gt 0 ]; then + printf " ${RED}Failed: %d${NC}\n" "$FAILED" + else + printf " Failed: %d\n" "$FAILED" + fi + echo "==============================" + + if [ "$FAILED" -gt 0 ]; then + exit 1 + fi + exit "$exit_code" +} + +trap cleanup EXIT + +pass() { + TESTS_RUN=$((TESTS_RUN + 1)) + PASSED=$((PASSED + 1)) + printf " ${GREEN}PASS${NC}: %s\n" "$1" +} + +fail() { + TESTS_RUN=$((TESTS_RUN + 1)) + FAILED=$((FAILED + 1)) + printf " ${RED}FAIL${NC}: %s\n" "$1" + if [ -n "${2:-}" ]; then + printf " %s\n" "$2" + fi +} + +# ── Preflight checks ────────────────────────────────────────────── + +if [ "$(uname)" != "Darwin" ]; then + echo "Error: this test is for macOS only." + exit 1 +fi + +if ! command -v launchctl &>/dev/null; then + echo "Error: launchctl not found." + exit 1 +fi + +if [ ! -x "$ZEN_BINARY" ]; then + echo "Error: zen binary not found at '$ZEN_BINARY'" + echo "Build with: xmake config -m debug && xmake build zen" + exit 1 +fi + +if [ ! -x "$ZENSERVER_BINARY" ]; then + echo "Error: zenserver binary not found at '$ZENSERVER_BINARY'" + echo "Build with: xmake config -m debug && xmake build zenserver" + exit 1 +fi + +echo "zen binary: $ZEN_BINARY" +echo "zenserver binary: $ZENSERVER_BINARY" +echo "service name: $SERVICE_NAME" +echo "daemon name: $DAEMON_NAME" +echo "plist path: $PLIST_FILE" +echo "" + +# ── Test: status before install (should fail) ───────────────────── + +echo "--- Test: status before install ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "status reports 'not installed' for non-existent service" +else + fail "status should report 'not installed'" "got: $OUTPUT" +fi + +# ── Test: install ───────────────────────────────────────────────── + +echo "--- Test: install ---" + +OUTPUT=$("$ZEN_BINARY" service install "$ZENSERVER_BINARY" "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "install exits with code 0" +else + fail "install exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +if [ -f "$PLIST_FILE" ]; then + pass "plist file created at $PLIST_FILE" +else + fail "plist file created at $PLIST_FILE" +fi + +# Verify plist contents +if [ -f "$PLIST_FILE" ]; then + if grep -q "<string>$ZENSERVER_BINARY</string>" "$PLIST_FILE"; then + pass "plist contains zenserver executable path" + else + fail "plist contains zenserver executable path" "$(cat "$PLIST_FILE")" + fi + + if grep -q "<key>Label</key>" "$PLIST_FILE"; then + pass "plist has Label key" + else + fail "plist has Label key" + fi + + if grep -q "$DAEMON_NAME" "$PLIST_FILE"; then + pass "plist contains correct daemon name" + else + fail "plist contains correct daemon name" + fi + + if grep -q "<key>ProgramArguments</key>" "$PLIST_FILE"; then + pass "plist has ProgramArguments" + else + fail "plist has ProgramArguments" + fi +fi + +# ── Test: install again (already installed, no --full) ──────────── + +echo "--- Test: install again (idempotent) ---" + +OUTPUT=$("$ZEN_BINARY" service install "$ZENSERVER_BINARY" "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "re-install exits with code 0" +else + fail "re-install exits with code 0" "got exit code: $EXIT_CODE" +fi + +if echo "$OUTPUT" | grep -qi "already installed"; then + pass "re-install reports service already installed" +else + fail "re-install reports service already installed" "got: $OUTPUT" +fi + +# ── Test: status after install (not yet started) ────────────────── + +echo "--- Test: status after install (stopped) ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not running"; then + pass "status reports 'not running' for stopped service" +else + fail "status reports 'not running' for stopped service" "got: $OUTPUT" +fi + +# ── Test: start ─────────────────────────────────────────────────── + +echo "--- Test: start ---" + +OUTPUT=$("$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "start exits with code 0" +else + fail "start exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +# Give the service a moment to start +sleep 2 + +if launchctl list "$DAEMON_NAME" &>/dev/null; then + pass "service is loaded after start" + + # ── Test: status while running ──────────────────────────────── + + echo "--- Test: status (running) ---" + + OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) + if echo "$OUTPUT" | grep -qi "Running"; then + pass "status reports 'Running'" + else + fail "status reports 'Running'" "got: $OUTPUT" + fi + + if echo "$OUTPUT" | grep -qi "zenserver"; then + pass "status shows executable path" + else + fail "status shows executable path" "got: $OUTPUT" + fi + + # ── Test: start again (already running) ─────────────────────── + + echo "--- Test: start again (already running) ---" + + OUTPUT=$("$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1) + if echo "$OUTPUT" | grep -qi "already running"; then + pass "start reports service already running" + else + fail "start reports service already running" "got: $OUTPUT" + fi + + # ── Test: stop ──────────────────────────────────────────────── + + echo "--- Test: stop ---" + + OUTPUT=$("$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1) + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 0 ]; then + pass "stop exits with code 0" + else + fail "stop exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" + fi + + sleep 1 + + if ! launchctl list "$DAEMON_NAME" &>/dev/null; then + pass "service is unloaded after stop" + else + fail "service is unloaded after stop" + fi +else + fail "service is loaded after start" "(skipping start-dependent tests)" +fi + +# ── Test: stop when already stopped ─────────────────────────────── + +echo "--- Test: stop when already stopped ---" + +# Make sure it's stopped first +launchctl bootout "gui/$(id -u)" "$PLIST_FILE" 2>/dev/null || true +sleep 1 + +OUTPUT=$("$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not running"; then + pass "stop reports 'not running' when already stopped" +else + fail "stop reports 'not running' when already stopped" "got: $OUTPUT" +fi + +# ── Test: uninstall while running (should fail) ─────────────────── + +echo "--- Test: uninstall while running (should fail) ---" + +# Start the service so we can test uninstall-while-running +"$ZEN_BINARY" service start "$SERVICE_NAME" 2>&1 || true +sleep 2 + +if launchctl list "$DAEMON_NAME" &>/dev/null; then + OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1 || true) + EXIT_CODE=$? + + if [ "$EXIT_CODE" -ne 0 ] || echo "$OUTPUT" | grep -qi "running.*stop"; then + pass "uninstall refuses while service is running" + else + fail "uninstall refuses while service is running" "got: exit=$EXIT_CODE, output: $OUTPUT" + fi + + # Stop it for the real uninstall test + "$ZEN_BINARY" service stop "$SERVICE_NAME" 2>&1 || true + launchctl bootout "gui/$(id -u)" "$PLIST_FILE" 2>/dev/null || true + sleep 1 +else + echo " (skipped: could not start service for this test)" +fi + +# ── Test: uninstall ─────────────────────────────────────────────── + +echo "--- Test: uninstall ---" + +# Ensure stopped +launchctl bootout "gui/$(id -u)" "$PLIST_FILE" 2>/dev/null || true +sleep 1 + +OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "uninstall exits with code 0" +else + fail "uninstall exits with code 0" "got exit code: $EXIT_CODE, output: $OUTPUT" +fi + +if [ ! -f "$PLIST_FILE" ]; then + pass "plist file removed after uninstall" +else + fail "plist file removed after uninstall" +fi + +# ── Test: status after uninstall ────────────────────────────────── + +echo "--- Test: status after uninstall ---" + +OUTPUT=$("$ZEN_BINARY" service status "$SERVICE_NAME" 2>&1 || true) +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "status reports 'not installed' after uninstall" +else + fail "status reports 'not installed' after uninstall" "got: $OUTPUT" +fi + +# ── Test: uninstall when not installed (idempotent) ─────────────── + +echo "--- Test: uninstall when not installed ---" + +OUTPUT=$("$ZEN_BINARY" service uninstall "$SERVICE_NAME" 2>&1) +EXIT_CODE=$? + +if [ "$EXIT_CODE" -eq 0 ]; then + pass "uninstall of non-existent service exits with code 0" +else + fail "uninstall of non-existent service exits with code 0" "got exit code: $EXIT_CODE" +fi + +if echo "$OUTPUT" | grep -qi "not installed"; then + pass "uninstall reports service not installed" +else + fail "uninstall reports service not installed" "got: $OUTPUT" +fi diff --git a/scripts/test_scripts/hub_load_test.py b/scripts/test_scripts/hub_load_test.py new file mode 100644 index 000000000..7bff1eb37 --- /dev/null +++ b/scripts/test_scripts/hub_load_test.py @@ -0,0 +1,969 @@ +#!/usr/bin/env python3 +"""Hub sustained load test. + +Keeps ~N modules concurrently provisioned from a pool of 1000 predefined +module names, writing and reading data to each instance, then either letting +the hub watchdog deprovision idle ones or explicitly deprovisioning them. +Runs indefinitely until Ctrl-C. + +Optional --s3: starts a local MinIO server and configures the hub to use it +as the de/hydration backend. + +Requirements: + pip install boto3 (only needed with --s3) +""" + +from __future__ import annotations + +import argparse +import json +import os +import queue +import random +import subprocess +import sys +import threading +import time +import urllib.error +import urllib.request +import webbrowser +from concurrent.futures import Future, ThreadPoolExecutor, wait as futures_wait +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +_EXE_SUFFIX = ".exe" if sys.platform == "win32" else "" +_MINIO_USER = "minioadmin" +_MINIO_PASS = "minioadmin" +_NAMESPACE = "loadtest" +_BUCKET = "bucket" + +# Key sizes to use for activity writes (bytes) - biased toward smaller. +# Blobs are pre-generated at startup; one per size, shared across all requests. +_KEY_SIZES = [512, 512, 2048, 2048, 8192, 32768] +_BLOBS: list[bytes] = [] # populated by _init_blobs() + + +def _init_blobs() -> None: + seen: set[int] = set() + for size in _KEY_SIZES: + if size not in seen: + seen.add(size) + _BLOBS.append(os.urandom(size)) + else: + # reuse the already-generated blob for this size + _BLOBS.append(next(b for b in _BLOBS if len(b) == size)) + + +# --------------------------------------------------------------------------- +# Executable discovery +# --------------------------------------------------------------------------- + +def _find_zenserver(override: Optional[str]) -> Path: + if override: + p = Path(override) / f"zenserver{_EXE_SUFFIX}" + if not p.exists(): + sys.exit(f"zenserver not found at {p}") + return p + + script_dir = Path(__file__).resolve().parent + repo_root = script_dir.parent.parent + candidates = [ + repo_root / "build" / "windows" / "x64" / "release" / f"zenserver{_EXE_SUFFIX}", + repo_root / "build" / "linux" / "x86_64" / "release" / f"zenserver{_EXE_SUFFIX}", + repo_root / "build" / "macosx" / "x86_64" / "release" / f"zenserver{_EXE_SUFFIX}", + ] + for c in candidates: + if c.exists(): + return c + + matches = list(repo_root.glob(f"build/**/release/zenserver{_EXE_SUFFIX}")) + if matches: + return max(matches, key=lambda p: p.stat().st_mtime) + + sys.exit( + "zenserver executable not found in build/. " + "Run: xmake config -y -m release -a x64 && xmake -y\n" + "Or pass --zenserver-dir <dir>." + ) + + +def _find_minio(zenserver_path: Path) -> Path: + p = zenserver_path.parent / f"minio{_EXE_SUFFIX}" + if not p.exists(): + sys.exit( + f"minio executable not found at {p}. " + "Build with: xmake config -y -m release -a x64 && xmake -y" + ) + return p + + +# --------------------------------------------------------------------------- +# MinIO +# --------------------------------------------------------------------------- + +def _start_minio(minio_exe: Path, data_dir: Path, port: int, console_port: int) -> subprocess.Popen: + minio_data = data_dir / "minio" + minio_data.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env["MINIO_ROOT_USER"] = _MINIO_USER + env["MINIO_ROOT_PASSWORD"] = _MINIO_PASS + popen_kwargs: dict = {} + if sys.platform == "win32": + popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP + proc = subprocess.Popen( + [str(minio_exe), "server", str(minio_data), + "--address", f":{port}", + "--console-address", f":{console_port}", + "--quiet"], + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + **popen_kwargs, + ) + print(f"[minio] started (pid {proc.pid}) on port {port}, console on port {console_port}") + return proc + + +def _wait_for_minio(port: int, timeout_s: float = 30.0) -> None: + deadline = time.monotonic() + timeout_s + url = f"http://localhost:{port}/minio/health/live" + while time.monotonic() < deadline: + try: + with urllib.request.urlopen(url, timeout=1): + print("[minio] ready") + return + except Exception: + time.sleep(0.1) + sys.exit(f"[minio] timed out waiting for readiness after {timeout_s}s") + + +def _create_minio_bucket(port: int, bucket: str) -> None: + try: + import boto3 + import botocore.config + import botocore.exceptions + except ImportError: + sys.exit( + "[minio] boto3 is required for --s3.\n" + "Install it with: pip install boto3" + ) + s3 = boto3.client( + "s3", + endpoint_url=f"http://localhost:{port}", + aws_access_key_id=_MINIO_USER, + aws_secret_access_key=_MINIO_PASS, + region_name="us-east-1", + config=botocore.config.Config(signature_version="s3v4"), + ) + try: + s3.create_bucket(Bucket=bucket) + print(f"[minio] created bucket '{bucket}'") + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"): + print(f"[minio] bucket '{bucket}' already exists") + else: + raise + + +# --------------------------------------------------------------------------- +# Hub lifecycle +# --------------------------------------------------------------------------- + +def _start_hub( + zenserver_exe: Path, + data_dir: Path, + port: int, + log_file: Path, + idle_timeout: int, + extra_args: list[str], + extra_env: Optional[dict[str, str]], +) -> tuple[subprocess.Popen, object]: + data_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + str(zenserver_exe), + "hub", + f"--data-dir={data_dir}", + f"--port={port}", + "--hub-instance-http-threads=8", + "--hub-instance-corelimit=4", + "--hub-provision-disk-limit-percent=99", + "--hub-provision-memory-limit-percent=80", + "--hub-instance-limit=500", + f"--hub-watchdog-provisioned-inactivity-timeout-seconds={idle_timeout}", + "--hub-watchdog-inactivity-check-margin-seconds=5", + "--hub-watchdog-cycle-interval-ms=2000", + "--hub-watchdog-cycle-processing-budget-ms=3000", + "--hub-watchdog-activity-check-connect-timeout-ms=20", + "--hub-watchdog-activity-check-request-timeout-ms=50", + ] + extra_args + + env = os.environ.copy() + if extra_env: + env.update(extra_env) + + popen_kwargs: dict = {} + if sys.platform == "win32": + popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP + log_handle = log_file.open("wb") + try: + proc = subprocess.Popen( + cmd, env=env, stdout=log_handle, stderr=subprocess.STDOUT, + **popen_kwargs, + ) + except Exception: + log_handle.close() + raise + print(f"[hub] started (pid {proc.pid}), log: {log_file}") + return proc, log_handle + + +def _wait_for_hub(proc: subprocess.Popen, port: int, timeout_s: float = 100.0) -> None: + deadline = time.monotonic() + timeout_s + req = urllib.request.Request(f"http://localhost:{port}/hub/status", + headers={"Accept": "application/json"}) + while time.monotonic() < deadline: + if proc.poll() is not None: + sys.exit(f"[hub] process exited unexpectedly (rc={proc.returncode}) - " + f"is another zenserver already running on port {port}?") + try: + with urllib.request.urlopen(req, timeout=2): + print("[hub] ready") + return + except Exception: + time.sleep(0.2) + sys.exit(f"[hub] timed out waiting for readiness after {timeout_s}s") + + +def _stop_process(proc: subprocess.Popen, name: str, timeout_s: float = 10.0) -> None: + if proc.poll() is not None: + return + proc.terminate() + try: + proc.wait(timeout=timeout_s) + except subprocess.TimeoutExpired: + print(f"[{name}] did not exit after {timeout_s}s, killing") + proc.kill() + proc.wait() + + +# --------------------------------------------------------------------------- +# Module state +# --------------------------------------------------------------------------- + +@dataclass +class ModuleState: + state: str = "idle" # idle|provisioning|active|deprovisioning|watchdog-pending + base_uri: Optional[str] = None + written_keys: list[str] = field(default_factory=list) + activity_rounds_left: int = 0 + explicit_deprovision: bool = False + watchdog_pending_since: float = 0.0 + generation: int = 0 # incremented on each provision; queue entries carry this to discard stale entries + + +# --------------------------------------------------------------------------- +# Counters +# --------------------------------------------------------------------------- + +@dataclass +class Counters: + provisions: int = 0 + deprovisions_explicit: int = 0 + deprovisions_watchdog: int = 0 + provision_rejected: int = 0 + activity_rounds: int = 0 + writes: int = 0 + reads: int = 0 + errors: int = 0 + last_reject_time: float = 0.0 + _lock: threading.Lock = field(default_factory=threading.Lock, repr=False, compare=False) + + def inc(self, name: str, n: int = 1) -> None: + with self._lock: + setattr(self, name, getattr(self, name) + n) + + def record_reject(self) -> None: + with self._lock: + self.provision_rejected += 1 + self.last_reject_time = time.monotonic() + + def snapshot(self) -> dict: + with self._lock: + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + +# --------------------------------------------------------------------------- +# Hub API helpers (urllib, no external deps) +# --------------------------------------------------------------------------- + +def _hub_post(port: int, path: str, timeout_s: float = 30.0) -> tuple[int, dict]: + url = f"http://localhost:{port}{path}" + req = urllib.request.Request(url, data=b"{}", method="POST", + headers={"Content-Type": "application/json", + "Accept": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + body = json.loads(resp.read()) + return resp.status, body + except urllib.error.HTTPError as e: + try: + body = json.loads(e.read()) + except Exception: + body = {} + return e.code, body + except Exception: + return 0, {} + + +def _hub_status(port: int, timeout_s: float = 5.0) -> Optional[list[dict]]: + try: + req = urllib.request.Request(f"http://localhost:{port}/hub/status", + headers={"Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + data = json.loads(resp.read()) + return data.get("modules", []) + except Exception: + return None + + +def _instance_put(base_uri: str, key: str, data: bytes, timeout_s: float = 10.0) -> int: + url = f"{base_uri}/z$/{_NAMESPACE}/{_BUCKET}/{key}" + req = urllib.request.Request(url, data=data, method="PUT", + headers={"Content-Type": "application/octet-stream", + "Accept": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + return resp.status + except urllib.error.HTTPError as e: + return e.code + except Exception: + return 0 + + +def _instance_get(base_uri: str, key: str, timeout_s: float = 10.0) -> int: + url = f"{base_uri}/z$/{_NAMESPACE}/{_BUCKET}/{key}" + req = urllib.request.Request(url, headers={"Accept": "application/octet-stream"}) + try: + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + resp.read() + return resp.status + except urllib.error.HTTPError as e: + return e.code + except Exception: + return 0 + + +# --------------------------------------------------------------------------- +# Worker tasks (run in thread pool - no sleeping) +# --------------------------------------------------------------------------- + +def _task_provision( + module_id: str, + port: int, + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + activity_queue: queue.PriorityQueue, + counters: Counters, + explicit_deprovision_rate: float, + stop_event: threading.Event, +) -> None: + status, body = _hub_post(port, f"/hub/modules/{module_id}/provision") + + generation = 0 + schedule_burst = False + with state_lock: + mod = state_map[module_id] + if mod.state != "provisioning": + # Shutdown changed state while provision was in-flight; leave it alone + return + if status in (200, 202): + instance_port = body.get("port") + base_uri = f"http://localhost:{instance_port}" if instance_port else None + if status == 200: + # Instance is immediately ready + mod.generation += 1 + generation = mod.generation + mod.state = "active" + mod.base_uri = base_uri + mod.written_keys = [] + mod.activity_rounds_left = random.randint(5, 20) + mod.explicit_deprovision = random.random() < explicit_deprovision_rate + schedule_burst = not stop_event.is_set() + else: + # 202: async provision - instance is still starting up. + # Stay in "provisioning"; the hub status poll activates it when ready. + # _shutdown_deprovision_all also covers "provisioning", so shutdown is safe. + mod.state = "provisioning" + mod.base_uri = base_uri + elif status == 409: + mod.state = "idle" + counters.record_reject() + return + else: + mod.state = "idle" + counters.inc("errors") + return + + counters.inc("provisions") + if schedule_burst: + activity_queue.put((time.monotonic() + random.uniform(0.1, 0.3), generation, module_id)) + + +def _task_deprovision( + module_id: str, + port: int, + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + counters: Counters, + retries: int = 5, +) -> None: + succeeded = False + for attempt in range(retries + 1): + status, _ = _hub_post(port, f"/hub/modules/{module_id}/deprovision") + if status in (200, 202, 404): + succeeded = True + break + if status == 409 and attempt < retries: + time.sleep(0.2) + continue + counters.inc("errors") + break + + with state_lock: + state_map[module_id].state = "idle" + state_map[module_id].base_uri = None + state_map[module_id].written_keys = [] + + if succeeded: + counters.inc("deprovisions_explicit") + + +def _task_activity_burst( + module_id: str, + generation: int, + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + activity_queue: queue.PriorityQueue, + counters: Counters, + pool: ThreadPoolExecutor, + port: int, +) -> None: + with state_lock: + mod = state_map[module_id] + if mod.state != "active" or mod.generation != generation: + return + base_uri = mod.base_uri + existing_keys = list(mod.written_keys) + + if not base_uri: + with state_lock: + state_map[module_id].state = "idle" + return + + # Write 3-8 new keys per burst + num_writes = random.randint(3, 8) + new_keys: list[str] = [] + write_errors = 0 + for _ in range(num_writes): + key = f"{random.getrandbits(160):040x}" + data = random.choice(_BLOBS) + status = _instance_put(base_uri, key, data) + if status in (200, 201, 204): + new_keys.append(key) + counters.inc("writes") + else: + write_errors += 1 + counters.inc("errors") + + if write_errors > 0 and not new_keys and not existing_keys: + # Instance unreachable - likely watchdog fired while we were scheduled + with state_lock: + mod = state_map[module_id] + if mod.generation == generation: + mod.state = "idle" + mod.base_uri = None + mod.written_keys = [] + return + + # Read back 1-3 random keys from all known keys + all_keys = existing_keys + new_keys + num_reads = min(random.randint(2, 5), len(all_keys)) + for key in random.sample(all_keys, num_reads): + status = _instance_get(base_uri, key) + if status == 200: + counters.inc("reads") + elif status in (404, 0): + # Key may not exist yet or instance gone; not fatal + pass + else: + counters.inc("errors") + + counters.inc("activity_rounds") + + next_generation = 0 + with state_lock: + mod = state_map[module_id] + if mod.state != "active" or mod.generation != generation: + return + mod.written_keys = (existing_keys + new_keys)[-200:] # cap list size + mod.activity_rounds_left -= 1 + + if mod.activity_rounds_left <= 0: + if mod.explicit_deprovision: + mod.state = "deprovisioning" + mod.base_uri = None + try: + pool.submit( + _task_deprovision, + module_id, port, state_map, state_lock, counters, + ) + except RuntimeError: + pass # pool shutting down; hub watchdog will clean up + else: + mod.state = "watchdog-pending" + mod.watchdog_pending_since = time.monotonic() + return + + next_generation = mod.generation # capture under lock before releasing + + # Schedule next burst soon for semi-continuous activity + next_time = time.monotonic() + random.uniform(0.2, 0.8) + activity_queue.put((next_time, next_generation, module_id)) + + +# --------------------------------------------------------------------------- +# Orchestrator +# --------------------------------------------------------------------------- + +def _orchestrate( + port: int, + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + activity_queue: queue.PriorityQueue, + counters: Counters, + pool: ThreadPoolExecutor, + stop_event: threading.Event, + target_active: int, + explicit_deprovision_rate: float, + idle_timeout: int, +) -> None: + last_status_poll = 0.0 + provision_backoff = False + + while not stop_event.is_set(): + now = time.monotonic() + + # Drain activity queue for due bursts + while True: + try: + next_time, generation, module_id = activity_queue.get_nowait() + except queue.Empty: + break + if next_time <= now: + try: + pool.submit( + _task_activity_burst, + module_id, generation, state_map, state_lock, + activity_queue, counters, pool, port, + ) + except RuntimeError: + break # pool shutting down + else: + activity_queue.put((next_time, generation, module_id)) + break + + # Activate/deactivate backoff based on recent 409 rejections + last_reject = counters.last_reject_time + if last_reject > 0 and now - last_reject < 5.0: + provision_backoff = True + elif provision_backoff and now - last_reject >= 5.0: + provision_backoff = False + + # Count states and submit provision tasks if needed + if not provision_backoff: + with state_lock: + idle_ids = [mid for mid, m in state_map.items() if m.state == "idle"] + working_count = sum( + 1 for m in state_map.values() + if m.state in ("active", "provisioning", "deprovisioning") + ) + watchdog_count = sum( + 1 for m in state_map.values() + if m.state == "watchdog-pending" + ) + + # Provision enough to keep working_count at target, but cap total + # in-flight (working + watchdog-pending) at 2x target to prevent + # runaway accumulation when all modules cycle to watchdog-pending. + inflight_cap = target_active * 2 + deficit = min( + target_active - working_count, + inflight_cap - working_count - watchdog_count, + ) + to_provision = idle_ids[:max(0, deficit)] + for mid in to_provision: + with state_lock: + if state_map[mid].state != "idle": + continue + state_map[mid].state = "provisioning" + try: + pool.submit( + _task_provision, + mid, port, state_map, state_lock, + activity_queue, counters, explicit_deprovision_rate, + stop_event, + ) + except RuntimeError: + with state_lock: + state_map[mid].state = "idle" + + # Poll hub status to detect async provision completions and watchdog-fired modules + if now - last_status_poll >= 5.0: + last_status_poll = now + modules_status = _hub_status(port) + if modules_status is not None: + hub_ids = {m["moduleId"]: m.get("state", "") for m in modules_status} + # Hub watchdog fires at ~idle_timeout from last activity. However, if the + # watchdog's previous visit predates the last burst, it sees a changed + # activity sum and resets LastActivityTime, delaying deprovision by ~173s. + # Explicitly deprovision after idle_timeout + 15s to avoid waiting for that. + timeout_threshold = idle_timeout + 15 + to_deprovision_explicitly: list[str] = [] + with state_lock: + for mid, mod in state_map.items(): + if mod.state == "provisioning" and mod.base_uri is not None: + # Waiting for async (202) provision to complete + hub_state = hub_ids.get(mid, "") + if hub_state == "provisioned": + mod.generation += 1 + mod.state = "active" + mod.written_keys = [] + mod.activity_rounds_left = random.randint(5, 20) + mod.explicit_deprovision = random.random() < explicit_deprovision_rate + activity_queue.put( + (time.monotonic() + random.uniform(0.1, 0.3), mod.generation, mid) + ) + elif mid not in hub_ids: + # Provision failed or was rolled back + mod.state = "idle" + mod.base_uri = None + elif mod.state == "watchdog-pending": + hub_state = hub_ids.get(mid, "") + gone = mid not in hub_ids + timed_out = (now - mod.watchdog_pending_since) > timeout_threshold + if gone or hub_state in ("unprovisioned", "deprovisioning"): + mod.state = "idle" + mod.base_uri = None + mod.written_keys = [] + counters.inc("deprovisions_watchdog") + elif timed_out: + mod.state = "deprovisioning" + to_deprovision_explicitly.append(mid) + for mid in to_deprovision_explicitly: + try: + pool.submit(_task_deprovision, mid, port, state_map, state_lock, counters) + except RuntimeError: + with state_lock: + if state_map[mid].state == "deprovisioning": + state_map[mid].state = "idle" + + stop_event.wait(timeout=0.05) + + +# --------------------------------------------------------------------------- +# Stats display +# --------------------------------------------------------------------------- + +def _stats_thread( + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + counters: Counters, + stop_event: threading.Event, + interval_s: float = 5.0, +) -> None: + is_tty = sys.stdout.isatty() + prev_lines = 0 + t0 = time.monotonic() + prev_snap: Optional[dict] = None + prev_t = t0 + + while not stop_event.is_set(): + stop_event.wait(timeout=interval_s) + now = time.monotonic() + elapsed = now - t0 + dt = now - prev_t + + snap = counters.snapshot() + with state_lock: + states: dict[str, int] = {} + for m in state_map.values(): + states[m.state] = states.get(m.state, 0) + 1 + + def rate(key: str) -> float: + if prev_snap is None or dt <= 0: + return 0.0 + return (snap[key] - prev_snap[key]) / dt * 60.0 + + lines = [ + f"[{time.strftime('%H:%M:%S')}] elapsed={elapsed:.0f}s", + f" modules: idle={states.get('idle', 0)} " + f"provisioning={states.get('provisioning', 0)} " + f"active={states.get('active', 0)} " + f"watchdog-pending={states.get('watchdog-pending', 0)} " + f"deprovisioning={states.get('deprovisioning', 0)}", + f" totals: provisions={snap['provisions']} " + f"deprov-explicit={snap['deprovisions_explicit']} " + f"deprov-watchdog={snap['deprovisions_watchdog']} " + f"rejected={snap['provision_rejected']} " + f"errors={snap['errors']}", + f" data: writes={snap['writes']} reads={snap['reads']} " + f"rounds={snap['activity_rounds']}", + f" rates/min: provisions={rate('provisions'):.1f} " + f"deprov={rate('deprovisions_explicit') + rate('deprovisions_watchdog'):.1f} " + f"writes={rate('writes'):.1f} reads={rate('reads'):.1f}", + ] + + if is_tty and prev_lines > 0: + # Move cursor up to overwrite previous block + sys.stdout.write(f"\033[{prev_lines}A\033[J") + + sys.stdout.write("\n".join(lines) + "\n") + sys.stdout.flush() + prev_lines = len(lines) + prev_snap = snap + prev_t = now + + +# --------------------------------------------------------------------------- +# Shutdown +# --------------------------------------------------------------------------- + +def _wait_for_hub_idle(port: int, hub_proc: subprocess.Popen, timeout_s: float = 120.0) -> None: + """Wait until the hub reports no transitioning instances (dehydration done).""" + _STABLE = {"provisioned", "hibernated", "crashed", "unprovisioned"} + print(f"[shutdown] waiting for hub dehydration (up to {timeout_s:.0f}s)...") + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + if hub_proc.poll() is not None: + print("[shutdown] hub process has exited") + return + modules = _hub_status(port, timeout_s=5.0) + if modules is None: + # Hub not responding. If it has exited, we're done. If it's still alive + # it may be saturated with S3 uploads - keep waiting rather than assuming done. + if hub_proc.poll() is not None: + print("[shutdown] hub process has exited") + return + time.sleep(1.0) + continue + transitioning = [m for m in modules if m.get("state") not in _STABLE] + remaining = len(modules) + if not transitioning: + if remaining: + print(f"[shutdown] hub idle ({remaining} instances in stable state)") + else: + print("[shutdown] hub idle (no instances remaining)") + return + print(f"[shutdown] {len(transitioning)} instance(s) still dehydrating...", end="\r") + time.sleep(1.0) + print(f"\n[shutdown] WARNING: hub did not become idle within {timeout_s:.0f}s") + + +def _shutdown_deprovision_all( + port: int, + state_map: dict[str, ModuleState], + state_lock: threading.Lock, + counters: Counters, + workers: int, + timeout_s: float = 60.0, +) -> None: + with state_lock: + to_deprovision = [ + mid for mid, m in state_map.items() + if m.state in ("active", "watchdog-pending", "provisioning") + ] + for mid in to_deprovision: + state_map[mid].state = "deprovisioning" + + if not to_deprovision: + return + + print(f"\n[shutdown] deprovisioning {len(to_deprovision)} active modules...") + pool = ThreadPoolExecutor(max_workers=min(workers, len(to_deprovision))) + futures: list[Future] = [ + pool.submit(_task_deprovision, mid, port, state_map, state_lock, counters) + for mid in to_deprovision + ] + pool.shutdown(wait=False) + + done_set, not_done_set = futures_wait(futures, timeout=timeout_s) + if not_done_set: + print(f"[shutdown] WARNING: {len(not_done_set)} deprovision tasks did not complete within {timeout_s}s") + else: + print(f"[shutdown] all modules deprovisioned") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--data-dir", default="E:/Dev/hub-loadtest", + help="Hub --data-dir (default: E:/Dev/hub-loadtest)") + parser.add_argument("--port", type=int, default=8558, + help="Hub HTTP port (default: 8558)") + parser.add_argument("--workers", type=int, default=50, + help="Thread pool size for HTTP calls (default: 50)") + parser.add_argument("--active-modules", type=int, default=100, + help="Target number of concurrently provisioned modules (default: 100)") + parser.add_argument("--module-count", type=int, default=1000, + help="Total predefined module name pool size (default: 1000)") + parser.add_argument("--idle-timeout", type=int, default=90, + help="Hub watchdog inactivity timeout in seconds (default: 90)") + parser.add_argument("--explicit-deprovision-rate", type=float, default=0.3, + help="Fraction of modules explicitly deprovisioned (default: 0.3)") + parser.add_argument("--zenserver-dir", + help="Directory containing zenserver executable (auto-detected by default)") + parser.add_argument("--s3", action="store_true", + help="Use local MinIO for hub de/hydration instead of filesystem") + parser.add_argument("--minio-port", type=int, default=9000, + help="MinIO S3 API port (default: 9000)") + parser.add_argument("--minio-console-port", type=int, default=9001, + help="MinIO web console port (default: 9001)") + parser.add_argument("--s3-bucket", default="zen-load-test", + help="S3 bucket name for MinIO hydration (default: zen-load-test)") + parser.add_argument("--no-browser", action="store_true", + help="Skip opening browser tabs") + args = parser.parse_args() + + if args.active_modules > args.module_count: + sys.exit( + f"--active-modules ({args.active_modules}) must not exceed " + f"--module-count ({args.module_count})" + ) + + _init_blobs() + + data_dir = Path(args.data_dir) + hub_log = data_dir / "hub.log" + + zenserver_exe = _find_zenserver(args.zenserver_dir) + print(f"[setup] zenserver: {zenserver_exe}") + + module_names = [f"load-test-module-{i:04d}" for i in range(args.module_count)] + state_map: dict[str, ModuleState] = {mid: ModuleState() for mid in module_names} + state_lock = threading.Lock() + activity_queue: queue.PriorityQueue = queue.PriorityQueue() + counters = Counters() + stop_event = threading.Event() + + minio_proc: Optional[subprocess.Popen] = None + hub_proc: Optional[subprocess.Popen] = None + hub_log_handle = None + + hub_extra_args: list[str] = [] + hub_extra_env: Optional[dict[str, str]] = None + + try: + if args.s3: + minio_exe = _find_minio(zenserver_exe) + minio_proc = _start_minio(minio_exe, data_dir, args.minio_port, args.minio_console_port) + _wait_for_minio(args.minio_port) + _create_minio_bucket(args.minio_port, args.s3_bucket) + if not args.no_browser: + webbrowser.open(f"http://localhost:{args.minio_console_port}") + config_json = { + "type": "s3", + "settings": { + "uri": f"s3://{args.s3_bucket}", + "endpoint": f"http://localhost:{args.minio_port}", + "path-style": True, + "region": "us-east-1", + }, + } + data_dir.mkdir(parents=True, exist_ok=True) + config_path = data_dir / "hydration_config.json" + config_path.write_text(json.dumps(config_json), encoding="ascii") + hub_extra_args = [f"--hub-hydration-target-config={config_path}"] + hub_extra_env = { + "AWS_ACCESS_KEY_ID": _MINIO_USER, + "AWS_SECRET_ACCESS_KEY": _MINIO_PASS, + } + + hub_proc, hub_log_handle = _start_hub( + zenserver_exe, data_dir, args.port, + hub_log, args.idle_timeout, + hub_extra_args, hub_extra_env, + ) + _wait_for_hub(hub_proc, args.port) + if not args.no_browser: + webbrowser.open(f"http://localhost:{args.port}/dashboard") + + print(f"[load-test] starting: pool={args.module_count} modules, " + f"target={args.active_modules} active, " + f"idle-timeout={args.idle_timeout}s, " + f"explicit-deprovision={args.explicit_deprovision_rate:.0%}") + print("[load-test] press Ctrl-C to stop") + + with ThreadPoolExecutor(max_workers=args.workers) as pool: + stats_t = threading.Thread( + target=_stats_thread, + args=(state_map, state_lock, counters, stop_event), + daemon=True, + ) + stats_t.start() + + orch_t = threading.Thread( + target=_orchestrate, + args=( + args.port, state_map, state_lock, + activity_queue, counters, pool, + stop_event, args.active_modules, + args.explicit_deprovision_rate, args.idle_timeout, + ), + daemon=True, + ) + orch_t.start() + + try: + while not stop_event.is_set(): + time.sleep(0.5) + if hub_proc.poll() is not None: + print(f"\n[hub] process exited unexpectedly (rc={hub_proc.returncode})") + break + except KeyboardInterrupt: + print("\n[load-test] Ctrl-C received, shutting down...") + + stop_event.set() + orch_t.join(timeout=3.0) + stats_t.join(timeout=3.0) + + _shutdown_deprovision_all( + args.port, state_map, state_lock, counters, + args.workers, timeout_s=60.0, + ) + _wait_for_hub_idle(args.port, hub_proc, timeout_s=max(120.0, args.active_modules * 1.0)) + + _stop_process(hub_proc, "hub", timeout_s=120.0) + if hub_log_handle is not None: + hub_log_handle.close() + hub_log_handle = None + if minio_proc is not None: + _stop_process(minio_proc, "minio") + minio_proc = None + + finally: + # Safety net: only reached if an exception occurred before normal shutdown + if hub_proc is not None and hub_proc.poll() is None: + _stop_process(hub_proc, "hub") + if hub_log_handle is not None: + hub_log_handle.close() + if minio_proc is not None: + _stop_process(minio_proc, "minio") + + +if __name__ == "__main__": + main() diff --git a/scripts/test_scripts/hub_provision_perf_test.py b/scripts/test_scripts/hub_provision_perf_test.py new file mode 100644 index 000000000..5b264ad62 --- /dev/null +++ b/scripts/test_scripts/hub_provision_perf_test.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +"""Hub provisioning performance test. + +Floods a zenserver hub with concurrent provision requests until the hub rejects +further provisioning (HTTP 409), then deprovisions all instances and exits. +A WPR ETW trace runs for the entire test and produces a .etl file for analysis +in WPA or PerfView (CPU sampling + context-switch events for lock contention). + +Optional --s3: starts a local MinIO server and configures the hub to use it +as the de/hydration backend instead of the default local filesystem. + +Requirements: + pip install boto3 (only needed with --s3) +WPR (wpr.exe) must be available (ships with Windows). Running as Administrator +is required for WPR to collect ETW traces. +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import time +import urllib.error +import urllib.request +import uuid +import webbrowser +from concurrent.futures import FIRST_COMPLETED, Future, ThreadPoolExecutor, as_completed, wait +from pathlib import Path +from typing import Optional + +_EXE_SUFFIX = ".exe" if sys.platform == "win32" else "" +_STABLE_STATES = {"provisioned", "hibernated", "crashed", "unprovisioned"} +_MINIO_USER = "minioadmin" +_MINIO_PASS = "minioadmin" + + +# --------------------------------------------------------------------------- +# Executable discovery +# --------------------------------------------------------------------------- + +def _find_zenserver(override: Optional[str]) -> Path: + if override: + p = Path(override) / f"zenserver{_EXE_SUFFIX}" + if not p.exists(): + sys.exit(f"zenserver not found at {p}") + return p + + script_dir = Path(__file__).resolve().parent + repo_root = script_dir.parent.parent + candidates = [ + repo_root / "build" / "windows" / "x64" / "release" / f"zenserver{_EXE_SUFFIX}", + repo_root / "build" / "linux" / "x86_64" / "release" / f"zenserver{_EXE_SUFFIX}", + repo_root / "build" / "macosx" / "x86_64" / "release" / f"zenserver{_EXE_SUFFIX}", + ] + for c in candidates: + if c.exists(): + return c + + matches = repo_root.glob(f"build/**/release/zenserver{_EXE_SUFFIX}") + for match in sorted(matches, key=lambda p: p.stat().st_mtime, reverse=True): + return match + + sys.exit( + "zenserver executable not found in build/. " + "Run: xmake config -y -m release -a x64 && xmake -y\n" + "Or pass --zenserver-dir <dir>." + ) + + +def _find_minio(zenserver_path: Path) -> Path: + p = zenserver_path.parent / f"minio{_EXE_SUFFIX}" + if not p.exists(): + sys.exit( + f"minio executable not found at {p}. " + "Build with: xmake config -y -m release -a x64 && xmake -y" + ) + return p + + +# --------------------------------------------------------------------------- +# WPR (Windows Performance Recorder) +# --------------------------------------------------------------------------- + +def _is_elevated() -> bool: + if sys.platform != "win32": + return False + try: + import ctypes + return bool(ctypes.windll.shell32.IsUserAnAdmin()) + except Exception: + return False + + +def _wpr_start(trace_file: Path) -> bool: + if sys.platform != "win32": + print("[wpr] WPR is Windows-only; skipping ETW trace.") + return False + if not _is_elevated(): + print("[wpr] skipping ETW trace - re-run from an elevated (Administrator) prompt to collect traces.") + return False + result = subprocess.run( + ["wpr.exe", "-start", "CPU", "-filemode"], + capture_output=True, text=True + ) + if result.returncode != 0: + print(f"[wpr] WARNING: wpr -start failed (code {result.returncode}):\n{result.stderr.strip()}") + return False + print(f"[wpr] ETW trace started (CPU profile, file mode)") + return True + + +def _wpr_stop(trace_file: Path) -> None: + if sys.platform != "win32": + return + result = subprocess.run( + ["wpr.exe", "-stop", str(trace_file)], + capture_output=True, text=True + ) + if result.returncode != 0: + print(f"[wpr] WARNING: wpr -stop failed (code {result.returncode}):\n{result.stderr.strip()}") + else: + print(f"[wpr] ETW trace saved: {trace_file}") + + +# --------------------------------------------------------------------------- +# MinIO +# --------------------------------------------------------------------------- + +def _start_minio(minio_exe: Path, data_dir: Path, port: int, console_port: int) -> subprocess.Popen: + minio_data = data_dir / "minio" + minio_data.mkdir(parents=True, exist_ok=True) + env = os.environ.copy() + env["MINIO_ROOT_USER"] = _MINIO_USER + env["MINIO_ROOT_PASSWORD"] = _MINIO_PASS + proc = subprocess.Popen( + [str(minio_exe), "server", str(minio_data), + "--address", f":{port}", + "--console-address", f":{console_port}", + "--quiet"], + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + print(f"[minio] started (pid {proc.pid}) on port {port}, console on port {console_port}") + return proc + + +def _wait_for_minio(port: int, timeout_s: float = 30.0) -> None: + deadline = time.monotonic() + timeout_s + url = f"http://localhost:{port}/minio/health/live" + while time.monotonic() < deadline: + try: + with urllib.request.urlopen(url, timeout=1): + print("[minio] ready") + return + except Exception: + time.sleep(0.1) + sys.exit(f"[minio] timed out waiting for readiness after {timeout_s}s") + + +def _create_minio_bucket(port: int, bucket: str) -> None: + try: + import boto3 + import botocore.config + import botocore.exceptions + except ImportError: + sys.exit( + "[minio] boto3 is required for --s3.\n" + "Install it with: pip install boto3" + ) + s3 = boto3.client( + "s3", + endpoint_url=f"http://localhost:{port}", + aws_access_key_id=_MINIO_USER, + aws_secret_access_key=_MINIO_PASS, + region_name="us-east-1", + config=botocore.config.Config(signature_version="s3v4"), + ) + try: + s3.create_bucket(Bucket=bucket) + print(f"[minio] created bucket '{bucket}'") + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"): + print(f"[minio] bucket '{bucket}' already exists") + else: + raise + + +# --------------------------------------------------------------------------- +# Hub lifecycle +# --------------------------------------------------------------------------- + +def _start_hub( + zenserver_exe: Path, + data_dir: Path, + port: int, + log_file: Path, + extra_args: list[str], + extra_env: Optional[dict[str, str]], +) -> tuple[subprocess.Popen, object]: + data_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + str(zenserver_exe), + "hub", + f"--data-dir={data_dir}", + f"--port={port}", + "--hub-instance-http-threads=8", + "--hub-instance-corelimit=4", + "--hub-provision-disk-limit-percent=99", + "--hub-provision-memory-limit-percent=80", + "--hub-instance-limit=100", + ] + extra_args + + env = os.environ.copy() + if extra_env: + env.update(extra_env) + + log_handle = log_file.open("wb") + try: + proc = subprocess.Popen( + cmd, env=env, stdout=log_handle, stderr=subprocess.STDOUT + ) + except Exception: + log_handle.close() + raise + print(f"[hub] started (pid {proc.pid}), log: {log_file}") + return proc, log_handle + + +def _wait_for_hub(proc: subprocess.Popen, port: int, timeout_s: float = 100.0) -> None: + deadline = time.monotonic() + timeout_s + req = urllib.request.Request(f"http://localhost:{port}/hub/status", + headers={"Accept": "application/json"}) + while time.monotonic() < deadline: + if proc.poll() is not None: + sys.exit(f"[hub] process exited unexpectedly (rc={proc.returncode}) - " + f"is another zenserver already running on port {port}?") + try: + with urllib.request.urlopen(req, timeout=2): + print("[hub] ready") + return + except Exception: + time.sleep(0.2) + sys.exit(f"[hub] timed out waiting for readiness after {timeout_s}s") + + +def _stop_process(proc: subprocess.Popen, name: str, timeout_s: float = 10.0) -> None: + if proc.poll() is not None: + return + proc.terminate() + try: + proc.wait(timeout=timeout_s) + except subprocess.TimeoutExpired: + print(f"[{name}] did not exit after {timeout_s}s, killing") + proc.kill() + proc.wait() + + +# --------------------------------------------------------------------------- +# Hub HTTP helpers +# --------------------------------------------------------------------------- + +def _hub_url(port: int, path: str) -> str: + return f"http://localhost:{port}{path}" + + +def _provision_one(port: int) -> tuple[str, int]: + module_id = str(uuid.uuid4()) + url = _hub_url(port, f"/hub/modules/{module_id}/provision") + req = urllib.request.Request(url, data=b"{}", method="POST", + headers={"Content-Type": "application/json", + "Accept": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return module_id, resp.status + except urllib.error.HTTPError as e: + return module_id, e.code + except Exception: + return module_id, 0 + + +def _deprovision_one(port: int, module_id: str, retries: int = 5) -> int: + url = _hub_url(port, f"/hub/modules/{module_id}/deprovision") + req = urllib.request.Request(url, data=b"{}", method="POST", + headers={"Content-Type": "application/json", + "Accept": "application/json"}) + for attempt in range(retries + 1): + try: + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.status + except urllib.error.HTTPError as e: + if e.code == 409 and attempt < retries: + time.sleep(0.2) + continue + return e.code + except Exception: + return 0 + + +def _hub_status(port: int, timeout_s: float = 5.0) -> Optional[list[dict]]: + try: + req = urllib.request.Request(_hub_url(port, "/hub/status"), + headers={"Accept": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout_s) as resp: + data = json.loads(resp.read()) + return data.get("modules", []) + except Exception: + return None + + +# --------------------------------------------------------------------------- +# Test phases +# --------------------------------------------------------------------------- + +def _flood_provision(port: int, workers: int) -> tuple[list[str], float, float]: + stopped = False + provisioned_ids: list[str] = [] + time_to_rejection: Optional[float] = None + t0 = time.monotonic() + + with ThreadPoolExecutor(max_workers=workers) as pool: + pending: set[Future] = {pool.submit(_provision_one, port) for _ in range(workers)} + + while pending: + done, pending = wait(pending, return_when=FIRST_COMPLETED) + + for f in done: + module_id, status = f.result() + if status in (200, 202): + provisioned_ids.append(module_id) + elif status == 409: + if not stopped: + time_to_rejection = time.monotonic() - t0 + stopped = True + print(f"\n[flood] hub rejected provisioning after " + f"{len(provisioned_ids)} instances " + f"({time_to_rejection:.2f}s)") + elif status == 0: + if not stopped: + stopped = True + print(f"\n[flood] hub unreachable - stopping flood " + f"({len(provisioned_ids)} instances so far)") + else: + print(f"[flood] unexpected status {status} for {module_id}") + + if not stopped: + pending.add(pool.submit(_provision_one, port)) + + wall_clock = time.monotonic() - t0 + return provisioned_ids, time_to_rejection or wall_clock, wall_clock + + +def _wait_stable(port: int, timeout_s: float = 20.0) -> None: + print("[hub] waiting for all instances to reach stable state...") + deadline = time.monotonic() + timeout_s + status_timeout_s = 5.0 + while time.monotonic() < deadline: + modules = _hub_status(port, timeout_s=status_timeout_s) + if modules is None: + time.sleep(0.5) + continue + transitioning = [m for m in modules if m.get("state") not in _STABLE_STATES] + elapsed = time.monotonic() - (deadline - timeout_s) + print(f"[hub] {elapsed:.1f}s: {len(modules) - len(transitioning)}/{len(modules)} stable", end="\r") + if not transitioning: + print(f"\n[hub] all {len(modules)} instances in stable state") + return + time.sleep(0.5) + print(f"\n[hub] WARNING: timed out waiting for stable states after {timeout_s}s") + + +def _deprovision_all(port: int, module_ids: list[str], workers: int) -> None: + raw_status = _hub_status(port, timeout_s=60.0) + if raw_status is None: + print("[deprovision] WARNING: could not reach hub to enumerate extra modules - " + "only deprovisioning tracked instances") + extra_ids = {m["moduleId"] for m in (raw_status or []) + if m.get("state") not in ("unprovisioned",)} - set(module_ids) + all_ids = list(module_ids) + list(extra_ids) + + print(f"[deprovision] deprovisioning {len(all_ids)} instances...") + t0 = time.monotonic() + errors = 0 + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = {pool.submit(_deprovision_one, port, mid): mid for mid in all_ids} + for f in as_completed(futures): + status = f.result() + if status not in (200, 202, 409): + errors += 1 + print(f"[deprovision] module {futures[f]}: unexpected status {status}") + elapsed = time.monotonic() - t0 + print(f"[deprovision] done in {elapsed:.2f}s ({errors} errors)") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--data-dir", default="E:/Dev/hub-perftest", + help="Hub --data-dir (default: E:/Dev/hub-perftest)") + parser.add_argument("--port", type=int, default=8558, + help="Hub HTTP port (default: 8558)") + parser.add_argument("--workers", type=int, default=20, + help="Concurrent provisioning threads (default: 20)") + parser.add_argument("--trace-file", default="hub_perf_trace.etl", + help="WPR output .etl path (default: hub_perf_trace.etl)") + parser.add_argument("--zenserver-dir", + help="Directory containing zenserver executable (auto-detected by default)") + parser.add_argument("--s3", action="store_true", + help="Use local MinIO for hub de/hydration instead of filesystem") + parser.add_argument("--minio-port", type=int, default=9000, + help="MinIO S3 API port when --s3 is used (default: 9000)") + parser.add_argument("--minio-console-port", type=int, default=9001, + help="MinIO web console port when --s3 is used (default: 9001)") + parser.add_argument("--s3-bucket", default="zen-hydration-test", + help="S3 bucket name for MinIO hydration (default: zen-hydration-test)") + args = parser.parse_args() + + data_dir = Path(args.data_dir) + trace_file = Path(args.trace_file).resolve() + hub_log = data_dir / "hub.log" + + zenserver_exe = _find_zenserver(args.zenserver_dir) + print(f"[setup] zenserver: {zenserver_exe}") + + minio_proc: Optional[subprocess.Popen] = None + hub_proc: Optional[subprocess.Popen] = None + hub_log_handle = None + wpr_started = False + + hub_extra_args: list[str] = [] + hub_extra_env: Optional[dict[str, str]] = None + + try: + if args.s3: + minio_exe = _find_minio(zenserver_exe) + minio_proc = _start_minio(minio_exe, data_dir, args.minio_port, args.minio_console_port) + _wait_for_minio(args.minio_port) + _create_minio_bucket(args.minio_port, args.s3_bucket) + webbrowser.open(f"http://localhost:{args.minio_console_port}") + config_json = { + "type": "s3", + "settings": { + "uri": f"s3://{args.s3_bucket}", + "endpoint": f"http://localhost:{args.minio_port}", + "path-style": True, + "region": "us-east-1", + }, + } + data_dir.mkdir(parents=True, exist_ok=True) + config_path = data_dir / "hydration_config.json" + config_path.write_text(json.dumps(config_json), encoding="ascii") + hub_extra_args = [ + f"--hub-hydration-target-config={config_path}", + ] + hub_extra_env = { + "AWS_ACCESS_KEY_ID": _MINIO_USER, + "AWS_SECRET_ACCESS_KEY": _MINIO_PASS, + } + + wpr_started = _wpr_start(trace_file) + + hub_proc, hub_log_handle = _start_hub( + zenserver_exe, data_dir, args.port, + hub_log, hub_extra_args, hub_extra_env + ) + _wait_for_hub(hub_proc, args.port) + webbrowser.open(f"http://localhost:{args.port}/dashboard") + + provisioned_ids, time_to_rejection, wall_clock = _flood_provision(args.port, args.workers) + + print(f"\n[results] provisioned : {len(provisioned_ids)}") + print(f"[results] time to 409 : {time_to_rejection:.3f}s") + print(f"[results] wall clock : {wall_clock:.3f}s") + if time_to_rejection > 0: + print(f"[results] rate : {len(provisioned_ids) / time_to_rejection:.1f} provisions/s") + + _wait_stable(args.port, timeout_s=120.0) + _deprovision_all(args.port, provisioned_ids, args.workers) + dehydration_timeout_s = max(60.0, len(provisioned_ids) * 0.5) + _wait_stable(args.port, timeout_s=dehydration_timeout_s) + + finally: + if wpr_started: + _wpr_stop(trace_file) + if hub_proc is not None: + _stop_process(hub_proc, "hub") + if hub_log_handle is not None: + hub_log_handle.close() + if minio_proc is not None: + _stop_process(minio_proc, "minio") + + +if __name__ == "__main__": + main() diff --git a/scripts/test_scripts/block-clone-test-windows.ps1 b/scripts/test_windows/block-clone-test.ps1 index df24831a4..aa6ec3a39 100644 --- a/scripts/test_scripts/block-clone-test-windows.ps1 +++ b/scripts/test_windows/block-clone-test.ps1 @@ -7,7 +7,7 @@ # # Usage: # # From an elevated PowerShell prompt: -# .\scripts\test_scripts\block-clone-test-windows.ps1 [-TestBinary <path>] +# .\scripts\test_windows\block-clone-test.ps1 [-TestBinary <path>] # # If -TestBinary is not given, defaults to build\windows\x64\debug\zencore-test.exe # relative to the repository root. diff --git a/scripts/test_windows/service-test.ps1 b/scripts/test_windows/service-test.ps1 new file mode 100644 index 000000000..4c484c63f --- /dev/null +++ b/scripts/test_windows/service-test.ps1 @@ -0,0 +1,353 @@ +# Test zen service command lifecycle on Windows (SCM). +# +# Requires: Administrator privileges +# +# Usage: +# # From an elevated PowerShell prompt: +# .\scripts\test_windows\service-test.ps1 [-ZenBinary <path>] +# +# If -ZenBinary is not given, defaults to build\windows\x64\debug\zen.exe +# relative to the repository root. + +param( + [string]$ZenBinary +) + +$ErrorActionPreference = "Stop" + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$RepoRoot = (Resolve-Path "$ScriptDir\..\..").Path + +if (-not $ZenBinary) { + $ZenBinary = Join-Path $RepoRoot "build\windows\x64\debug\zen.exe" +} +$ZenServerBinary = Join-Path (Split-Path -Parent $ZenBinary) "zenserver.exe" +$ServiceName = "ZenServerTest-$PID" + +$Script:Passed = 0 +$Script:Failed = 0 +$Script:TestsRun = 0 + +function Pass($Message) { + $Script:TestsRun++ + $Script:Passed++ + Write-Host " PASS: $Message" -ForegroundColor Green +} + +function Fail($Message, $Detail) { + $Script:TestsRun++ + $Script:Failed++ + Write-Host " FAIL: $Message" -ForegroundColor Red + if ($Detail) { + Write-Host " $Detail" + } +} + +function Cleanup { + Write-Host "" + Write-Host "--- Cleanup ---" + + # Stop the service if running + $svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if ($svc -and $svc.Status -eq "Running") { + Write-Host "Stopping test service..." + Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue + Start-Sleep -Seconds 2 + } + + # Delete the service if it exists + if (Get-Service -Name $ServiceName -ErrorAction SilentlyContinue) { + Write-Host "Removing test service..." + sc.exe delete $ServiceName 2>$null | Out-Null + } + + Write-Host "" + Write-Host "==============================" + Write-Host " Tests run: $Script:TestsRun" + Write-Host " Passed: $Script:Passed" -ForegroundColor Green + if ($Script:Failed -gt 0) { + Write-Host " Failed: $Script:Failed" -ForegroundColor Red + } else { + Write-Host " Failed: $Script:Failed" + } + Write-Host "==============================" +} + +# ── Preflight checks ────────────────────────────────────────────── + +$IsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole( + [Security.Principal.WindowsBuiltInRole]::Administrator) + +if (-not $IsAdmin) { + Write-Host "Error: this test must be run from an elevated (Administrator) prompt." -ForegroundColor Red + exit 1 +} + +if (-not (Test-Path $ZenBinary)) { + Write-Host "Error: zen binary not found at '$ZenBinary'" -ForegroundColor Red + Write-Host "Build with: xmake config -m debug && xmake build zen" + exit 1 +} + +if (-not (Test-Path $ZenServerBinary)) { + Write-Host "Error: zenserver binary not found at '$ZenServerBinary'" -ForegroundColor Red + Write-Host "Build with: xmake config -m debug && xmake build zenserver" + exit 1 +} + +Write-Host "zen binary: $ZenBinary" +Write-Host "zenserver binary: $ZenServerBinary" +Write-Host "service name: $ServiceName" +Write-Host "" + +try { + +# ── Test: status before install (should fail) ───────────────────── + +Write-Host "--- Test: status before install ---" + +$Output = & $ZenBinary service status $ServiceName 2>&1 | Out-String +if ($Output -match "not installed") { + Pass "status reports 'not installed' for non-existent service" +} else { + Fail "status should report 'not installed'" "got: $Output" +} + +# ── Test: install ───────────────────────────────────────────────── + +Write-Host "--- Test: install ---" + +$Output = & $ZenBinary service install $ZenServerBinary $ServiceName --allow-elevation 2>&1 | Out-String +$ExitCode = $LASTEXITCODE + +if ($ExitCode -eq 0) { + Pass "install exits with code 0" +} else { + Fail "install exits with code 0" "got exit code: $ExitCode, output: $Output" +} + +$svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue +if ($svc) { + Pass "service registered in SCM" +} else { + Fail "service registered in SCM" +} + +# Verify service configuration +if ($svc) { + $svcWmi = Get-CimInstance -ClassName Win32_Service -Filter "Name='$ServiceName'" -ErrorAction SilentlyContinue + if ($svcWmi -and $svcWmi.PathName -match "zenserver") { + Pass "service binary path contains zenserver" + } else { + Fail "service binary path contains zenserver" "got: $($svcWmi.PathName)" + } + + if ($svcWmi -and $svcWmi.StartMode -eq "Auto") { + Pass "service is set to auto-start" + } else { + Fail "service is set to auto-start" "got: $($svcWmi.StartMode)" + } +} + +# ── Test: install again (already installed, no --full) ──────────── + +Write-Host "--- Test: install again (idempotent) ---" + +$Output = & $ZenBinary service install $ZenServerBinary $ServiceName --allow-elevation 2>&1 | Out-String +$ExitCode = $LASTEXITCODE + +if ($ExitCode -eq 0) { + Pass "re-install exits with code 0" +} else { + Fail "re-install exits with code 0" "got exit code: $ExitCode" +} + +if ($Output -match "already installed") { + Pass "re-install reports service already installed" +} else { + Fail "re-install reports service already installed" "got: $Output" +} + +# ── Test: status after install (not yet started) ────────────────── + +Write-Host "--- Test: status after install (stopped) ---" + +$Output = & $ZenBinary service status $ServiceName 2>&1 | Out-String +if ($Output -match "not running") { + Pass "status reports 'not running' for stopped service" +} else { + Fail "status reports 'not running' for stopped service" "got: $Output" +} + +# ── Test: start ─────────────────────────────────────────────────── + +Write-Host "--- Test: start ---" + +$Output = & $ZenBinary service start $ServiceName --allow-elevation 2>&1 | Out-String +$ExitCode = $LASTEXITCODE + +if ($ExitCode -eq 0) { + Pass "start exits with code 0" +} else { + Fail "start exits with code 0" "got exit code: $ExitCode, output: $Output" +} + +Start-Sleep -Seconds 2 + +$svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue +if ($svc -and $svc.Status -eq "Running") { + Pass "service is running after start" + + # ── Test: status while running ──────────────────────────────── + + Write-Host "--- Test: status (running) ---" + + $Output = & $ZenBinary service status $ServiceName 2>&1 | Out-String + if ($Output -match "Running") { + Pass "status reports 'Running'" + } else { + Fail "status reports 'Running'" "got: $Output" + } + + if ($Output -match "zenserver") { + Pass "status shows executable path" + } else { + Fail "status shows executable path" "got: $Output" + } + + # ── Test: start again (already running) ─────────────────────── + + Write-Host "--- Test: start again (already running) ---" + + $Output = & $ZenBinary service start $ServiceName --allow-elevation 2>&1 | Out-String + if ($Output -match "already running") { + Pass "start reports service already running" + } else { + Fail "start reports service already running" "got: $Output" + } + + # ── Test: stop ──────────────────────────────────────────────── + + Write-Host "--- Test: stop ---" + + $Output = & $ZenBinary service stop $ServiceName --allow-elevation 2>&1 | Out-String + $ExitCode = $LASTEXITCODE + + if ($ExitCode -eq 0) { + Pass "stop exits with code 0" + } else { + Fail "stop exits with code 0" "got exit code: $ExitCode, output: $Output" + } + + Start-Sleep -Seconds 2 + + $svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue + if ($svc -and $svc.Status -ne "Running") { + Pass "service is not running after stop" + } else { + Fail "service is not running after stop" + } +} else { + Fail "service is running after start" "(skipping start-dependent tests)" +} + +# ── Test: stop when already stopped ─────────────────────────────── + +Write-Host "--- Test: stop when already stopped ---" + +Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue +Start-Sleep -Seconds 1 + +$Output = & $ZenBinary service stop $ServiceName --allow-elevation 2>&1 | Out-String +if ($Output -match "not running") { + Pass "stop reports 'not running' when already stopped" +} else { + Fail "stop reports 'not running' when already stopped" "got: $Output" +} + +# ── Test: uninstall while running (should fail) ─────────────────── + +Write-Host "--- Test: uninstall while running (should fail) ---" + +& $ZenBinary service start $ServiceName --allow-elevation 2>&1 | Out-Null +Start-Sleep -Seconds 2 + +$svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue +if ($svc -and $svc.Status -eq "Running") { + $Output = & $ZenBinary service uninstall $ServiceName --allow-elevation 2>&1 | Out-String + $ExitCode = $LASTEXITCODE + + if ($ExitCode -ne 0 -or $Output -match "running.*stop") { + Pass "uninstall refuses while service is running" + } else { + Fail "uninstall refuses while service is running" "got: exit=$ExitCode, output: $Output" + } + + # Stop it for the real uninstall test + & $ZenBinary service stop $ServiceName --allow-elevation 2>&1 | Out-Null + Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue + Start-Sleep -Seconds 2 +} else { + Write-Host " (skipped: could not start service for this test)" +} + +# ── Test: uninstall ─────────────────────────────────────────────── + +Write-Host "--- Test: uninstall ---" + +Stop-Service -Name $ServiceName -Force -ErrorAction SilentlyContinue +Start-Sleep -Seconds 1 + +$Output = & $ZenBinary service uninstall $ServiceName --allow-elevation 2>&1 | Out-String +$ExitCode = $LASTEXITCODE + +if ($ExitCode -eq 0) { + Pass "uninstall exits with code 0" +} else { + Fail "uninstall exits with code 0" "got exit code: $ExitCode, output: $Output" +} + +$svc = Get-Service -Name $ServiceName -ErrorAction SilentlyContinue +if (-not $svc) { + Pass "service removed from SCM after uninstall" +} else { + Fail "service removed from SCM after uninstall" +} + +# ── Test: status after uninstall ────────────────────────────────── + +Write-Host "--- Test: status after uninstall ---" + +$Output = & $ZenBinary service status $ServiceName 2>&1 | Out-String +if ($Output -match "not installed") { + Pass "status reports 'not installed' after uninstall" +} else { + Fail "status reports 'not installed' after uninstall" "got: $Output" +} + +# ── Test: uninstall when not installed (idempotent) ─────────────── + +Write-Host "--- Test: uninstall when not installed ---" + +$Output = & $ZenBinary service uninstall $ServiceName --allow-elevation 2>&1 | Out-String +$ExitCode = $LASTEXITCODE + +if ($ExitCode -eq 0) { + Pass "uninstall of non-existent service exits with code 0" +} else { + Fail "uninstall of non-existent service exits with code 0" "got exit code: $ExitCode" +} + +if ($Output -match "not installed") { + Pass "uninstall reports service not installed" +} else { + Fail "uninstall reports service not installed" "got: $Output" +} + +} finally { + Cleanup +} + +if ($Script:Failed -gt 0) { + exit 1 +} |