diff options
| author | Dan Engelbrecht <[email protected]> | 2026-05-05 14:59:21 +0200 |
|---|---|---|
| committer | GitHub Enterprise <[email protected]> | 2026-05-05 14:59:21 +0200 |
| commit | 46f456ffd4d0717a035253ff9076ca6ee664e536 (patch) | |
| tree | 69d7a9a43b9874fd3990c43aa5ff4135c35d53d9 /scripts/test_scripts/hub/parse_perf_log.py | |
| parent | watchdog ephemeral port exhaust (#1022) (diff) | |
| download | archived-zen-46f456ffd4d0717a035253ff9076ca6ee664e536.tar.xz archived-zen-46f456ffd4d0717a035253ff9076ca6ee664e536.zip | |
hub async s3 client (#1024)
- Feature: `AsyncHttpClient` adds cancellable request tokens, streaming GET to a file (`AsyncDownload`), zero-copy chunk-callback GET (`AsyncStream`), pull-mode body source for streaming `AsyncPut`, retry layer mirroring the synchronous client, and a submit-side in-flight cap (`HttpClientSettings::MaxConcurrentRequests`) so hub-scale fanout against a single host cannot stall queued handles into curl's connect-timeout window
- Feature: Hub hydration can route S3 transfers through a non-blocking `AsyncHttpClient` (curl_multi + asio) backed by a single io thread; hydrate and dehydrate now pipeline requests instead of blocking worker threads
- `--hub-hydration-async-enabled` (Lua: `hub.hydration.async.enabled`, default true)
- `--hub-hydration-async-max-concurrent-requests` (Lua: `hub.hydration.async.maxconcurrentrequests`, default `clamp(cpu*4, 128, 512)`)
- Feature: Hub provision/deprovision/obliterate now run as two phases on separate worker pools so per-module hydration cannot starve child-process spawn/despawn (and vice versa)
- New `--hub-instance-spawn-threads` (Lua: `hub.instance.spawnthreads`, default `clamp(cpu/8, 4, 16)`) drives child-process spawn/despawn
- `--hub-instance-provision-threads` (Lua: `hub.instance.provisionthreads`) now drives per-module hydrate/dehydrate scheduling only; default changed from `max(cpu/4, 2)` to `clamp(cpu/8, 4, 12)`
- `--hub-hydration-threads` (Lua: `hub.hydration.threads`) now controls per-file workers inside a single hydrate/dehydrate; default changed from `max(cpu/4, 2)` to `clamp(cpu/8, 4, 12)`
- Feature: `AsyncHttpClient` owns its `asio::io_context` and one io thread by default; the `(BaseUri, io_context&)` constructor is preserved for callers that want to share an externally-driven `io_context` across clients (caller MUST keep the loop running until the client destructs)
- Feature: `Hub::Configuration` C++ struct fields renamed (`OptionalProvisionWorkerPool`/`OptionalHydrationWorkerPool` -> `OptionalProvisionPool`/`OptionalSpawnPool`/`OptionalHydrationPool`). Embedders constructing `Hub` directly must update field names; provision and spawn pools must both be set or both null (asserted at construction).
- Bugfix: `S3Client` signing-key cache no longer returns stale signatures after IMDS-rotated credentials change `AccessKeyId`; cache is now keyed on `(DateStamp, AccessKeyId)`
Diffstat (limited to 'scripts/test_scripts/hub/parse_perf_log.py')
| -rw-r--r-- | scripts/test_scripts/hub/parse_perf_log.py | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/scripts/test_scripts/hub/parse_perf_log.py b/scripts/test_scripts/hub/parse_perf_log.py new file mode 100644 index 000000000..8833a5211 --- /dev/null +++ b/scripts/test_scripts/hub/parse_perf_log.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Parse hub perf log to extract hydration / spawn / deprovision timings.""" +from __future__ import annotations +import re +import statistics +import sys +from pathlib import Path + +HYDRATE_RE = re.compile(r"Hydration complete module '([^']+)': (\d+) files \(([^)]+)\) in ([\d.]+)(ms|s)") +SPAWN_RE = re.compile(r"module '([^']+)' started, listening on port \d+, spawn took ([\d.]+)(ms|s)") +PROVISION_START_RE = re.compile(r"\[(\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[inf\] Provisioning storage server instance for module '([^']+)'") +SPAWN_DONE_RE = re.compile(r"\[(\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[inf\] Storage server instance for module '([^']+)' started") +HYDRATE_DONE_RE = re.compile(r"\[(\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[inf\] Hydration complete module '([^']+)'") +DEPROV_START_RE = re.compile(r"\[(\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[inf\] Module '([^']+)' changed state from provisioned to deprovisioning") +DEPROV_REMOVE_RE = re.compile(r"\[(\d{2}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\] \[inf\] Module '([^']+)' (?:removed|changed state from deprovisioning to unprovisioned)") + + +def to_ms(num: str, unit: str) -> float: + v = float(num) + return v * 1000.0 if unit == "s" else v + + +def parse_ts(ts: str) -> float: + # 26-04-29 22:41:34.009 -> seconds since arbitrary epoch (use day-of-year * 86400 + h*3600 + m*60 + s) + import datetime + dt = datetime.datetime.strptime(ts, "%y-%m-%d %H:%M:%S.%f") + return dt.timestamp() + + +def stats(name: str, vals: list[float]) -> str: + if not vals: + return f"{name:14s} n=0" + vs = sorted(vals) + n = len(vs) + p50 = vs[n // 2] + p95 = vs[min(n - 1, int(n * 0.95))] + p99 = vs[min(n - 1, int(n * 0.99))] + return (f"{name:14s} n={n:5d} mean={statistics.mean(vs):8.0f}ms " + f"p50={p50:7.0f}ms p95={p95:7.0f}ms p99={p99:7.0f}ms max={max(vs):7.0f}ms " + f"sum={sum(vs)/1000.0:7.1f}s") + + +def analyse(log_path: Path) -> None: + print(f"=== {log_path}") + hydrate_durations: list[float] = [] + spawn_durations: list[float] = [] + + prov_start: dict[str, float] = {} + hydrate_done: dict[str, float] = {} + spawn_done: dict[str, float] = {} + deprov_start: dict[str, float] = {} + deprov_done: dict[str, float] = {} + + with log_path.open("r", encoding="utf-8", errors="replace") as f: + for line in f: + m = HYDRATE_RE.search(line) + if m: + hydrate_durations.append(to_ms(m.group(4), m.group(5))) + m = SPAWN_RE.search(line) + if m: + spawn_durations.append(to_ms(m.group(2), m.group(3))) + m = PROVISION_START_RE.search(line) + if m: + prov_start[m.group(2)] = parse_ts(m.group(1)) + continue + m = HYDRATE_DONE_RE.search(line) + if m: + hydrate_done[m.group(2)] = parse_ts(m.group(1)) + continue + m = SPAWN_DONE_RE.search(line) + if m: + spawn_done[m.group(2)] = parse_ts(m.group(1)) + continue + m = DEPROV_START_RE.search(line) + if m: + deprov_start[m.group(2)] = parse_ts(m.group(1)) + continue + m = DEPROV_REMOVE_RE.search(line) + if m: + deprov_done[m.group(2)] = parse_ts(m.group(1)) + continue + + # Reconstruct end-to-end provision wall time per module: + e2e_prov_ms: list[float] = [] + queue_to_hydrate_ms: list[float] = [] # time in DataPool queue before hydrate begun (proxy: prov_start->hydrate_done minus hydrate ms) + spawn_minus_data_ms: list[float] = [] # time between hydrate_done and spawn_done (lifetime + lifetime queue) + for mid, t0 in prov_start.items(): + t_end = spawn_done.get(mid) + if t_end is None: + continue + e2e_prov_ms.append((t_end - t0) * 1000.0) + t_h = hydrate_done.get(mid) + if t_h is not None: + spawn_minus_data_ms.append((t_end - t_h) * 1000.0) + + e2e_deprov_ms: list[float] = [] + for mid, t0 in deprov_start.items(): + t_end = deprov_done.get(mid) + if t_end is None: + continue + e2e_deprov_ms.append((t_end - t0) * 1000.0) + + print(stats("hydrate_dur", hydrate_durations)) + print(stats("spawn_dur", spawn_durations)) + print(stats("e2e_prov", e2e_prov_ms)) + print(stats("after_hydrate", spawn_minus_data_ms)) + print(stats("e2e_deprov", e2e_deprov_ms)) + if hydrate_durations and spawn_durations: + h_sum = sum(hydrate_durations) / 1000.0 + s_sum = sum(spawn_durations) / 1000.0 + print(f" total hydrate work: {h_sum:.1f}s, total spawn work: {s_sum:.1f}s, ratio H/S: {h_sum/s_sum:.2f}") + print() + + +if __name__ == "__main__": + for arg in sys.argv[1:]: + analyse(Path(arg)) |