#!/usr/bin/env python3 """Parse two hub.log archives (packed vs unpacked) and produce a comparison.""" from __future__ import annotations import argparse import re import sys from pathlib import Path from statistics import mean, median _SIZE_UNITS = {"B": 1, "K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4} _TIME_UNITS = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1.0} def parse_size(s: str) -> float: # Accept forms: "1.53K", "22.6K", "180M", "1.83G", "25" (bare bytes), "0B". m = re.fullmatch(r"(\d+(?:\.\d+)?)\s*([KMGT]?)B?", s.strip()) if not m: return 0.0 val, unit = m.group(1), m.group(2) or "B" return float(val) * _SIZE_UNITS.get(unit, 1) def parse_time_s(s: str) -> float: # Accept forms: "1s", "46ms", "712us", "0ns" m = re.fullmatch(r"(\d+(?:\.\d+)?)\s*(ns|us|ms|s)", s.strip()) if not m: return 0.0 val, unit = m.group(1), m.group(2) return float(val) * _TIME_UNITS[unit] def parse_rate_bits(s: str) -> float: # "30.7Mbits/s" -> bits/sec m = re.fullmatch(r"(\d+(?:\.\d+)?)\s*([KMGT]?)bits/s", s.strip()) if not m: return 0.0 val, unit = m.group(1), m.group(2) or "" mult = {"": 1, "K": 1e3, "M": 1e6, "G": 1e9, "T": 1e12}[unit] return float(val) * mult _RE_COMPLETE = re.compile(r"Hydration complete module '([0-9a-f-]+)': (\d+) files \(([^)]+)\) in (\S+)$") _RE_PACK = re.compile(r"Pack unpack:\s+(\d+) packs,\s+(\d+) files,\s+(\S+),\s+download (\S+),\s+unpack (\S+)") _RE_DLPHASE = re.compile(r"Download phase:\s+(\S+)\s+(\d+)/(\d+)\s+\(([^)]+)\) downloaded,\s+(\S+),\s+(\d+) threads") _RE_REQS = re.compile(r"Requests:\s+(\d+) reqs,\s+avg (\S+)/req,\s+avg (\S+)/req,\s+peak in-flight (\d+),\s+saturation (\d+)%") _RE_SCHED = re.compile(r"Scheduling:\s+1st schedule \+(\S+),\s+1st start \+(\S+),\s+queue wait (\S+)") _RE_LOADMETA = re.compile(r"Load metadata:\s+(\S+)") _RE_RENAME = re.compile(r"Rename/copy:\s+(\S+)") def parse_log(path: Path) -> list[dict]: modules: list[dict] = [] text = path.read_text(encoding="utf-8", errors="replace").splitlines() i = 0 while i < len(text): line = text[i] m = _RE_COMPLETE.search(line) if not m: i += 1 continue mod = { "mid": m.group(1), "files": int(m.group(2)), "total_bytes": parse_size(m.group(3)), "wall_s": parse_time_s(m.group(4)), } # Read the block that follows (up to ~12 lines). block = text[i : min(i + 20, len(text))] joined = "\n".join(block) pm = _RE_PACK.search(joined) if pm: mod["packs"] = int(pm.group(1)) mod["packed_files"] = int(pm.group(2)) # Uncompressed format: single size, no integrity rehash. mod["pack_bytes"] = parse_size(pm.group(3)) mod["pack_download_s"] = parse_time_s(pm.group(4)) mod["pack_unpack_s"] = parse_time_s(pm.group(5)) else: mod["packs"] = 0 mod["packed_files"] = 0 mod["pack_bytes"] = 0.0 mod["pack_download_s"] = 0.0 mod["pack_unpack_s"] = 0.0 dm = _RE_DLPHASE.search(joined) if dm: mod["dl_phase_s"] = parse_time_s(dm.group(1)) mod["dl_files"] = int(dm.group(2)) mod["dl_total_files"] = int(dm.group(3)) mod["dl_bytes"] = parse_size(dm.group(4)) mod["dl_throughput_bps"] = parse_rate_bits(dm.group(5)) mod["dl_threads"] = int(dm.group(6)) rm = _RE_REQS.search(joined) if rm: mod["reqs"] = int(rm.group(1)) mod["req_avg_s"] = parse_time_s(rm.group(2)) mod["req_avg_throughput_bps"] = parse_rate_bits(rm.group(3)) mod["req_peak_inflight"] = int(rm.group(4)) mod["req_saturation_pct"] = int(rm.group(5)) sm = _RE_SCHED.search(joined) if sm: mod["sched_1st_schedule_s"] = parse_time_s(sm.group(1)) mod["sched_1st_start_s"] = parse_time_s(sm.group(2)) mod["queue_wait_s"] = parse_time_s(sm.group(3)) lm = _RE_LOADMETA.search(joined) if lm: mod["load_metadata_s"] = parse_time_s(lm.group(1)) rcm = _RE_RENAME.search(joined) if rcm: mod["rename_copy_s"] = parse_time_s(rcm.group(1)) modules.append(mod) i += 1 return modules def fmt_bytes(n: float) -> str: for u in ("B", "KB", "MB", "GB", "TB"): if n < 1024 or u == "TB": return f"{n:.1f} {u}" n /= 1024 return f"{n:.1f} TB" def fmt_s(s: float) -> str: if s < 1e-3: return f"{s*1e6:.0f} us" if s < 1.0: return f"{s*1e3:.1f} ms" return f"{s:.2f} s" def agg(modules: list[dict]) -> dict: a: dict = {} a["count"] = len(modules) a["total_files"] = sum(m["files"] for m in modules) a["total_bytes"] = sum(m["total_bytes"] for m in modules) a["total_reqs"] = sum(m.get("reqs", 0) for m in modules) a["total_packs"] = sum(m.get("packs", 0) for m in modules) a["total_packed_files"] = sum(m.get("packed_files", 0) for m in modules) a["total_pack_bytes"] = sum(m.get("pack_bytes", 0.0) for m in modules) a["total_dl_bytes"] = sum(m.get("dl_bytes", 0.0) for m in modules) a["total_dl_phase_s"] = sum(m.get("dl_phase_s", 0.0) for m in modules) a["total_pack_download_s"] = sum(m.get("pack_download_s", 0.0) for m in modules) a["total_pack_unpack_s"] = sum(m.get("pack_unpack_s", 0.0) for m in modules) a["total_rename_s"] = sum(m.get("rename_copy_s", 0.0) for m in modules) a["total_load_meta_s"] = sum(m.get("load_metadata_s", 0.0) for m in modules) a["total_wall_s"] = sum(m["wall_s"] for m in modules) # Per-module medians and percentiles of request-level metrics req_avgs = [m["req_avg_s"] for m in modules if m.get("reqs", 0) > 0] a["req_avg_latency_median_s"] = median(req_avgs) if req_avgs else 0.0 a["req_avg_latency_mean_s"] = mean(req_avgs) if req_avgs else 0.0 a["req_avg_latency_p95_s"] = ( sorted(req_avgs)[int(0.95 * len(req_avgs))] if req_avgs else 0.0 ) queue_waits = [m["queue_wait_s"] for m in modules if "queue_wait_s" in m] a["queue_wait_median_s"] = median(queue_waits) if queue_waits else 0.0 a["queue_wait_mean_s"] = mean(queue_waits) if queue_waits else 0.0 a["queue_wait_p95_s"] = ( sorted(queue_waits)[int(0.95 * len(queue_waits))] if queue_waits else 0.0 ) a["queue_wait_max_s"] = max(queue_waits) if queue_waits else 0.0 first_starts = [m["sched_1st_start_s"] for m in modules if "sched_1st_start_s" in m] a["first_start_median_s"] = median(first_starts) if first_starts else 0.0 a["first_start_p95_s"] = ( sorted(first_starts)[int(0.95 * len(first_starts))] if first_starts else 0.0 ) wall_times = [m["wall_s"] for m in modules] a["wall_median_s"] = median(wall_times) if wall_times else 0.0 a["wall_mean_s"] = mean(wall_times) if wall_times else 0.0 a["wall_p95_s"] = sorted(wall_times)[int(0.95 * len(wall_times))] if wall_times else 0.0 a["wall_max_s"] = max(wall_times) if wall_times else 0.0 saturations = [m["req_saturation_pct"] for m in modules if "req_saturation_pct" in m] a["saturation_median_pct"] = median(saturations) if saturations else 0 a["saturation_mean_pct"] = mean(saturations) if saturations else 0 peak_inflights = [m["req_peak_inflight"] for m in modules if "req_peak_inflight" in m] a["peak_inflight_median"] = median(peak_inflights) if peak_inflights else 0 a["peak_inflight_max"] = max(peak_inflights) if peak_inflights else 0 return a def print_row(label: str, packed_val: str, unpacked_val: str, delta: str = ""): print(f" {label:<42} {packed_val:>14} {unpacked_val:>14} {delta}") def delta_pct(packed: float, unpacked: float) -> str: if unpacked == 0: return "" d = packed - unpacked pct = (d / unpacked) * 100.0 sign = "+" if d >= 0 else "" return f"{sign}{pct:.1f}%" def main() -> int: p = argparse.ArgumentParser() p.add_argument("--packed-log", required=True) p.add_argument("--unpacked-log", required=True) args = p.parse_args() pm = parse_log(Path(args.packed_log)) um = parse_log(Path(args.unpacked_log)) pa = agg(pm) ua = agg(um) print(f"\nParsed packed: {pa['count']} modules from {args.packed_log}") print(f"Parsed unpacked: {ua['count']} modules from {args.unpacked_log}\n") print(f" {'metric':<42} {'packed':>14} {'unpacked':>14} delta(p vs u)") print(f" {'-'*42} {'-'*14} {'-'*14} {'-'*12}") print("\n-- workload --") print_row("modules", f"{pa['count']}", f"{ua['count']}") print_row("total files (entries)", f"{pa['total_files']}", f"{ua['total_files']}") print_row("total raw data", fmt_bytes(pa["total_bytes"]), fmt_bytes(ua["total_bytes"])) print("\n-- S3 requests --") print_row("total S3 GETs", f"{pa['total_reqs']}", f"{ua['total_reqs']}", delta_pct(pa["total_reqs"], ua["total_reqs"])) print_row("GETs per module (mean)", f"{pa['total_reqs']/pa['count']:.1f}", f"{ua['total_reqs']/ua['count']:.1f}", delta_pct(pa['total_reqs']/pa['count'], ua['total_reqs']/ua['count'])) print_row("requests saved by packing", f"{ua['total_reqs'] - pa['total_reqs']}", "-", f"{((ua['total_reqs'] - pa['total_reqs']) / ua['total_reqs']) * 100:.1f}%") print("\n-- payload --") print_row("total bytes downloaded", fmt_bytes(pa["total_dl_bytes"]), fmt_bytes(ua["total_dl_bytes"]), delta_pct(pa["total_dl_bytes"], ua["total_dl_bytes"])) print_row(" pack bytes (aggregate)", fmt_bytes(pa["total_pack_bytes"]), "-") diff = pa["total_dl_bytes"] - ua["total_dl_bytes"] direction = "saved by packing" if diff < 0 else "added by packing" print_row(f"bytes {direction}", fmt_bytes(abs(diff)), "-") print("\n-- per-request latency (per-module avg) --") print_row("median req avg latency", fmt_s(pa["req_avg_latency_median_s"]), fmt_s(ua["req_avg_latency_median_s"]), delta_pct(pa["req_avg_latency_median_s"], ua["req_avg_latency_median_s"])) print_row("mean req avg latency", fmt_s(pa["req_avg_latency_mean_s"]), fmt_s(ua["req_avg_latency_mean_s"]), delta_pct(pa["req_avg_latency_mean_s"], ua["req_avg_latency_mean_s"])) print_row("p95 req avg latency", fmt_s(pa["req_avg_latency_p95_s"]), fmt_s(ua["req_avg_latency_p95_s"]), delta_pct(pa["req_avg_latency_p95_s"], ua["req_avg_latency_p95_s"])) print("\n-- queue / scheduling --") print_row("median queue wait (per module)", fmt_s(pa["queue_wait_median_s"]), fmt_s(ua["queue_wait_median_s"]), delta_pct(pa["queue_wait_median_s"], ua["queue_wait_median_s"])) print_row("mean queue wait", fmt_s(pa["queue_wait_mean_s"]), fmt_s(ua["queue_wait_mean_s"]), delta_pct(pa["queue_wait_mean_s"], ua["queue_wait_mean_s"])) print_row("p95 queue wait", fmt_s(pa["queue_wait_p95_s"]), fmt_s(ua["queue_wait_p95_s"]), delta_pct(pa["queue_wait_p95_s"], ua["queue_wait_p95_s"])) print_row("max queue wait", fmt_s(pa["queue_wait_max_s"]), fmt_s(ua["queue_wait_max_s"]), delta_pct(pa["queue_wait_max_s"], ua["queue_wait_max_s"])) print_row("median 1st-start delay", fmt_s(pa["first_start_median_s"]), fmt_s(ua["first_start_median_s"]), delta_pct(pa["first_start_median_s"], ua["first_start_median_s"])) print("\n-- per-module wall time (hydration block) --") print_row("median", fmt_s(pa["wall_median_s"]), fmt_s(ua["wall_median_s"]), delta_pct(pa["wall_median_s"], ua["wall_median_s"])) print_row("mean", fmt_s(pa["wall_mean_s"]), fmt_s(ua["wall_mean_s"]), delta_pct(pa["wall_mean_s"], ua["wall_mean_s"])) print_row("p95", fmt_s(pa["wall_p95_s"]), fmt_s(ua["wall_p95_s"]), delta_pct(pa["wall_p95_s"], ua["wall_p95_s"])) print_row("max", fmt_s(pa["wall_max_s"]), fmt_s(ua["wall_max_s"]), delta_pct(pa["wall_max_s"], ua["wall_max_s"])) print("\n-- thread-pool saturation / concurrency --") print_row("saturation median %", f"{pa['saturation_median_pct']}", f"{ua['saturation_median_pct']}") print_row("saturation mean %", f"{pa['saturation_mean_pct']:.1f}", f"{ua['saturation_mean_pct']:.1f}") print_row("peak in-flight median", f"{pa['peak_inflight_median']}", f"{ua['peak_inflight_median']}") print_row("peak in-flight max", f"{pa['peak_inflight_max']}", f"{ua['peak_inflight_max']}") print("\n-- pack-specific costs (packed run only) --") print(f" pack GETs added: {pa['total_packs']} ({pa['total_packs']/pa['count']:.2f}/module)") print(f" small files folded: {pa['total_packed_files']} ({pa['total_packed_files']/pa['count']:.1f}/module)") print(f" pack download time total: {pa['total_pack_download_s']:.1f} s") print(f" pack unpack total: {pa['total_pack_unpack_s']:.2f} s") print("\n-- per-phase totals (sum across modules, wall-clock contribution) --") print_row("load metadata total", fmt_s(pa["total_load_meta_s"]), fmt_s(ua["total_load_meta_s"]), delta_pct(pa["total_load_meta_s"], ua["total_load_meta_s"])) print_row("download phase total", fmt_s(pa["total_dl_phase_s"]), fmt_s(ua["total_dl_phase_s"]), delta_pct(pa["total_dl_phase_s"], ua["total_dl_phase_s"])) print_row("rename/copy total", fmt_s(pa["total_rename_s"]), fmt_s(ua["total_rename_s"]), delta_pct(pa["total_rename_s"], ua["total_rename_s"])) print_row("hydration wall sum", fmt_s(pa["total_wall_s"]), fmt_s(ua["total_wall_s"]), delta_pct(pa["total_wall_s"], ua["total_wall_s"])) return 0 if __name__ == "__main__": sys.exit(main())