aboutsummaryrefslogtreecommitdiff
path: root/src/zencompute/cloudmetadata.cpp
diff options
context:
space:
mode:
authorLiam Mitchell <[email protected]>2026-03-09 19:06:36 -0700
committerLiam Mitchell <[email protected]>2026-03-09 19:06:36 -0700
commitd1abc50ee9d4fb72efc646e17decafea741caa34 (patch)
treee4288e00f2f7ca0391b83d986efcb69d3ba66a83 /src/zencompute/cloudmetadata.cpp
parentAllow requests with invalid content-types unless specified in command line or... (diff)
parentupdated chunk–block analyser (#818) (diff)
downloadzen-d1abc50ee9d4fb72efc646e17decafea741caa34.tar.xz
zen-d1abc50ee9d4fb72efc646e17decafea741caa34.zip
Merge branch 'main' into lm/restrict-content-type
Diffstat (limited to 'src/zencompute/cloudmetadata.cpp')
-rw-r--r--src/zencompute/cloudmetadata.cpp1014
1 files changed, 1014 insertions, 0 deletions
diff --git a/src/zencompute/cloudmetadata.cpp b/src/zencompute/cloudmetadata.cpp
new file mode 100644
index 000000000..65bac895f
--- /dev/null
+++ b/src/zencompute/cloudmetadata.cpp
@@ -0,0 +1,1014 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#include <zencompute/cloudmetadata.h>
+
+#include <zencore/basicfile.h>
+#include <zencore/filesystem.h>
+#include <zencore/string.h>
+#include <zencore/trace.h>
+#include <zenhttp/httpclient.h>
+
+ZEN_THIRD_PARTY_INCLUDES_START
+#include <json11.hpp>
+ZEN_THIRD_PARTY_INCLUDES_END
+
+namespace zen::compute {
+
+// All major cloud providers expose instance metadata at this link-local address.
+// It is only routable from within a cloud VM; on bare-metal the TCP connect will
+// fail, which is how we distinguish cloud from non-cloud environments.
+static constexpr std::string_view kImdsEndpoint = "http://169.254.169.254";
+
+// Short connect timeout so that detection on non-cloud machines is fast. The IMDS
+// is a local service on the hypervisor so 200ms is generous for actual cloud VMs.
+static constexpr auto kImdsTimeout = std::chrono::milliseconds{200};
+
+std::string_view
+ToString(CloudProvider Provider)
+{
+ switch (Provider)
+ {
+ case CloudProvider::AWS:
+ return "AWS";
+ case CloudProvider::Azure:
+ return "Azure";
+ case CloudProvider::GCP:
+ return "GCP";
+ default:
+ return "None";
+ }
+}
+
+CloudMetadata::CloudMetadata(std::filesystem::path DataDir) : CloudMetadata(std::move(DataDir), std::string(kImdsEndpoint))
+{
+}
+
+CloudMetadata::CloudMetadata(std::filesystem::path DataDir, std::string ImdsEndpoint)
+: m_Log(logging::Get("cloud"))
+, m_DataDir(std::move(DataDir))
+, m_ImdsEndpoint(std::move(ImdsEndpoint))
+{
+ ZEN_TRACE_CPU("CloudMetadata::CloudMetadata");
+
+ std::error_code Ec;
+ std::filesystem::create_directories(m_DataDir, Ec);
+
+ DetectProvider();
+
+ if (m_Info.Provider != CloudProvider::None)
+ {
+ StartTerminationMonitor();
+ }
+}
+
+CloudMetadata::~CloudMetadata()
+{
+ ZEN_TRACE_CPU("CloudMetadata::~CloudMetadata");
+ m_MonitorEnabled = false;
+ m_MonitorEvent.Set();
+ if (m_MonitorThread.joinable())
+ {
+ m_MonitorThread.join();
+ }
+}
+
+CloudProvider
+CloudMetadata::GetProvider() const
+{
+ return m_InfoLock.WithSharedLock([&] { return m_Info.Provider; });
+}
+
+CloudInstanceInfo
+CloudMetadata::GetInstanceInfo() const
+{
+ return m_InfoLock.WithSharedLock([&] { return m_Info; });
+}
+
+bool
+CloudMetadata::IsTerminationPending() const
+{
+ return m_TerminationPending.load(std::memory_order_relaxed);
+}
+
+std::string
+CloudMetadata::GetTerminationReason() const
+{
+ return m_ReasonLock.WithSharedLock([&] { return m_TerminationReason; });
+}
+
+void
+CloudMetadata::Describe(CbWriter& Writer) const
+{
+ ZEN_TRACE_CPU("CloudMetadata::Describe");
+ CloudInstanceInfo Info = GetInstanceInfo();
+
+ if (Info.Provider == CloudProvider::None)
+ {
+ return;
+ }
+
+ Writer.BeginObject("cloud");
+ Writer << "provider" << ToString(Info.Provider);
+ Writer << "instance_id" << Info.InstanceId;
+ Writer << "availability_zone" << Info.AvailabilityZone;
+ Writer << "is_spot" << Info.IsSpot;
+ Writer << "is_autoscaling" << Info.IsAutoscaling;
+ Writer << "termination_pending" << IsTerminationPending();
+
+ if (IsTerminationPending())
+ {
+ Writer << "termination_reason" << GetTerminationReason();
+ }
+
+ Writer.EndObject();
+}
+
+void
+CloudMetadata::DetectProvider()
+{
+ ZEN_TRACE_CPU("CloudMetadata::DetectProvider");
+
+ if (TryDetectAWS())
+ {
+ return;
+ }
+
+ if (TryDetectAzure())
+ {
+ return;
+ }
+
+ if (TryDetectGCP())
+ {
+ return;
+ }
+
+ ZEN_DEBUG("no cloud provider detected");
+}
+
+// AWS detection uses IMDSv2 which requires a session token obtained via PUT before
+// any GET requests are allowed. This is more secure than IMDSv1 (which allowed
+// unauthenticated GETs) and is the default on modern EC2 instances. The token has
+// a 300-second TTL and is reused for termination polling.
+bool
+CloudMetadata::TryDetectAWS()
+{
+ ZEN_TRACE_CPU("CloudMetadata::TryDetectAWS");
+
+ std::filesystem::path SentinelPath = m_DataDir / ".isNotAWS";
+
+ if (HasSentinelFile(SentinelPath))
+ {
+ ZEN_DEBUG("skipping AWS detection - negative cache hit");
+ return false;
+ }
+
+ ZEN_DEBUG("probing AWS IMDS");
+
+ try
+ {
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-aws", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{1000}});
+
+ // IMDSv2: acquire session token. The TTL header is mandatory; we request
+ // 300s which is sufficient for the detection phase. The token is also
+ // stored in m_AwsToken for reuse by the termination polling thread.
+ HttpClient::KeyValueMap TokenHeaders(std::pair<std::string_view, std::string_view>{"X-aws-ec2-metadata-token-ttl-seconds", "300"});
+ HttpClient::Response TokenResponse = ImdsClient.Put("/latest/api/token", IoBuffer{}, TokenHeaders);
+
+ if (!TokenResponse.IsSuccess())
+ {
+ ZEN_DEBUG("AWS IMDS token request failed ({}), not on AWS", static_cast<int>(TokenResponse.StatusCode));
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+
+ m_AwsToken = std::string(TokenResponse.AsText());
+
+ HttpClient::KeyValueMap AuthHeaders(std::pair<std::string_view, std::string_view>{"X-aws-ec2-metadata-token", m_AwsToken});
+
+ HttpClient::Response IdResponse = ImdsClient.Get("/latest/meta-data/instance-id", AuthHeaders);
+ if (IdResponse.IsSuccess())
+ {
+ m_Info.InstanceId = std::string(IdResponse.AsText());
+ }
+
+ HttpClient::Response AzResponse = ImdsClient.Get("/latest/meta-data/placement/availability-zone", AuthHeaders);
+ if (AzResponse.IsSuccess())
+ {
+ m_Info.AvailabilityZone = std::string(AzResponse.AsText());
+ }
+
+ // "spot" vs "on-demand" — determines whether the instance can be
+ // reclaimed by AWS with a 2-minute warning
+ HttpClient::Response LifecycleResponse = ImdsClient.Get("/latest/meta-data/instance-life-cycle", AuthHeaders);
+ if (LifecycleResponse.IsSuccess())
+ {
+ m_Info.IsSpot = (LifecycleResponse.AsText() == "spot");
+ }
+
+ // This endpoint only exists on instances managed by an Auto Scaling
+ // Group. A successful response (regardless of value) means autoscaling.
+ HttpClient::Response AutoscaleResponse = ImdsClient.Get("/latest/meta-data/autoscaling/target-lifecycle-state", AuthHeaders);
+ if (AutoscaleResponse.IsSuccess())
+ {
+ m_Info.IsAutoscaling = true;
+ }
+
+ m_Info.Provider = CloudProvider::AWS;
+
+ ZEN_INFO("detected AWS instance: id={}, az={}, spot={}, autoscaling={}",
+ m_Info.InstanceId,
+ m_Info.AvailabilityZone,
+ m_Info.IsSpot,
+ m_Info.IsAutoscaling);
+
+ return true;
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_DEBUG("AWS IMDS probe failed: {}", Ex.what());
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+}
+
+// Azure IMDS returns a single JSON document for the entire instance metadata,
+// unlike AWS and GCP which use separate plain-text endpoints per field. The
+// "Metadata: true" header is required; requests without it are rejected.
+// The api-version parameter is mandatory and pins the response schema.
+bool
+CloudMetadata::TryDetectAzure()
+{
+ ZEN_TRACE_CPU("CloudMetadata::TryDetectAzure");
+
+ std::filesystem::path SentinelPath = m_DataDir / ".isNotAzure";
+
+ if (HasSentinelFile(SentinelPath))
+ {
+ ZEN_DEBUG("skipping Azure detection - negative cache hit");
+ return false;
+ }
+
+ ZEN_DEBUG("probing Azure IMDS");
+
+ try
+ {
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-azure", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{1000}});
+
+ HttpClient::KeyValueMap MetadataHeaders({
+ std::pair<std::string_view, std::string_view>{"Metadata", "true"},
+ });
+
+ HttpClient::Response InstanceResponse = ImdsClient.Get("/metadata/instance?api-version=2021-02-01", MetadataHeaders);
+
+ if (!InstanceResponse.IsSuccess())
+ {
+ ZEN_DEBUG("Azure IMDS request failed ({}), not on Azure", static_cast<int>(InstanceResponse.StatusCode));
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+
+ std::string JsonError;
+ const json11::Json Json = json11::Json::parse(std::string(InstanceResponse.AsText()), JsonError);
+
+ if (!JsonError.empty())
+ {
+ ZEN_DEBUG("Azure IMDS returned invalid JSON: {}", JsonError);
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+
+ const json11::Json& Compute = Json["compute"];
+
+ m_Info.InstanceId = Compute["vmId"].string_value();
+ m_Info.AvailabilityZone = Compute["location"].string_value();
+
+ // Azure spot VMs have priority "Spot"; regular VMs have "Regular"
+ std::string Priority = Compute["priority"].string_value();
+ m_Info.IsSpot = (Priority == "Spot");
+
+ // Check if part of a VMSS (Virtual Machine Scale Set) — indicates autoscaling
+ std::string VmssName = Compute["vmScaleSetName"].string_value();
+ m_Info.IsAutoscaling = !VmssName.empty();
+
+ m_Info.Provider = CloudProvider::Azure;
+
+ ZEN_INFO("detected Azure instance: id={}, location={}, spot={}, vmss={}",
+ m_Info.InstanceId,
+ m_Info.AvailabilityZone,
+ m_Info.IsSpot,
+ m_Info.IsAutoscaling);
+
+ return true;
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_DEBUG("Azure IMDS probe failed: {}", Ex.what());
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+}
+
+// GCP requires the "Metadata-Flavor: Google" header on all IMDS requests.
+// Unlike AWS, there is no session token; the header itself is the auth mechanism
+// (it prevents SSRF attacks since browsers won't send custom headers to the
+// metadata endpoint). Each metadata field is fetched from a separate URL.
+bool
+CloudMetadata::TryDetectGCP()
+{
+ ZEN_TRACE_CPU("CloudMetadata::TryDetectGCP");
+
+ std::filesystem::path SentinelPath = m_DataDir / ".isNotGCP";
+
+ if (HasSentinelFile(SentinelPath))
+ {
+ ZEN_DEBUG("skipping GCP detection - negative cache hit");
+ return false;
+ }
+
+ ZEN_DEBUG("probing GCP metadata service");
+
+ try
+ {
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-gcp", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{1000}});
+
+ HttpClient::KeyValueMap MetadataHeaders(std::pair<std::string_view, std::string_view>{"Metadata-Flavor", "Google"});
+
+ // Fetch instance ID
+ HttpClient::Response IdResponse = ImdsClient.Get("/computeMetadata/v1/instance/id", MetadataHeaders);
+
+ if (!IdResponse.IsSuccess())
+ {
+ ZEN_DEBUG("GCP metadata request failed ({}), not on GCP", static_cast<int>(IdResponse.StatusCode));
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+
+ m_Info.InstanceId = std::string(IdResponse.AsText());
+
+ // GCP returns the fully-qualified zone path "projects/<num>/zones/<zone>".
+ // Strip the prefix to get just the zone name (e.g. "us-central1-a").
+ HttpClient::Response ZoneResponse = ImdsClient.Get("/computeMetadata/v1/instance/zone", MetadataHeaders);
+ if (ZoneResponse.IsSuccess())
+ {
+ std::string_view Zone = ZoneResponse.AsText();
+ if (auto Pos = Zone.rfind('/'); Pos != std::string_view::npos)
+ {
+ Zone = Zone.substr(Pos + 1);
+ }
+ m_Info.AvailabilityZone = std::string(Zone);
+ }
+
+ // Check for preemptible/spot (scheduling/preemptible returns "TRUE" or "FALSE")
+ HttpClient::Response PreemptibleResponse = ImdsClient.Get("/computeMetadata/v1/instance/scheduling/preemptible", MetadataHeaders);
+ if (PreemptibleResponse.IsSuccess())
+ {
+ m_Info.IsSpot = (PreemptibleResponse.AsText() == "TRUE");
+ }
+
+ // Check for maintenance event
+ HttpClient::Response MaintenanceResponse = ImdsClient.Get("/computeMetadata/v1/instance/maintenance-event", MetadataHeaders);
+ if (MaintenanceResponse.IsSuccess())
+ {
+ std::string_view Event = MaintenanceResponse.AsText();
+ if (!Event.empty() && Event != "NONE")
+ {
+ m_TerminationPending = true;
+ m_ReasonLock.WithExclusiveLock([&] { m_TerminationReason = fmt::format("GCP maintenance event: {}", Event); });
+ }
+ }
+
+ m_Info.Provider = CloudProvider::GCP;
+
+ ZEN_INFO("detected GCP instance: id={}, az={}, spot={}", m_Info.InstanceId, m_Info.AvailabilityZone, m_Info.IsSpot);
+
+ return true;
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_DEBUG("GCP metadata probe failed: {}", Ex.what());
+ WriteSentinelFile(SentinelPath);
+ return false;
+ }
+}
+
+// Sentinel files are empty marker files whose mere existence signals that a
+// previous detection attempt for a given provider failed. This avoids paying
+// the connect-timeout cost on every startup for providers that are known to
+// be absent. The files persist across process restarts; delete them manually
+// (or remove the DataDir) to force re-detection.
+void
+CloudMetadata::WriteSentinelFile(const std::filesystem::path& Path)
+{
+ try
+ {
+ BasicFile File;
+ File.Open(Path, BasicFile::Mode::kTruncate);
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_WARN("failed to write sentinel file '{}': {}", Path.string(), Ex.what());
+ }
+}
+
+bool
+CloudMetadata::HasSentinelFile(const std::filesystem::path& Path) const
+{
+ return zen::IsFile(Path);
+}
+
+void
+CloudMetadata::ClearSentinelFiles()
+{
+ std::error_code Ec;
+ std::filesystem::remove(m_DataDir / ".isNotAWS", Ec);
+ std::filesystem::remove(m_DataDir / ".isNotAzure", Ec);
+ std::filesystem::remove(m_DataDir / ".isNotGCP", Ec);
+}
+
+void
+CloudMetadata::StartTerminationMonitor()
+{
+ ZEN_INFO("starting cloud termination monitor for {} instance {}", ToString(m_Info.Provider), m_Info.InstanceId);
+
+ m_MonitorThread = std::thread{&CloudMetadata::TerminationMonitorThread, this};
+}
+
+void
+CloudMetadata::TerminationMonitorThread()
+{
+ SetCurrentThreadName("cloud_term_mon");
+
+ // Poll every 5 seconds. The Event is used as an interruptible sleep so
+ // that the destructor can wake us up immediately for a clean shutdown.
+ while (m_MonitorEnabled)
+ {
+ m_MonitorEvent.Wait(5000);
+ m_MonitorEvent.Reset();
+
+ if (!m_MonitorEnabled)
+ {
+ return;
+ }
+
+ PollTermination();
+ }
+}
+
+void
+CloudMetadata::PollTermination()
+{
+ try
+ {
+ CloudProvider Provider = m_InfoLock.WithSharedLock([&] { return m_Info.Provider; });
+
+ if (Provider == CloudProvider::AWS)
+ {
+ PollAWSTermination();
+ }
+ else if (Provider == CloudProvider::Azure)
+ {
+ PollAzureTermination();
+ }
+ else if (Provider == CloudProvider::GCP)
+ {
+ PollGCPTermination();
+ }
+ }
+ catch (const std::exception& Ex)
+ {
+ ZEN_DEBUG("termination poll error: {}", Ex.what());
+ }
+}
+
+// AWS termination signals:
+// - /spot/instance-action: returns 200 with a JSON body ~2 minutes before
+// a spot instance is reclaimed. Returns 404 when no action is pending.
+// - /autoscaling/target-lifecycle-state: returns the ASG lifecycle state.
+// "InService" is normal; anything else (e.g. "Terminated:Wait") means
+// the instance is being cycled out.
+void
+CloudMetadata::PollAWSTermination()
+{
+ ZEN_TRACE_CPU("CloudMetadata::PollAWSTermination");
+
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-aws", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{2000}});
+
+ HttpClient::KeyValueMap AuthHeaders(std::pair<std::string_view, std::string_view>{"X-aws-ec2-metadata-token", m_AwsToken});
+
+ HttpClient::Response SpotResponse = ImdsClient.Get("/latest/meta-data/spot/instance-action", AuthHeaders);
+ if (SpotResponse.IsSuccess())
+ {
+ if (!m_TerminationPending.exchange(true))
+ {
+ m_ReasonLock.WithExclusiveLock([&] { m_TerminationReason = fmt::format("AWS spot interruption: {}", SpotResponse.AsText()); });
+ ZEN_WARN("AWS spot interruption detected: {}", SpotResponse.AsText());
+ }
+ return;
+ }
+
+ HttpClient::Response AutoscaleResponse = ImdsClient.Get("/latest/meta-data/autoscaling/target-lifecycle-state", AuthHeaders);
+ if (AutoscaleResponse.IsSuccess())
+ {
+ std::string_view State = AutoscaleResponse.AsText();
+ if (State.find("InService") == std::string_view::npos)
+ {
+ if (!m_TerminationPending.exchange(true))
+ {
+ m_ReasonLock.WithExclusiveLock([&] { m_TerminationReason = fmt::format("AWS autoscaling lifecycle: {}", State); });
+ ZEN_WARN("AWS autoscaling termination detected: {}", State);
+ }
+ }
+ }
+}
+
+// Azure Scheduled Events API returns a JSON array of upcoming platform events.
+// We care about "Preempt" (spot eviction), "Terminate", and "Reboot" events.
+// Other event types like "Freeze" (live migration) are non-destructive and
+// ignored. The Events array is empty when nothing is pending.
+void
+CloudMetadata::PollAzureTermination()
+{
+ ZEN_TRACE_CPU("CloudMetadata::PollAzureTermination");
+
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-azure", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{2000}});
+
+ HttpClient::KeyValueMap MetadataHeaders({
+ std::pair<std::string_view, std::string_view>{"Metadata", "true"},
+ });
+
+ HttpClient::Response EventsResponse = ImdsClient.Get("/metadata/scheduledevents?api-version=2020-07-01", MetadataHeaders);
+
+ if (!EventsResponse.IsSuccess())
+ {
+ return;
+ }
+
+ std::string JsonError;
+ const json11::Json Json = json11::Json::parse(std::string(EventsResponse.AsText()), JsonError);
+
+ if (!JsonError.empty())
+ {
+ return;
+ }
+
+ const json11::Json::array& Events = Json["Events"].array_items();
+ for (const auto& Evt : Events)
+ {
+ std::string EventType = Evt["EventType"].string_value();
+ if (EventType == "Preempt" || EventType == "Terminate" || EventType == "Reboot")
+ {
+ if (!m_TerminationPending.exchange(true))
+ {
+ std::string EventStatus = Evt["EventStatus"].string_value();
+ m_ReasonLock.WithExclusiveLock(
+ [&] { m_TerminationReason = fmt::format("Azure scheduled event: {} ({})", EventType, EventStatus); });
+ ZEN_WARN("Azure termination event detected: {} ({})", EventType, EventStatus);
+ }
+ return;
+ }
+ }
+}
+
+// GCP maintenance-event returns "NONE" when nothing is pending, and a
+// descriptive string like "TERMINATE_ON_HOST_MAINTENANCE" when the VM is
+// about to be live-migrated or terminated. Preemptible/spot VMs get a
+// 30-second warning before termination.
+void
+CloudMetadata::PollGCPTermination()
+{
+ ZEN_TRACE_CPU("CloudMetadata::PollGCPTermination");
+
+ HttpClient ImdsClient(m_ImdsEndpoint,
+ {.LogCategory = "cloud-gcp", .ConnectTimeout = kImdsTimeout, .Timeout = std::chrono::milliseconds{2000}});
+
+ HttpClient::KeyValueMap MetadataHeaders(std::pair<std::string_view, std::string_view>{"Metadata-Flavor", "Google"});
+
+ HttpClient::Response MaintenanceResponse = ImdsClient.Get("/computeMetadata/v1/instance/maintenance-event", MetadataHeaders);
+ if (MaintenanceResponse.IsSuccess())
+ {
+ std::string_view Event = MaintenanceResponse.AsText();
+ if (!Event.empty() && Event != "NONE")
+ {
+ if (!m_TerminationPending.exchange(true))
+ {
+ m_ReasonLock.WithExclusiveLock([&] { m_TerminationReason = fmt::format("GCP maintenance event: {}", Event); });
+ ZEN_WARN("GCP maintenance event detected: {}", Event);
+ }
+ }
+ }
+}
+
+} // namespace zen::compute
+
+//////////////////////////////////////////////////////////////////////////
+
+#if ZEN_WITH_TESTS
+
+# include <zencompute/mockimds.h>
+
+# include <zencore/filesystem.h>
+# include <zencore/testing.h>
+# include <zencore/testutils.h>
+# include <zenhttp/httpserver.h>
+
+# include <memory>
+# include <thread>
+
+namespace zen::compute {
+
+TEST_SUITE_BEGIN("compute.cloudmetadata");
+
+// ---------------------------------------------------------------------------
+// Test helper — spins up a local ASIO HTTP server hosting a MockImdsService
+// ---------------------------------------------------------------------------
+
+struct TestImdsServer
+{
+ MockImdsService Mock;
+
+ void Start()
+ {
+ m_TmpDir.emplace();
+ m_Server = CreateHttpServer(HttpServerConfig{.ServerClass = "asio"});
+ m_Port = m_Server->Initialize(7575, m_TmpDir->Path() / "http");
+ REQUIRE(m_Port != -1);
+ m_Server->RegisterService(Mock);
+ m_ServerThread = std::thread([this]() { m_Server->Run(false); });
+ }
+
+ std::string Endpoint() const { return fmt::format("http://127.0.0.1:{}", m_Port); }
+
+ std::filesystem::path DataDir() const { return m_TmpDir->Path() / "cloud"; }
+
+ std::unique_ptr<CloudMetadata> CreateCloud() { return std::make_unique<CloudMetadata>(DataDir(), Endpoint()); }
+
+ ~TestImdsServer()
+ {
+ if (m_Server)
+ {
+ m_Server->RequestExit();
+ }
+ if (m_ServerThread.joinable())
+ {
+ m_ServerThread.join();
+ }
+ if (m_Server)
+ {
+ m_Server->Close();
+ }
+ }
+
+private:
+ std::optional<ScopedTemporaryDirectory> m_TmpDir;
+ Ref<HttpServer> m_Server;
+ std::thread m_ServerThread;
+ int m_Port = -1;
+};
+
+// ---------------------------------------------------------------------------
+// AWS
+// ---------------------------------------------------------------------------
+
+TEST_CASE("cloudmetadata.aws")
+{
+ TestImdsServer Imds;
+ Imds.Mock.ActiveProvider = CloudProvider::AWS;
+
+ SUBCASE("detection basics")
+ {
+ Imds.Mock.Aws.InstanceId = "i-abc123";
+ Imds.Mock.Aws.AvailabilityZone = "us-west-2b";
+ Imds.Mock.Aws.LifeCycle = "on-demand";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::AWS);
+
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.InstanceId == "i-abc123");
+ CHECK(Info.AvailabilityZone == "us-west-2b");
+ CHECK(Info.IsSpot == false);
+ CHECK(Info.IsAutoscaling == false);
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+
+ SUBCASE("spot instance")
+ {
+ Imds.Mock.Aws.LifeCycle = "spot";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.IsSpot == true);
+ }
+
+ SUBCASE("autoscaling instance")
+ {
+ Imds.Mock.Aws.AutoscalingState = "InService";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.IsAutoscaling == true);
+ }
+
+ SUBCASE("spot termination")
+ {
+ Imds.Mock.Aws.LifeCycle = "spot";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CHECK(Cloud->IsTerminationPending() == false);
+
+ // Simulate a spot interruption notice appearing
+ Imds.Mock.Aws.SpotAction = R"({"action":"terminate","time":"2025-01-01T00:00:00Z"})";
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("spot interruption") != std::string::npos);
+ }
+
+ SUBCASE("autoscaling termination")
+ {
+ Imds.Mock.Aws.AutoscalingState = "InService";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CHECK(Cloud->IsTerminationPending() == false);
+
+ // Simulate ASG cycling the instance out
+ Imds.Mock.Aws.AutoscalingState = "Terminated:Wait";
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("autoscaling") != std::string::npos);
+ }
+
+ SUBCASE("no termination when InService")
+ {
+ Imds.Mock.Aws.AutoscalingState = "InService";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Azure
+// ---------------------------------------------------------------------------
+
+TEST_CASE("cloudmetadata.azure")
+{
+ TestImdsServer Imds;
+ Imds.Mock.ActiveProvider = CloudProvider::Azure;
+
+ SUBCASE("detection basics")
+ {
+ Imds.Mock.Azure.VmId = "vm-test-1234";
+ Imds.Mock.Azure.Location = "westeurope";
+ Imds.Mock.Azure.Priority = "Regular";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::Azure);
+
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.InstanceId == "vm-test-1234");
+ CHECK(Info.AvailabilityZone == "westeurope");
+ CHECK(Info.IsSpot == false);
+ CHECK(Info.IsAutoscaling == false);
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+
+ SUBCASE("spot instance")
+ {
+ Imds.Mock.Azure.Priority = "Spot";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.IsSpot == true);
+ }
+
+ SUBCASE("vmss instance")
+ {
+ Imds.Mock.Azure.VmScaleSetName = "my-vmss";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.IsAutoscaling == true);
+ }
+
+ SUBCASE("preempt termination")
+ {
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CHECK(Cloud->IsTerminationPending() == false);
+
+ Imds.Mock.Azure.ScheduledEventType = "Preempt";
+ Imds.Mock.Azure.ScheduledEventStatus = "Scheduled";
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("Preempt") != std::string::npos);
+ }
+
+ SUBCASE("terminate event")
+ {
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CHECK(Cloud->IsTerminationPending() == false);
+
+ Imds.Mock.Azure.ScheduledEventType = "Terminate";
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("Terminate") != std::string::npos);
+ }
+
+ SUBCASE("no termination when events empty")
+ {
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+}
+
+// ---------------------------------------------------------------------------
+// GCP
+// ---------------------------------------------------------------------------
+
+TEST_CASE("cloudmetadata.gcp")
+{
+ TestImdsServer Imds;
+ Imds.Mock.ActiveProvider = CloudProvider::GCP;
+
+ SUBCASE("detection basics")
+ {
+ Imds.Mock.Gcp.InstanceId = "9876543210";
+ Imds.Mock.Gcp.Zone = "projects/123/zones/europe-west1-b";
+ Imds.Mock.Gcp.Preemptible = "FALSE";
+ Imds.Mock.Gcp.MaintenanceEvent = "NONE";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::GCP);
+
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.InstanceId == "9876543210");
+ CHECK(Info.AvailabilityZone == "europe-west1-b"); // zone prefix stripped
+ CHECK(Info.IsSpot == false);
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+
+ SUBCASE("preemptible instance")
+ {
+ Imds.Mock.Gcp.Preemptible = "TRUE";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.IsSpot == true);
+ }
+
+ SUBCASE("maintenance event during detection")
+ {
+ Imds.Mock.Gcp.MaintenanceEvent = "TERMINATE_ON_HOST_MAINTENANCE";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+
+ // GCP sets termination pending immediately during detection if a
+ // maintenance event is active
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("maintenance") != std::string::npos);
+ }
+
+ SUBCASE("maintenance event during polling")
+ {
+ Imds.Mock.Gcp.MaintenanceEvent = "NONE";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ CHECK(Cloud->IsTerminationPending() == false);
+
+ Imds.Mock.Gcp.MaintenanceEvent = "TERMINATE_ON_HOST_MAINTENANCE";
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == true);
+ CHECK(Cloud->GetTerminationReason().find("maintenance") != std::string::npos);
+ }
+
+ SUBCASE("no termination when NONE")
+ {
+ Imds.Mock.Gcp.MaintenanceEvent = "NONE";
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+ Cloud->PollTermination();
+
+ CHECK(Cloud->IsTerminationPending() == false);
+ }
+}
+
+// ---------------------------------------------------------------------------
+// No provider
+// ---------------------------------------------------------------------------
+
+TEST_CASE("cloudmetadata.no_provider")
+{
+ TestImdsServer Imds;
+ Imds.Mock.ActiveProvider = CloudProvider::None;
+ Imds.Start();
+
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::None);
+
+ CloudInstanceInfo Info = Cloud->GetInstanceInfo();
+ CHECK(Info.InstanceId.empty());
+ CHECK(Info.AvailabilityZone.empty());
+ CHECK(Info.IsSpot == false);
+ CHECK(Info.IsAutoscaling == false);
+ CHECK(Cloud->IsTerminationPending() == false);
+}
+
+// ---------------------------------------------------------------------------
+// Sentinel file management
+// ---------------------------------------------------------------------------
+
+TEST_CASE("cloudmetadata.sentinel_files")
+{
+ TestImdsServer Imds;
+ Imds.Mock.ActiveProvider = CloudProvider::None;
+ Imds.Start();
+
+ auto DataDir = Imds.DataDir();
+
+ SUBCASE("sentinels are written on failed detection")
+ {
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::None);
+ CHECK(zen::IsFile(DataDir / ".isNotAWS"));
+ CHECK(zen::IsFile(DataDir / ".isNotAzure"));
+ CHECK(zen::IsFile(DataDir / ".isNotGCP"));
+ }
+
+ SUBCASE("ClearSentinelFiles removes sentinels")
+ {
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(zen::IsFile(DataDir / ".isNotAWS"));
+ CHECK(zen::IsFile(DataDir / ".isNotAzure"));
+ CHECK(zen::IsFile(DataDir / ".isNotGCP"));
+
+ Cloud->ClearSentinelFiles();
+
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotAWS"));
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotAzure"));
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotGCP"));
+ }
+
+ SUBCASE("only failed providers get sentinels")
+ {
+ // Switch to AWS — Azure and GCP never probed, so no sentinels for them
+ Imds.Mock.ActiveProvider = CloudProvider::AWS;
+
+ auto Cloud = Imds.CreateCloud();
+
+ CHECK(Cloud->GetProvider() == CloudProvider::AWS);
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotAWS"));
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotAzure"));
+ CHECK_FALSE(zen::IsFile(DataDir / ".isNotGCP"));
+ }
+}
+
+TEST_SUITE_END();
+
+void
+cloudmetadata_forcelink()
+{
+}
+
+} // namespace zen::compute
+
+#endif // ZEN_WITH_TESTS