aboutsummaryrefslogtreecommitdiff
path: root/src/zencompute/include/zencompute/cloudmetadata.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/zencompute/include/zencompute/cloudmetadata.h')
-rw-r--r--src/zencompute/include/zencompute/cloudmetadata.h151
1 files changed, 151 insertions, 0 deletions
diff --git a/src/zencompute/include/zencompute/cloudmetadata.h b/src/zencompute/include/zencompute/cloudmetadata.h
new file mode 100644
index 000000000..a5bc5a34d
--- /dev/null
+++ b/src/zencompute/include/zencompute/cloudmetadata.h
@@ -0,0 +1,151 @@
+// Copyright Epic Games, Inc. All Rights Reserved.
+
+#pragma once
+
+#include <zencore/compactbinarybuilder.h>
+#include <zencore/logging.h>
+#include <zencore/thread.h>
+
+#include <atomic>
+#include <filesystem>
+#include <string>
+#include <thread>
+
+namespace zen::compute {
+
+enum class CloudProvider
+{
+ None,
+ AWS,
+ Azure,
+ GCP
+};
+
+std::string_view ToString(CloudProvider Provider);
+
+/** Snapshot of detected cloud instance properties. */
+struct CloudInstanceInfo
+{
+ CloudProvider Provider = CloudProvider::None;
+ std::string InstanceId;
+ std::string AvailabilityZone;
+ bool IsSpot = false;
+ bool IsAutoscaling = false;
+};
+
+/**
+ * Detects whether the process is running on a cloud VM (AWS, Azure, or GCP)
+ * and monitors for impending termination signals.
+ *
+ * Detection works by querying the Instance Metadata Service (IMDS) at the
+ * well-known link-local address 169.254.169.254, which is only routable from
+ * within a cloud VM. Each provider is probed in sequence (AWS -> Azure -> GCP);
+ * the first successful response wins.
+ *
+ * To avoid a ~200ms connect timeout penalty on every startup when running on
+ * bare-metal or non-cloud machines, failed probes write sentinel files
+ * (e.g. ".isNotAWS") to DataDir. Subsequent startups skip providers that have
+ * a sentinel present. Delete the sentinel files to force re-detection.
+ *
+ * When a provider is detected, a background thread polls for termination
+ * signals every 5 seconds (spot interruption, autoscaling lifecycle changes,
+ * scheduled maintenance). The termination state is exposed as an atomic bool
+ * so the compute server can include it in coordinator announcements and react
+ * to imminent shutdown.
+ *
+ * Thread safety: GetInstanceInfo() and GetTerminationReason() acquire a
+ * shared RwLock; the background monitor thread acquires the exclusive lock
+ * only when writing the termination reason (a one-time transition). The
+ * termination-pending flag itself is a lock-free atomic.
+ *
+ * Usage:
+ * auto Cloud = std::make_unique<CloudMetadata>(DataDir / "cloud");
+ * if (Cloud->IsTerminationPending()) { ... }
+ * Cloud->Describe(AnnounceBody); // writes "cloud" sub-object into CB
+ */
+class CloudMetadata
+{
+public:
+ /** Synchronously probes cloud providers and starts the termination monitor
+ * if a provider is detected. Creates DataDir if it does not exist.
+ */
+ explicit CloudMetadata(std::filesystem::path DataDir);
+
+ /** Synchronously probes cloud providers at the given IMDS endpoint.
+ * Intended for testing — allows redirecting all IMDS queries to a local
+ * mock HTTP server instead of the real 169.254.169.254 endpoint.
+ */
+ CloudMetadata(std::filesystem::path DataDir, std::string ImdsEndpoint);
+
+ /** Stops the termination monitor thread and joins it. */
+ ~CloudMetadata();
+
+ CloudMetadata(const CloudMetadata&) = delete;
+ CloudMetadata& operator=(const CloudMetadata&) = delete;
+
+ CloudProvider GetProvider() const;
+ CloudInstanceInfo GetInstanceInfo() const;
+ bool IsTerminationPending() const;
+ std::string GetTerminationReason() const;
+
+ /** Writes a "cloud" sub-object into the compact binary writer if a provider
+ * was detected. No-op when running on bare metal.
+ */
+ void Describe(CbWriter& Writer) const;
+
+ /** Executes a single termination-poll cycle for the detected provider.
+ * Public so tests can drive poll cycles synchronously without relying on
+ * the background thread's 5-second timer.
+ */
+ void PollTermination();
+
+ /** Removes the negative-cache sentinel files (.isNotAWS, .isNotAzure,
+ * .isNotGCP) from DataDir so subsequent detection probes are not skipped.
+ * Primarily intended for tests that need to reset state between sub-cases.
+ */
+ void ClearSentinelFiles();
+
+private:
+ /** Tries each provider in order, stops on first successful detection. */
+ void DetectProvider();
+ bool TryDetectAWS();
+ bool TryDetectAzure();
+ bool TryDetectGCP();
+
+ void WriteSentinelFile(const std::filesystem::path& Path);
+ bool HasSentinelFile(const std::filesystem::path& Path) const;
+
+ void StartTerminationMonitor();
+ void TerminationMonitorThread();
+ void PollAWSTermination();
+ void PollAzureTermination();
+ void PollGCPTermination();
+
+ LoggerRef Log() { return m_Log; }
+
+ LoggerRef m_Log;
+ std::filesystem::path m_DataDir;
+ std::string m_ImdsEndpoint;
+
+ mutable RwLock m_InfoLock;
+ CloudInstanceInfo m_Info;
+
+ std::atomic<bool> m_TerminationPending{false};
+
+ mutable RwLock m_ReasonLock;
+ std::string m_TerminationReason;
+
+ // IMDSv2 session token, acquired during AWS detection and reused for
+ // subsequent termination polling. Has a 300s TTL on the AWS side; if it
+ // expires mid-run the poll requests will get 401s which we treat as
+ // non-terminal (the monitor simply retries next cycle).
+ std::string m_AwsToken;
+
+ std::thread m_MonitorThread;
+ std::atomic<bool> m_MonitorEnabled{true};
+ Event m_MonitorEvent;
+};
+
+void cloudmetadata_forcelink(); // internal
+
+} // namespace zen::compute