Browse Source

feat(fortuna): add `request_failovers_triggered` metric (#3110)

* Add request_failovers_triggered metric to Fortuna

This metric tracks when backup replicas perform failover to fulfill
requests that the primary replica failed to handle. It will be used
for alerting when primary instances are not working correctly.

Co-Authored-By: Tejas Badadare <tejas@dourolabs.xyz>

* chore: doc, version bump

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Tejas Badadare 1 month ago
parent
commit
ddf2d875bc

+ 1 - 1
Cargo.lock

@@ -3094,7 +3094,7 @@ dependencies = [
 
 [[package]]
 name = "fortuna"
-version = "9.2.1"
+version = "9.2.2"
 dependencies = [
  "anyhow",
  "axum 0.6.20",

+ 1 - 1
apps/fortuna/Cargo.toml

@@ -1,6 +1,6 @@
 [package]
 name = "fortuna"
-version = "9.2.1"
+version = "9.2.2"
 edition = "2021"
 
 [lib]

+ 11 - 0
apps/fortuna/src/keeper/keeper_metrics.rs

@@ -35,6 +35,7 @@ pub struct KeeperMetrics {
     pub requests_processed_success: Family<AccountLabel, Counter>,
     pub requests_processed_failure: Family<AccountLabel, Counter>,
     pub requests_reprocessed: Family<AccountLabel, Counter>,
+    pub request_failovers_triggered: Family<AccountLabel, Counter>,
     pub reveals: Family<AccountLabel, Counter>,
     pub request_duration_ms: Family<AccountLabel, Histogram>,
     pub retry_count: Family<AccountLabel, Histogram>,
@@ -66,6 +67,7 @@ impl Default for KeeperMetrics {
             requests_processed_success: Family::default(),
             requests_processed_failure: Family::default(),
             requests_reprocessed: Family::default(),
+            request_failovers_triggered: Family::default(),
             reveals: Family::default(),
             request_duration_ms: Family::new_with_constructor(|| {
                 Histogram::new(vec![
@@ -186,6 +188,12 @@ impl KeeperMetrics {
             keeper_metrics.requests_reprocessed.clone(),
         );
 
+        writable_registry.register(
+            "request_failovers_triggered",
+            "Number of requests where backup replica attemped to fulfill the request",
+            keeper_metrics.request_failovers_triggered.clone(),
+        );
+
         writable_registry.register(
             "request_duration_ms",
             "Time taken to process each successful callback request in milliseconds",
@@ -297,6 +305,9 @@ impl KeeperMetrics {
             .requests_processed_failure
             .get_or_create(&account_label);
         let _ = self.requests_reprocessed.get_or_create(&account_label);
+        let _ = self
+            .request_failovers_triggered
+            .get_or_create(&account_label);
         let _ = self.reveals.get_or_create(&account_label);
         let _ = self.request_duration_ms.get_or_create(&account_label);
         let _ = self.retry_count.get_or_create(&account_label);

+ 9 - 0
apps/fortuna/src/keeper/process_event.rs

@@ -91,6 +91,15 @@ pub async fn process_event_with_backoff(
                     );
                 }
             }
+
+            let account_label = AccountLabel {
+                chain_id: chain_state.id.clone(),
+                address: chain_state.provider_address.to_string(),
+            };
+            metrics
+                .request_failovers_triggered
+                .get_or_create(&account_label)
+                .inc();
         }
     }