Quellcode durchsuchen

Merge pull request #2761 from pyth-network/cprussin/measure-timings-for-history-queries

feat(fortuna): add prometheus metrics for history query latency
Connor Prussin vor 5 Monaten
Ursprung
Commit
a1d0e22ea8

+ 2 - 2
apps/fortuna/Cargo.lock

@@ -3108,9 +3108,9 @@ dependencies = [
 
 [[package]]
 name = "prometheus-client"
-version = "0.21.2"
+version = "0.23.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c99afa9a01501019ac3a14d71d9f94050346f55ca471ce90c799a15c58f61e2"
+checksum = "cf41c1a7c32ed72abe5082fb19505b969095c12da9f5732a4bc9878757fd087c"
 dependencies = [
  "dtoa",
  "itoa",

+ 1 - 1
apps/fortuna/Cargo.toml

@@ -19,7 +19,7 @@ ethabi = "18.0.0"
 ethers = { version = "2.0.14", features = ["ws"] }
 futures = { version = "0.3.28" }
 hex = "0.4.3"
-prometheus-client = { version = "0.21.2" }
+prometheus-client = { version = "0.23.1" }
 pythnet-sdk = { path = "../../pythnet/pythnet_sdk", features = ["strum"] }
 rand = "0.8.5"
 reqwest = { version = "0.11.22", features = ["json", "blocking"] }

+ 5 - 0
apps/fortuna/src/api.rs

@@ -61,6 +61,8 @@ pub struct ApiState {
 
     /// Prometheus metrics
     pub metrics: Arc<ApiMetrics>,
+
+    pub explorer_metrics: Arc<ExplorerMetrics>,
 }
 
 impl ApiState {
@@ -73,6 +75,8 @@ impl ApiState {
             http_requests: Family::default(),
         };
 
+        let explorer_metrics = Arc::new(ExplorerMetrics::new(metrics_registry.clone()).await);
+
         let http_requests = metrics.http_requests.clone();
         metrics_registry.write().await.register(
             "http_requests",
@@ -83,6 +87,7 @@ impl ApiState {
         ApiState {
             chains,
             metrics: Arc::new(metrics),
+            explorer_metrics,
             history,
             metrics_registry,
         }

+ 97 - 3
apps/fortuna/src/api/explorer.rs

@@ -1,17 +1,89 @@
 use {
     crate::{
         api::{ApiBlockChainState, NetworkId, RestError, StateTag},
-        history::RequestStatus,
+        config::LATENCY_BUCKETS,
+        history::{RequestQueryBuilder, RequestStatus, SearchField},
     },
     axum::{
         extract::{Query, State},
         Json,
     },
     chrono::{DateTime, Utc},
+    prometheus_client::{
+        encoding::{EncodeLabelSet, EncodeLabelValue},
+        metrics::{family::Family, histogram::Histogram},
+        registry::Registry,
+    },
+    std::sync::Arc,
+    tokio::{sync::RwLock, time::Instant},
     utoipa::IntoParams,
 };
 
-#[derive(Debug, serde::Serialize, serde::Deserialize, IntoParams)]
+#[derive(Debug)]
+pub struct ExplorerMetrics {
+    results_latency: Family<QueryTags, Histogram>,
+    count_latency: Family<QueryTags, Histogram>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, EncodeLabelSet)]
+pub struct QueryTags {
+    search_type: Option<SearchType>,
+    has_network_id_filter: bool,
+    has_state_filter: bool,
+}
+
+impl<'a> From<RequestQueryBuilder<'a>> for QueryTags {
+    fn from(builder: RequestQueryBuilder<'a>) -> Self {
+        QueryTags {
+            search_type: builder.search.map(|val| match val {
+                SearchField::TxHash(_) => SearchType::TxHash,
+                SearchField::Sender(_) => SearchType::Sender,
+                SearchField::SequenceNumber(_) => SearchType::SequenceNumber,
+            }),
+            has_network_id_filter: builder.network_id.is_some(),
+            has_state_filter: builder.state.is_some(),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, EncodeLabelValue)]
+enum SearchType {
+    TxHash,
+    Sender,
+    SequenceNumber,
+}
+
+impl ExplorerMetrics {
+    pub async fn new(metrics_registry: Arc<RwLock<Registry>>) -> Self {
+        let mut guard = metrics_registry.write().await;
+        let sub_registry = guard.sub_registry_with_prefix("explorer");
+
+        let results_latency = Family::<QueryTags, Histogram>::new_with_constructor(|| {
+            Histogram::new(LATENCY_BUCKETS.into_iter())
+        });
+        sub_registry.register(
+            "results_latency",
+            "The latency of requests to the database to collect the limited results.",
+            results_latency.clone(),
+        );
+
+        let count_latency = Family::<QueryTags, Histogram>::new_with_constructor(|| {
+            Histogram::new(LATENCY_BUCKETS.into_iter())
+        });
+        sub_registry.register(
+            "count_latency",
+            "The latency of requests to the database to collect the total matching result count.",
+            count_latency.clone(),
+        );
+
+        Self {
+            results_latency,
+            count_latency,
+        }
+    }
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, IntoParams)]
 #[into_params(parameter_in=Query)]
 pub struct ExplorerQueryParams {
     /// Only return logs that are newer or equal to this timestamp. Timestamp is in ISO 8601 format with UTC timezone.
@@ -96,7 +168,13 @@ pub async fn explorer(
         query = query.max_timestamp(max_timestamp);
     }
 
-    let (requests, total_results) = tokio::join!(query.execute(), query.count_results());
+    let results_latency = &state.explorer_metrics.results_latency;
+    let count_latency = &state.explorer_metrics.count_latency;
+    let query_tags = &query.clone().into();
+    let (requests, total_results) = tokio::join!(
+        measure_latency(results_latency, query_tags, query.execute()),
+        measure_latency(count_latency, query_tags, query.count_results())
+    );
     let requests = requests.map_err(|_| RestError::TemporarilyUnavailable)?;
     let total_results = total_results.map_err(|_| RestError::TemporarilyUnavailable)?;
 
@@ -105,3 +183,19 @@ pub async fn explorer(
         total_results,
     }))
 }
+
+async fn measure_latency<T, F>(
+    metric: &Family<QueryTags, Histogram>,
+    query_tags: &QueryTags,
+    function: F,
+) -> T
+where
+    F: std::future::Future<Output = T>,
+{
+    let start = Instant::now();
+    let return_value = function.await;
+    metric
+        .get_or_create(query_tags)
+        .observe(start.elapsed().as_secs_f64());
+    return_value
+}

+ 9 - 2
apps/fortuna/src/config.rs

@@ -11,8 +11,9 @@ use {
 };
 pub use {
     generate::GenerateOptions, get_request::GetRequestOptions, inspect::InspectOptions,
-    register_provider::RegisterProviderOptions, request_randomness::RequestRandomnessOptions,
-    run::RunOptions, setup_provider::SetupProviderOptions, withdraw_fees::WithdrawFeesOptions,
+    prometheus_client::metrics::histogram::Histogram, register_provider::RegisterProviderOptions,
+    request_randomness::RequestRandomnessOptions, run::RunOptions,
+    setup_provider::SetupProviderOptions, withdraw_fees::WithdrawFeesOptions,
 };
 
 mod generate;
@@ -367,3 +368,9 @@ impl SecretString {
         Ok(None)
     }
 }
+
+/// This is a histogram with a bucket configuration appropriate for most things
+/// which measure latency to external services.
+pub const LATENCY_BUCKETS: [f64; 11] = [
+    0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0,
+];

+ 2 - 7
apps/fortuna/src/eth_utils/traced_client.rs

@@ -1,5 +1,5 @@
 use {
-    crate::api::ChainId,
+    crate::{api::ChainId, config::LATENCY_BUCKETS},
     anyhow::Result,
     axum::async_trait,
     ethers::{
@@ -42,12 +42,7 @@ impl RpcMetrics {
         );
 
         let latency = Family::<RpcLabel, Histogram>::new_with_constructor(|| {
-            Histogram::new(
-                [
-                    0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0,
-                ]
-                .into_iter(),
-            )
+            Histogram::new(LATENCY_BUCKETS.into_iter())
         });
         sub_registry.register(
             "latency",

+ 8 - 8
apps/fortuna/src/history.rs

@@ -345,13 +345,13 @@ impl History {
 #[derive(Debug, Clone)]
 pub struct RequestQueryBuilder<'a> {
     pool: &'a Pool<Sqlite>,
-    search: Option<SearchField>,
-    network_id: Option<i64>,
-    state: Option<StateTag>,
-    limit: i64,
-    offset: i64,
-    min_timestamp: DateTime<chrono::Utc>,
-    max_timestamp: DateTime<chrono::Utc>,
+    pub search: Option<SearchField>,
+    pub network_id: Option<i64>,
+    pub state: Option<StateTag>,
+    pub limit: i64,
+    pub offset: i64,
+    pub min_timestamp: DateTime<chrono::Utc>,
+    pub max_timestamp: DateTime<chrono::Utc>,
 }
 
 impl<'a> RequestQueryBuilder<'a> {
@@ -503,7 +503,7 @@ pub enum RequestQueryBuilderError {
 }
 
 #[derive(Debug, Clone)]
-enum SearchField {
+pub enum SearchField {
     TxHash(TxHash),
     Sender(Address),
     SequenceNumber(i64),

+ 7 - 12
apps/fortuna/src/keeper/keeper_metrics.rs

@@ -69,24 +69,19 @@ impl Default for KeeperMetrics {
             requests_reprocessed: Family::default(),
             reveals: Family::default(),
             request_duration_ms: Family::new_with_constructor(|| {
-                Histogram::new(
-                    vec![
-                        1000.0, 2500.0, 5000.0, 7500.0, 10000.0, 20000.0, 30000.0, 40000.0,
-                        50000.0, 60000.0, 120000.0, 180000.0, 240000.0, 300000.0, 600000.0,
-                    ]
-                    .into_iter(),
-                )
+                Histogram::new(vec![
+                    1000.0, 2500.0, 5000.0, 7500.0, 10000.0, 20000.0, 30000.0, 40000.0, 50000.0,
+                    60000.0, 120000.0, 180000.0, 240000.0, 300000.0, 600000.0,
+                ])
             }),
             retry_count: Family::new_with_constructor(|| {
-                Histogram::new(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0].into_iter())
+                Histogram::new(vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0])
             }),
             final_gas_multiplier: Family::new_with_constructor(|| {
-                Histogram::new(
-                    vec![100.0, 125.0, 150.0, 200.0, 300.0, 400.0, 500.0, 600.0].into_iter(),
-                )
+                Histogram::new(vec![100.0, 125.0, 150.0, 200.0, 300.0, 400.0, 500.0, 600.0])
             }),
             final_fee_multiplier: Family::new_with_constructor(|| {
-                Histogram::new(vec![100.0, 110.0, 120.0, 140.0, 160.0, 180.0, 200.0].into_iter())
+                Histogram::new(vec![100.0, 110.0, 120.0, 140.0, 160.0, 180.0, 200.0])
             }),
             gas_price_estimate: Family::default(),
             highest_revealed_sequence_number: Family::default(),