瀏覽代碼

drozdziak1/pyth attester prometheus metrics (#396)

* Add prometheus to Tilt with attester metrics

* Dockerfile.prometheus: re-add after faulty merge

* p2w-attest: Clarify port number, remove subpage config

* p2w-attest/util.rs: fix warnings

* p2w-attest: typo
Stanisław Drozd 3 年之前
父節點
當前提交
bbe35df8cf

+ 3 - 0
Dockerfile.prometheus

@@ -0,0 +1,3 @@
+FROM prom/prometheus
+
+ADD --chown=nobody:nobody ./prometheus_config.yaml .

+ 16 - 1
Tiltfile

@@ -198,7 +198,7 @@ k8s_yaml_with_ns("devnet/p2w-attest.yaml")
 k8s_resource(
     "p2w-attest",
     resource_deps = ["solana-devnet", "pyth", "guardian"],
-    port_forwards = [],
+    port_forwards = [port_forward(3000, name = "metrics", host = webHost)],
     labels = ["pyth"],
     trigger_mode = trigger_mode,
 )
@@ -320,3 +320,18 @@ k8s_resource(
     labels = ["terra"],
     trigger_mode = trigger_mode,
 )
+
+docker_build(
+    ref = "prometheus",
+    context = ".",
+    dockerfile = "Dockerfile.prometheus",
+)
+
+k8s_yaml_with_ns("devnet/prometheus.yaml")
+
+k8s_resource(
+    "prometheus",
+    port_forwards = [port_forward(9090, name = "Prometheus dashboard", host = webHost)],
+    labels = ["prometheus"],
+    trigger_mode = trigger_mode,
+)

+ 5 - 0
devnet/p2w-attest.yaml

@@ -10,6 +10,8 @@ spec:
     - port: 4343
       name: p2w-attest
       protocol: TCP
+    - port: 3000
+      name: metrics
   clusterIP: None
   selector:
     app: p2w-attest
@@ -52,3 +54,6 @@ spec:
             - containerPort: 4343
               name: p2w-attest
               protocol: TCP
+            - containerPort: 3000
+              name: metrics
+              protocol: TCP

+ 48 - 0
devnet/prometheus.yaml

@@ -0,0 +1,48 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  labels:
+    app: prometheus
+spec:
+  clusterIP: None
+  selector:
+    app: prometheus
+  ports:
+    - port: 9090
+      name: dashboard
+      protocol: TCP
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+spec:
+  selector:
+    matchLabels:
+      app: prometheus
+  serviceName: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      restartPolicy: Always
+      terminationGracePeriodSeconds: 0
+      containers:
+        - name: prometheus
+          image: prometheus
+          readinessProbe:
+            tcpSocket:
+              port: 9090
+            periodSeconds: 1
+            failureThreshold: 300
+          ports:
+            - containerPort: 9090
+              name: dashboard
+              protocol: TCP
+          command:
+            - "prometheus"
+            - "--config.file=prometheus_config.yaml"
+            - "--web.external-url=http://[::]:9090"

+ 5 - 0
prometheus_config.yaml

@@ -0,0 +1,5 @@
+scrape_configs:
+  - job_name: p2w_attest
+    scrape_interval: 5s
+    static_configs:
+      - targets: ["p2w-attest:3000"]

+ 169 - 3
solana/pyth2wormhole/Cargo.lock

@@ -561,6 +561,16 @@ dependencies = [
  "regex-automata",
 ]
 
+[[package]]
+name = "buf_redux"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f"
+dependencies = [
+ "memchr",
+ "safemem",
+]
+
 [[package]]
 name = "bumpalo"
 version = "3.9.1"
@@ -1508,6 +1518,31 @@ dependencies = [
  "ahash",
 ]
 
+[[package]]
+name = "headers"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3e372db8e5c0d213e0cd0b9be18be2aca3d44cf2fe30a9d46a65581cd454584"
+dependencies = [
+ "base64 0.13.0",
+ "bitflags",
+ "bytes",
+ "headers-core",
+ "http",
+ "httpdate",
+ "mime",
+ "sha1",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
+dependencies = [
+ "http",
+]
+
 [[package]]
 name = "heck"
 version = "0.3.3"
@@ -1585,9 +1620,9 @@ dependencies = [
 
 [[package]]
 name = "http"
-version = "0.2.7"
+version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff8670570af52249509a86f5e3e18a08c60b177071826898fde8997cf5f6bfbb"
+checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
 dependencies = [
  "bytes",
  "fnv",
@@ -1964,6 +1999,16 @@ version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
 
+[[package]]
+name = "mime_guess"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2022,6 +2067,24 @@ dependencies = [
  "syn 1.0.94",
 ]
 
+[[package]]
+name = "multipart"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00dec633863867f29cb39df64a397cdf4a6354708ddd7759f70c7fb51c5f9182"
+dependencies = [
+ "buf_redux",
+ "httparse",
+ "log",
+ "mime",
+ "mime_guess",
+ "quick-error",
+ "rand 0.8.5",
+ "safemem",
+ "tempfile",
+ "twoway",
+]
+
 [[package]]
 name = "nix"
 version = "0.23.1"
@@ -2542,6 +2605,27 @@ dependencies = [
  "yansi",
 ]
 
+[[package]]
+name = "prometheus"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot 0.12.0",
+ "protobuf",
+ "thiserror",
+]
+
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
 [[package]]
 name = "pyth-client"
 version = "0.2.2"
@@ -2642,15 +2726,18 @@ dependencies = [
 
 [[package]]
 name = "pyth2wormhole-client"
-version = "1.0.0"
+version = "1.1.0"
 dependencies = [
  "borsh",
  "clap 3.1.18",
  "env_logger 0.8.4",
  "futures",
  "generic-array",
+ "http",
+ "lazy_static",
  "log",
  "p2w-sdk",
+ "prometheus",
  "pyth-client 0.5.1",
  "pyth-sdk-solana 0.6.1",
  "pyth2wormhole",
@@ -2665,6 +2752,7 @@ dependencies = [
  "solana-transaction-status",
  "solitaire",
  "tokio",
+ "warp",
  "wormhole-bridge-solana",
 ]
 
@@ -2690,6 +2778,12 @@ dependencies = [
  "percent-encoding",
 ]
 
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
 [[package]]
 name = "quinn"
 version = "0.8.4"
@@ -3123,6 +3217,12 @@ version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
 
+[[package]]
+name = "safemem"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072"
+
 [[package]]
 name = "same-file"
 version = "1.0.6"
@@ -3166,6 +3266,12 @@ dependencies = [
  "syn 1.0.94",
 ]
 
+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
 [[package]]
 name = "scopeguard"
 version = "1.1.0"
@@ -3317,6 +3423,17 @@ dependencies = [
  "digest 0.10.5",
 ]
 
+[[package]]
+name = "sha1"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest 0.10.5",
+]
+
 [[package]]
 name = "sha2"
 version = "0.9.9"
@@ -4852,6 +4969,15 @@ dependencies = [
  "webpki-roots",
 ]
 
+[[package]]
+name = "twoway"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "typenum"
 version = "1.15.0"
@@ -4870,6 +4996,15 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.8"
@@ -5007,6 +5142,37 @@ dependencies = [
  "try-lock",
 ]
 
+[[package]]
+name = "warp"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed7b8be92646fc3d18b06147664ebc5f48d222686cb11a8755e561a735aacc6d"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "headers",
+ "http",
+ "hyper",
+ "log",
+ "mime",
+ "mime_guess",
+ "multipart",
+ "percent-encoding",
+ "pin-project",
+ "rustls-pemfile 0.2.1",
+ "scoped-tls",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "tokio",
+ "tokio-stream",
+ "tokio-tungstenite",
+ "tokio-util 0.7.2",
+ "tower-service",
+ "tracing",
+]
+
 [[package]]
 name = "wasi"
 version = "0.9.0+wasi-snapshot-preview1"

+ 5 - 1
solana/pyth2wormhole/client/Cargo.toml

@@ -1,6 +1,6 @@
 [package]
 name = "pyth2wormhole-client"
-version = "1.0.0"
+version = "1.1.0"
 edition = "2018"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -33,6 +33,10 @@ tokio = {version = "1", features = ["sync", "rt-multi-thread", "time"]}
 futures = "0.3.21"
 sha3 = "0.10.6"
 generic-array = "0.14.6"
+lazy_static = "1.4.0"
+prometheus = "0.13.3"
+warp = "0.3.3"
+http = "0.2.8"
 
 [dev-dependencies]
 pyth-client = "0.5.0"

+ 11 - 1
solana/pyth2wormhole/client/src/cli.rs

@@ -7,7 +7,10 @@ use {
     },
     solana_program::pubkey::Pubkey,
     solana_sdk::commitment_config::CommitmentConfig,
-    std::path::PathBuf,
+    std::{
+        net::SocketAddr,
+        path::PathBuf,
+    },
 };
 
 #[derive(Parser)]
@@ -84,6 +87,13 @@ pub enum Action {
             default_value = "20"
         )]
         confirmation_timeout_secs: u64,
+        #[clap(
+            short = 'm',
+            long,
+            help = "Address to use for serving Prometheus metrics.",
+            default_value = "[::]:3000"
+        )]
+        metrics_bind_addr:         SocketAddr,
     },
     #[clap(about = "Retrieve a pyth2wormhole program's current settings")]
     GetConfig,

+ 1 - 0
solana/pyth2wormhole/client/src/lib.rs

@@ -13,6 +13,7 @@ pub use {
     message::P2WMessageQueue,
     pyth2wormhole::Pyth2WormholeConfig,
     util::{
+        start_metrics_server,
         RLMutex,
         RLMutexGuard,
     },

+ 46 - 4
solana/pyth2wormhole/client/src/main.rs

@@ -11,6 +11,7 @@ use {
         TryFutureExt,
     },
     generic_array::GenericArray,
+    lazy_static::lazy_static,
     log::{
         debug,
         error,
@@ -19,6 +20,12 @@ use {
         LevelFilter,
     },
     p2w_sdk::P2WEmitter,
+    prometheus::{
+        register_int_counter,
+        register_int_gauge,
+        IntCounter,
+        IntGauge,
+    },
     pyth2wormhole::{
         attest::P2W_MAX_BATCH_SIZE,
         Pyth2WormholeConfig,
@@ -45,6 +52,7 @@ use {
     },
     std::{
         fs::File,
+        net::SocketAddr,
         sync::Arc,
         time::{
             Duration,
@@ -62,6 +70,18 @@ use {
 
 pub const SEQNO_PREFIX: &'static str = "Program log: Sequence: ";
 
+lazy_static! {
+    static ref ATTESTATIONS_OK_CNT: IntCounter =
+        register_int_counter!("attestations_ok", "Number of successful attestations").unwrap();
+    static ref ATTESTATIONS_ERR_CNT: IntCounter =
+        register_int_counter!("attestations_err", "Number of failed attestations").unwrap();
+    static ref LAST_SEQNO_GAUGE: IntGauge = register_int_gauge!(
+        "last_seqno",
+        "Latest sequence number produced by this attester"
+    )
+    .unwrap();
+}
+
 #[tokio::main(flavor = "multi_thread")]
 async fn main() -> Result<(), ErrBox> {
     let cli = Cli::parse();
@@ -178,6 +198,7 @@ async fn main() -> Result<(), ErrBox> {
             n_retries,
             retry_interval_secs,
             confirmation_timeout_secs,
+            metrics_bind_addr,
             daemon,
         } => {
             // Load the attestation config yaml
@@ -201,7 +222,14 @@ async fn main() -> Result<(), ErrBox> {
             ));
 
             if daemon {
-                handle_attest_daemon_mode(rpc_cfg, payer, p2w_addr, attestation_cfg).await?;
+                handle_attest_daemon_mode(
+                    rpc_cfg,
+                    payer,
+                    p2w_addr,
+                    attestation_cfg,
+                    metrics_bind_addr,
+                )
+                .await?;
             } else {
                 handle_attest_non_daemon_mode(
                     attestation_cfg,
@@ -245,7 +273,12 @@ async fn handle_attest_daemon_mode(
     payer: Keypair,
     p2w_addr: Pubkey,
     mut attestation_cfg: AttestationConfig,
+    metrics_bind_addr: SocketAddr,
 ) -> Result<(), ErrBox> {
+    tokio::spawn(start_metrics_server(metrics_bind_addr));
+
+    info!("Started serving metrics on {}", metrics_bind_addr);
+
     info!(
         "Crawling mapping {:?} every {} minutes",
         attestation_cfg.mapping_addr, attestation_cfg.mapping_reload_interval_mins
@@ -273,7 +306,16 @@ async fn handle_attest_daemon_mode(
     loop {
         let start_time = Instant::now(); // Helps timekeep mapping lookups accurately
 
-        let config = get_config_account(&lock_and_make_rpc(&rpc_cfg).await, &p2w_addr).await?;
+        let config = match get_config_account(&lock_and_make_rpc(&rpc_cfg).await, &p2w_addr).await {
+            Ok(c) => c,
+            Err(e) => {
+                error!(
+                    "Could not look up latest on-chain config in top-level loop: {:?}",
+                    e
+                );
+                continue;
+            }
+        };
 
         // Use the mapping if specified
         if let Some(mapping_addr) = attestation_cfg.mapping_addr.as_ref() {
@@ -366,8 +408,6 @@ async fn handle_attest_daemon_mode(
 
         tokio::time::sleep(remaining).await;
     }
-
-    Ok(())
 }
 
 #[derive(Clone)]
@@ -767,8 +807,10 @@ async fn attestation_job(args: AttestationJobArgs) -> Result<(), ErrBoxSend> {
         "Batch {}/{}, group {:?} OK",
         batch_no, batch_count, group_name
     );
+    ATTESTATIONS_OK_CNT.inc();
     // NOTE(2022-03-09): p2w_autoattest.py relies on parsing this println!{}
     println!("Sequence number: {}", seqno);
+    LAST_SEQNO_GAUGE.set(seqno.parse::<i64>()?);
     Result::<(), ErrBoxSend>::Ok(())
 }
 

+ 35 - 1
solana/pyth2wormhole/client/src/util.rs

@@ -1,6 +1,12 @@
 use {
-    log::trace,
+    http::status::StatusCode,
+    log::{
+        error,
+        trace,
+    },
+    prometheus::TextEncoder,
     std::{
+        net::SocketAddr,
         ops::{
             Deref,
             DerefMut,
@@ -14,6 +20,12 @@ use {
         Mutex,
         MutexGuard,
     },
+    warp::{
+        reply,
+        Filter,
+        Rejection,
+        Reply,
+    },
 };
 
 /// Rate-limited mutex. Ensures there's a period of minimum rl_interval between lock acquisitions
@@ -96,3 +108,25 @@ impl<T> RLMutex<T> {
         RLMutexGuard { guard }
     }
 }
+
+async fn metrics_handler() -> Result<impl Reply, Rejection> {
+    let encoder = TextEncoder::new();
+    match encoder.encode_to_string(&prometheus::gather()) {
+        Ok(encoded_metrics) => Ok(reply::with_status(encoded_metrics, StatusCode::OK)),
+        Err(e) => {
+            error!("Could not serve metrics: {}", e.to_string());
+            Ok(reply::with_status(
+                "".to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            ))
+        }
+    }
+}
+
+pub async fn start_metrics_server(addr: impl Into<SocketAddr> + 'static) {
+    let metrics_route = warp::path("metrics") // The metrics subpage is standardized to always be /metrics
+        .and(warp::path::end())
+        .and_then(metrics_handler);
+
+    warp::serve(metrics_route).bind(addr).await;
+}