Browse Source

Add solana-watchtower program

Michael Vines 6 năm trước cách đây
mục cha
commit
2db28cae41
6 tập tin đã thay đổi với 171 bổ sung0 xóa
  1. 13 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 2 0
      watchtower/.gitignore
  4. 23 0
      watchtower/Cargo.toml
  5. 16 0
      watchtower/README.md
  6. 116 0
      watchtower/src/main.rs

+ 13 - 0
Cargo.lock

@@ -4090,6 +4090,19 @@ dependencies = [
  "solana-sdk 0.22.0",
 ]
 
+[[package]]
+name = "solana-watchtower"
+version = "0.22.0"
+dependencies = [
+ "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
+ "solana-clap-utils 0.22.0",
+ "solana-client 0.22.0",
+ "solana-logger 0.22.0",
+ "solana-metrics 0.22.0",
+ "solana-sdk 0.22.0",
+]
+
 [[package]]
 name = "solana_libra_bytecode_verifier"
 version = "0.0.1-sol4"

+ 1 - 0
Cargo.toml

@@ -49,6 +49,7 @@ members = [
     "vote-signer",
     "cli",
     "rayon-threadlimit",
+    "watchtower",
 ]
 
 exclude = [

+ 2 - 0
watchtower/.gitignore

@@ -0,0 +1,2 @@
+/target/
+/farf/

+ 23 - 0
watchtower/Cargo.toml

@@ -0,0 +1,23 @@
+[package]
+authors = ["Solana Maintainers <maintainers@solana.com>"]
+edition = "2018"
+name = "solana-watchtower"
+description = "Blockchain, Rebuilt for Scale"
+version = "0.22.0"
+repository = "https://github.com/solana-labs/solana"
+license = "Apache-2.0"
+homepage = "https://solana.com/"
+
+[dependencies]
+clap = "2.33.0"
+log = "0.4.8"
+solana-clap-utils = { path = "../clap-utils", version = "0.22.0" }
+solana-client = { path = "../client", version = "0.22.0" }
+solana-logger = { path = "../logger", version = "0.22.0" }
+solana-metrics = { path = "../metrics", version = "0.22.0" }
+solana-sdk = { path = "../sdk", version = "0.22.0" }
+
+[[bin]]
+name = "solana-watchtower"
+path = "src/main.rs"
+

+ 16 - 0
watchtower/README.md

@@ -0,0 +1,16 @@
+The `solana-watchtower` program is used to monitor the health of a cluster.  It
+periodically polls the cluster over an RPC API to confirm that the transaction
+count is advancing, new blockhashes are available, and no validators are
+delinquent.  Results are reported as InfluxDB metrics.
+
+### Metrics
+#### `watchtower-sanity`
+On every iteration this data point will be emitted indicating the overall result
+using a boolean `ok` field.
+
+#### `watchtower-sanity-failure`
+On failure this data point contains details about the specific test that failed via
+the following fields:
+* `test`: name of the sanity test that failed
+* `err`: exact sanity failure message
+

+ 116 - 0
watchtower/src/main.rs

@@ -0,0 +1,116 @@
+//! A command-line executable for monitoring the health of a cluster
+
+use clap::{crate_description, crate_name, value_t_or_exit, App, Arg};
+use log::*;
+use solana_clap_utils::input_validators::is_url;
+use solana_client::rpc_client::RpcClient;
+use solana_metrics::{datapoint_error, datapoint_info};
+use std::{error, io, thread::sleep, time::Duration};
+
+fn main() -> Result<(), Box<dyn error::Error>> {
+    let matches = App::new(crate_name!())
+        .about(crate_description!())
+        .version(solana_clap_utils::version!())
+        .arg(
+            Arg::with_name("json_rpc_url")
+                .long("url")
+                .value_name("URL")
+                .takes_value(true)
+                .required(true)
+                .validator(is_url)
+                .help("JSON RPC URL for the cluster"),
+        )
+        .arg(
+            Arg::with_name("interval")
+                .long("interval")
+                .value_name("SECONDS")
+                .takes_value(true)
+                .default_value("60")
+                .help("Wait interval seconds between checking the cluster"),
+        )
+        .get_matches();
+
+    let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
+    let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String);
+
+    solana_logger::setup_with_filter("solana=info");
+    solana_metrics::set_panic_hook("watchtower");
+
+    let rpc_client = RpcClient::new(json_rpc_url.to_string());
+
+    let mut last_transaction_count = 0;
+    loop {
+        let ok = rpc_client
+            .get_transaction_count()
+            .and_then(|transaction_count| {
+                info!("Current transaction count: {}", transaction_count);
+
+                if transaction_count > last_transaction_count {
+                    last_transaction_count = transaction_count;
+                    Ok(true)
+                } else {
+                    Err(io::Error::new(
+                        io::ErrorKind::Other,
+                        format!(
+                            "Transaction count is not advancing: {} <= {}",
+                            transaction_count, last_transaction_count
+                        ),
+                    ))
+                }
+            })
+            .unwrap_or_else(|err| {
+                datapoint_error!(
+                    "watchtower-sanity-failure",
+                    ("test", "transaction-count", String),
+                    ("err", err.to_string(), String)
+                );
+                false
+            })
+            && rpc_client
+                .get_recent_blockhash()
+                .and_then(|(blockhash, _fee_calculator)| {
+                    info!("Current blockhash: {}", blockhash);
+                    rpc_client.get_new_blockhash(&blockhash)
+                })
+                .and_then(|(blockhash, _fee_calculator)| {
+                    info!("New blockhash: {}", blockhash);
+                    Ok(true)
+                })
+                .unwrap_or_else(|err| {
+                    datapoint_error!(
+                        "watchtower-sanity-failure",
+                        ("test", "blockhash", String),
+                        ("err", err.to_string(), String)
+                    );
+                    false
+                })
+            && rpc_client
+                .get_vote_accounts()
+                .and_then(|vote_accounts| {
+                    info!("Current validator count: {}", vote_accounts.current.len());
+                    info!(
+                        "Delinquent validator count: {}",
+                        vote_accounts.delinquent.len()
+                    );
+                    if vote_accounts.delinquent.is_empty() {
+                        Ok(true)
+                    } else {
+                        Err(io::Error::new(
+                            io::ErrorKind::Other,
+                            format!("{} delinquent validators", vote_accounts.delinquent.len()),
+                        ))
+                    }
+                })
+                .unwrap_or_else(|err| {
+                    datapoint_error!(
+                        "watchtower-sanity-failure",
+                        ("test", "delinquent-validators", String),
+                        ("err", err.to_string(), String)
+                    );
+                    false
+                });
+
+        datapoint_info!("watchtower-sanity", ("ok", ok, bool));
+        sleep(interval);
+    }
+}