4 月之前 · 6bcd5baf84
--- a/validator/Cargo.toml
+++ b/validator/Cargo.toml
@@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
 
															 jsonrpc-core-client = { workspace = true, features = ["ipc"] }
														
 
															 jsonrpc-derive = { workspace = true }
														
 
															 jsonrpc-ipc-server = { workspace = true }
														
 
															+libc = { workspace = true }
														
 
															 libloading = { workspace = true }
														
 
															 log = { workspace = true }
														
 
															 num_cpus = { workspace = true }
														
--- a/validator/src/admin_rpc_service.rs
+++ b/validator/src/admin_rpc_service.rs
@@ -152,9 +152,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
 
															 pub trait AdminRpc {
														
 
															     type Metadata;
														
 
															+    /// Initiates validator exit; exit is asynchronous so the validator
														
 
															+    /// will almost certainly still be running when this method returns
														
 
															     #[rpc(meta, name = "exit")]
														
 
															     fn exit(&self, meta: Self::Metadata) -> Result<()>;
														
 
															+    /// Return the process id (pid)
														
 
															+    #[rpc(meta, name = "pid")]
														
 
															+    fn pid(&self, meta: Self::Metadata) -> Result<u32>;
														
 
															+
														
 
															     #[rpc(meta, name = "reloadPlugin")]
														
 
															     fn reload_plugin(
														
 
															         &self,
														
@@ -270,7 +276,7 @@ impl AdminRpc for AdminRpcImpl {
 
															                 // receive a confusing error as the validator shuts down before a response is sent back.
														
 
															                 thread::sleep(Duration::from_millis(100));
														
 
															-                warn!("validator exit requested");
														
 
															+                info!("validator exit requested");
														
 
															                 meta.validator_exit.write().unwrap().exit();
														
 
															                 if !meta.validator_exit_backpressure.is_empty() {
														
@@ -315,6 +321,10 @@ impl AdminRpc for AdminRpcImpl {
 
															         Ok(())
														
 
															     }
														
 
															+    fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
														
 
															+        Ok(std::process::id())
														
 
															+    }
														
 
															+
														
 
															     fn reload_plugin(
														
 
															         &self,
														
 
															         meta: Self::Metadata,
														
--- a/validator/src/commands/exit/mod.rs
+++ b/validator/src/commands/exit/mod.rs
@@ -1,7 +1,9 @@
 
															+#[cfg(target_os = "linux")]
														
 
															+use std::{io, thread, time::Duration};
														
 
															 use {
														
 
															     crate::{
														
 
															         admin_rpc_service,
														
 
															-        commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
														
 
															+        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
														
 
															     },
														
 
															     clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
														
 
															     solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
														
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
 
															 const DEFAULT_MIN_IDLE_TIME: &str = "10";
														
 
															 const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
														
 
															+#[derive(Clone, Debug, PartialEq)]
														
 
															+pub enum PostExitAction {
														
 
															+    // Run the agave-validator monitor command indefinitely
														
 
															+    Monitor,
														
 
															+    // Block until the exiting validator process has terminated
														
 
															+    Wait,
														
 
															+}
														
 
															+
														
 
															 #[derive(Debug, PartialEq)]
														
 
															 pub struct ExitArgs {
														
 
															     pub force: bool,
														
 
															-    pub monitor: bool,
														
 
															+    pub post_exit_action: Option<PostExitAction>,
														
 
															     pub min_idle_time: usize,
														
 
															     pub max_delinquent_stake: u8,
														
 
															     pub skip_new_snapshot_check: bool,
														
@@ -25,9 +35,17 @@ pub struct ExitArgs {
 
															 impl FromClapArgMatches for ExitArgs {
														
 
															     fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
														
 
															+        let post_exit_action = if matches.is_present("monitor") {
														
 
															+            Some(PostExitAction::Monitor)
														
 
															+        } else if matches.is_present("wait_for_exit") {
														
 
															+            Some(PostExitAction::Wait)
														
 
															+        } else {
														
 
															+            None
														
 
															+        };
														
 
															+
														
 
															         Ok(ExitArgs {
														
 
															             force: matches.is_present("force"),
														
 
															-            monitor: matches.is_present("monitor"),
														
 
															+            post_exit_action,
														
 
															             min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
														
 
															             max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
														
 
															             skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
														
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
 
															                 .takes_value(false)
														
 
															                 .help("Monitor the validator after sending the exit request"),
														
 
															         )
														
 
															+        .arg(
														
 
															+            Arg::with_name("wait_for_exit")
														
 
															+                .long("wait-for-exit")
														
 
															+                .conflicts_with("monitor")
														
 
															+                .help("Wait for the validator to terminate after sending the exit request"),
														
 
															+        )
														
 
															         .arg(
														
 
															             Arg::with_name("min_idle_time")
														
 
															                 .long("min-idle-time")
														
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
 
															         )?;
														
 
															     }
														
 
															-    let admin_client = admin_rpc_service::connect(ledger_path);
														
 
															-    admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
														
 
															+    // Grab the pid from the process before initiating exit as the running
														
 
															+    // validator will be unable to respond after exit has returned.
														
 
															+    //
														
 
															+    // Additionally, only check the pid() RPC call result if it will be used.
														
 
															+    // In an upgrade scenario, it is possible that a binary that calls pid()
														
 
															+    // will be initating exit against a process that doesn't support pid().
														
 
															+    // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
														
 
															+    // result is checked ONLY in that case to provide a friendlier upgrade
														
 
															+    // path for users who are NOT using --wait-for-exit
														
 
															+    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
														
 
															+        "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
														
 
															+    let post_exit_action = exit_args.post_exit_action.clone();
														
 
															+    let validator_pid = admin_rpc_service::runtime().block_on(async move {
														
 
															+        let admin_client = admin_rpc_service::connect(ledger_path).await?;
														
 
															+        let validator_pid = match post_exit_action {
														
 
															+            Some(PostExitAction::Wait) => admin_client
														
 
															+                .pid()
														
 
															+                .await
														
 
															+                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
														
 
															+            _ => 0,
														
 
															+        };
														
 
															+        admin_client.exit().await?;
														
 
															+
														
 
															+        Ok::<u32, Error>(validator_pid)
														
 
															+    })?;
														
 
															+
														
 
															     println!("Exit request sent");
														
 
															-    if exit_args.monitor {
														
 
															-        monitor::execute(matches, ledger_path)?;
														
 
															+    match exit_args.post_exit_action {
														
 
															+        None => Ok(()),
														
 
															+        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
														
 
															+        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
														
 
															+    }?;
														
 
															+
														
 
															+    Ok(())
														
 
															+}
														
 
															+
														
 
															+#[cfg(target_os = "linux")]
														
 
															+fn poll_until_pid_terminates(pid: u32) -> Result<()> {
														
 
															+    let pid = i32::try_from(pid)?;
														
 
															+
														
 
															+    println!("Waiting for agave-validator process {pid} to terminate");
														
 
															+    loop {
														
 
															+        // From man kill(2)
														
 
															+        //
														
 
															+        // If sig is 0, then no signal is sent, but existence and permission
														
 
															+        // checks are still performed; this can be used to check for the
														
 
															+        // existence of a process ID or process group ID that the caller is
														
 
															+        // permitted to signal.
														
 
															+        let result = unsafe {
														
 
															+            libc::kill(pid, /*sig:*/ 0)
														
 
															+        };
														
 
															+        if result >= 0 {
														
 
															+            // Give the process some time to exit before checking again
														
 
															+            thread::sleep(Duration::from_millis(500));
														
 
															+        } else {
														
 
															+            let errno = io::Error::last_os_error()
														
 
															+                .raw_os_error()
														
 
															+                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
														
 
															+            match errno {
														
 
															+                libc::ESRCH => {
														
 
															+                    println!("Done, agave-validator process {pid} has terminated");
														
 
															+                    break;
														
 
															+                }
														
 
															+                libc::EINVAL => {
														
 
															+                    // An invalid signal was specified, we only pass sig=0 so
														
 
															+                    // this should not be possible
														
 
															+                    Err(Error::Dynamic(
														
 
															+                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
														
 
															+                    ))?;
														
 
															+                }
														
 
															+                libc::EPERM => {
														
 
															+                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
														
 
															+                }
														
 
															+                unknown => {
														
 
															+                    Err(Error::Dynamic(
														
 
															+                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
														
 
															+                    ))?;
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															     }
														
 
															     Ok(())
														
 
															 }
														
 
															+#[cfg(not(target_os = "linux"))]
														
 
															+fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
														
 
															+    Err(Error::Dynamic(
														
 
															+        "Unable to wait for agave-validator process termination on this platform".into(),
														
 
															+    ))
														
 
															+}
														
 
															+
														
 
															 #[cfg(test)]
														
 
															 mod tests {
														
 
															     use {super::*, crate::commands::tests::verify_args_struct_by_command};
														
@@ -126,7 +232,7 @@ mod tests {
 
															                     .parse()
														
 
															                     .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
														
 
															                 force: false,
														
 
															-                monitor: false,
														
 
															+                post_exit_action: None,
														
 
															                 skip_new_snapshot_check: false,
														
 
															                 skip_health_check: false,
														
 
															             }
														
@@ -151,12 +257,21 @@ mod tests {
 
															     }
														
 
															     #[test]
														
 
															-    fn verify_args_struct_by_command_exit_with_monitor() {
														
 
															+    fn verify_args_struct_by_command_exit_with_post_exit_action() {
														
 
															         verify_args_struct_by_command(
														
 
															             command(),
														
 
															             vec![COMMAND, "--monitor"],
														
 
															             ExitArgs {
														
 
															-                monitor: true,
														
 
															+                post_exit_action: Some(PostExitAction::Monitor),
														
 
															+                ..ExitArgs::default()
														
 
															+            },
														
 
															+        );
														
 
															+
														
 
															+        verify_args_struct_by_command(
														
 
															+            command(),
														
 
															+            vec![COMMAND, "--wait-for-exit"],
														
 
															+            ExitArgs {
														
 
															+                post_exit_action: Some(PostExitAction::Wait),
														
 
															                 ..ExitArgs::default()
														
 
															             },
														
 
															         );
														
--- a/validator/src/commands/mod.rs
+++ b/validator/src/commands/mod.rs
@@ -27,6 +27,9 @@ pub enum Error {
 
															     #[error(transparent)]
														
 
															     Io(#[from] std::io::Error),
														
 
															+
														
 
															+    #[error(transparent)]
														
 
															+    TryFromInt(#[from] std::num::TryFromIntError),
														
 
															 }
														
 
															 pub type Result<T> = std::result::Result<T, Error>;