4 months ago · 6bcd5baf84
--- a/validator/Cargo.toml
+++ b/validator/Cargo.toml
@@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
 
				 jsonrpc-core-client = { workspace = true, features = ["ipc"] }
			
 
				 jsonrpc-derive = { workspace = true }
			
 
				 jsonrpc-ipc-server = { workspace = true }
			
 
				+libc = { workspace = true }
			
 
				 libloading = { workspace = true }
			
 
				 log = { workspace = true }
			
 
				 num_cpus = { workspace = true }
			
--- a/validator/src/admin_rpc_service.rs
+++ b/validator/src/admin_rpc_service.rs
@@ -152,9 +152,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
 
				 pub trait AdminRpc {
			
 
				     type Metadata;
			
 
				 
			
 
				+    /// Initiates validator exit; exit is asynchronous so the validator
			
 
				+    /// will almost certainly still be running when this method returns
			
 
				     #[rpc(meta, name = "exit")]
			
 
				     fn exit(&self, meta: Self::Metadata) -> Result<()>;
			
 
				 
			
 
				+    /// Return the process id (pid)
			
 
				+    #[rpc(meta, name = "pid")]
			
 
				+    fn pid(&self, meta: Self::Metadata) -> Result<u32>;
			
 
				+
			
 
				     #[rpc(meta, name = "reloadPlugin")]
			
 
				     fn reload_plugin(
			
 
				         &self,
			
@@ -270,7 +276,7 @@ impl AdminRpc for AdminRpcImpl {
 
				                 // receive a confusing error as the validator shuts down before a response is sent back.
			
 
				                 thread::sleep(Duration::from_millis(100));
			
 
				 
			
 
				-                warn!("validator exit requested");
			
 
				+                info!("validator exit requested");
			
 
				                 meta.validator_exit.write().unwrap().exit();
			
 
				 
			
 
				                 if !meta.validator_exit_backpressure.is_empty() {
			
@@ -315,6 +321,10 @@ impl AdminRpc for AdminRpcImpl {
 
				         Ok(())
			
 
				     }
			
 
				 
			
 
				+    fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
			
 
				+        Ok(std::process::id())
			
 
				+    }
			
 
				+
			
 
				     fn reload_plugin(
			
 
				         &self,
			
 
				         meta: Self::Metadata,
			
--- a/validator/src/commands/exit/mod.rs
+++ b/validator/src/commands/exit/mod.rs
@@ -1,7 +1,9 @@
 
				+#[cfg(target_os = "linux")]
			
 
				+use std::{io, thread, time::Duration};
			
 
				 use {
			
 
				     crate::{
			
 
				         admin_rpc_service,
			
 
				-        commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
			
 
				+        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
			
 
				     },
			
 
				     clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
			
 
				     solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
			
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
 
				 const DEFAULT_MIN_IDLE_TIME: &str = "10";
			
 
				 const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
			
 
				 
			
 
				+#[derive(Clone, Debug, PartialEq)]
			
 
				+pub enum PostExitAction {
			
 
				+    // Run the agave-validator monitor command indefinitely
			
 
				+    Monitor,
			
 
				+    // Block until the exiting validator process has terminated
			
 
				+    Wait,
			
 
				+}
			
 
				+
			
 
				 #[derive(Debug, PartialEq)]
			
 
				 pub struct ExitArgs {
			
 
				     pub force: bool,
			
 
				-    pub monitor: bool,
			
 
				+    pub post_exit_action: Option<PostExitAction>,
			
 
				     pub min_idle_time: usize,
			
 
				     pub max_delinquent_stake: u8,
			
 
				     pub skip_new_snapshot_check: bool,
			
@@ -25,9 +35,17 @@ pub struct ExitArgs {
 
				 
			
 
				 impl FromClapArgMatches for ExitArgs {
			
 
				     fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
			
 
				+        let post_exit_action = if matches.is_present("monitor") {
			
 
				+            Some(PostExitAction::Monitor)
			
 
				+        } else if matches.is_present("wait_for_exit") {
			
 
				+            Some(PostExitAction::Wait)
			
 
				+        } else {
			
 
				+            None
			
 
				+        };
			
 
				+
			
 
				         Ok(ExitArgs {
			
 
				             force: matches.is_present("force"),
			
 
				-            monitor: matches.is_present("monitor"),
			
 
				+            post_exit_action,
			
 
				             min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
			
 
				             max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
			
 
				             skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
			
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
 
				                 .takes_value(false)
			
 
				                 .help("Monitor the validator after sending the exit request"),
			
 
				         )
			
 
				+        .arg(
			
 
				+            Arg::with_name("wait_for_exit")
			
 
				+                .long("wait-for-exit")
			
 
				+                .conflicts_with("monitor")
			
 
				+                .help("Wait for the validator to terminate after sending the exit request"),
			
 
				+        )
			
 
				         .arg(
			
 
				             Arg::with_name("min_idle_time")
			
 
				                 .long("min-idle-time")
			
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
 
				         )?;
			
 
				     }
			
 
				 
			
 
				-    let admin_client = admin_rpc_service::connect(ledger_path);
			
 
				-    admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
			
 
				+    // Grab the pid from the process before initiating exit as the running
			
 
				+    // validator will be unable to respond after exit has returned.
			
 
				+    //
			
 
				+    // Additionally, only check the pid() RPC call result if it will be used.
			
 
				+    // In an upgrade scenario, it is possible that a binary that calls pid()
			
 
				+    // will be initating exit against a process that doesn't support pid().
			
 
				+    // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
			
 
				+    // result is checked ONLY in that case to provide a friendlier upgrade
			
 
				+    // path for users who are NOT using --wait-for-exit
			
 
				+    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
			
 
				+        "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
			
 
				+    let post_exit_action = exit_args.post_exit_action.clone();
			
 
				+    let validator_pid = admin_rpc_service::runtime().block_on(async move {
			
 
				+        let admin_client = admin_rpc_service::connect(ledger_path).await?;
			
 
				+        let validator_pid = match post_exit_action {
			
 
				+            Some(PostExitAction::Wait) => admin_client
			
 
				+                .pid()
			
 
				+                .await
			
 
				+                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
			
 
				+            _ => 0,
			
 
				+        };
			
 
				+        admin_client.exit().await?;
			
 
				+
			
 
				+        Ok::<u32, Error>(validator_pid)
			
 
				+    })?;
			
 
				+
			
 
				     println!("Exit request sent");
			
 
				 
			
 
				-    if exit_args.monitor {
			
 
				-        monitor::execute(matches, ledger_path)?;
			
 
				+    match exit_args.post_exit_action {
			
 
				+        None => Ok(()),
			
 
				+        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
			
 
				+        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
			
 
				+    }?;
			
 
				+
			
 
				+    Ok(())
			
 
				+}
			
 
				+
			
 
				+#[cfg(target_os = "linux")]
			
 
				+fn poll_until_pid_terminates(pid: u32) -> Result<()> {
			
 
				+    let pid = i32::try_from(pid)?;
			
 
				+
			
 
				+    println!("Waiting for agave-validator process {pid} to terminate");
			
 
				+    loop {
			
 
				+        // From man kill(2)
			
 
				+        //
			
 
				+        // If sig is 0, then no signal is sent, but existence and permission
			
 
				+        // checks are still performed; this can be used to check for the
			
 
				+        // existence of a process ID or process group ID that the caller is
			
 
				+        // permitted to signal.
			
 
				+        let result = unsafe {
			
 
				+            libc::kill(pid, /*sig:*/ 0)
			
 
				+        };
			
 
				+        if result >= 0 {
			
 
				+            // Give the process some time to exit before checking again
			
 
				+            thread::sleep(Duration::from_millis(500));
			
 
				+        } else {
			
 
				+            let errno = io::Error::last_os_error()
			
 
				+                .raw_os_error()
			
 
				+                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
			
 
				+            match errno {
			
 
				+                libc::ESRCH => {
			
 
				+                    println!("Done, agave-validator process {pid} has terminated");
			
 
				+                    break;
			
 
				+                }
			
 
				+                libc::EINVAL => {
			
 
				+                    // An invalid signal was specified, we only pass sig=0 so
			
 
				+                    // this should not be possible
			
 
				+                    Err(Error::Dynamic(
			
 
				+                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
			
 
				+                    ))?;
			
 
				+                }
			
 
				+                libc::EPERM => {
			
 
				+                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
			
 
				+                }
			
 
				+                unknown => {
			
 
				+                    Err(Error::Dynamic(
			
 
				+                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
			
 
				+                    ))?;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     Ok(())
			
 
				 }
			
 
				 
			
 
				+#[cfg(not(target_os = "linux"))]
			
 
				+fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
			
 
				+    Err(Error::Dynamic(
			
 
				+        "Unable to wait for agave-validator process termination on this platform".into(),
			
 
				+    ))
			
 
				+}
			
 
				+
			
 
				 #[cfg(test)]
			
 
				 mod tests {
			
 
				     use {super::*, crate::commands::tests::verify_args_struct_by_command};
			
@@ -126,7 +232,7 @@ mod tests {
 
				                     .parse()
			
 
				                     .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
			
 
				                 force: false,
			
 
				-                monitor: false,
			
 
				+                post_exit_action: None,
			
 
				                 skip_new_snapshot_check: false,
			
 
				                 skip_health_check: false,
			
 
				             }
			
@@ -151,12 +257,21 @@ mod tests {
 
				     }
			
 
				 
			
 
				     #[test]
			
 
				-    fn verify_args_struct_by_command_exit_with_monitor() {
			
 
				+    fn verify_args_struct_by_command_exit_with_post_exit_action() {
			
 
				         verify_args_struct_by_command(
			
 
				             command(),
			
 
				             vec![COMMAND, "--monitor"],
			
 
				             ExitArgs {
			
 
				-                monitor: true,
			
 
				+                post_exit_action: Some(PostExitAction::Monitor),
			
 
				+                ..ExitArgs::default()
			
 
				+            },
			
 
				+        );
			
 
				+
			
 
				+        verify_args_struct_by_command(
			
 
				+            command(),
			
 
				+            vec![COMMAND, "--wait-for-exit"],
			
 
				+            ExitArgs {
			
 
				+                post_exit_action: Some(PostExitAction::Wait),
			
 
				                 ..ExitArgs::default()
			
 
				             },
			
 
				         );
			
--- a/validator/src/commands/mod.rs
+++ b/validator/src/commands/mod.rs
@@ -27,6 +27,9 @@ pub enum Error {
 
				 
			
 
				     #[error(transparent)]
			
 
				     Io(#[from] std::io::Error),
			
 
				+
			
 
				+    #[error(transparent)]
			
 
				+    TryFromInt(#[from] std::num::TryFromIntError),
			
 
				 }
			
 
				 pub type Result<T> = std::result::Result<T, Error>;