Browse Source

validator: Add --wait-for-exit flag to exit subcommand (#6780)

agave-validator exit currents returns immediately after the AdminRpc
call returns. However, the running validator has not exited at this
point and may continue to tear itself down for multiple seconds

The exit subcommand now has an optional flag, --wait-for-exit, that
queries the PID from the validator and loops until that PID has fully
terminated. Use of this flag means that a caller can be sure the
running validator is dead when agave-validator exit returns
steviez 4 months ago
parent
commit
6bcd5baf84

+ 1 - 0
validator/Cargo.toml

@@ -27,6 +27,7 @@ jsonrpc-core = { workspace = true }
 jsonrpc-core-client = { workspace = true, features = ["ipc"] }
 jsonrpc-derive = { workspace = true }
 jsonrpc-ipc-server = { workspace = true }
+libc = { workspace = true }
 libloading = { workspace = true }
 log = { workspace = true }
 num_cpus = { workspace = true }

+ 11 - 1
validator/src/admin_rpc_service.rs

@@ -152,9 +152,15 @@ impl solana_cli_output::QuietDisplay for AdminRpcRepairWhitelist {}
 pub trait AdminRpc {
     type Metadata;
 
+    /// Initiates validator exit; exit is asynchronous so the validator
+    /// will almost certainly still be running when this method returns
     #[rpc(meta, name = "exit")]
     fn exit(&self, meta: Self::Metadata) -> Result<()>;
 
+    /// Return the process id (pid)
+    #[rpc(meta, name = "pid")]
+    fn pid(&self, meta: Self::Metadata) -> Result<u32>;
+
     #[rpc(meta, name = "reloadPlugin")]
     fn reload_plugin(
         &self,
@@ -270,7 +276,7 @@ impl AdminRpc for AdminRpcImpl {
                 // receive a confusing error as the validator shuts down before a response is sent back.
                 thread::sleep(Duration::from_millis(100));
 
-                warn!("validator exit requested");
+                info!("validator exit requested");
                 meta.validator_exit.write().unwrap().exit();
 
                 if !meta.validator_exit_backpressure.is_empty() {
@@ -315,6 +321,10 @@ impl AdminRpc for AdminRpcImpl {
         Ok(())
     }
 
+    fn pid(&self, _meta: Self::Metadata) -> Result<u32> {
+        Ok(std::process::id())
+    }
+
     fn reload_plugin(
         &self,
         meta: Self::Metadata,

+ 125 - 10
validator/src/commands/exit/mod.rs

@@ -1,7 +1,9 @@
+#[cfg(target_os = "linux")]
+use std::{io, thread, time::Duration};
 use {
     crate::{
         admin_rpc_service,
-        commands::{monitor, wait_for_restart_window, FromClapArgMatches, Result},
+        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
     },
     clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
     solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
@@ -13,10 +15,18 @@ const COMMAND: &str = "exit";
 const DEFAULT_MIN_IDLE_TIME: &str = "10";
 const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
 
+#[derive(Clone, Debug, PartialEq)]
+pub enum PostExitAction {
+    // Run the agave-validator monitor command indefinitely
+    Monitor,
+    // Block until the exiting validator process has terminated
+    Wait,
+}
+
 #[derive(Debug, PartialEq)]
 pub struct ExitArgs {
     pub force: bool,
-    pub monitor: bool,
+    pub post_exit_action: Option<PostExitAction>,
     pub min_idle_time: usize,
     pub max_delinquent_stake: u8,
     pub skip_new_snapshot_check: bool,
@@ -25,9 +35,17 @@ pub struct ExitArgs {
 
 impl FromClapArgMatches for ExitArgs {
     fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
+        let post_exit_action = if matches.is_present("monitor") {
+            Some(PostExitAction::Monitor)
+        } else if matches.is_present("wait_for_exit") {
+            Some(PostExitAction::Wait)
+        } else {
+            None
+        };
+
         Ok(ExitArgs {
             force: matches.is_present("force"),
-            monitor: matches.is_present("monitor"),
+            post_exit_action,
             min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
             max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
             skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
@@ -55,6 +73,12 @@ pub fn command<'a>() -> App<'a, 'a> {
                 .takes_value(false)
                 .help("Monitor the validator after sending the exit request"),
         )
+        .arg(
+            Arg::with_name("wait_for_exit")
+                .long("wait-for-exit")
+                .conflicts_with("monitor")
+                .help("Wait for the validator to terminate after sending the exit request"),
+        )
         .arg(
             Arg::with_name("min_idle_time")
                 .long("min-idle-time")
@@ -101,17 +125,99 @@ pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
         )?;
     }
 
-    let admin_client = admin_rpc_service::connect(ledger_path);
-    admin_rpc_service::runtime().block_on(async move { admin_client.await?.exit().await })?;
+    // Grab the pid from the process before initiating exit as the running
+    // validator will be unable to respond after exit has returned.
+    //
+    // Additionally, only check the pid() RPC call result if it will be used.
+    // In an upgrade scenario, it is possible that a binary that calls pid()
+    // will be initating exit against a process that doesn't support pid().
+    // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
+    // result is checked ONLY in that case to provide a friendlier upgrade
+    // path for users who are NOT using --wait-for-exit
+    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
+        "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
+    let post_exit_action = exit_args.post_exit_action.clone();
+    let validator_pid = admin_rpc_service::runtime().block_on(async move {
+        let admin_client = admin_rpc_service::connect(ledger_path).await?;
+        let validator_pid = match post_exit_action {
+            Some(PostExitAction::Wait) => admin_client
+                .pid()
+                .await
+                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
+            _ => 0,
+        };
+        admin_client.exit().await?;
+
+        Ok::<u32, Error>(validator_pid)
+    })?;
+
     println!("Exit request sent");
 
-    if exit_args.monitor {
-        monitor::execute(matches, ledger_path)?;
+    match exit_args.post_exit_action {
+        None => Ok(()),
+        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
+        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
+    }?;
+
+    Ok(())
+}
+
+#[cfg(target_os = "linux")]
+fn poll_until_pid_terminates(pid: u32) -> Result<()> {
+    let pid = i32::try_from(pid)?;
+
+    println!("Waiting for agave-validator process {pid} to terminate");
+    loop {
+        // From man kill(2)
+        //
+        // If sig is 0, then no signal is sent, but existence and permission
+        // checks are still performed; this can be used to check for the
+        // existence of a process ID or process group ID that the caller is
+        // permitted to signal.
+        let result = unsafe {
+            libc::kill(pid, /*sig:*/ 0)
+        };
+        if result >= 0 {
+            // Give the process some time to exit before checking again
+            thread::sleep(Duration::from_millis(500));
+        } else {
+            let errno = io::Error::last_os_error()
+                .raw_os_error()
+                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
+            match errno {
+                libc::ESRCH => {
+                    println!("Done, agave-validator process {pid} has terminated");
+                    break;
+                }
+                libc::EINVAL => {
+                    // An invalid signal was specified, we only pass sig=0 so
+                    // this should not be possible
+                    Err(Error::Dynamic(
+                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
+                    ))?;
+                }
+                libc::EPERM => {
+                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
+                }
+                unknown => {
+                    Err(Error::Dynamic(
+                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
+                    ))?;
+                }
+            }
+        }
     }
 
     Ok(())
 }
 
+#[cfg(not(target_os = "linux"))]
+fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
+    Err(Error::Dynamic(
+        "Unable to wait for agave-validator process termination on this platform".into(),
+    ))
+}
+
 #[cfg(test)]
 mod tests {
     use {super::*, crate::commands::tests::verify_args_struct_by_command};
@@ -126,7 +232,7 @@ mod tests {
                     .parse()
                     .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
                 force: false,
-                monitor: false,
+                post_exit_action: None,
                 skip_new_snapshot_check: false,
                 skip_health_check: false,
             }
@@ -151,12 +257,21 @@ mod tests {
     }
 
     #[test]
-    fn verify_args_struct_by_command_exit_with_monitor() {
+    fn verify_args_struct_by_command_exit_with_post_exit_action() {
         verify_args_struct_by_command(
             command(),
             vec![COMMAND, "--monitor"],
             ExitArgs {
-                monitor: true,
+                post_exit_action: Some(PostExitAction::Monitor),
+                ..ExitArgs::default()
+            },
+        );
+
+        verify_args_struct_by_command(
+            command(),
+            vec![COMMAND, "--wait-for-exit"],
+            ExitArgs {
+                post_exit_action: Some(PostExitAction::Wait),
                 ..ExitArgs::default()
             },
         );

+ 3 - 0
validator/src/commands/mod.rs

@@ -27,6 +27,9 @@ pub enum Error {
 
     #[error(transparent)]
     Io(#[from] std::io::Error),
+
+    #[error(transparent)]
+    TryFromInt(#[from] std::num::TryFromIntError),
 }
 pub type Result<T> = std::result::Result<T, Error>;