Forráskód Böngészése

validator: Remove extra thread for signal handling (#8634)

The agave-logger currently spins up an extra thread to act upon SIGUSR1
for logrotate. We already have the main thread blocking on join, so just
make the main thread listen for both SIGUSR1 and changes to the exit
flag
steviez 1 hete
szülő
commit
35c5c1d95e

+ 1 - 1
Cargo.lock

@@ -438,7 +438,6 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml 0.9.34+deprecated",
- "signal-hook",
  "solana-account",
  "solana-account-decoder",
  "solana-accounts-db",
@@ -8182,6 +8181,7 @@ dependencies = [
  "serde_json",
  "serial_test",
  "shaq",
+ "signal-hook",
  "slab",
  "solana-account",
  "solana-accounts-db",

+ 2 - 1
core/Cargo.toml

@@ -43,6 +43,7 @@ frozen-abi = [
 [dependencies]
 agave-banking-stage-ingress-types = { workspace = true }
 agave-feature-set = { workspace = true }
+agave-logger = { workspace = true }
 agave-scheduler-bindings = { workspace = true }
 agave-scheduling-utils = { workspace = true }
 agave-snapshots = { workspace = true }
@@ -83,6 +84,7 @@ rolling-file = { workspace = true }
 rustls = { workspace = true }
 serde = { workspace = true }
 serde_bytes = { workspace = true }
+signal-hook = { workspace = true }
 slab = { workspace = true }
 solana-account = { workspace = true }
 solana-accounts-db = { workspace = true }
@@ -190,7 +192,6 @@ shaq = { workspace = true }
 sysctl = { workspace = true }
 
 [dev-dependencies]
-agave-logger = { workspace = true }
 agave-reserved-account-keys = { workspace = true }
 agave-scheduler-bindings = { workspace = true, features = ["dev-context-only-utils"] }
 bencher = { workspace = true }

+ 61 - 4
core/src/validator.rs

@@ -164,7 +164,7 @@ use {
             atomic::{AtomicBool, AtomicU64, Ordering},
             Arc, Mutex, RwLock,
         },
-        thread::{sleep, Builder, JoinHandle},
+        thread::{self, Builder, JoinHandle},
         time::{Duration, Instant},
     },
     strum::VariantNames,
@@ -308,6 +308,8 @@ pub struct GeneratorConfig {
 }
 
 pub struct ValidatorConfig {
+    /// The destination file for validator logs; `stderr` is used if `None`
+    pub logfile: Option<PathBuf>,
     pub halt_at_slot: Option<Slot>,
     pub expected_genesis_hash: Option<Hash>,
     pub expected_bank_hash: Option<Hash>,
@@ -391,6 +393,7 @@ impl ValidatorConfig {
             NonZeroUsize::new(num_cpus::get()).expect("thread count is non-zero");
 
         Self {
+            logfile: None,
             halt_at_slot: None,
             expected_genesis_hash: None,
             expected_bank_hash: None,
@@ -615,6 +618,11 @@ impl ValidatorTpuConfig {
 }
 
 pub struct Validator {
+    /// The destination file for validator logs; `stderr` is used if `None`
+    #[cfg_attr(not(unix), allow(dead_code))]
+    logfile: Option<PathBuf>,
+    /// A global flag to indicate communicate shutdown between threads
+    exit: Arc<AtomicBool>,
     validator_exit: Arc<RwLock<Exit>>,
     json_rpc_service: Option<JsonRpcService>,
     pubsub_service: Option<PubSubService>,
@@ -1728,7 +1736,7 @@ impl Validator {
             blockstore.clone(),
             &config.broadcast_stage_type,
             xdp_sender,
-            exit,
+            exit.clone(),
             node.info.shred_version(),
             vote_tracker,
             bank_forks.clone(),
@@ -1803,6 +1811,8 @@ impl Validator {
         });
 
         Ok(Self {
+            logfile: config.logfile.clone(),
+            exit,
             stats_reporter_service,
             gossip_service,
             serve_repair_service,
@@ -1839,6 +1849,53 @@ impl Validator {
         })
     }
 
+    /// Register and listen for signals that the validator will act on. Also,
+    /// monitor the validator's exit flag incase a shutdown has been initated
+    /// by one of the validator threads
+    pub fn listen_for_signals(&self) -> Result<()> {
+        // Reopen the logfile when the SIGUSR1 signal is received; this provides
+        // a hook for working with logrotate
+        let sigusr1_flag = Arc::new(AtomicBool::new(false));
+        #[cfg(unix)]
+        {
+            if self.logfile.is_some() {
+                signal_hook::flag::register(libc::SIGUSR1, sigusr1_flag.clone())?;
+            }
+        }
+
+        info!("Validator::listen_for_signals() has started");
+        loop {
+            if self.exit.load(Ordering::Relaxed) {
+                break;
+            }
+
+            if sigusr1_flag.load(Ordering::Relaxed) {
+                #[cfg(unix)]
+                {
+                    if let Some(logfile) = self.logfile.as_ref() {
+                        info!("Received SIGUSR1, reopening {}", logfile.display());
+                        agave_logger::redirect_stderr(logfile);
+                        // Reset the flag to `false` to allow detection of the
+                        // signal again and to avoid hitting this case every
+                        // iteration
+                        sigusr1_flag.store(false, Ordering::Relaxed);
+                    }
+                }
+                #[cfg(not(unix))]
+                {
+                    unreachable!("The SIGUSR1 signal is only handled on unix systems");
+                }
+            }
+
+            // One second is a reasonable response time for these signals to
+            // avoid this thread from being overly active
+            thread::sleep(Duration::from_secs(1));
+        }
+        info!("Validator::listen_for_signals() has stopped");
+
+        Ok(())
+    }
+
     // Used for notifying many nodes in parallel to exit
     pub fn exit(&mut self) {
         self.validator_exit.write().unwrap().exit();
@@ -2340,7 +2397,7 @@ impl<'a> ProcessBlockStore<'a> {
                             let slot = bank_forks.read().unwrap().working_bank().slot();
                             *start_progress.write().unwrap() =
                                 ValidatorStartProgress::ProcessingLedger { slot, max_slot };
-                            sleep(Duration::from_secs(2));
+                            thread::sleep(Duration::from_secs(2));
                         }
                     })
                     .unwrap();
@@ -2777,7 +2834,7 @@ fn wait_for_supermajority(
                 // prevent load balancers from removing the node from their list of candidates during a
                 // manual restart.
                 rpc_override_health_check.store(true, Ordering::Relaxed);
-                sleep(Duration::new(1, 0));
+                thread::sleep(Duration::new(1, 0));
             }
             rpc_override_health_check.store(false, Ordering::Relaxed);
             Ok(true)

+ 2 - 0
dev-bins/Cargo.lock

@@ -6801,6 +6801,7 @@ version = "4.0.0-alpha.0"
 dependencies = [
  "agave-banking-stage-ingress-types",
  "agave-feature-set",
+ "agave-logger",
  "agave-scheduler-bindings",
  "agave-scheduling-utils",
  "agave-snapshots",
@@ -6843,6 +6844,7 @@ dependencies = [
  "serde",
  "serde_bytes",
  "shaq",
+ "signal-hook",
  "slab",
  "solana-account",
  "solana-accounts-db",

+ 1 - 0
local-cluster/src/validator_configs.rs

@@ -6,6 +6,7 @@ use {
 
 pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
     ValidatorConfig {
+        logfile: config.logfile.clone(),
         halt_at_slot: config.halt_at_slot,
         expected_genesis_hash: config.expected_genesis_hash,
         expected_bank_hash: config.expected_bank_hash,

+ 23 - 1
logger/src/lib.rs

@@ -77,7 +77,7 @@ pub fn setup_file_with_default(logfile: &Path, filter: &str) {
 }
 
 #[cfg(unix)]
-fn redirect_stderr(filename: &Path) {
+pub fn redirect_stderr(filename: &Path) {
     use std::{fs::OpenOptions, os::unix::io::AsRawFd};
     match OpenOptions::new().create(true).append(true).open(filename) {
         Ok(file) => unsafe {
@@ -87,6 +87,28 @@ fn redirect_stderr(filename: &Path) {
     }
 }
 
+pub fn initialize_logging(logfile: Option<PathBuf>) {
+    // Debugging panics is easier with a backtrace
+    if env::var_os("RUST_BACKTRACE").is_none() {
+        env::set_var("RUST_BACKTRACE", "1")
+    }
+
+    let Some(logfile) = logfile else {
+        setup_with_default_filter();
+        return;
+    };
+
+    #[cfg(unix)]
+    {
+        setup_with_default_filter();
+        redirect_stderr(&logfile);
+    }
+    #[cfg(not(unix))]
+    {
+        setup_file_with_default(&logfile, DEFAULT_FILTER);
+    }
+}
+
 // Redirect stderr to a file with support for logrotate by sending a SIGUSR1 to the process.
 //
 // Upon success, future `log` macros and `eprintln!()` can be found in the specified log file.

+ 2 - 1
programs/sbf/Cargo.lock

@@ -282,7 +282,6 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_yaml",
- "signal-hook",
  "solana-account",
  "solana-accounts-db",
  "solana-clap-utils",
@@ -6639,6 +6638,7 @@ version = "4.0.0-alpha.0"
 dependencies = [
  "agave-banking-stage-ingress-types",
  "agave-feature-set",
+ "agave-logger",
  "agave-scheduler-bindings",
  "agave-scheduling-utils",
  "agave-snapshots",
@@ -6681,6 +6681,7 @@ dependencies = [
  "serde",
  "serde_bytes",
  "shaq",
+ "signal-hook",
  "slab",
  "solana-account",
  "solana-accounts-db",

+ 0 - 1
validator/Cargo.toml

@@ -97,7 +97,6 @@ jemallocator = { workspace = true }
 
 [target."cfg(unix)".dependencies]
 libc = { workspace = true }
-signal-hook = { workspace = true }
 
 [dev-dependencies]
 assert_cmd = { workspace = true }

+ 3 - 2
validator/src/commands/run/execute.rs

@@ -6,7 +6,6 @@ use {
         commands::{run::args::RunArgs, FromClapArgMatches},
         ledger_lockfile, lock_ledger,
     },
-    agave_logger::redirect_stderr_to_file,
     agave_snapshots::{
         paths::BANK_SNAPSHOTS_DIR,
         snapshot_config::{SnapshotConfig, SnapshotUsage},
@@ -120,7 +119,7 @@ pub fn execute(
         println!("log file: {}", logfile.display());
     }
     let use_progress_bar = logfile.is_none();
-    let _logger_thread = redirect_stderr_to_file(logfile);
+    agave_logger::initialize_logging(logfile.clone());
 
     info!("{} {}", crate_name!(), solana_version);
     info!("Starting validator with: {:#?}", std::env::args_os());
@@ -520,6 +519,7 @@ pub fn execute(
     );
 
     let mut validator_config = ValidatorConfig {
+        logfile,
         require_tower: matches.is_present("require_tower"),
         tower_storage,
         halt_at_slot: value_t!(matches, "dev_halt_at_slot", Slot).ok(),
@@ -1055,6 +1055,7 @@ pub fn execute(
         File::create(filename).map_err(|err| format!("unable to create {filename}: {err}"))?;
     }
     info!("Validator initialized");
+    validator.listen_for_signals()?;
     validator.join();
     info!("Validator exiting..");