|
|
@@ -3,6 +3,7 @@
|
|
|
use {
|
|
|
crate::{
|
|
|
commitment::{update_commitment_cache, CommitmentType},
|
|
|
+ consensus_metrics::{ConsensusMetricsEvent, ConsensusMetricsEventSender},
|
|
|
event::{CompletedBlock, VotorEvent, VotorEventReceiver},
|
|
|
event_handler::stats::EventHandlerStats,
|
|
|
root_utils::{self, RootContext, SetRootError},
|
|
|
@@ -13,7 +14,7 @@ use {
|
|
|
votor::{SharedContext, Votor},
|
|
|
},
|
|
|
agave_votor_messages::{consensus_message::Block, vote::Vote},
|
|
|
- crossbeam_channel::{RecvTimeoutError, SendError},
|
|
|
+ crossbeam_channel::{RecvTimeoutError, TrySendError},
|
|
|
parking_lot::RwLock,
|
|
|
solana_clock::Slot,
|
|
|
solana_hash::Hash,
|
|
|
@@ -31,7 +32,7 @@ use {
|
|
|
Arc, Condvar, Mutex,
|
|
|
},
|
|
|
thread::{self, Builder, JoinHandle},
|
|
|
- time::Duration,
|
|
|
+ time::{Duration, Instant},
|
|
|
},
|
|
|
thiserror::Error,
|
|
|
};
|
|
|
@@ -62,7 +63,7 @@ enum EventLoopError {
|
|
|
ReceiverDisconnected(#[from] RecvTimeoutError),
|
|
|
|
|
|
#[error("Sender is disconnected")]
|
|
|
- SenderDisconnected(#[from] SendError<()>),
|
|
|
+ SenderDisconnected,
|
|
|
|
|
|
#[error("Error generating and inserting vote")]
|
|
|
VotingError(#[from] VoteError),
|
|
|
@@ -164,7 +165,9 @@ impl EventHandler {
|
|
|
let mut send_votes_batch_time = Measure::start("send_votes_batch");
|
|
|
for vote in votes {
|
|
|
local_context.stats.incr_vote(&vote);
|
|
|
- vctx.bls_sender.send(vote).map_err(|_| SendError(()))?;
|
|
|
+ vctx.bls_sender
|
|
|
+ .send(vote)
|
|
|
+ .map_err(|_| EventLoopError::SenderDisconnected)?;
|
|
|
}
|
|
|
send_votes_batch_time.stop();
|
|
|
local_context.stats.send_votes_batch_time_us = local_context
|
|
|
@@ -208,6 +211,24 @@ impl EventHandler {
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
+ fn send_to_metrics(
|
|
|
+ consensus_metrics_sender: &ConsensusMetricsEventSender,
|
|
|
+ consensus_metrics_events: Vec<ConsensusMetricsEvent>,
|
|
|
+ stats: &mut EventHandlerStats,
|
|
|
+ ) -> Result<(), EventLoopError> {
|
|
|
+ // Do not kill or block event handler threads just because metrics
|
|
|
+ // send failed (maybe because the queue is full).
|
|
|
+ match consensus_metrics_sender.try_send((Instant::now(), consensus_metrics_events)) {
|
|
|
+ Ok(()) => Ok(()),
|
|
|
+ Err(TrySendError::Disconnected(_)) => Err(EventLoopError::SenderDisconnected),
|
|
|
+ Err(TrySendError::Full(_)) => {
|
|
|
+ warn!("send_to_metrics failed: queue is full");
|
|
|
+ stats.metrics_queue_became_full = true;
|
|
|
+ Ok(())
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
fn handle_event(
|
|
|
event: VotorEvent,
|
|
|
timer_manager: &RwLock<TimerManager>,
|
|
|
@@ -228,18 +249,25 @@ impl EventHandler {
|
|
|
// Block has completed replay
|
|
|
VotorEvent::Block(CompletedBlock { slot, bank }) => {
|
|
|
debug_assert!(bank.is_frozen());
|
|
|
- {
|
|
|
- let mut metrics_guard = vctx.consensus_metrics.write();
|
|
|
- match metrics_guard.record_block_hash_seen(*bank.collector_id(), slot) {
|
|
|
- Ok(()) => (),
|
|
|
- Err(err) => {
|
|
|
- error!(
|
|
|
- "{my_pubkey}: recording block on slot {slot} failed with {err:?}"
|
|
|
- );
|
|
|
- }
|
|
|
- }
|
|
|
- metrics_guard.maybe_new_epoch(bank.epoch());
|
|
|
+ let mut consensus_metrics_events =
|
|
|
+ vec![ConsensusMetricsEvent::StartOfSlot { slot }];
|
|
|
+ if slot == first_of_consecutive_leader_slots(slot) {
|
|
|
+ // all slots except the first in the window would typically start when
|
|
|
+ // the block is seen so the recording would essentially record 0.
|
|
|
+ // hence we skip it.
|
|
|
+ consensus_metrics_events.push(ConsensusMetricsEvent::BlockHashSeen {
|
|
|
+ leader: *bank.collector_id(),
|
|
|
+ slot,
|
|
|
+ });
|
|
|
}
|
|
|
+ consensus_metrics_events.push(ConsensusMetricsEvent::MaybeNewEpoch {
|
|
|
+ epoch: bank.epoch(),
|
|
|
+ });
|
|
|
+ Self::send_to_metrics(
|
|
|
+ &vctx.consensus_metrics_sender,
|
|
|
+ consensus_metrics_events,
|
|
|
+ stats,
|
|
|
+ )?;
|
|
|
let (block, parent_block) = Self::get_block_parent_block(&bank);
|
|
|
info!("{my_pubkey}: Block {block:?} parent {parent_block:?}");
|
|
|
if Self::try_notar(
|
|
|
@@ -296,6 +324,11 @@ impl EventHandler {
|
|
|
|
|
|
// Received a parent ready notification for `slot`
|
|
|
VotorEvent::ParentReady { slot, parent_block } => {
|
|
|
+ Self::send_to_metrics(
|
|
|
+ &vctx.consensus_metrics_sender,
|
|
|
+ vec![ConsensusMetricsEvent::StartOfSlot { slot }],
|
|
|
+ stats,
|
|
|
+ )?;
|
|
|
Self::handle_parent_ready_event(
|
|
|
slot,
|
|
|
parent_block,
|
|
|
@@ -318,6 +351,15 @@ impl EventHandler {
|
|
|
// Skip timer for the slot has fired
|
|
|
VotorEvent::Timeout(slot) => {
|
|
|
info!("{my_pubkey}: Timeout {slot}");
|
|
|
+ if slot != last_of_consecutive_leader_slots(slot) {
|
|
|
+ Self::send_to_metrics(
|
|
|
+ &vctx.consensus_metrics_sender,
|
|
|
+ vec![ConsensusMetricsEvent::StartOfSlot {
|
|
|
+ slot: slot.saturating_add(1),
|
|
|
+ }],
|
|
|
+ stats,
|
|
|
+ )?;
|
|
|
+ }
|
|
|
if vctx.vote_history.voted(slot) {
|
|
|
return Ok(votes);
|
|
|
}
|
|
|
@@ -776,7 +818,7 @@ mod tests {
|
|
|
super::*,
|
|
|
crate::{
|
|
|
commitment::CommitmentAggregationData,
|
|
|
- consensus_metrics::ConsensusMetrics,
|
|
|
+ consensus_metrics::ConsensusMetricsEventReceiver,
|
|
|
event::{LeaderWindowInfo, VotorEventSender},
|
|
|
vote_history_storage::{
|
|
|
FileVoteHistoryStorage, SavedVoteHistory, SavedVoteHistoryVersions,
|
|
|
@@ -789,7 +831,7 @@ mod tests {
|
|
|
consensus_message::{ConsensusMessage, VoteMessage, BLS_KEYPAIR_DERIVE_SEED},
|
|
|
vote::Vote,
|
|
|
},
|
|
|
- crossbeam_channel::{unbounded, Receiver, RecvTimeoutError},
|
|
|
+ crossbeam_channel::{bounded, Receiver, RecvTimeoutError},
|
|
|
parking_lot::RwLock as PlRwLock,
|
|
|
solana_bls_signatures::{
|
|
|
keypair::Keypair as BLSKeypair, signature::Signature as BLSSignature,
|
|
|
@@ -838,22 +880,23 @@ mod tests {
|
|
|
leader_window_notifier: Arc<LeaderWindowNotifier>,
|
|
|
drop_bank_receiver: Receiver<Vec<BankWithScheduler>>,
|
|
|
cluster_info: Arc<ClusterInfo>,
|
|
|
+ consensus_metrics_receiver: ConsensusMetricsEventReceiver,
|
|
|
}
|
|
|
|
|
|
impl EventHandlerTestContext {
|
|
|
fn setup() -> EventHandlerTestContext {
|
|
|
- let (bls_sender, bls_receiver) = unbounded();
|
|
|
- let (commitment_sender, commitment_receiver) = unbounded();
|
|
|
- let (own_vote_sender, own_vote_receiver) = unbounded();
|
|
|
- let (drop_bank_sender, drop_bank_receiver) = unbounded();
|
|
|
+ // For tests, we just make each queue bounded at 100, should be enough.
|
|
|
+ let (bls_sender, bls_receiver) = bounded(100);
|
|
|
+ let (commitment_sender, commitment_receiver) = bounded(100);
|
|
|
+ let (own_vote_sender, own_vote_receiver) = bounded(100);
|
|
|
+ let (drop_bank_sender, drop_bank_receiver) = bounded(100);
|
|
|
let exit = Arc::new(AtomicBool::new(false));
|
|
|
let start = Arc::new((Mutex::new(true), Condvar::new()));
|
|
|
- let (event_sender, event_receiver) = unbounded();
|
|
|
- let consensus_metrics = Arc::new(PlRwLock::new(ConsensusMetrics::new(0)));
|
|
|
+ let (event_sender, event_receiver) = bounded(100);
|
|
|
+ let (consensus_metrics_sender, consensus_metrics_receiver) = bounded(100);
|
|
|
let timer_manager = Arc::new(PlRwLock::new(TimerManager::new(
|
|
|
event_sender.clone(),
|
|
|
exit.clone(),
|
|
|
- consensus_metrics.clone(),
|
|
|
)));
|
|
|
|
|
|
// Create 10 node validatorvotekeypairs vec
|
|
|
@@ -913,7 +956,7 @@ mod tests {
|
|
|
derived_bls_keypairs: HashMap::new(),
|
|
|
has_new_vote_been_rooted: false,
|
|
|
own_vote_sender,
|
|
|
- consensus_metrics,
|
|
|
+ consensus_metrics_sender,
|
|
|
};
|
|
|
|
|
|
let root_context = RootContext {
|
|
|
@@ -949,6 +992,7 @@ mod tests {
|
|
|
leader_window_notifier,
|
|
|
drop_bank_receiver,
|
|
|
cluster_info,
|
|
|
+ consensus_metrics_receiver,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -1121,6 +1165,14 @@ mod tests {
|
|
|
assert!(self.timer_manager.read().is_timeout_set(expected_slot));
|
|
|
}
|
|
|
|
|
|
+ fn check_for_metrics_event(&self, expected: ConsensusMetricsEvent) {
|
|
|
+ let event = self
|
|
|
+ .consensus_metrics_receiver
|
|
|
+ .recv_timeout(TEST_SHORT_TIMEOUT)
|
|
|
+ .expect("Should receive metrics event");
|
|
|
+ assert!(event.1.contains(&expected));
|
|
|
+ }
|
|
|
+
|
|
|
fn crate_vote_history_storage_and_switch_identity(
|
|
|
&self,
|
|
|
new_identity: &Keypair,
|
|
|
@@ -1174,6 +1226,8 @@ mod tests {
|
|
|
test_context.wait_for_event_to_be_processed();
|
|
|
let block_id_1 = bank1.block_id().unwrap();
|
|
|
|
|
|
+ test_context.check_for_metrics_event(ConsensusMetricsEvent::StartOfSlot { slot });
|
|
|
+
|
|
|
// We should receive Notarize Vote for block 1
|
|
|
test_context.check_for_vote(&Vote::new_notarization_vote(slot, block_id_1));
|
|
|
test_context.check_for_commitment(CommitmentType::Notarize, slot);
|
|
|
@@ -1635,32 +1689,37 @@ mod tests {
|
|
|
#[test_case("bls_receiver")]
|
|
|
#[test_case("commitment_receiver")]
|
|
|
#[test_case("own_vote_receiver")]
|
|
|
+ #[test_case("consensus_metrics_receiver")]
|
|
|
fn test_channel_disconnection(channel_name: &str) {
|
|
|
agave_logger::setup();
|
|
|
let mut test_context = EventHandlerTestContext::setup();
|
|
|
match channel_name {
|
|
|
"bls_receiver" => {
|
|
|
let bls_receiver = test_context.bls_receiver.clone();
|
|
|
- test_context.bls_receiver = unbounded().1;
|
|
|
+ test_context.bls_receiver = bounded(100).1;
|
|
|
drop(bls_receiver);
|
|
|
}
|
|
|
"commitment_receiver" => {
|
|
|
let commitment_receiver = test_context.commitment_receiver.clone();
|
|
|
- test_context.commitment_receiver = unbounded().1;
|
|
|
+ test_context.commitment_receiver = bounded(100).1;
|
|
|
drop(commitment_receiver);
|
|
|
}
|
|
|
"own_vote_receiver" => {
|
|
|
let own_vote_receiver = test_context.own_vote_receiver.clone();
|
|
|
- test_context.own_vote_receiver = unbounded().1;
|
|
|
+ test_context.own_vote_receiver = bounded(100).1;
|
|
|
drop(own_vote_receiver);
|
|
|
}
|
|
|
+ "consensus_metrics_receiver" => {
|
|
|
+ let consensus_metrics_receiver = test_context.consensus_metrics_receiver.clone();
|
|
|
+ test_context.consensus_metrics_receiver = bounded(100).1;
|
|
|
+ drop(consensus_metrics_receiver);
|
|
|
+ }
|
|
|
_ => panic!("Unknown channel name"),
|
|
|
}
|
|
|
// We normally need some event hitting all the senders to trigger exit
|
|
|
let root_bank = test_context.bank_forks.read().unwrap().root_bank();
|
|
|
let _ = test_context.create_block_and_send_block_event(1, root_bank);
|
|
|
test_context.send_parent_ready_event(1, (0, Hash::default()));
|
|
|
- test_context.wait_for_event_to_be_processed();
|
|
|
// Verify that the event_handler exits within 5 seconds
|
|
|
let start = Instant::now();
|
|
|
while !test_context.exit.load(Ordering::Relaxed) && start.elapsed() < Duration::from_secs(5)
|