wen_restart.rs 152 KB


  1. //! The `wen-restart` module handles automatic repair during a cluster restart
  2. use {
  3. crate::{
  4. heaviest_fork_aggregate::{HeaviestForkAggregate, HeaviestForkAggregateResult},
  5. last_voted_fork_slots_aggregate::{
  6. LastVotedForkSlotsAggregate, LastVotedForkSlotsAggregateResult,
  7. LastVotedForkSlotsEpochInfo, LastVotedForkSlotsFinalResult,
  8. },
  9. solana::wen_restart_proto::{
  10. self, ConflictMessage, GenerateSnapshotRecord, HeaviestForkAggregateRecord,
  11. HeaviestForkRecord, LastVotedForkSlotsAggregateFinal,
  12. LastVotedForkSlotsAggregateRecord, LastVotedForkSlotsEpochInfoRecord,
  13. LastVotedForkSlotsRecord, State as RestartState, WenRestartProgress,
  14. },
  15. },
  16. agave_snapshots::{
  17. paths::{
  18. get_highest_full_snapshot_archive_slot, get_highest_incremental_snapshot_archive_slot,
  19. },
  20. snapshot_archive_info::SnapshotArchiveInfoGetter,
  21. },
  22. anyhow::Result,
  23. log::*,
  24. prost::Message,
  25. solana_clock::{Epoch, Slot},
  26. solana_entry::entry::VerifyRecyclers,
  27. solana_gossip::{
  28. cluster_info::{ClusterInfo, GOSSIP_SLEEP_MILLIS},
  29. restart_crds_values::RestartLastVotedForkSlots,
  30. },
  31. solana_hash::Hash,
  32. solana_ledger::{
  33. ancestor_iterator::AncestorIterator,
  34. blockstore::Blockstore,
  35. blockstore_processor::{process_single_slot, ConfirmationProgress, ProcessOptions},
  36. leader_schedule_cache::LeaderScheduleCache,
  37. },
  38. solana_pubkey::Pubkey,
  39. solana_runtime::{
  40. accounts_background_service::AbsStatus,
  41. bank::Bank,
  42. bank_forks::BankForks,
  43. snapshot_bank_utils::{
  44. bank_to_full_snapshot_archive, bank_to_incremental_snapshot_archive,
  45. },
  46. snapshot_controller::SnapshotController,
  47. snapshot_utils::purge_all_bank_snapshots,
  48. },
  49. solana_shred_version::compute_shred_version,
  50. solana_svm_timings::ExecuteTimings,
  51. solana_time_utils::timestamp,
  52. solana_vote::vote_transaction::VoteTransaction,
  53. std::{
  54. collections::{HashMap, HashSet},
  55. fs::{read, File},
  56. io::{Cursor, Write},
  57. path::{Path, PathBuf},
  58. str::FromStr,
  59. sync::{
  60. atomic::{AtomicBool, Ordering},
  61. Arc, RwLock,
  62. },
  63. thread::sleep,
  64. time::{Duration, Instant},
  65. },
  66. };
  67. // If >42% of the validators have this block, repair this block locally.
  68. const REPAIR_THRESHOLD: f64 = 0.42;
  69. // When counting Heaviest Fork, only count those with no less than
  70. // 67% - 5% - (100% - active_stake) = active_stake - 38% stake.
  71. // 67% is the supermajority threshold (2/3), 5% is the assumption we
  72. // made regarding how much non-conforming/offline validators the
  73. // algorithm can tolerate.
  74. const HEAVIEST_FORK_THRESHOLD_DELTA: f64 = 0.38;
  75. // The coordinator print new stats every 10 seconds.
  76. const COORDINATOR_STAT_PRINT_INTERVAL_SECONDS: u64 = 10;
  77. #[derive(Debug, PartialEq)]
  78. pub enum WenRestartError {
  79. BankHashMismatch(Slot, Hash, Hash),
  80. BlockNotFound(Slot),
  81. BlockNotFull(Slot),
  82. BlockNotFrozenAfterReplay(Slot, Option<String>),
  83. BlockNotLinkedToExpectedParent(Slot, Option<Slot>, Slot),
  84. ChildStakeLargerThanParent(Slot, u64, Slot, u64),
  85. Exiting,
  86. FutureSnapshotExists(Slot, Slot, String),
  87. GenerateSnapshotWhenOneExists(Slot, String),
  88. GenerateSnapshotWhenDisabled,
  89. HeaviestForkOnLeaderOnDifferentFork(Slot, Slot),
  90. MalformedLastVotedForkSlotsProtobuf(Option<LastVotedForkSlotsRecord>),
  91. MalformedProgress(RestartState, String),
  92. MissingLastVotedForkSlots,
  93. MissingSnapshotInProtobuf,
  94. NotEnoughStakeAgreeingWithUs(Slot, Hash, HashMap<(Slot, Hash), u64>),
  95. UnexpectedState(wen_restart_proto::State),
  96. }
  97. impl std::fmt::Display for WenRestartError {
  98. fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
  99. match self {
  100. WenRestartError::BankHashMismatch(slot, expected, actual) => {
  101. write!(
  102. f,
  103. "Bank hash mismatch for slot: {slot} expected: {expected} actual: {actual}"
  104. )
  105. }
  106. WenRestartError::BlockNotFound(slot) => {
  107. write!(f, "Block not found: {slot}")
  108. }
  109. WenRestartError::BlockNotFull(slot) => {
  110. write!(f, "Block not full: {slot}")
  111. }
  112. WenRestartError::BlockNotFrozenAfterReplay(slot, err) => {
  113. write!(f, "Block not frozen after replay: {slot} {err:?}")
  114. }
  115. WenRestartError::BlockNotLinkedToExpectedParent(slot, parent, expected_parent) => {
  116. write!(
  117. f,
  118. "Block {slot} is not linked to expected parent {expected_parent} but to \
  119. {parent:?}"
  120. )
  121. }
  122. WenRestartError::ChildStakeLargerThanParent(
  123. slot,
  124. child_stake,
  125. parent,
  126. parent_stake,
  127. ) => {
  128. write!(
  129. f,
  130. "Block {slot} has more stake {child_stake} than its parent {parent} with \
  131. stake {parent_stake}"
  132. )
  133. }
  134. WenRestartError::Exiting => write!(f, "Exiting"),
  135. WenRestartError::FutureSnapshotExists(slot, highest_slot, directory) => {
  136. write!(
  137. f,
  138. "Future snapshot exists for slot: {slot} highest slot: {highest_slot} in \
  139. directory: {directory}",
  140. )
  141. }
  142. WenRestartError::GenerateSnapshotWhenOneExists(slot, directory) => {
  143. write!(
  144. f,
  145. "Generate snapshot when one exists for slot: {slot} in directory: {directory}",
  146. )
  147. }
  148. WenRestartError::GenerateSnapshotWhenDisabled => {
  149. write!(f, "Generate snapshot when snapshots are disabled")
  150. }
  151. WenRestartError::HeaviestForkOnLeaderOnDifferentFork(
  152. coordinator_heaviest_slot,
  153. should_include_slot,
  154. ) => {
  155. write!(
  156. f,
  157. "Heaviest fork on coordinator on different fork: heaviest: \
  158. {coordinator_heaviest_slot} does not include: {should_include_slot}",
  159. )
  160. }
  161. WenRestartError::MalformedLastVotedForkSlotsProtobuf(record) => {
  162. write!(f, "Malformed last voted fork slots protobuf: {record:?}")
  163. }
  164. WenRestartError::MalformedProgress(state, missing) => {
  165. write!(f, "Malformed progress: {state:?} missing {missing}")
  166. }
  167. WenRestartError::MissingLastVotedForkSlots => {
  168. write!(f, "Missing last voted fork slots")
  169. }
  170. WenRestartError::MissingSnapshotInProtobuf => {
  171. write!(f, "Missing snapshot in protobuf")
  172. }
  173. WenRestartError::NotEnoughStakeAgreeingWithUs(slot, hash, block_stake_map) => {
  174. write!(
  175. f,
  176. "Not enough stake agreeing with our slot: {slot} hash: {hash}\n \
  177. {block_stake_map:?}",
  178. )
  179. }
  180. WenRestartError::UnexpectedState(state) => {
  181. write!(f, "Unexpected state: {state:?}")
  182. }
  183. }
  184. }
  185. }
  186. impl std::error::Error for WenRestartError {}
  187. // We need a WenRestartProgressInternalState so we can convert the protobuf written in file
  188. // into internal data structure in the initialize function. It should be easily
  189. // convertible to and from WenRestartProgress protobuf.
  190. #[derive(Debug, PartialEq)]
  191. pub(crate) enum WenRestartProgressInternalState {
  192. Init {
  193. last_voted_fork_slots: Vec<Slot>,
  194. last_vote_bankhash: Hash,
  195. },
  196. LastVotedForkSlots {
  197. last_voted_fork_slots: Vec<Slot>,
  198. aggregate_final_result: Option<LastVotedForkSlotsFinalResult>,
  199. },
  200. FindHeaviestFork {
  201. aggregate_final_result: LastVotedForkSlotsFinalResult,
  202. my_heaviest_fork: Option<HeaviestForkRecord>,
  203. },
  204. HeaviestFork {
  205. my_heaviest_fork_slot: Slot,
  206. my_heaviest_fork_hash: Hash,
  207. },
  208. GenerateSnapshot {
  209. my_heaviest_fork_slot: Slot,
  210. my_snapshot: Option<GenerateSnapshotRecord>,
  211. },
  212. Done {
  213. slot: Slot,
  214. hash: Hash,
  215. shred_version: u16,
  216. },
  217. }
  218. pub(crate) fn send_restart_last_voted_fork_slots(
  219. cluster_info: Arc<ClusterInfo>,
  220. last_voted_fork_slots: &[Slot],
  221. last_vote_bankhash: Hash,
  222. ) -> Result<LastVotedForkSlotsRecord> {
  223. cluster_info.push_restart_last_voted_fork_slots(last_voted_fork_slots, last_vote_bankhash)?;
  224. Ok(LastVotedForkSlotsRecord {
  225. last_voted_fork_slots: last_voted_fork_slots.to_vec(),
  226. last_vote_bankhash: last_vote_bankhash.to_string(),
  227. shred_version: cluster_info.my_shred_version() as u32,
  228. wallclock: timestamp(),
  229. })
  230. }
  231. pub(crate) fn aggregate_restart_last_voted_fork_slots(
  232. wen_restart_path: &PathBuf,
  233. wait_for_supermajority_threshold_percent: u64,
  234. cluster_info: Arc<ClusterInfo>,
  235. last_voted_fork_slots: &Vec<Slot>,
  236. bank_forks: Arc<RwLock<BankForks>>,
  237. blockstore: Arc<Blockstore>,
  238. wen_restart_repair_slots: Arc<RwLock<Vec<Slot>>>,
  239. exit: Arc<AtomicBool>,
  240. progress: &mut WenRestartProgress,
  241. ) -> Result<LastVotedForkSlotsFinalResult> {
  242. let root_bank = bank_forks.read().unwrap().root_bank();
  243. let root_slot = root_bank.slot();
  244. let mut last_voted_fork_slots_aggregate = LastVotedForkSlotsAggregate::new(
  245. root_bank.clone(),
  246. REPAIR_THRESHOLD,
  247. last_voted_fork_slots,
  248. &cluster_info.id(),
  249. );
  250. if let Some(aggregate_record) = &progress.last_voted_fork_slots_aggregate {
  251. for (key_string, message) in &aggregate_record.received {
  252. if let Err(e) =
  253. last_voted_fork_slots_aggregate.aggregate_from_record(key_string, message)
  254. {
  255. error!("Failed to aggregate from record: {e:?}");
  256. }
  257. }
  258. } else {
  259. progress.last_voted_fork_slots_aggregate = Some(LastVotedForkSlotsAggregateRecord {
  260. received: HashMap::new(),
  261. final_result: None,
  262. });
  263. }
  264. let mut cursor = solana_gossip::crds::Cursor::default();
  265. let mut is_full_slots = HashSet::new();
  266. let mut old_progress = WenRestartProgress::default();
  267. loop {
  268. if exit.load(Ordering::Relaxed) {
  269. return Err(WenRestartError::Exiting.into());
  270. }
  271. let start = timestamp();
  272. for new_last_voted_fork_slots in cluster_info.get_restart_last_voted_fork_slots(&mut cursor)
  273. {
  274. let from = new_last_voted_fork_slots.from.to_string();
  275. match last_voted_fork_slots_aggregate.aggregate(new_last_voted_fork_slots) {
  276. LastVotedForkSlotsAggregateResult::Inserted(record) => {
  277. progress
  278. .last_voted_fork_slots_aggregate
  279. .as_mut()
  280. .unwrap()
  281. .received
  282. .insert(from, record);
  283. }
  284. LastVotedForkSlotsAggregateResult::DifferentVersionExists(
  285. old_record,
  286. new_record,
  287. ) => {
  288. info!(
  289. "Different LastVotedForkSlots message exists from {from}: {old_record:#?} \
  290. vs {new_record:#?}"
  291. );
  292. progress.conflict_message.insert(
  293. from,
  294. ConflictMessage {
  295. old_message: format!("{old_record:?}"),
  296. new_message: format!("{new_record:?}"),
  297. },
  298. );
  299. }
  300. LastVotedForkSlotsAggregateResult::AlreadyExists => (),
  301. }
  302. }
  303. // Because all operations on the aggregate are called from this single thread, we can
  304. // fetch all results separately without worrying about them being out of sync. We can
  305. // also use returned iterator without the vector changing underneath us.
  306. let active_percent = last_voted_fork_slots_aggregate.min_active_percent();
  307. let mut filtered_slots: Vec<Slot>;
  308. {
  309. filtered_slots = last_voted_fork_slots_aggregate
  310. .slots_to_repair_iter()
  311. .filter(|slot| {
  312. if *slot <= &root_slot || is_full_slots.contains(*slot) {
  313. return false;
  314. }
  315. if blockstore.is_full(**slot) {
  316. is_full_slots.insert(**slot);
  317. false
  318. } else {
  319. true
  320. }
  321. })
  322. .cloned()
  323. .collect();
  324. }
  325. filtered_slots.sort();
  326. if progress != &old_progress {
  327. info!(
  328. "Active peers: {} Slots to repair: {:?}",
  329. active_percent, &filtered_slots
  330. );
  331. write_wen_restart_records(wen_restart_path, progress)?;
  332. old_progress = progress.clone();
  333. }
  334. if filtered_slots.is_empty()
  335. && active_percent >= wait_for_supermajority_threshold_percent as f64
  336. {
  337. *wen_restart_repair_slots.write().unwrap() = vec![];
  338. break;
  339. }
  340. {
  341. *wen_restart_repair_slots.write().unwrap() = filtered_slots;
  342. }
  343. let elapsed = timestamp().saturating_sub(start);
  344. let time_left = GOSSIP_SLEEP_MILLIS.saturating_sub(elapsed);
  345. if time_left > 0 {
  346. sleep(Duration::from_millis(time_left));
  347. }
  348. }
  349. Ok(last_voted_fork_slots_aggregate.get_final_result())
  350. }
  351. fn is_over_stake_threshold(
  352. epoch_info_vec: &[LastVotedForkSlotsEpochInfo],
  353. epoch: Epoch,
  354. stake: &u64,
  355. ) -> bool {
  356. epoch_info_vec
  357. .iter()
  358. .find(|info| info.epoch == epoch)
  359. .is_some_and(|info| {
  360. let threshold = info
  361. .actively_voting_stake
  362. .checked_sub((info.total_stake as f64 * HEAVIEST_FORK_THRESHOLD_DELTA) as u64)
  363. .unwrap();
  364. stake >= &threshold
  365. })
  366. }
  367. // Verify that all blocks with at least (active_stake_percnet - 38%) of the stake form a
  368. // single chain from the root, and use the highest slot in the blocks as the heaviest fork.
  369. // Please see SIMD 46 "gossip current heaviest fork" for correctness proof.
  370. pub(crate) fn find_heaviest_fork(
  371. aggregate_final_result: LastVotedForkSlotsFinalResult,
  372. bank_forks: Arc<RwLock<BankForks>>,
  373. blockstore: Arc<Blockstore>,
  374. exit: Arc<AtomicBool>,
  375. ) -> Result<(Slot, Hash)> {
  376. let root_bank = bank_forks.read().unwrap().root_bank();
  377. let root_slot = root_bank.slot();
  378. let mut slots = aggregate_final_result
  379. .slots_stake_map
  380. .iter()
  381. .filter(|(slot, stake)| {
  382. **slot > root_slot
  383. && is_over_stake_threshold(
  384. &aggregate_final_result.epoch_info_vec,
  385. root_bank.epoch_schedule().get_epoch(**slot),
  386. stake,
  387. )
  388. })
  389. .map(|(slot, _)| *slot)
  390. .collect::<Vec<Slot>>();
  391. slots.sort();
  392. // The heaviest slot we selected will always be the last of the slots list, or root if the list is empty.
  393. let heaviest_fork_slot = slots.last().map_or(root_slot, |x| *x);
  394. let mut expected_parent = root_slot;
  395. for slot in &slots {
  396. if exit.load(Ordering::Relaxed) {
  397. return Err(WenRestartError::Exiting.into());
  398. }
  399. if let Ok(Some(block_meta)) = blockstore.meta(*slot) {
  400. if block_meta.parent_slot != Some(expected_parent) {
  401. if expected_parent == root_slot {
  402. error!(
  403. "First block {slot} in repair list not linked to local root {root_slot}, \
  404. this could mean our root is too old"
  405. );
  406. } else {
  407. error!(
  408. "Block {slot} in blockstore is not linked to expected parent from Wen \
  409. Restart {expected_parent} but to Block {:?}",
  410. block_meta.parent_slot
  411. );
  412. }
  413. return Err(WenRestartError::BlockNotLinkedToExpectedParent(
  414. *slot,
  415. block_meta.parent_slot,
  416. expected_parent,
  417. )
  418. .into());
  419. }
  420. if !block_meta.is_full() {
  421. return Err(WenRestartError::BlockNotFull(*slot).into());
  422. }
  423. expected_parent = *slot;
  424. } else {
  425. return Err(WenRestartError::BlockNotFound(*slot).into());
  426. }
  427. }
  428. let heaviest_fork_bankhash = find_bankhash_of_heaviest_fork(
  429. heaviest_fork_slot,
  430. slots,
  431. blockstore.clone(),
  432. bank_forks.clone(),
  433. &exit,
  434. )?;
  435. info!("Heaviest fork found: slot: {heaviest_fork_slot}, bankhash: {heaviest_fork_bankhash:?}");
  436. Ok((heaviest_fork_slot, heaviest_fork_bankhash))
  437. }
  438. fn check_slot_smaller_than_intended_snapshot_slot(
  439. slot: Slot,
  440. intended_snapshot_slot: Slot,
  441. directory: &Path,
  442. ) -> Result<()> {
  443. match slot.cmp(&intended_snapshot_slot) {
  444. std::cmp::Ordering::Greater => Err(WenRestartError::FutureSnapshotExists(
  445. intended_snapshot_slot,
  446. slot,
  447. directory.to_string_lossy().to_string(),
  448. )
  449. .into()),
  450. std::cmp::Ordering::Equal => Err(WenRestartError::GenerateSnapshotWhenOneExists(
  451. slot,
  452. directory.to_string_lossy().to_string(),
  453. )
  454. .into()),
  455. std::cmp::Ordering::Less => Ok(()),
  456. }
  457. }
  458. // Given the agreed upon slot, add hard fork and rehash the corresponding bank, then
  459. // generate new snapshot. Generate incremental snapshot if possible, but generate full
  460. // snapshot if there is no full snapshot or snapshot generation is turned off (in this
  461. // case the incremental snasphot based on the full snapshot is incorrect).
  462. //
  463. // We don't use set_root() explicitly, because it may kick off snapshot requests, we
  464. // can't have multiple snapshot requests in progress. In bank_to_snapshot_archive()
  465. // everything set_root() does will be done (without bank_forks setting root). So
  466. // when we restart from the snapshot bank on my_heaviest_fork_slot will become root.
  467. pub(crate) fn generate_snapshot(
  468. bank_forks: Arc<RwLock<BankForks>>,
  469. snapshot_controller: &SnapshotController,
  470. abs_status: &AbsStatus,
  471. genesis_config_hash: Hash,
  472. my_heaviest_fork_slot: Slot,
  473. ) -> Result<GenerateSnapshotRecord> {
  474. let new_root_bank;
  475. {
  476. let my_bank_forks = bank_forks.read().unwrap();
  477. let old_root_bank = my_bank_forks.root_bank();
  478. if !old_root_bank
  479. .hard_forks()
  480. .iter()
  481. .any(|(slot, _)| slot == &my_heaviest_fork_slot)
  482. {
  483. old_root_bank.register_hard_fork(my_heaviest_fork_slot);
  484. }
  485. // my_heaviest_fork_slot is guaranteed to have a bank in bank_forks, it's checked in
  486. // find_bankhash_of_heaviest_fork().
  487. match my_bank_forks.get(my_heaviest_fork_slot) {
  488. Some(bank) => new_root_bank = bank.clone(),
  489. None => {
  490. return Err(WenRestartError::BlockNotFound(my_heaviest_fork_slot).into());
  491. }
  492. }
  493. let mut banks = vec![&new_root_bank];
  494. let parents = new_root_bank.parents();
  495. banks.extend(parents.iter());
  496. }
  497. // Snapshot generation calls AccountsDb background tasks (flush/clean/shrink).
  498. // These cannot run conncurrent with each other, so we must shutdown
  499. // AccountsBackgroundService before proceeding.
  500. abs_status.stop();
  501. info!("Waiting for AccountsBackgroundService to stop");
  502. while abs_status.is_running() {
  503. std::thread::yield_now();
  504. }
  505. let snapshot_config = snapshot_controller.snapshot_config();
  506. let mut directory = &snapshot_config.full_snapshot_archives_dir;
  507. // Calculate the full_snapshot_slot an incremental snapshot should depend on. If the
  508. // validator is configured not the generate snapshot, it will only have the initial
  509. // snapshot on disk, which might be too old to generate an incremental snapshot from.
  510. // In this case we also set full_snapshot_slot to None.
  511. let full_snapshot_slot = if snapshot_config.should_generate_snapshots() {
  512. get_highest_full_snapshot_archive_slot(directory)
  513. } else {
  514. None
  515. };
  516. // In very rare cases it's possible that the local root is not on the heaviest fork, so the
  517. // validator generated snapshot for slots > local root. If the cluster agreed upon restart
  518. // slot my_heaviest_fork_slot is less than the current highest full_snapshot_slot, that means the
  519. // locally rooted full_snapshot_slot will be rolled back. this requires human inspection。
  520. //
  521. // In even rarer cases, the selected slot might be the latest full snapshot slot. We could
  522. // just re-generate a new snapshot to make sure the snapshot is up to date after hard fork,
  523. // but for now we just return an error to keep the code simple.
  524. let new_snapshot_path = if let Some(full_snapshot_slot) = full_snapshot_slot {
  525. check_slot_smaller_than_intended_snapshot_slot(
  526. full_snapshot_slot,
  527. my_heaviest_fork_slot,
  528. directory,
  529. )?;
  530. directory = &snapshot_config.incremental_snapshot_archives_dir;
  531. if let Some(incremental_snapshot_slot) =
  532. get_highest_incremental_snapshot_archive_slot(directory, full_snapshot_slot)
  533. {
  534. check_slot_smaller_than_intended_snapshot_slot(
  535. incremental_snapshot_slot,
  536. my_heaviest_fork_slot,
  537. directory,
  538. )?;
  539. }
  540. bank_to_incremental_snapshot_archive(
  541. &snapshot_config.bank_snapshots_dir,
  542. &new_root_bank,
  543. full_snapshot_slot,
  544. Some(snapshot_config.snapshot_version),
  545. &snapshot_config.full_snapshot_archives_dir,
  546. &snapshot_config.incremental_snapshot_archives_dir,
  547. snapshot_config.archive_format,
  548. )?
  549. .path()
  550. .display()
  551. .to_string()
  552. } else {
  553. info!(
  554. "Can't find full snapshot, generating full snapshot for slot: {my_heaviest_fork_slot}"
  555. );
  556. bank_to_full_snapshot_archive(
  557. &snapshot_config.bank_snapshots_dir,
  558. &new_root_bank,
  559. Some(snapshot_config.snapshot_version),
  560. &snapshot_config.full_snapshot_archives_dir,
  561. &snapshot_config.incremental_snapshot_archives_dir,
  562. snapshot_config.archive_format,
  563. )?
  564. .path()
  565. .display()
  566. .to_string()
  567. };
  568. let new_shred_version =
  569. compute_shred_version(&genesis_config_hash, Some(&new_root_bank.hard_forks()));
  570. info!("wen_restart snapshot generated on {new_snapshot_path} base slot {full_snapshot_slot:?}");
  571. // We might have bank snapshots past the my_heaviest_fork_slot, we need to purge them.
  572. purge_all_bank_snapshots(&snapshot_config.bank_snapshots_dir);
  573. Ok(GenerateSnapshotRecord {
  574. path: new_snapshot_path,
  575. slot: my_heaviest_fork_slot,
  576. bankhash: new_root_bank.hash().to_string(),
  577. shred_version: new_shred_version as u32,
  578. })
  579. }
  580. // Find the hash of the heaviest fork, if block hasn't been replayed, replay to get the hash.
  581. pub(crate) fn find_bankhash_of_heaviest_fork(
  582. heaviest_fork_slot: Slot,
  583. slots: Vec<Slot>,
  584. blockstore: Arc<Blockstore>,
  585. bank_forks: Arc<RwLock<BankForks>>,
  586. exit: &AtomicBool,
  587. ) -> Result<Hash> {
  588. if let Some(hash) = bank_forks
  589. .read()
  590. .unwrap()
  591. .get(heaviest_fork_slot)
  592. .map(|bank| bank.hash())
  593. {
  594. return Ok(hash);
  595. }
  596. let root_bank = bank_forks.read().unwrap().root_bank();
  597. let leader_schedule_cache = LeaderScheduleCache::new_from_bank(&root_bank);
  598. let replay_tx_thread_pool = rayon::ThreadPoolBuilder::new()
  599. .thread_name(|i| format!("solReplayTx{i:02}"))
  600. .build()
  601. .expect("new rayon threadpool");
  602. let recyclers = VerifyRecyclers::default();
  603. let mut timing = ExecuteTimings::default();
  604. let opts = ProcessOptions::default();
  605. // Now replay all the missing blocks.
  606. let mut parent_bank = root_bank;
  607. for slot in slots {
  608. if exit.load(Ordering::Relaxed) {
  609. return Err(WenRestartError::Exiting.into());
  610. }
  611. let saved_bank = bank_forks.read().unwrap().get_with_scheduler(slot);
  612. let bank_with_scheduler = saved_bank.unwrap_or_else(|| {
  613. let new_bank = Bank::new_from_parent(
  614. parent_bank.clone(),
  615. &leader_schedule_cache
  616. .slot_leader_at(slot, Some(&parent_bank))
  617. .unwrap(),
  618. slot,
  619. );
  620. bank_forks.write().unwrap().insert_from_ledger(new_bank)
  621. });
  622. let bank = if bank_with_scheduler.is_frozen() {
  623. bank_with_scheduler.clone_without_scheduler()
  624. } else {
  625. let mut progress = ConfirmationProgress::new(parent_bank.last_blockhash());
  626. if let Err(e) = process_single_slot(
  627. &blockstore,
  628. &bank_with_scheduler,
  629. &replay_tx_thread_pool,
  630. &opts,
  631. &recyclers,
  632. &mut progress,
  633. None,
  634. None,
  635. None,
  636. &mut timing,
  637. ) {
  638. return Err(
  639. WenRestartError::BlockNotFrozenAfterReplay(slot, Some(e.to_string())).into(),
  640. );
  641. }
  642. let cur_bank;
  643. {
  644. cur_bank = bank_forks
  645. .read()
  646. .unwrap()
  647. .get(slot)
  648. .expect("bank should have been just inserted");
  649. }
  650. cur_bank
  651. };
  652. parent_bank = bank;
  653. }
  654. Ok(parent_bank.hash())
  655. }
  656. // Aggregate the heaviest fork at the coordinator.
  657. pub(crate) fn aggregate_restart_heaviest_fork(
  658. wen_restart_path: &PathBuf,
  659. cluster_info: Arc<ClusterInfo>,
  660. bank_forks: Arc<RwLock<BankForks>>,
  661. exit: Arc<AtomicBool>,
  662. progress: &mut WenRestartProgress,
  663. ) -> Result<()> {
  664. let root_bank = bank_forks.read().unwrap().root_bank();
  665. if progress.my_heaviest_fork.is_none() {
  666. return Err(WenRestartError::MalformedProgress(
  667. RestartState::HeaviestFork,
  668. "my_heaviest_fork".to_string(),
  669. )
  670. .into());
  671. }
  672. let my_heaviest_fork = progress.my_heaviest_fork.clone().unwrap();
  673. let heaviest_fork_slot = my_heaviest_fork.slot;
  674. let heaviest_fork_hash = Hash::from_str(&my_heaviest_fork.bankhash)?;
  675. // Use the epoch_stakes associated with the heaviest fork slot we picked.
  676. let epoch_stakes = root_bank
  677. .epoch_stakes(root_bank.epoch_schedule().get_epoch(heaviest_fork_slot))
  678. .unwrap();
  679. let total_stake = epoch_stakes.total_stake();
  680. let mut heaviest_fork_aggregate = HeaviestForkAggregate::new(
  681. cluster_info.my_shred_version(),
  682. epoch_stakes,
  683. heaviest_fork_slot,
  684. heaviest_fork_hash,
  685. &cluster_info.id(),
  686. );
  687. if let Some(aggregate_record) = &progress.heaviest_fork_aggregate {
  688. for message in &aggregate_record.received {
  689. if let Err(e) = heaviest_fork_aggregate.aggregate_from_record(message) {
  690. // Do not abort wen_restart if we got one malformed message.
  691. error!("Failed to aggregate from record: {e:?}");
  692. }
  693. }
  694. } else {
  695. progress.heaviest_fork_aggregate = Some(HeaviestForkAggregateRecord {
  696. received: Vec::new(),
  697. total_active_stake: 0,
  698. });
  699. }
  700. let mut cursor = solana_gossip::crds::Cursor::default();
  701. let mut total_active_stake = 0;
  702. let mut stat_printed_at = Instant::now();
  703. let mut old_progress = WenRestartProgress::default();
  704. loop {
  705. if exit.load(Ordering::Relaxed) {
  706. return Ok(());
  707. }
  708. let start = timestamp();
  709. for new_heaviest_fork in cluster_info.get_restart_heaviest_fork(&mut cursor) {
  710. info!("Received new heaviest fork: {new_heaviest_fork:?}");
  711. let from = new_heaviest_fork.from.to_string();
  712. match heaviest_fork_aggregate.aggregate(new_heaviest_fork) {
  713. HeaviestForkAggregateResult::Inserted(record) => {
  714. info!("Successfully aggregated new heaviest fork: {record:?}");
  715. progress
  716. .heaviest_fork_aggregate
  717. .as_mut()
  718. .unwrap()
  719. .received
  720. .push(record);
  721. }
  722. HeaviestForkAggregateResult::DifferentVersionExists(old_record, new_record) => {
  723. warn!(
  724. "Different version from {from} exists old {old_record:#?} vs new \
  725. {new_record:#?}"
  726. );
  727. progress.conflict_message.insert(
  728. from,
  729. ConflictMessage {
  730. old_message: format!("{old_record:?}"),
  731. new_message: format!("{new_record:?}"),
  732. },
  733. );
  734. }
  735. HeaviestForkAggregateResult::ZeroStakeIgnored => (),
  736. HeaviestForkAggregateResult::AlreadyExists => (),
  737. HeaviestForkAggregateResult::Malformed => (),
  738. }
  739. }
  740. let current_total_active_stake = heaviest_fork_aggregate.total_active_stake();
  741. if current_total_active_stake > total_active_stake {
  742. total_active_stake = current_total_active_stake;
  743. progress
  744. .heaviest_fork_aggregate
  745. .as_mut()
  746. .unwrap()
  747. .total_active_stake = current_total_active_stake;
  748. }
  749. if old_progress != *progress {
  750. info!(
  751. "Total active stake: {} Total stake {}",
  752. heaviest_fork_aggregate.total_active_stake(),
  753. total_stake
  754. );
  755. write_wen_restart_records(wen_restart_path, progress)?;
  756. old_progress = progress.clone();
  757. }
  758. let elapsed = timestamp().saturating_sub(start);
  759. let time_left = GOSSIP_SLEEP_MILLIS.saturating_sub(elapsed);
  760. if time_left > 0 {
  761. sleep(Duration::from_millis(time_left));
  762. }
  763. // Print the block stake map after a while.
  764. if stat_printed_at.elapsed() > Duration::from_secs(COORDINATOR_STAT_PRINT_INTERVAL_SECONDS)
  765. {
  766. heaviest_fork_aggregate.print_block_stake_map();
  767. stat_printed_at = Instant::now();
  768. }
  769. }
  770. }
  771. pub(crate) fn repair_heaviest_fork(
  772. my_heaviest_fork_slot: Slot,
  773. heaviest_slot: Slot,
  774. exit: Arc<AtomicBool>,
  775. blockstore: Arc<Blockstore>,
  776. wen_restart_repair_slots: Arc<RwLock<Vec<Slot>>>,
  777. ) -> Result<()> {
  778. loop {
  779. if exit.load(Ordering::Relaxed) {
  780. return Err(WenRestartError::Exiting.into());
  781. }
  782. // Repair all ancestors of heaviest_slot (including itself) which are larger than
  783. // my_heaviest_fork_slot.
  784. let to_repair = if blockstore.meta(heaviest_slot).is_ok_and(|x| x.is_some()) {
  785. AncestorIterator::new_inclusive(heaviest_slot, &blockstore)
  786. .take_while(|slot| *slot > my_heaviest_fork_slot)
  787. .filter(|slot| !blockstore.is_full(*slot))
  788. .collect()
  789. } else {
  790. vec![heaviest_slot]
  791. };
  792. info!("wen_restart repair slots: {to_repair:?}");
  793. if to_repair.is_empty() {
  794. return Ok(()); // All blocks are full
  795. }
  796. *wen_restart_repair_slots.write().unwrap() = to_repair;
  797. sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS));
  798. }
  799. }
  800. pub(crate) fn verify_coordinator_heaviest_fork(
  801. my_heaviest_fork_slot: Slot,
  802. coordinator_heaviest_slot: Slot,
  803. coordinator_heaviest_hash: &Hash,
  804. bank_forks: Arc<RwLock<BankForks>>,
  805. blockstore: Arc<Blockstore>,
  806. exit: Arc<AtomicBool>,
  807. wen_restart_repair_slots: Arc<RwLock<Vec<Slot>>>,
  808. ) -> Result<()> {
  809. repair_heaviest_fork(
  810. my_heaviest_fork_slot,
  811. coordinator_heaviest_slot,
  812. exit.clone(),
  813. blockstore.clone(),
  814. wen_restart_repair_slots.clone(),
  815. )?;
  816. let root_slot = bank_forks.read().unwrap().root_bank().slot();
  817. let mut coordinator_heaviest_slot_ancestors: Vec<Slot> =
  818. AncestorIterator::new_inclusive(coordinator_heaviest_slot, &blockstore)
  819. .take_while(|slot| slot >= &root_slot)
  820. .collect();
  821. coordinator_heaviest_slot_ancestors.sort();
  822. if !coordinator_heaviest_slot_ancestors.contains(&root_slot) {
  823. return Err(WenRestartError::HeaviestForkOnLeaderOnDifferentFork(
  824. coordinator_heaviest_slot,
  825. root_slot,
  826. )
  827. .into());
  828. }
  829. if coordinator_heaviest_slot > my_heaviest_fork_slot
  830. && !coordinator_heaviest_slot_ancestors.contains(&my_heaviest_fork_slot)
  831. {
  832. return Err(WenRestartError::HeaviestForkOnLeaderOnDifferentFork(
  833. coordinator_heaviest_slot,
  834. my_heaviest_fork_slot,
  835. )
  836. .into());
  837. }
  838. if coordinator_heaviest_slot < my_heaviest_fork_slot
  839. && !AncestorIterator::new(my_heaviest_fork_slot, &blockstore)
  840. .any(|slot| slot == coordinator_heaviest_slot)
  841. {
  842. return Err(WenRestartError::HeaviestForkOnLeaderOnDifferentFork(
  843. coordinator_heaviest_slot,
  844. my_heaviest_fork_slot,
  845. )
  846. .into());
  847. }
  848. let my_bankhash = if !coordinator_heaviest_slot_ancestors.is_empty() {
  849. find_bankhash_of_heaviest_fork(
  850. coordinator_heaviest_slot,
  851. coordinator_heaviest_slot_ancestors,
  852. blockstore.clone(),
  853. bank_forks.clone(),
  854. &exit,
  855. )?
  856. } else {
  857. bank_forks
  858. .read()
  859. .unwrap()
  860. .get(coordinator_heaviest_slot)
  861. .unwrap()
  862. .hash()
  863. };
  864. if my_bankhash != *coordinator_heaviest_hash {
  865. return Err(WenRestartError::BankHashMismatch(
  866. coordinator_heaviest_slot,
  867. my_bankhash,
  868. *coordinator_heaviest_hash,
  869. )
  870. .into());
  871. }
  872. Ok(())
  873. }
  874. pub(crate) fn receive_restart_heaviest_fork(
  875. wen_restart_coordinator: Pubkey,
  876. cluster_info: Arc<ClusterInfo>,
  877. exit: Arc<AtomicBool>,
  878. progress: &mut WenRestartProgress,
  879. ) -> Result<(Slot, Hash)> {
  880. let mut cursor = solana_gossip::crds::Cursor::default();
  881. loop {
  882. if exit.load(Ordering::Relaxed) {
  883. return Err(WenRestartError::Exiting.into());
  884. }
  885. for new_heaviest_fork in cluster_info.get_restart_heaviest_fork(&mut cursor) {
  886. if new_heaviest_fork.from == wen_restart_coordinator {
  887. info!(
  888. "Received new heaviest fork from coordinator: {wen_restart_coordinator} \
  889. {new_heaviest_fork:?}"
  890. );
  891. let coordinator_heaviest_slot = new_heaviest_fork.last_slot;
  892. let coordinator_heaviest_hash = new_heaviest_fork.last_slot_hash;
  893. progress.coordinator_heaviest_fork = Some(HeaviestForkRecord {
  894. slot: coordinator_heaviest_slot,
  895. bankhash: coordinator_heaviest_hash.to_string(),
  896. total_active_stake: 0,
  897. wallclock: new_heaviest_fork.wallclock,
  898. shred_version: new_heaviest_fork.shred_version as u32,
  899. from: new_heaviest_fork.from.to_string(),
  900. });
  901. return Ok((coordinator_heaviest_slot, coordinator_heaviest_hash));
  902. }
  903. }
  904. sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS));
  905. }
  906. }
  907. pub(crate) fn send_and_receive_heaviest_fork(
  908. my_heaviest_fork_slot: Slot,
  909. my_heaviest_fork_hash: Hash,
  910. config: &WenRestartConfig,
  911. progress: &mut WenRestartProgress,
  912. pushfn: impl FnOnce(Slot, Hash),
  913. ) -> Result<(Slot, Hash)> {
  914. if config.cluster_info.id() == config.wen_restart_coordinator {
  915. pushfn(my_heaviest_fork_slot, my_heaviest_fork_hash);
  916. Ok((my_heaviest_fork_slot, my_heaviest_fork_hash))
  917. } else {
  918. let (coordinator_slot, coordinator_hash) = receive_restart_heaviest_fork(
  919. config.wen_restart_coordinator,
  920. config.cluster_info.clone(),
  921. config.exit.clone(),
  922. progress,
  923. )?;
  924. match verify_coordinator_heaviest_fork(
  925. my_heaviest_fork_slot,
  926. coordinator_slot,
  927. &coordinator_hash,
  928. config.bank_forks.clone(),
  929. config.blockstore.clone(),
  930. config.exit.clone(),
  931. config.wen_restart_repair_slots.clone().unwrap(),
  932. ) {
  933. Ok(()) => pushfn(coordinator_slot, coordinator_hash),
  934. Err(e) => {
  935. warn!("Failed to verify coordinator heaviest fork: {e:?}, exit soon");
  936. pushfn(my_heaviest_fork_slot, my_heaviest_fork_hash);
  937. // flush_push_queue only flushes the messages to crds, doesn't guarantee
  938. // sending them out, so we still need to wait for a while before exiting.
  939. config.cluster_info.flush_push_queue();
  940. sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS));
  941. return Err(e);
  942. }
  943. }
  944. Ok((coordinator_slot, coordinator_hash))
  945. }
  946. }
  947. #[derive(Clone)]
  948. pub struct WenRestartConfig {
  949. pub wen_restart_path: PathBuf,
  950. pub wen_restart_coordinator: Pubkey,
  951. pub last_vote: VoteTransaction,
  952. pub blockstore: Arc<Blockstore>,
  953. pub cluster_info: Arc<ClusterInfo>,
  954. pub bank_forks: Arc<RwLock<BankForks>>,
  955. pub wen_restart_repair_slots: Option<Arc<RwLock<Vec<Slot>>>>,
  956. pub wait_for_supermajority_threshold_percent: u64,
  957. pub snapshot_controller: Option<Arc<SnapshotController>>,
  958. pub abs_status: AbsStatus,
  959. pub genesis_config_hash: Hash,
  960. pub exit: Arc<AtomicBool>,
  961. }
  962. pub fn wait_for_wen_restart(config: WenRestartConfig) -> Result<()> {
  963. let (mut state, mut progress) = initialize(
  964. &config.wen_restart_path,
  965. config.last_vote.clone(),
  966. config.blockstore.clone(),
  967. )?;
  968. loop {
  969. state = match state {
  970. WenRestartProgressInternalState::Init {
  971. last_voted_fork_slots,
  972. last_vote_bankhash,
  973. } => {
  974. progress.my_last_voted_fork_slots = Some(send_restart_last_voted_fork_slots(
  975. config.cluster_info.clone(),
  976. &last_voted_fork_slots,
  977. last_vote_bankhash,
  978. )?);
  979. WenRestartProgressInternalState::Init {
  980. last_voted_fork_slots,
  981. last_vote_bankhash,
  982. }
  983. }
  984. WenRestartProgressInternalState::LastVotedForkSlots {
  985. last_voted_fork_slots,
  986. aggregate_final_result,
  987. } => {
  988. let final_result = match aggregate_final_result {
  989. Some(result) => result,
  990. None => aggregate_restart_last_voted_fork_slots(
  991. &config.wen_restart_path,
  992. config.wait_for_supermajority_threshold_percent,
  993. config.cluster_info.clone(),
  994. &last_voted_fork_slots,
  995. config.bank_forks.clone(),
  996. config.blockstore.clone(),
  997. config.wen_restart_repair_slots.clone().unwrap(),
  998. config.exit.clone(),
  999. &mut progress,
  1000. )?,
  1001. };
  1002. WenRestartProgressInternalState::LastVotedForkSlots {
  1003. last_voted_fork_slots,
  1004. aggregate_final_result: Some(final_result),
  1005. }
  1006. }
  1007. WenRestartProgressInternalState::FindHeaviestFork {
  1008. aggregate_final_result,
  1009. my_heaviest_fork,
  1010. } => {
  1011. let heaviest_fork = match my_heaviest_fork {
  1012. Some(heaviest_fork) => heaviest_fork,
  1013. None => {
  1014. let (slot, bankhash) = find_heaviest_fork(
  1015. aggregate_final_result.clone(),
  1016. config.bank_forks.clone(),
  1017. config.blockstore.clone(),
  1018. config.exit.clone(),
  1019. )?;
  1020. info!("Heaviest fork found: slot: {slot}, bankhash: {bankhash}");
  1021. HeaviestForkRecord {
  1022. slot,
  1023. bankhash: bankhash.to_string(),
  1024. total_active_stake: 0,
  1025. wallclock: 0,
  1026. shred_version: config.cluster_info.my_shred_version() as u32,
  1027. from: config.cluster_info.id().to_string(),
  1028. }
  1029. }
  1030. };
  1031. WenRestartProgressInternalState::FindHeaviestFork {
  1032. aggregate_final_result,
  1033. my_heaviest_fork: Some(heaviest_fork),
  1034. }
  1035. }
  1036. WenRestartProgressInternalState::HeaviestFork {
  1037. my_heaviest_fork_slot,
  1038. my_heaviest_fork_hash,
  1039. } => {
  1040. let (slot, hash) = send_and_receive_heaviest_fork(
  1041. my_heaviest_fork_slot,
  1042. my_heaviest_fork_hash,
  1043. &config,
  1044. &mut progress,
  1045. |slot, hash| {
  1046. config
  1047. .cluster_info
  1048. .push_restart_heaviest_fork(slot, hash, 0);
  1049. },
  1050. )?;
  1051. WenRestartProgressInternalState::HeaviestFork {
  1052. my_heaviest_fork_slot: slot,
  1053. my_heaviest_fork_hash: hash,
  1054. }
  1055. }
  1056. WenRestartProgressInternalState::GenerateSnapshot {
  1057. my_heaviest_fork_slot,
  1058. my_snapshot,
  1059. } => {
  1060. let snapshot_record = match my_snapshot {
  1061. Some(record) => record,
  1062. None => match &config.snapshot_controller {
  1063. Some(snapshot_controller) => generate_snapshot(
  1064. config.bank_forks.clone(),
  1065. snapshot_controller,
  1066. &config.abs_status,
  1067. config.genesis_config_hash,
  1068. my_heaviest_fork_slot,
  1069. ),
  1070. None => {
  1071. // Only tests don't have a snapshot controller
  1072. Err(WenRestartError::GenerateSnapshotWhenDisabled.into())
  1073. }
  1074. }?,
  1075. };
  1076. WenRestartProgressInternalState::GenerateSnapshot {
  1077. my_heaviest_fork_slot,
  1078. my_snapshot: Some(snapshot_record),
  1079. }
  1080. }
  1081. // Proceed to restart if we are ready to wait for supermajority.
  1082. WenRestartProgressInternalState::Done {
  1083. slot,
  1084. hash,
  1085. shred_version,
  1086. } => {
  1087. error!(
  1088. "Wen start finished, please remove --wen_restart and restart with \
  1089. --wait-for-supermajority {slot} --expected-bank-hash {hash} \
  1090. --expected-shred-version {shred_version} --no-snapshot-fetch",
  1091. );
  1092. if config.cluster_info.id() == config.wen_restart_coordinator {
  1093. aggregate_restart_heaviest_fork(
  1094. &config.wen_restart_path,
  1095. config.cluster_info.clone(),
  1096. config.bank_forks.clone(),
  1097. config.exit.clone(),
  1098. &mut progress,
  1099. )?;
  1100. }
  1101. return Ok(());
  1102. }
  1103. };
  1104. state = increment_and_write_wen_restart_records(
  1105. &config.wen_restart_path,
  1106. state,
  1107. &mut progress,
  1108. )?;
  1109. }
  1110. }
  1111. pub(crate) fn increment_and_write_wen_restart_records(
  1112. records_path: &PathBuf,
  1113. current_state: WenRestartProgressInternalState,
  1114. progress: &mut WenRestartProgress,
  1115. ) -> Result<WenRestartProgressInternalState> {
  1116. let new_state = match current_state {
  1117. WenRestartProgressInternalState::Init {
  1118. last_voted_fork_slots,
  1119. last_vote_bankhash: _,
  1120. } => {
  1121. progress.set_state(RestartState::LastVotedForkSlots);
  1122. WenRestartProgressInternalState::LastVotedForkSlots {
  1123. last_voted_fork_slots,
  1124. aggregate_final_result: None,
  1125. }
  1126. }
  1127. WenRestartProgressInternalState::LastVotedForkSlots {
  1128. last_voted_fork_slots: _,
  1129. aggregate_final_result,
  1130. } => {
  1131. if let Some(aggregate_final_result) = aggregate_final_result {
  1132. progress.set_state(RestartState::HeaviestFork);
  1133. if let Some(aggregate_record) = progress.last_voted_fork_slots_aggregate.as_mut() {
  1134. aggregate_record.final_result = Some(LastVotedForkSlotsAggregateFinal {
  1135. slots_stake_map: aggregate_final_result.slots_stake_map.clone(),
  1136. epoch_infos: aggregate_final_result
  1137. .epoch_info_vec
  1138. .iter()
  1139. .map(|info| LastVotedForkSlotsEpochInfoRecord {
  1140. epoch: info.epoch,
  1141. total_stake: info.total_stake,
  1142. actively_voting_stake: info.actively_voting_stake,
  1143. actively_voting_for_this_epoch_stake: info
  1144. .actively_voting_for_this_epoch_stake,
  1145. })
  1146. .collect(),
  1147. });
  1148. }
  1149. WenRestartProgressInternalState::FindHeaviestFork {
  1150. aggregate_final_result,
  1151. my_heaviest_fork: None,
  1152. }
  1153. } else {
  1154. return Err(
  1155. WenRestartError::UnexpectedState(RestartState::LastVotedForkSlots).into(),
  1156. );
  1157. }
  1158. }
  1159. WenRestartProgressInternalState::FindHeaviestFork {
  1160. aggregate_final_result: _,
  1161. my_heaviest_fork,
  1162. } => {
  1163. if let Some(my_heaviest_fork) = my_heaviest_fork {
  1164. progress.my_heaviest_fork = Some(my_heaviest_fork.clone());
  1165. WenRestartProgressInternalState::HeaviestFork {
  1166. my_heaviest_fork_slot: my_heaviest_fork.slot,
  1167. my_heaviest_fork_hash: Hash::from_str(&my_heaviest_fork.bankhash).unwrap(),
  1168. }
  1169. } else {
  1170. return Err(WenRestartError::UnexpectedState(RestartState::HeaviestFork).into());
  1171. }
  1172. }
  1173. WenRestartProgressInternalState::HeaviestFork {
  1174. my_heaviest_fork_slot,
  1175. ..
  1176. } => {
  1177. progress.set_state(RestartState::GenerateSnapshot);
  1178. WenRestartProgressInternalState::GenerateSnapshot {
  1179. my_heaviest_fork_slot,
  1180. my_snapshot: None,
  1181. }
  1182. }
  1183. WenRestartProgressInternalState::GenerateSnapshot {
  1184. my_heaviest_fork_slot: _,
  1185. my_snapshot,
  1186. } => {
  1187. if let Some(my_snapshot) = my_snapshot {
  1188. progress.set_state(RestartState::Done);
  1189. progress.my_snapshot = Some(my_snapshot.clone());
  1190. WenRestartProgressInternalState::Done {
  1191. slot: my_snapshot.slot,
  1192. hash: Hash::from_str(&my_snapshot.bankhash).unwrap(),
  1193. shred_version: my_snapshot.shred_version as u16,
  1194. }
  1195. } else {
  1196. return Err(WenRestartError::MissingSnapshotInProtobuf.into());
  1197. }
  1198. }
  1199. WenRestartProgressInternalState::Done { .. } => {
  1200. return Err(WenRestartError::UnexpectedState(RestartState::Done).into())
  1201. }
  1202. };
  1203. write_wen_restart_records(records_path, progress)?;
  1204. Ok(new_state)
  1205. }
  1206. pub(crate) fn initialize(
  1207. records_path: &PathBuf,
  1208. last_vote: VoteTransaction,
  1209. blockstore: Arc<Blockstore>,
  1210. ) -> Result<(WenRestartProgressInternalState, WenRestartProgress)> {
  1211. let progress = match read_wen_restart_records(records_path) {
  1212. Ok(progress) => progress,
  1213. Err(e) => {
  1214. let stdio_err = e.downcast_ref::<std::io::Error>();
  1215. if stdio_err.is_some_and(|e| e.kind() == std::io::ErrorKind::NotFound) {
  1216. info!("wen restart proto file not found at {records_path:?}, write init state");
  1217. let progress = WenRestartProgress {
  1218. state: RestartState::Init.into(),
  1219. ..Default::default()
  1220. };
  1221. write_wen_restart_records(records_path, &progress)?;
  1222. progress
  1223. } else {
  1224. return Err(e);
  1225. }
  1226. }
  1227. };
  1228. match progress.state() {
  1229. RestartState::Done => {
  1230. if let Some(my_snapshot) = progress.my_snapshot.as_ref() {
  1231. Ok((
  1232. WenRestartProgressInternalState::Done {
  1233. slot: my_snapshot.slot,
  1234. hash: Hash::from_str(&my_snapshot.bankhash).unwrap(),
  1235. shred_version: my_snapshot.shred_version as u16,
  1236. },
  1237. progress,
  1238. ))
  1239. } else {
  1240. Err(WenRestartError::MissingSnapshotInProtobuf.into())
  1241. }
  1242. }
  1243. RestartState::Init => {
  1244. let last_voted_fork_slots;
  1245. let last_vote_bankhash;
  1246. match &progress.my_last_voted_fork_slots {
  1247. Some(my_last_voted_fork_slots) => {
  1248. last_voted_fork_slots = my_last_voted_fork_slots.last_voted_fork_slots.clone();
  1249. last_vote_bankhash =
  1250. Hash::from_str(&my_last_voted_fork_slots.last_vote_bankhash).unwrap();
  1251. }
  1252. None => {
  1253. // repair and restart option does not work without last voted slot.
  1254. if let Some(last_vote_slot) = last_vote.last_voted_slot() {
  1255. last_vote_bankhash = last_vote.hash();
  1256. last_voted_fork_slots =
  1257. AncestorIterator::new_inclusive(last_vote_slot, &blockstore)
  1258. .take(RestartLastVotedForkSlots::MAX_SLOTS)
  1259. .collect();
  1260. } else {
  1261. error!(
  1262. "Cannot find last voted slot in the tower storage, it either means \
  1263. that this node has never voted or the tower storage is corrupted. \
  1264. Unfortunately, since WenRestart is a consensus protocol depending on \
  1265. each participant to send their last voted fork slots, your validator \
  1266. cannot participate.Please check discord for the conclusion of the \
  1267. WenRestart protocol, then generate a snapshot and use \
  1268. --wait-for-supermajority to restart the validator."
  1269. );
  1270. return Err(WenRestartError::MissingLastVotedForkSlots.into());
  1271. }
  1272. }
  1273. }
  1274. Ok((
  1275. WenRestartProgressInternalState::Init {
  1276. last_voted_fork_slots,
  1277. last_vote_bankhash,
  1278. },
  1279. progress,
  1280. ))
  1281. }
  1282. RestartState::LastVotedForkSlots => {
  1283. if let Some(record) = progress.my_last_voted_fork_slots.as_ref() {
  1284. Ok((
  1285. WenRestartProgressInternalState::LastVotedForkSlots {
  1286. last_voted_fork_slots: record.last_voted_fork_slots.clone(),
  1287. aggregate_final_result: progress
  1288. .last_voted_fork_slots_aggregate
  1289. .as_ref()
  1290. .and_then(|r| {
  1291. r.final_result.as_ref().map(|result| {
  1292. LastVotedForkSlotsFinalResult {
  1293. slots_stake_map: result.slots_stake_map.clone(),
  1294. epoch_info_vec: result
  1295. .epoch_infos
  1296. .iter()
  1297. .map(|info| LastVotedForkSlotsEpochInfo {
  1298. epoch: info.epoch,
  1299. total_stake: info.total_stake,
  1300. actively_voting_stake: info.actively_voting_stake,
  1301. actively_voting_for_this_epoch_stake: info
  1302. .actively_voting_for_this_epoch_stake,
  1303. })
  1304. .collect(),
  1305. }
  1306. })
  1307. }),
  1308. },
  1309. progress,
  1310. ))
  1311. } else {
  1312. Err(WenRestartError::MalformedLastVotedForkSlotsProtobuf(None).into())
  1313. }
  1314. }
  1315. RestartState::HeaviestFork => Ok((
  1316. WenRestartProgressInternalState::FindHeaviestFork {
  1317. aggregate_final_result: progress
  1318. .last_voted_fork_slots_aggregate
  1319. .as_ref()
  1320. .and_then(|r| {
  1321. r.final_result
  1322. .as_ref()
  1323. .map(|result| LastVotedForkSlotsFinalResult {
  1324. slots_stake_map: result.slots_stake_map.clone(),
  1325. epoch_info_vec: result
  1326. .epoch_infos
  1327. .iter()
  1328. .map(|info| LastVotedForkSlotsEpochInfo {
  1329. epoch: info.epoch,
  1330. total_stake: info.total_stake,
  1331. actively_voting_stake: info.actively_voting_stake,
  1332. actively_voting_for_this_epoch_stake: info
  1333. .actively_voting_for_this_epoch_stake,
  1334. })
  1335. .collect(),
  1336. })
  1337. })
  1338. .ok_or(WenRestartError::MalformedProgress(
  1339. RestartState::HeaviestFork,
  1340. "final_result in last_voted_fork_slots_aggregate".to_string(),
  1341. ))?,
  1342. my_heaviest_fork: progress.my_heaviest_fork.clone(),
  1343. },
  1344. progress,
  1345. )),
  1346. RestartState::GenerateSnapshot => Ok((
  1347. WenRestartProgressInternalState::GenerateSnapshot {
  1348. my_heaviest_fork_slot: progress
  1349. .my_heaviest_fork
  1350. .as_ref()
  1351. .ok_or(WenRestartError::MalformedProgress(
  1352. RestartState::GenerateSnapshot,
  1353. "my_heaviest_fork".to_string(),
  1354. ))?
  1355. .slot,
  1356. my_snapshot: progress.my_snapshot.clone(),
  1357. },
  1358. progress,
  1359. )),
  1360. }
  1361. }
  1362. fn read_wen_restart_records(records_path: &PathBuf) -> Result<WenRestartProgress> {
  1363. let buffer = read(records_path)?;
  1364. let progress = WenRestartProgress::decode(&mut Cursor::new(buffer))?;
  1365. info!("read record {progress:?}");
  1366. Ok(progress)
  1367. }
  1368. pub(crate) fn write_wen_restart_records(
  1369. records_path: &PathBuf,
  1370. new_progress: &WenRestartProgress,
  1371. ) -> Result<()> {
  1372. // overwrite anything if exists
  1373. let mut file = File::create(records_path)?;
  1374. info!("writing new record {new_progress:?}");
  1375. let mut buf = Vec::with_capacity(new_progress.encoded_len());
  1376. new_progress.encode(&mut buf)?;
  1377. file.write_all(&buf)?;
  1378. Ok(())
  1379. }
  1380. #[cfg(test)]
  1381. mod tests {
  1382. use {
  1383. crate::wen_restart::{tests::wen_restart_proto::LastVotedForkSlotsAggregateFinal, *},
  1384. agave_snapshots::{
  1385. paths::build_incremental_snapshot_archive_path,
  1386. snapshot_config::{SnapshotConfig, SnapshotUsage},
  1387. snapshot_hash::SnapshotHash,
  1388. },
  1389. crossbeam_channel::unbounded,
  1390. solana_entry::entry::create_ticks,
  1391. solana_genesis_utils::MAX_GENESIS_ARCHIVE_UNPACKED_SIZE,
  1392. solana_gossip::{
  1393. cluster_info::ClusterInfo,
  1394. contact_info::ContactInfo,
  1395. crds::GossipRoute,
  1396. crds_data::CrdsData,
  1397. crds_value::CrdsValue,
  1398. restart_crds_values::{RestartHeaviestFork, RestartLastVotedForkSlots},
  1399. },
  1400. solana_hash::Hash,
  1401. solana_keypair::Keypair,
  1402. solana_ledger::{
  1403. blockstore::{create_new_ledger, entries_to_test_shreds, Blockstore},
  1404. blockstore_options::LedgerColumnOptions,
  1405. blockstore_processor::{fill_blockstore_slot_with_ticks, test_process_blockstore},
  1406. get_tmp_ledger_path_auto_delete,
  1407. },
  1408. solana_net_utils::SocketAddrSpace,
  1409. solana_pubkey::Pubkey,
  1410. solana_runtime::{
  1411. epoch_stakes::VersionedEpochStakes,
  1412. genesis_utils::{
  1413. create_genesis_config_with_vote_accounts, GenesisConfigInfo, ValidatorVoteKeypairs,
  1414. },
  1415. snapshot_bank_utils::bank_to_full_snapshot_archive,
  1416. },
  1417. solana_signer::Signer,
  1418. solana_time_utils::timestamp,
  1419. solana_vote::vote_account::VoteAccount,
  1420. solana_vote_interface::state::{TowerSync, Vote},
  1421. solana_vote_program::vote_state::create_v4_account_with_authorized,
  1422. std::{fs::remove_file, sync::Arc, thread::Builder},
  1423. tempfile::TempDir,
  1424. };
  1425. const SHRED_VERSION: u16 = 2;
  1426. const EXPECTED_SLOTS: Slot = 40;
  1427. const TICKS_PER_SLOT: u64 = 2;
  1428. const TOTAL_VALIDATOR_COUNT: u16 = 20;
  1429. const MY_INDEX: usize = TOTAL_VALIDATOR_COUNT as usize - 1;
  1430. const COORDINATOR_INDEX: usize = 0;
  1431. const WAIT_FOR_THREAD_TIMEOUT: u64 = 10_000;
  1432. const WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT: u64 = 80;
  1433. const NON_CONFORMING_VALIDATOR_PERCENT: u64 = 5;
  1434. fn push_restart_last_voted_fork_slots(
  1435. cluster_info: Arc<ClusterInfo>,
  1436. node: &ContactInfo,
  1437. last_voted_fork_slots: &[Slot],
  1438. last_vote_hash: &Hash,
  1439. node_keypair: &Keypair,
  1440. wallclock: u64,
  1441. ) {
  1442. let slots = RestartLastVotedForkSlots::new(
  1443. *node.pubkey(),
  1444. wallclock,
  1445. last_voted_fork_slots,
  1446. *last_vote_hash,
  1447. SHRED_VERSION,
  1448. )
  1449. .unwrap();
  1450. let entries = vec![
  1451. CrdsValue::new(CrdsData::from(node), node_keypair),
  1452. CrdsValue::new(CrdsData::RestartLastVotedForkSlots(slots), node_keypair),
  1453. ];
  1454. {
  1455. let mut gossip_crds = cluster_info.gossip.crds.write().unwrap();
  1456. for entry in entries {
  1457. assert!(gossip_crds
  1458. .insert(entry, /*now=*/ 0, GossipRoute::LocalMessage)
  1459. .is_ok());
  1460. }
  1461. }
  1462. }
  1463. fn push_restart_heaviest_fork(
  1464. cluster_info: Arc<ClusterInfo>,
  1465. node: &ContactInfo,
  1466. heaviest_fork_slot: Slot,
  1467. heaviest_fork_hash: &Hash,
  1468. observed_stake: u64,
  1469. node_keypair: &Keypair,
  1470. wallclock: u64,
  1471. ) {
  1472. let heaviest_fork = RestartHeaviestFork {
  1473. from: *node.pubkey(),
  1474. wallclock,
  1475. last_slot: heaviest_fork_slot,
  1476. last_slot_hash: *heaviest_fork_hash,
  1477. observed_stake,
  1478. shred_version: SHRED_VERSION,
  1479. };
  1480. assert!(cluster_info
  1481. .gossip
  1482. .crds
  1483. .write()
  1484. .unwrap()
  1485. .insert(
  1486. CrdsValue::new(CrdsData::RestartHeaviestFork(heaviest_fork), node_keypair),
  1487. /*now=*/ 0,
  1488. GossipRoute::LocalMessage
  1489. )
  1490. .is_ok());
  1491. }
  1492. struct WenRestartTestInitResult {
  1493. pub validator_voting_keypairs: Vec<ValidatorVoteKeypairs>,
  1494. pub blockstore: Arc<Blockstore>,
  1495. pub cluster_info: Arc<ClusterInfo>,
  1496. pub bank_forks: Arc<RwLock<BankForks>>,
  1497. pub last_voted_fork_slots: Vec<Slot>,
  1498. pub wen_restart_proto_path: PathBuf,
  1499. pub wen_restart_coordinator: Pubkey,
  1500. pub last_blockhash: Hash,
  1501. pub genesis_config_hash: Hash,
  1502. }
  1503. fn insert_slots_into_blockstore(
  1504. blockstore: Arc<Blockstore>,
  1505. first_parent: Slot,
  1506. slots_to_insert: &[Slot],
  1507. entries_per_slot: u64,
  1508. start_blockhash: Hash,
  1509. ) -> Hash {
  1510. let mut last_hash = start_blockhash;
  1511. let mut last_parent = first_parent;
  1512. for i in slots_to_insert {
  1513. last_hash = fill_blockstore_slot_with_ticks(
  1514. &blockstore,
  1515. entries_per_slot,
  1516. *i,
  1517. last_parent,
  1518. last_hash,
  1519. );
  1520. last_parent = *i;
  1521. }
  1522. last_hash
  1523. }
  1524. fn wen_restart_test_init(ledger_path: &TempDir) -> WenRestartTestInitResult {
  1525. let validator_voting_keypairs: Vec<_> = (0..TOTAL_VALIDATOR_COUNT)
  1526. .map(|_| ValidatorVoteKeypairs::new_rand())
  1527. .collect();
  1528. let node_keypair = Arc::new(
  1529. validator_voting_keypairs[MY_INDEX]
  1530. .node_keypair
  1531. .insecure_clone(),
  1532. );
  1533. let wen_restart_coordinator = validator_voting_keypairs[COORDINATOR_INDEX]
  1534. .node_keypair
  1535. .pubkey();
  1536. let cluster_info = Arc::new(ClusterInfo::new(
  1537. {
  1538. let mut contact_info =
  1539. ContactInfo::new_localhost(&node_keypair.pubkey(), timestamp());
  1540. contact_info.set_shred_version(SHRED_VERSION);
  1541. contact_info
  1542. },
  1543. node_keypair.clone(),
  1544. SocketAddrSpace::Unspecified,
  1545. ));
  1546. let GenesisConfigInfo {
  1547. mut genesis_config, ..
  1548. } = create_genesis_config_with_vote_accounts(
  1549. 10_000,
  1550. &validator_voting_keypairs,
  1551. vec![100; validator_voting_keypairs.len()],
  1552. );
  1553. genesis_config.ticks_per_slot = TICKS_PER_SLOT;
  1554. let start_blockhash = create_new_ledger(
  1555. ledger_path.path(),
  1556. &genesis_config,
  1557. MAX_GENESIS_ARCHIVE_UNPACKED_SIZE,
  1558. LedgerColumnOptions::default(),
  1559. )
  1560. .unwrap();
  1561. let blockstore = Arc::new(Blockstore::open(ledger_path.path()).unwrap());
  1562. let (bank_forks, ..) = test_process_blockstore(
  1563. &genesis_config,
  1564. &blockstore,
  1565. &ProcessOptions {
  1566. run_verification: true,
  1567. ..ProcessOptions::default()
  1568. },
  1569. Arc::default(),
  1570. );
  1571. let mut last_blockhash = start_blockhash;
  1572. // Skip block 1, 2 links directly to 0.
  1573. let last_parent: Slot = 2;
  1574. let mut last_voted_fork_slots: Vec<Slot> = Vec::new();
  1575. last_voted_fork_slots
  1576. .extend(last_parent..last_parent.saturating_add(EXPECTED_SLOTS).saturating_add(1));
  1577. last_blockhash = insert_slots_into_blockstore(
  1578. blockstore.clone(),
  1579. 0,
  1580. &last_voted_fork_slots,
  1581. genesis_config.ticks_per_slot,
  1582. last_blockhash,
  1583. );
  1584. last_voted_fork_slots.insert(0, 0);
  1585. last_voted_fork_slots.reverse();
  1586. let mut wen_restart_proto_path = ledger_path.path().to_path_buf();
  1587. wen_restart_proto_path.push("wen_restart_status.proto");
  1588. let _ = remove_file(&wen_restart_proto_path);
  1589. WenRestartTestInitResult {
  1590. validator_voting_keypairs,
  1591. blockstore,
  1592. cluster_info,
  1593. bank_forks,
  1594. last_voted_fork_slots,
  1595. wen_restart_proto_path,
  1596. wen_restart_coordinator,
  1597. last_blockhash,
  1598. genesis_config_hash: genesis_config.hash(),
  1599. }
  1600. }
  1601. fn wait_on_expected_progress_with_timeout(
  1602. wen_restart_proto_path: PathBuf,
  1603. expected_progress: WenRestartProgress,
  1604. ) {
  1605. let start = timestamp();
  1606. let mut progress = WenRestartProgress {
  1607. state: RestartState::Init.into(),
  1608. ..Default::default()
  1609. };
  1610. loop {
  1611. if let Ok(new_progress) = read_wen_restart_records(&wen_restart_proto_path) {
  1612. progress = new_progress;
  1613. if let Some(my_last_voted_fork_slots) = &expected_progress.my_last_voted_fork_slots
  1614. {
  1615. if let Some(record) = progress.my_last_voted_fork_slots.as_mut() {
  1616. record.wallclock = my_last_voted_fork_slots.wallclock;
  1617. }
  1618. }
  1619. if progress == expected_progress {
  1620. return;
  1621. }
  1622. }
  1623. if timestamp().saturating_sub(start) > WAIT_FOR_THREAD_TIMEOUT {
  1624. assert_eq!(
  1625. progress.my_last_voted_fork_slots,
  1626. expected_progress.my_last_voted_fork_slots
  1627. );
  1628. assert_eq!(
  1629. progress.last_voted_fork_slots_aggregate,
  1630. expected_progress.last_voted_fork_slots_aggregate
  1631. );
  1632. panic!(
  1633. "wait_on_expected_progress_with_timeout failed to get expected progress {:?} \
  1634. expected {:?}",
  1635. &progress, expected_progress
  1636. );
  1637. }
  1638. sleep(Duration::from_millis(10));
  1639. }
  1640. }
  1641. fn wen_restart_test_succeed_after_failure(
  1642. test_state: WenRestartTestInitResult,
  1643. last_vote_bankhash: Hash,
  1644. expected_progress: WenRestartProgress,
  1645. ) {
  1646. // continue normally after the error, we should be good.
  1647. let exit = Arc::new(AtomicBool::new(false));
  1648. let last_vote_slot: Slot = test_state.last_voted_fork_slots[0];
  1649. let wen_restart_config = WenRestartConfig {
  1650. wen_restart_path: test_state.wen_restart_proto_path.clone(),
  1651. wen_restart_coordinator: test_state.wen_restart_coordinator,
  1652. last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
  1653. blockstore: test_state.blockstore.clone(),
  1654. cluster_info: test_state.cluster_info.clone(),
  1655. bank_forks: test_state.bank_forks.clone(),
  1656. wen_restart_repair_slots: Some(Arc::new(RwLock::new(Vec::new()))),
  1657. wait_for_supermajority_threshold_percent: 80,
  1658. snapshot_controller: None,
  1659. abs_status: AbsStatus::new_for_tests(),
  1660. genesis_config_hash: test_state.genesis_config_hash,
  1661. exit: exit.clone(),
  1662. };
  1663. let wen_restart_thread_handle = Builder::new()
  1664. .name("solana-wen-restart".to_string())
  1665. .spawn(move || {
  1666. let _ = wait_for_wen_restart(wen_restart_config).is_ok();
  1667. })
  1668. .unwrap();
  1669. wait_on_expected_progress_with_timeout(
  1670. test_state.wen_restart_proto_path.clone(),
  1671. expected_progress,
  1672. );
  1673. exit.store(true, Ordering::Relaxed);
  1674. assert!(wen_restart_thread_handle.join().is_ok());
  1675. let _ = remove_file(&test_state.wen_restart_proto_path);
  1676. }
  1677. #[test]
  1678. fn test_wen_restart_normal_flow() {
  1679. let ledger_path = get_tmp_ledger_path_auto_delete!();
  1680. let wen_restart_repair_slots = Some(Arc::new(RwLock::new(Vec::new())));
  1681. let test_state = wen_restart_test_init(&ledger_path);
  1682. let last_vote_slot = test_state.last_voted_fork_slots[0];
  1683. let last_vote_bankhash = Hash::new_unique();
  1684. let expected_slots_to_repair: Vec<Slot> =
  1685. (last_vote_slot + 1..last_vote_slot + 3).collect();
  1686. let my_pubkey = &test_state.validator_voting_keypairs[MY_INDEX]
  1687. .node_keypair
  1688. .pubkey();
  1689. let bank_snapshots_dir = tempfile::TempDir::new().unwrap();
  1690. let full_snapshot_archives_dir = tempfile::TempDir::new().unwrap();
  1691. let incremental_snapshot_archives_dir = tempfile::TempDir::new().unwrap();
  1692. let snapshot_config = SnapshotConfig {
  1693. bank_snapshots_dir: bank_snapshots_dir.as_ref().to_path_buf(),
  1694. full_snapshot_archives_dir: full_snapshot_archives_dir.as_ref().to_path_buf(),
  1695. incremental_snapshot_archives_dir: incremental_snapshot_archives_dir
  1696. .as_ref()
  1697. .to_path_buf(),
  1698. ..Default::default()
  1699. };
  1700. let old_root_bank = test_state.bank_forks.read().unwrap().root_bank();
  1701. // Trigger full snapshot generation on the old root bank.
  1702. assert!(bank_to_full_snapshot_archive(
  1703. snapshot_config.bank_snapshots_dir.clone(),
  1704. &old_root_bank,
  1705. Some(snapshot_config.snapshot_version),
  1706. snapshot_config.full_snapshot_archives_dir.clone(),
  1707. snapshot_config.incremental_snapshot_archives_dir.clone(),
  1708. snapshot_config.archive_format,
  1709. )
  1710. .is_ok());
  1711. let exit = Arc::new(AtomicBool::new(false));
  1712. let (abs_request_sender, _abs_request_receiver) = unbounded();
  1713. let snapshot_controller =
  1714. SnapshotController::new(abs_request_sender, snapshot_config, last_vote_slot);
  1715. let wen_restart_config = WenRestartConfig {
  1716. wen_restart_path: test_state.wen_restart_proto_path.clone(),
  1717. wen_restart_coordinator: test_state.wen_restart_coordinator,
  1718. last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
  1719. blockstore: test_state.blockstore.clone(),
  1720. cluster_info: test_state.cluster_info.clone(),
  1721. bank_forks: test_state.bank_forks.clone(),
  1722. wen_restart_repair_slots: wen_restart_repair_slots.clone(),
  1723. wait_for_supermajority_threshold_percent: 80,
  1724. snapshot_controller: Some(Arc::new(snapshot_controller)),
  1725. abs_status: AbsStatus::new_for_tests(),
  1726. genesis_config_hash: test_state.genesis_config_hash,
  1727. exit: exit.clone(),
  1728. };
  1729. let wen_restart_thread_handle = Builder::new()
  1730. .name("solana-wen-restart".to_string())
  1731. .spawn(move || {
  1732. assert!(wait_for_wen_restart(wen_restart_config).is_ok());
  1733. })
  1734. .unwrap();
  1735. let mut rng = rand::thread_rng();
  1736. let mut expected_received_last_voted_fork_slots = HashMap::new();
  1737. let validators_to_take: usize =
  1738. (WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT * TOTAL_VALIDATOR_COUNT as u64 / 100 - 1)
  1739. .try_into()
  1740. .unwrap();
  1741. let mut last_voted_fork_slots_from_others = test_state.last_voted_fork_slots.clone();
  1742. last_voted_fork_slots_from_others.reverse();
  1743. last_voted_fork_slots_from_others.append(&mut expected_slots_to_repair.clone());
  1744. for keypairs in test_state
  1745. .validator_voting_keypairs
  1746. .iter()
  1747. .take(validators_to_take)
  1748. {
  1749. let node_pubkey = keypairs.node_keypair.pubkey();
  1750. let node = ContactInfo::new_rand(&mut rng, Some(node_pubkey));
  1751. let last_vote_hash = Hash::new_unique();
  1752. let now = timestamp();
  1753. push_restart_last_voted_fork_slots(
  1754. test_state.cluster_info.clone(),
  1755. &node,
  1756. &last_voted_fork_slots_from_others,
  1757. &last_vote_hash,
  1758. &keypairs.node_keypair,
  1759. now,
  1760. );
  1761. expected_received_last_voted_fork_slots.insert(
  1762. node_pubkey.to_string(),
  1763. LastVotedForkSlotsRecord {
  1764. last_voted_fork_slots: last_voted_fork_slots_from_others.clone(),
  1765. last_vote_bankhash: last_vote_hash.to_string(),
  1766. shred_version: SHRED_VERSION as u32,
  1767. wallclock: now,
  1768. },
  1769. );
  1770. }
  1771. // Simulating successful repair of missing blocks.
  1772. let _ = insert_slots_into_blockstore(
  1773. test_state.blockstore.clone(),
  1774. last_vote_slot,
  1775. &expected_slots_to_repair,
  1776. TICKS_PER_SLOT,
  1777. test_state.last_blockhash,
  1778. );
  1779. let my_heaviest_fork_slot = last_vote_slot + 2;
  1780. let my_heaviest_fork_bankhash;
  1781. loop {
  1782. if let Some(bank) = test_state
  1783. .bank_forks
  1784. .read()
  1785. .unwrap()
  1786. .get(my_heaviest_fork_slot)
  1787. {
  1788. // When deciding the local heaviest fork, we will freeze the bank.
  1789. if bank.is_frozen() {
  1790. my_heaviest_fork_bankhash = bank.hash();
  1791. break;
  1792. }
  1793. }
  1794. sleep(Duration::from_millis(100));
  1795. }
  1796. // Now simulate receiving HeaviestFork messages from coordinator.
  1797. let coordinator_heaviest_fork_slot = my_heaviest_fork_slot - 1;
  1798. let coordinator_heaviest_fork_bankhash = test_state
  1799. .bank_forks
  1800. .read()
  1801. .unwrap()
  1802. .get(coordinator_heaviest_fork_slot)
  1803. .unwrap()
  1804. .hash();
  1805. let coordinator_keypair =
  1806. &test_state.validator_voting_keypairs[COORDINATOR_INDEX].node_keypair;
  1807. let node = ContactInfo::new_rand(&mut rng, Some(coordinator_keypair.pubkey()));
  1808. let now = timestamp();
  1809. push_restart_heaviest_fork(
  1810. test_state.cluster_info.clone(),
  1811. &node,
  1812. coordinator_heaviest_fork_slot,
  1813. &coordinator_heaviest_fork_bankhash,
  1814. 0,
  1815. coordinator_keypair,
  1816. now,
  1817. );
  1818. assert!(wen_restart_thread_handle.join().is_ok());
  1819. exit.store(true, Ordering::Relaxed);
  1820. let progress = read_wen_restart_records(&test_state.wen_restart_proto_path).unwrap();
  1821. let progress_start_time = progress
  1822. .my_last_voted_fork_slots
  1823. .as_ref()
  1824. .unwrap()
  1825. .wallclock;
  1826. let mut expected_slots_stake_map: HashMap<Slot, u64> = test_state
  1827. .last_voted_fork_slots
  1828. .iter()
  1829. .map(|slot| {
  1830. (
  1831. *slot,
  1832. WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT * TOTAL_VALIDATOR_COUNT as u64,
  1833. )
  1834. })
  1835. .collect();
  1836. let stake_for_new_slots = validators_to_take as u64 * 100;
  1837. expected_slots_stake_map.extend(
  1838. expected_slots_to_repair
  1839. .iter()
  1840. .map(|slot| (*slot, stake_for_new_slots)),
  1841. );
  1842. let voted_stake = (validators_to_take + 1) as u64 * 100;
  1843. assert_eq!(
  1844. progress,
  1845. WenRestartProgress {
  1846. state: RestartState::Done.into(),
  1847. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  1848. last_voted_fork_slots: test_state.last_voted_fork_slots,
  1849. last_vote_bankhash: last_vote_bankhash.to_string(),
  1850. shred_version: SHRED_VERSION as u32,
  1851. wallclock: progress_start_time,
  1852. }),
  1853. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  1854. received: expected_received_last_voted_fork_slots,
  1855. final_result: Some(LastVotedForkSlotsAggregateFinal {
  1856. slots_stake_map: expected_slots_stake_map,
  1857. epoch_infos: vec![
  1858. LastVotedForkSlotsEpochInfoRecord {
  1859. epoch: 0,
  1860. total_stake: 2000,
  1861. actively_voting_stake: voted_stake,
  1862. actively_voting_for_this_epoch_stake: voted_stake,
  1863. },
  1864. LastVotedForkSlotsEpochInfoRecord {
  1865. epoch: 1,
  1866. total_stake: 2000,
  1867. actively_voting_stake: voted_stake,
  1868. actively_voting_for_this_epoch_stake: voted_stake,
  1869. },
  1870. ],
  1871. }),
  1872. }),
  1873. my_heaviest_fork: Some(HeaviestForkRecord {
  1874. slot: my_heaviest_fork_slot,
  1875. bankhash: my_heaviest_fork_bankhash.to_string(),
  1876. total_active_stake: 0,
  1877. shred_version: SHRED_VERSION as u32,
  1878. wallclock: 0,
  1879. from: my_pubkey.to_string(),
  1880. }),
  1881. heaviest_fork_aggregate: None,
  1882. my_snapshot: Some(GenerateSnapshotRecord {
  1883. slot: coordinator_heaviest_fork_slot,
  1884. bankhash: progress.my_snapshot.as_ref().unwrap().bankhash.clone(),
  1885. shred_version: progress.my_snapshot.as_ref().unwrap().shred_version,
  1886. path: progress.my_snapshot.as_ref().unwrap().path.clone(),
  1887. }),
  1888. coordinator_heaviest_fork: Some(HeaviestForkRecord {
  1889. slot: coordinator_heaviest_fork_slot,
  1890. bankhash: coordinator_heaviest_fork_bankhash.to_string(),
  1891. total_active_stake: 0,
  1892. shred_version: SHRED_VERSION as u32,
  1893. wallclock: progress
  1894. .coordinator_heaviest_fork
  1895. .as_ref()
  1896. .unwrap()
  1897. .wallclock,
  1898. from: coordinator_keypair.pubkey().to_string(),
  1899. }),
  1900. ..Default::default()
  1901. }
  1902. );
  1903. }
  1904. fn change_proto_file_readonly(wen_restart_proto_path: &PathBuf, readonly: bool) {
  1905. let mut perms = std::fs::metadata(wen_restart_proto_path)
  1906. .unwrap()
  1907. .permissions();
  1908. perms.set_readonly(readonly);
  1909. std::fs::set_permissions(wen_restart_proto_path, perms).unwrap();
  1910. }
  1911. #[test]
  1912. fn test_wen_restart_divergence_across_epoch_boundary() {
  1913. agave_logger::setup();
  1914. let ledger_path = get_tmp_ledger_path_auto_delete!();
  1915. let test_state = wen_restart_test_init(&ledger_path);
  1916. let last_vote_slot = test_state.last_voted_fork_slots[0];
  1917. let old_root_bank = test_state.bank_forks.read().unwrap().root_bank();
  1918. // Add bank last_vote + 1 linking directly to 0, tweak its epoch_stakes, and then add it to bank_forks.
  1919. let my_heaviest_fork_slot = last_vote_slot + 1;
  1920. let mut new_root_bank = Bank::new_from_parent(
  1921. old_root_bank.clone(),
  1922. &Pubkey::default(),
  1923. my_heaviest_fork_slot,
  1924. );
  1925. assert_eq!(new_root_bank.epoch(), 1);
  1926. // For epoch 2, make validator 0 have 90% of the stake.
  1927. let vote_accounts_hash_map = test_state
  1928. .validator_voting_keypairs
  1929. .iter()
  1930. .enumerate()
  1931. .map(|(i, keypairs)| {
  1932. let stake = if i == 0 {
  1933. 900 * (TOTAL_VALIDATOR_COUNT - 1) as u64
  1934. } else {
  1935. 100
  1936. };
  1937. let authorized_voter = keypairs.vote_keypair.pubkey();
  1938. let node_id = keypairs.node_keypair.pubkey();
  1939. (
  1940. authorized_voter,
  1941. (
  1942. stake,
  1943. VoteAccount::try_from(create_v4_account_with_authorized(
  1944. &node_id,
  1945. &authorized_voter,
  1946. &node_id,
  1947. None,
  1948. 0,
  1949. 100,
  1950. ))
  1951. .unwrap(),
  1952. ),
  1953. )
  1954. })
  1955. .collect();
  1956. let epoch2_epoch_stakes = VersionedEpochStakes::new_for_tests(vote_accounts_hash_map, 2);
  1957. new_root_bank.set_epoch_stakes_for_test(2, epoch2_epoch_stakes);
  1958. let _ = insert_slots_into_blockstore(
  1959. test_state.blockstore.clone(),
  1960. 0,
  1961. &[my_heaviest_fork_slot],
  1962. TICKS_PER_SLOT,
  1963. old_root_bank.last_blockhash(),
  1964. );
  1965. let replay_tx_thread_pool = rayon::ThreadPoolBuilder::new()
  1966. .thread_name(|i| format!("solReplayTx{i:02}"))
  1967. .build()
  1968. .expect("new rayon threadpool");
  1969. let recyclers = VerifyRecyclers::default();
  1970. let mut timing = ExecuteTimings::default();
  1971. let opts = ProcessOptions::default();
  1972. let mut progress = ConfirmationProgress::new(old_root_bank.last_blockhash());
  1973. let last_vote_bankhash = new_root_bank.hash();
  1974. let bank_with_scheduler = test_state
  1975. .bank_forks
  1976. .write()
  1977. .unwrap()
  1978. .insert_from_ledger(new_root_bank);
  1979. if let Err(e) = process_single_slot(
  1980. &test_state.blockstore,
  1981. &bank_with_scheduler,
  1982. &replay_tx_thread_pool,
  1983. &opts,
  1984. &recyclers,
  1985. &mut progress,
  1986. None,
  1987. None,
  1988. None,
  1989. &mut timing,
  1990. ) {
  1991. panic!("process_single_slot failed: {e:?}");
  1992. }
  1993. {
  1994. let mut bank_forks = test_state.bank_forks.write().unwrap();
  1995. let _ = bank_forks.set_root(last_vote_slot + 1, None, Some(last_vote_slot + 1));
  1996. }
  1997. let new_root_bank = test_state
  1998. .bank_forks
  1999. .read()
  2000. .unwrap()
  2001. .get(last_vote_slot + 1)
  2002. .unwrap();
  2003. // Add two more banks: old_epoch_bank (slot = last_vote_slot + 2) and
  2004. // new_epoch_bank (slot = first slot in epoch 2). They both link to last_vote_slot + 1.
  2005. // old_epoch_bank has everyone's votes except 0, so it has > 66% stake in the old epoch.
  2006. // new_epoch_bank has 0's vote, so it has > 66% stake in the new epoch.
  2007. let old_epoch_slot = my_heaviest_fork_slot + 1;
  2008. let _ = insert_slots_into_blockstore(
  2009. test_state.blockstore.clone(),
  2010. new_root_bank.slot(),
  2011. &[old_epoch_slot],
  2012. TICKS_PER_SLOT,
  2013. new_root_bank.last_blockhash(),
  2014. );
  2015. let new_epoch_slot = new_root_bank.epoch_schedule().get_first_slot_in_epoch(2);
  2016. let _ = insert_slots_into_blockstore(
  2017. test_state.blockstore.clone(),
  2018. my_heaviest_fork_slot,
  2019. &[new_epoch_slot],
  2020. TICKS_PER_SLOT,
  2021. new_root_bank.last_blockhash(),
  2022. );
  2023. let mut rng = rand::thread_rng();
  2024. // Everyone except 0 votes for old_epoch_bank.
  2025. for (index, keypairs) in test_state
  2026. .validator_voting_keypairs
  2027. .iter()
  2028. .take(TOTAL_VALIDATOR_COUNT as usize - 1)
  2029. .enumerate()
  2030. {
  2031. let node_pubkey = keypairs.node_keypair.pubkey();
  2032. let node = ContactInfo::new_rand(&mut rng, Some(node_pubkey));
  2033. let last_vote_hash = Hash::new_unique();
  2034. let now = timestamp();
  2035. // Validator 0 votes for the new_epoch_bank while everyone elese vote for old_epoch_bank.
  2036. let last_voted_fork_slots = if index == 0 {
  2037. vec![new_epoch_slot, my_heaviest_fork_slot, 0]
  2038. } else {
  2039. vec![old_epoch_slot, my_heaviest_fork_slot, 0]
  2040. };
  2041. push_restart_last_voted_fork_slots(
  2042. test_state.cluster_info.clone(),
  2043. &node,
  2044. &last_voted_fork_slots,
  2045. &last_vote_hash,
  2046. &keypairs.node_keypair,
  2047. now,
  2048. );
  2049. }
  2050. assert_eq!(
  2051. wait_for_wen_restart(WenRestartConfig {
  2052. wen_restart_path: test_state.wen_restart_proto_path,
  2053. wen_restart_coordinator: test_state.wen_restart_coordinator,
  2054. last_vote: VoteTransaction::from(Vote::new(
  2055. vec![my_heaviest_fork_slot],
  2056. last_vote_bankhash
  2057. )),
  2058. blockstore: test_state.blockstore,
  2059. cluster_info: test_state.cluster_info,
  2060. bank_forks: test_state.bank_forks,
  2061. wen_restart_repair_slots: Some(Arc::new(RwLock::new(Vec::new()))),
  2062. wait_for_supermajority_threshold_percent: 80,
  2063. snapshot_controller: None,
  2064. abs_status: AbsStatus::new_for_tests(),
  2065. genesis_config_hash: test_state.genesis_config_hash,
  2066. exit: Arc::new(AtomicBool::new(false)),
  2067. })
  2068. .unwrap_err()
  2069. .downcast::<WenRestartError>()
  2070. .unwrap(),
  2071. WenRestartError::BlockNotLinkedToExpectedParent(
  2072. new_epoch_slot,
  2073. Some(my_heaviest_fork_slot),
  2074. old_epoch_slot
  2075. )
  2076. );
  2077. }
  2078. #[test]
  2079. fn test_wen_restart_initialize() {
  2080. agave_logger::setup();
  2081. let ledger_path = get_tmp_ledger_path_auto_delete!();
  2082. let test_state = wen_restart_test_init(&ledger_path);
  2083. let last_vote_bankhash = Hash::new_unique();
  2084. let mut last_voted_fork_slots = test_state.last_voted_fork_slots.clone();
  2085. last_voted_fork_slots.reverse();
  2086. let mut file = File::create(&test_state.wen_restart_proto_path).unwrap();
  2087. file.write_all(b"garbage").unwrap();
  2088. assert_eq!(
  2089. initialize(
  2090. &test_state.wen_restart_proto_path,
  2091. VoteTransaction::from(Vote::new(last_voted_fork_slots.clone(), last_vote_bankhash)),
  2092. test_state.blockstore.clone()
  2093. )
  2094. .unwrap_err()
  2095. .downcast::<prost::DecodeError>()
  2096. .unwrap(),
  2097. prost::DecodeError::new("invalid wire type value: 7")
  2098. );
  2099. assert!(remove_file(&test_state.wen_restart_proto_path).is_ok());
  2100. let last_vote_bankhash = Hash::new_unique();
  2101. let empty_last_vote = VoteTransaction::from(Vote::new(vec![], last_vote_bankhash));
  2102. assert_eq!(
  2103. initialize(
  2104. &test_state.wen_restart_proto_path,
  2105. empty_last_vote.clone(),
  2106. test_state.blockstore.clone()
  2107. )
  2108. .unwrap_err()
  2109. .downcast::<WenRestartError>()
  2110. .unwrap(),
  2111. WenRestartError::MissingLastVotedForkSlots,
  2112. );
  2113. assert!(write_wen_restart_records(
  2114. &test_state.wen_restart_proto_path,
  2115. &WenRestartProgress {
  2116. state: RestartState::LastVotedForkSlots.into(),
  2117. ..Default::default()
  2118. },
  2119. )
  2120. .is_ok());
  2121. assert_eq!(
  2122. initialize(
  2123. &test_state.wen_restart_proto_path,
  2124. VoteTransaction::from(Vote::new(last_voted_fork_slots.clone(), last_vote_bankhash)),
  2125. test_state.blockstore.clone()
  2126. )
  2127. .err()
  2128. .unwrap()
  2129. .to_string(),
  2130. "Malformed last voted fork slots protobuf: None"
  2131. );
  2132. let progress_missing_heaviest_fork_aggregate = WenRestartProgress {
  2133. state: RestartState::HeaviestFork.into(),
  2134. my_heaviest_fork: Some(HeaviestForkRecord {
  2135. slot: 0,
  2136. bankhash: Hash::new_unique().to_string(),
  2137. total_active_stake: 0,
  2138. shred_version: SHRED_VERSION as u32,
  2139. wallclock: 0,
  2140. from: Pubkey::new_unique().to_string(),
  2141. }),
  2142. ..Default::default()
  2143. };
  2144. assert!(write_wen_restart_records(
  2145. &test_state.wen_restart_proto_path,
  2146. &progress_missing_heaviest_fork_aggregate,
  2147. )
  2148. .is_ok());
  2149. assert_eq!(
  2150. initialize(
  2151. &test_state.wen_restart_proto_path,
  2152. VoteTransaction::from(Vote::new(last_voted_fork_slots.clone(), last_vote_bankhash)),
  2153. test_state.blockstore.clone()
  2154. )
  2155. .err()
  2156. .unwrap()
  2157. .to_string(),
  2158. "Malformed progress: HeaviestFork missing final_result in \
  2159. last_voted_fork_slots_aggregate",
  2160. );
  2161. let progress_missing_my_heaviestfork = WenRestartProgress {
  2162. state: RestartState::GenerateSnapshot.into(),
  2163. my_snapshot: Some(GenerateSnapshotRecord {
  2164. slot: 0,
  2165. bankhash: Hash::new_unique().to_string(),
  2166. shred_version: SHRED_VERSION as u32,
  2167. path: "/path/to/snapshot".to_string(),
  2168. }),
  2169. ..Default::default()
  2170. };
  2171. assert!(write_wen_restart_records(
  2172. &test_state.wen_restart_proto_path,
  2173. &progress_missing_my_heaviestfork,
  2174. )
  2175. .is_ok());
  2176. assert_eq!(
  2177. initialize(
  2178. &test_state.wen_restart_proto_path,
  2179. VoteTransaction::from(Vote::new(last_voted_fork_slots.clone(), last_vote_bankhash)),
  2180. test_state.blockstore.clone()
  2181. )
  2182. .err()
  2183. .unwrap()
  2184. .to_string(),
  2185. "Malformed progress: GenerateSnapshot missing my_heaviest_fork",
  2186. );
  2187. // Now test successful initialization.
  2188. assert!(remove_file(&test_state.wen_restart_proto_path).is_ok());
  2189. // Test the case where the file is not found.
  2190. let mut vote = TowerSync::from(vec![(test_state.last_voted_fork_slots[0], 1)]);
  2191. vote.hash = last_vote_bankhash;
  2192. let last_vote = VoteTransaction::from(vote);
  2193. assert_eq!(
  2194. initialize(
  2195. &test_state.wen_restart_proto_path,
  2196. last_vote.clone(),
  2197. test_state.blockstore.clone()
  2198. )
  2199. .unwrap(),
  2200. (
  2201. WenRestartProgressInternalState::Init {
  2202. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2203. last_vote_bankhash
  2204. },
  2205. WenRestartProgress {
  2206. state: RestartState::Init.into(),
  2207. ..Default::default()
  2208. }
  2209. )
  2210. );
  2211. let progress = WenRestartProgress {
  2212. state: RestartState::Init.into(),
  2213. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2214. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2215. last_vote_bankhash: last_vote_bankhash.to_string(),
  2216. shred_version: SHRED_VERSION as u32,
  2217. wallclock: 0,
  2218. }),
  2219. ..Default::default()
  2220. };
  2221. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress,).is_ok());
  2222. assert_eq!(
  2223. initialize(
  2224. &test_state.wen_restart_proto_path,
  2225. last_vote.clone(),
  2226. test_state.blockstore.clone()
  2227. )
  2228. .unwrap(),
  2229. (
  2230. WenRestartProgressInternalState::Init {
  2231. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2232. last_vote_bankhash,
  2233. },
  2234. progress
  2235. )
  2236. );
  2237. let progress = WenRestartProgress {
  2238. state: RestartState::LastVotedForkSlots.into(),
  2239. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2240. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2241. last_vote_bankhash: last_vote_bankhash.to_string(),
  2242. shred_version: SHRED_VERSION as u32,
  2243. wallclock: 0,
  2244. }),
  2245. ..Default::default()
  2246. };
  2247. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress,).is_ok());
  2248. assert_eq!(
  2249. initialize(
  2250. &test_state.wen_restart_proto_path,
  2251. last_vote.clone(),
  2252. test_state.blockstore.clone()
  2253. )
  2254. .unwrap(),
  2255. (
  2256. WenRestartProgressInternalState::LastVotedForkSlots {
  2257. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2258. aggregate_final_result: None,
  2259. },
  2260. progress
  2261. )
  2262. );
  2263. let progress = WenRestartProgress {
  2264. state: RestartState::HeaviestFork.into(),
  2265. my_heaviest_fork: Some(HeaviestForkRecord {
  2266. slot: 0,
  2267. bankhash: Hash::new_unique().to_string(),
  2268. total_active_stake: 0,
  2269. shred_version: SHRED_VERSION as u32,
  2270. wallclock: 0,
  2271. from: Pubkey::new_unique().to_string(),
  2272. }),
  2273. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2274. received: HashMap::new(),
  2275. final_result: Some(LastVotedForkSlotsAggregateFinal {
  2276. slots_stake_map: HashMap::new(),
  2277. epoch_infos: vec![
  2278. LastVotedForkSlotsEpochInfoRecord {
  2279. epoch: 1,
  2280. total_stake: 1000,
  2281. actively_voting_stake: 800,
  2282. actively_voting_for_this_epoch_stake: 800,
  2283. },
  2284. LastVotedForkSlotsEpochInfoRecord {
  2285. epoch: 2,
  2286. total_stake: 1000,
  2287. actively_voting_stake: 900,
  2288. actively_voting_for_this_epoch_stake: 900,
  2289. },
  2290. ],
  2291. }),
  2292. }),
  2293. ..Default::default()
  2294. };
  2295. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress,).is_ok());
  2296. assert_eq!(
  2297. initialize(
  2298. &test_state.wen_restart_proto_path,
  2299. last_vote.clone(),
  2300. test_state.blockstore.clone()
  2301. )
  2302. .unwrap(),
  2303. (
  2304. WenRestartProgressInternalState::FindHeaviestFork {
  2305. aggregate_final_result: LastVotedForkSlotsFinalResult {
  2306. slots_stake_map: HashMap::new(),
  2307. epoch_info_vec: vec![
  2308. LastVotedForkSlotsEpochInfo {
  2309. epoch: 1,
  2310. total_stake: 1000,
  2311. actively_voting_stake: 800,
  2312. actively_voting_for_this_epoch_stake: 800,
  2313. },
  2314. LastVotedForkSlotsEpochInfo {
  2315. epoch: 2,
  2316. total_stake: 1000,
  2317. actively_voting_stake: 900,
  2318. actively_voting_for_this_epoch_stake: 900,
  2319. }
  2320. ],
  2321. },
  2322. my_heaviest_fork: progress.my_heaviest_fork.clone(),
  2323. },
  2324. progress
  2325. )
  2326. );
  2327. let progress = WenRestartProgress {
  2328. state: RestartState::GenerateSnapshot.into(),
  2329. my_heaviest_fork: Some(HeaviestForkRecord {
  2330. slot: 0,
  2331. bankhash: Hash::new_unique().to_string(),
  2332. total_active_stake: 0,
  2333. shred_version: SHRED_VERSION as u32,
  2334. wallclock: 0,
  2335. from: Pubkey::new_unique().to_string(),
  2336. }),
  2337. my_snapshot: Some(GenerateSnapshotRecord {
  2338. slot: 0,
  2339. bankhash: Hash::new_unique().to_string(),
  2340. shred_version: SHRED_VERSION as u32,
  2341. path: "/path/to/snapshot".to_string(),
  2342. }),
  2343. ..Default::default()
  2344. };
  2345. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress,).is_ok());
  2346. assert_eq!(
  2347. initialize(
  2348. &test_state.wen_restart_proto_path,
  2349. VoteTransaction::from(Vote::new(last_voted_fork_slots.clone(), last_vote_bankhash)),
  2350. test_state.blockstore.clone()
  2351. )
  2352. .unwrap(),
  2353. (
  2354. WenRestartProgressInternalState::GenerateSnapshot {
  2355. my_heaviest_fork_slot: 0,
  2356. my_snapshot: progress.my_snapshot.clone(),
  2357. },
  2358. progress,
  2359. )
  2360. );
  2361. let last_vote_slot = test_state.last_voted_fork_slots[0];
  2362. let snapshot_slot_hash = Hash::new_unique();
  2363. let progress = WenRestartProgress {
  2364. state: RestartState::Done.into(),
  2365. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2366. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2367. last_vote_bankhash: last_vote_bankhash.to_string(),
  2368. shred_version: SHRED_VERSION as u32,
  2369. wallclock: 0,
  2370. }),
  2371. my_heaviest_fork: Some(HeaviestForkRecord {
  2372. slot: last_vote_slot,
  2373. bankhash: snapshot_slot_hash.to_string(),
  2374. total_active_stake: 0,
  2375. shred_version: SHRED_VERSION as u32,
  2376. wallclock: 0,
  2377. from: Pubkey::new_unique().to_string(),
  2378. }),
  2379. my_snapshot: Some(GenerateSnapshotRecord {
  2380. slot: last_vote_slot,
  2381. bankhash: snapshot_slot_hash.to_string(),
  2382. shred_version: SHRED_VERSION as u32,
  2383. path: "/path/to/snapshot".to_string(),
  2384. }),
  2385. ..Default::default()
  2386. };
  2387. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress,).is_ok());
  2388. assert_eq!(
  2389. initialize(
  2390. &test_state.wen_restart_proto_path,
  2391. VoteTransaction::from(Vote::new(last_voted_fork_slots, last_vote_bankhash)),
  2392. test_state.blockstore.clone()
  2393. )
  2394. .unwrap(),
  2395. (
  2396. WenRestartProgressInternalState::Done {
  2397. slot: last_vote_slot,
  2398. hash: snapshot_slot_hash,
  2399. shred_version: SHRED_VERSION,
  2400. },
  2401. progress
  2402. )
  2403. );
  2404. }
  2405. #[test]
  2406. fn test_wen_restart_send_last_voted_fork_failures() {
  2407. let ledger_path = get_tmp_ledger_path_auto_delete!();
  2408. let test_state = wen_restart_test_init(&ledger_path);
  2409. let progress = wen_restart_proto::WenRestartProgress {
  2410. state: RestartState::Init.into(),
  2411. ..Default::default()
  2412. };
  2413. let original_progress = progress.clone();
  2414. assert_eq!(
  2415. send_restart_last_voted_fork_slots(
  2416. test_state.cluster_info.clone(),
  2417. &[],
  2418. Hash::new_unique(),
  2419. )
  2420. .err()
  2421. .unwrap()
  2422. .to_string(),
  2423. "Last voted fork cannot be empty"
  2424. );
  2425. assert_eq!(progress, original_progress);
  2426. let last_vote_bankhash = Hash::new_unique();
  2427. let last_voted_fork_slots = test_state.last_voted_fork_slots.clone();
  2428. wen_restart_test_succeed_after_failure(
  2429. test_state,
  2430. last_vote_bankhash,
  2431. WenRestartProgress {
  2432. state: RestartState::LastVotedForkSlots.into(),
  2433. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2434. last_voted_fork_slots,
  2435. last_vote_bankhash: last_vote_bankhash.to_string(),
  2436. shred_version: SHRED_VERSION as u32,
  2437. wallclock: 0,
  2438. }),
  2439. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2440. received: HashMap::new(),
  2441. final_result: None,
  2442. }),
  2443. ..Default::default()
  2444. },
  2445. );
  2446. }
  2447. #[test]
  2448. fn test_write_wen_restart_records_failure() {
  2449. let ledger_path = get_tmp_ledger_path_auto_delete!();
  2450. let test_state = wen_restart_test_init(&ledger_path);
  2451. let progress = wen_restart_proto::WenRestartProgress {
  2452. state: RestartState::Init.into(),
  2453. ..Default::default()
  2454. };
  2455. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress).is_ok());
  2456. change_proto_file_readonly(&test_state.wen_restart_proto_path, true);
  2457. assert_eq!(
  2458. write_wen_restart_records(&test_state.wen_restart_proto_path, &progress)
  2459. .unwrap_err()
  2460. .downcast::<std::io::Error>()
  2461. .unwrap()
  2462. .kind(),
  2463. std::io::ErrorKind::PermissionDenied,
  2464. );
  2465. change_proto_file_readonly(&test_state.wen_restart_proto_path, false);
  2466. assert!(write_wen_restart_records(&test_state.wen_restart_proto_path, &progress).is_ok());
  2467. let last_voted_fork_slots = test_state.last_voted_fork_slots.clone();
  2468. let last_vote_bankhash = Hash::new_unique();
  2469. wen_restart_test_succeed_after_failure(
  2470. test_state,
  2471. last_vote_bankhash,
  2472. WenRestartProgress {
  2473. state: RestartState::LastVotedForkSlots.into(),
  2474. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2475. last_voted_fork_slots,
  2476. last_vote_bankhash: last_vote_bankhash.to_string(),
  2477. shred_version: SHRED_VERSION as u32,
  2478. wallclock: 0,
  2479. }),
  2480. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2481. received: HashMap::new(),
  2482. final_result: None,
  2483. }),
  2484. ..Default::default()
  2485. },
  2486. );
  2487. }
  2488. #[test]
  2489. fn test_wen_restart_aggregate_last_voted_fork_stop_and_restart() {
  2490. let ledger_path = get_tmp_ledger_path_auto_delete!();
  2491. let test_state = wen_restart_test_init(&ledger_path);
  2492. let last_vote_slot: Slot = test_state.last_voted_fork_slots[0];
  2493. let last_vote_bankhash = Hash::new_unique();
  2494. let start_time = timestamp();
  2495. assert!(write_wen_restart_records(
  2496. &test_state.wen_restart_proto_path,
  2497. &WenRestartProgress {
  2498. state: RestartState::LastVotedForkSlots.into(),
  2499. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2500. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2501. last_vote_bankhash: last_vote_bankhash.to_string(),
  2502. shred_version: SHRED_VERSION as u32,
  2503. wallclock: start_time,
  2504. }),
  2505. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2506. received: HashMap::new(),
  2507. final_result: None,
  2508. }),
  2509. ..Default::default()
  2510. }
  2511. )
  2512. .is_ok());
  2513. let mut rng = rand::thread_rng();
  2514. let mut expected_messages = HashMap::new();
  2515. let expected_slots_to_repair: Vec<Slot> =
  2516. (last_vote_slot + 1..last_vote_slot + 3).collect();
  2517. let mut last_voted_fork_slots_from_others = test_state.last_voted_fork_slots.clone();
  2518. last_voted_fork_slots_from_others.reverse();
  2519. last_voted_fork_slots_from_others.append(&mut expected_slots_to_repair.clone());
  2520. let progress = WenRestartProgress {
  2521. state: RestartState::LastVotedForkSlots.into(),
  2522. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2523. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2524. last_vote_bankhash: last_vote_bankhash.to_string(),
  2525. shred_version: SHRED_VERSION as u32,
  2526. wallclock: start_time,
  2527. }),
  2528. ..Default::default()
  2529. };
  2530. let validators_to_take: usize =
  2531. (WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT * TOTAL_VALIDATOR_COUNT as u64 / 100 - 1)
  2532. .try_into()
  2533. .unwrap();
  2534. for keypairs in test_state
  2535. .validator_voting_keypairs
  2536. .iter()
  2537. .take(validators_to_take)
  2538. {
  2539. let wen_restart_proto_path_clone = test_state.wen_restart_proto_path.clone();
  2540. let cluster_info_clone = test_state.cluster_info.clone();
  2541. let bank_forks_clone = test_state.bank_forks.clone();
  2542. let blockstore_clone = test_state.blockstore.clone();
  2543. let exit = Arc::new(AtomicBool::new(false));
  2544. let exit_clone = exit.clone();
  2545. let mut progress_clone = progress.clone();
  2546. let last_voted_fork_slots = test_state.last_voted_fork_slots.clone();
  2547. let wen_restart_thread_handle = Builder::new()
  2548. .name("solana-wen-restart".to_string())
  2549. .spawn(move || {
  2550. assert!(aggregate_restart_last_voted_fork_slots(
  2551. &wen_restart_proto_path_clone,
  2552. WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT,
  2553. cluster_info_clone,
  2554. &last_voted_fork_slots,
  2555. bank_forks_clone,
  2556. blockstore_clone,
  2557. Arc::new(RwLock::new(Vec::new())),
  2558. exit_clone,
  2559. &mut progress_clone,
  2560. )
  2561. .is_ok());
  2562. })
  2563. .unwrap();
  2564. let node_pubkey = keypairs.node_keypair.pubkey();
  2565. let node = ContactInfo::new_rand(&mut rng, Some(node_pubkey));
  2566. let last_vote_hash = Hash::new_unique();
  2567. let now = timestamp();
  2568. push_restart_last_voted_fork_slots(
  2569. test_state.cluster_info.clone(),
  2570. &node,
  2571. &last_voted_fork_slots_from_others,
  2572. &last_vote_hash,
  2573. &keypairs.node_keypair,
  2574. now,
  2575. );
  2576. expected_messages.insert(
  2577. node_pubkey.to_string(),
  2578. LastVotedForkSlotsRecord {
  2579. last_voted_fork_slots: last_voted_fork_slots_from_others.clone(),
  2580. last_vote_bankhash: last_vote_hash.to_string(),
  2581. shred_version: SHRED_VERSION as u32,
  2582. wallclock: now,
  2583. },
  2584. );
  2585. wait_on_expected_progress_with_timeout(
  2586. test_state.wen_restart_proto_path.clone(),
  2587. WenRestartProgress {
  2588. state: RestartState::LastVotedForkSlots.into(),
  2589. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2590. last_voted_fork_slots: test_state.last_voted_fork_slots.clone(),
  2591. last_vote_bankhash: last_vote_bankhash.to_string(),
  2592. shred_version: SHRED_VERSION as u32,
  2593. wallclock: start_time,
  2594. }),
  2595. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2596. received: expected_messages.clone(),
  2597. final_result: None,
  2598. }),
  2599. ..Default::default()
  2600. },
  2601. );
  2602. exit.store(true, Ordering::Relaxed);
  2603. let _ = wen_restart_thread_handle.join();
  2604. }
  2605. // Simulating successful repair of missing blocks.
  2606. let _ = insert_slots_into_blockstore(
  2607. test_state.blockstore.clone(),
  2608. last_vote_slot,
  2609. &expected_slots_to_repair,
  2610. TICKS_PER_SLOT,
  2611. test_state.last_blockhash,
  2612. );
  2613. let last_voted_fork_slots = test_state.last_voted_fork_slots.clone();
  2614. wen_restart_test_succeed_after_failure(
  2615. test_state,
  2616. last_vote_bankhash,
  2617. WenRestartProgress {
  2618. state: RestartState::LastVotedForkSlots.into(),
  2619. my_last_voted_fork_slots: Some(LastVotedForkSlotsRecord {
  2620. last_voted_fork_slots,
  2621. last_vote_bankhash: last_vote_bankhash.to_string(),
  2622. shred_version: SHRED_VERSION as u32,
  2623. wallclock: start_time,
  2624. }),
  2625. last_voted_fork_slots_aggregate: Some(LastVotedForkSlotsAggregateRecord {
  2626. received: expected_messages,
  2627. final_result: None,
  2628. }),
  2629. ..Default::default()
  2630. },
  2631. );
  2632. }
  2633. #[test]
  2634. fn test_increment_and_write_wen_restart_records() {
  2635. agave_logger::setup();
  2636. let my_dir = TempDir::new().unwrap();
  2637. let mut wen_restart_proto_path = my_dir.path().to_path_buf();
  2638. wen_restart_proto_path.push("wen_restart_status.proto");
  2639. let last_vote_bankhash = Hash::new_unique();
  2640. let my_last_voted_fork_slots = Some(LastVotedForkSlotsRecord {
  2641. last_voted_fork_slots: vec![0, 1],
  2642. last_vote_bankhash: last_vote_bankhash.to_string(),
  2643. shred_version: 0,
  2644. wallclock: 0,
  2645. });
  2646. let last_voted_fork_slots_aggregate = Some(LastVotedForkSlotsAggregateRecord {
  2647. received: HashMap::new(),
  2648. final_result: Some(LastVotedForkSlotsAggregateFinal {
  2649. slots_stake_map: vec![(0, 900), (1, 800)].into_iter().collect(),
  2650. epoch_infos: vec![LastVotedForkSlotsEpochInfoRecord {
  2651. epoch: 0,
  2652. total_stake: 2000,
  2653. actively_voting_stake: 900,
  2654. actively_voting_for_this_epoch_stake: 900,
  2655. }],
  2656. }),
  2657. });
  2658. let my_pubkey = Pubkey::new_unique();
  2659. let my_heaviest_fork = Some(HeaviestForkRecord {
  2660. slot: 1,
  2661. bankhash: Hash::default().to_string(),
  2662. total_active_stake: 900,
  2663. shred_version: SHRED_VERSION as u32,
  2664. wallclock: 0,
  2665. from: my_pubkey.to_string(),
  2666. });
  2667. let coordinator_heaviest_fork = Some(HeaviestForkRecord {
  2668. slot: 2,
  2669. bankhash: Hash::new_unique().to_string(),
  2670. total_active_stake: 800,
  2671. shred_version: SHRED_VERSION as u32,
  2672. wallclock: 0,
  2673. from: Pubkey::new_unique().to_string(),
  2674. });
  2675. let my_bankhash = Hash::new_unique();
  2676. let new_shred_version = SHRED_VERSION + 57;
  2677. let my_snapshot = Some(GenerateSnapshotRecord {
  2678. slot: 1,
  2679. bankhash: my_bankhash.to_string(),
  2680. path: "snapshot_1".to_string(),
  2681. shred_version: new_shred_version as u32,
  2682. });
  2683. let expected_slots_stake_map: HashMap<Slot, u64> =
  2684. vec![(0, 900), (1, 800)].into_iter().collect();
  2685. for (entrance_state, exit_state, entrance_progress, exit_progress) in [
  2686. (
  2687. WenRestartProgressInternalState::Init {
  2688. last_voted_fork_slots: vec![0, 1],
  2689. last_vote_bankhash,
  2690. },
  2691. WenRestartProgressInternalState::LastVotedForkSlots {
  2692. last_voted_fork_slots: vec![0, 1],
  2693. aggregate_final_result: None,
  2694. },
  2695. WenRestartProgress {
  2696. state: RestartState::LastVotedForkSlots.into(),
  2697. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2698. ..Default::default()
  2699. },
  2700. WenRestartProgress {
  2701. state: RestartState::LastVotedForkSlots.into(),
  2702. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2703. ..Default::default()
  2704. },
  2705. ),
  2706. (
  2707. WenRestartProgressInternalState::LastVotedForkSlots {
  2708. last_voted_fork_slots: vec![0, 1],
  2709. aggregate_final_result: Some(LastVotedForkSlotsFinalResult {
  2710. slots_stake_map: expected_slots_stake_map.clone(),
  2711. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2712. epoch: 0,
  2713. total_stake: 2000,
  2714. actively_voting_stake: 900,
  2715. actively_voting_for_this_epoch_stake: 900,
  2716. }],
  2717. }),
  2718. },
  2719. WenRestartProgressInternalState::FindHeaviestFork {
  2720. aggregate_final_result: LastVotedForkSlotsFinalResult {
  2721. slots_stake_map: expected_slots_stake_map.clone(),
  2722. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2723. epoch: 0,
  2724. total_stake: 2000,
  2725. actively_voting_stake: 900,
  2726. actively_voting_for_this_epoch_stake: 900,
  2727. }],
  2728. },
  2729. my_heaviest_fork: None,
  2730. },
  2731. WenRestartProgress {
  2732. state: RestartState::LastVotedForkSlots.into(),
  2733. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2734. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2735. ..Default::default()
  2736. },
  2737. WenRestartProgress {
  2738. state: RestartState::HeaviestFork.into(),
  2739. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2740. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2741. ..Default::default()
  2742. },
  2743. ),
  2744. (
  2745. WenRestartProgressInternalState::FindHeaviestFork {
  2746. aggregate_final_result: LastVotedForkSlotsFinalResult {
  2747. slots_stake_map: expected_slots_stake_map,
  2748. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2749. epoch: 0,
  2750. total_stake: 2000,
  2751. actively_voting_stake: 900,
  2752. actively_voting_for_this_epoch_stake: 900,
  2753. }],
  2754. },
  2755. my_heaviest_fork: Some(HeaviestForkRecord {
  2756. slot: 1,
  2757. bankhash: Hash::default().to_string(),
  2758. total_active_stake: 900,
  2759. shred_version: SHRED_VERSION as u32,
  2760. wallclock: 0,
  2761. from: my_pubkey.to_string(),
  2762. }),
  2763. },
  2764. WenRestartProgressInternalState::HeaviestFork {
  2765. my_heaviest_fork_slot: 1,
  2766. my_heaviest_fork_hash: Hash::default(),
  2767. },
  2768. WenRestartProgress {
  2769. state: RestartState::HeaviestFork.into(),
  2770. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2771. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2772. ..Default::default()
  2773. },
  2774. WenRestartProgress {
  2775. state: RestartState::HeaviestFork.into(),
  2776. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2777. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2778. my_heaviest_fork: my_heaviest_fork.clone(),
  2779. ..Default::default()
  2780. },
  2781. ),
  2782. (
  2783. WenRestartProgressInternalState::HeaviestFork {
  2784. my_heaviest_fork_slot: 1,
  2785. my_heaviest_fork_hash: Hash::default(),
  2786. },
  2787. WenRestartProgressInternalState::GenerateSnapshot {
  2788. my_heaviest_fork_slot: 1,
  2789. my_snapshot: None,
  2790. },
  2791. WenRestartProgress {
  2792. state: RestartState::HeaviestFork.into(),
  2793. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2794. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2795. my_heaviest_fork: my_heaviest_fork.clone(),
  2796. coordinator_heaviest_fork: coordinator_heaviest_fork.clone(),
  2797. ..Default::default()
  2798. },
  2799. WenRestartProgress {
  2800. state: RestartState::GenerateSnapshot.into(),
  2801. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2802. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2803. my_heaviest_fork: my_heaviest_fork.clone(),
  2804. coordinator_heaviest_fork: coordinator_heaviest_fork.clone(),
  2805. ..Default::default()
  2806. },
  2807. ),
  2808. (
  2809. WenRestartProgressInternalState::GenerateSnapshot {
  2810. my_heaviest_fork_slot: 1,
  2811. my_snapshot: my_snapshot.clone(),
  2812. },
  2813. WenRestartProgressInternalState::Done {
  2814. slot: 1,
  2815. hash: my_bankhash,
  2816. shred_version: new_shred_version,
  2817. },
  2818. WenRestartProgress {
  2819. state: RestartState::HeaviestFork.into(),
  2820. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2821. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2822. my_heaviest_fork: my_heaviest_fork.clone(),
  2823. coordinator_heaviest_fork: coordinator_heaviest_fork.clone(),
  2824. ..Default::default()
  2825. },
  2826. WenRestartProgress {
  2827. state: RestartState::Done.into(),
  2828. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2829. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2830. my_heaviest_fork: my_heaviest_fork.clone(),
  2831. coordinator_heaviest_fork,
  2832. my_snapshot: my_snapshot.clone(),
  2833. ..Default::default()
  2834. },
  2835. ),
  2836. ] {
  2837. let mut progress = entrance_progress;
  2838. let state = increment_and_write_wen_restart_records(
  2839. &wen_restart_proto_path,
  2840. entrance_state,
  2841. &mut progress,
  2842. )
  2843. .unwrap();
  2844. assert_eq!(&state, &exit_state);
  2845. assert_eq!(&progress, &exit_progress);
  2846. }
  2847. let mut progress = WenRestartProgress {
  2848. state: RestartState::Done.into(),
  2849. my_last_voted_fork_slots: my_last_voted_fork_slots.clone(),
  2850. last_voted_fork_slots_aggregate: last_voted_fork_slots_aggregate.clone(),
  2851. ..Default::default()
  2852. };
  2853. assert_eq!(
  2854. increment_and_write_wen_restart_records(
  2855. &wen_restart_proto_path,
  2856. WenRestartProgressInternalState::Done {
  2857. slot: 1,
  2858. hash: my_bankhash,
  2859. shred_version: new_shred_version,
  2860. },
  2861. &mut progress
  2862. )
  2863. .unwrap_err()
  2864. .downcast::<WenRestartError>()
  2865. .unwrap(),
  2866. WenRestartError::UnexpectedState(RestartState::Done),
  2867. );
  2868. }
  2869. #[test]
  2870. fn test_find_heaviest_fork_failures() {
  2871. agave_logger::setup();
  2872. let ledger_path = get_tmp_ledger_path_auto_delete!();
  2873. let exit = Arc::new(AtomicBool::new(false));
  2874. let test_state = wen_restart_test_init(&ledger_path);
  2875. let last_vote_slot = test_state.last_voted_fork_slots[0];
  2876. let slot_with_no_block = 1;
  2877. // This fails because corresponding block is not found, which is wrong, we should have
  2878. // repaired all eligible blocks when we exit LastVotedForkSlots state.
  2879. assert_eq!(
  2880. find_heaviest_fork(
  2881. LastVotedForkSlotsFinalResult {
  2882. slots_stake_map: vec![(0, 900), (slot_with_no_block, 800)]
  2883. .into_iter()
  2884. .collect(),
  2885. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2886. epoch: 0,
  2887. total_stake: 1000,
  2888. actively_voting_stake: 900,
  2889. actively_voting_for_this_epoch_stake: 900,
  2890. }],
  2891. },
  2892. test_state.bank_forks.clone(),
  2893. test_state.blockstore.clone(),
  2894. exit.clone(),
  2895. )
  2896. .unwrap_err()
  2897. .downcast::<WenRestartError>()
  2898. .unwrap(),
  2899. WenRestartError::BlockNotFound(slot_with_no_block),
  2900. );
  2901. // The following fails because we expect to see the first slot in slots_stake_map doesn't chain to local root.
  2902. assert_eq!(
  2903. find_heaviest_fork(
  2904. LastVotedForkSlotsFinalResult {
  2905. slots_stake_map: vec![(3, 900)].into_iter().collect(),
  2906. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2907. epoch: 0,
  2908. total_stake: 1000,
  2909. actively_voting_stake: 900,
  2910. actively_voting_for_this_epoch_stake: 900,
  2911. }],
  2912. },
  2913. test_state.bank_forks.clone(),
  2914. test_state.blockstore.clone(),
  2915. exit.clone(),
  2916. )
  2917. .unwrap_err()
  2918. .downcast::<WenRestartError>()
  2919. .unwrap(),
  2920. WenRestartError::BlockNotLinkedToExpectedParent(3, Some(2), 0),
  2921. );
  2922. // The following fails because we expect to see the some slot in slots_stake_map doesn't chain to the
  2923. // one before it.
  2924. assert_eq!(
  2925. find_heaviest_fork(
  2926. LastVotedForkSlotsFinalResult {
  2927. slots_stake_map: vec![(2, 900), (5, 900)].into_iter().collect(),
  2928. epoch_info_vec: vec![LastVotedForkSlotsEpochInfo {
  2929. epoch: 0,
  2930. total_stake: 1000,
  2931. actively_voting_stake: 900,
  2932. actively_voting_for_this_epoch_stake: 900,
  2933. }],
  2934. },
  2935. test_state.bank_forks.clone(),
  2936. test_state.blockstore.clone(),
  2937. exit.clone(),
  2938. )
  2939. .unwrap_err()
  2940. .downcast::<WenRestartError>()
  2941. .unwrap(),
  2942. WenRestartError::BlockNotLinkedToExpectedParent(5, Some(4), 2),
  2943. );
  2944. // The following fails because the new slot is not full.
  2945. let not_full_slot = last_vote_slot + 5;
  2946. let parent_slot = last_vote_slot;
  2947. let num_slots = (not_full_slot - parent_slot).max(1);
  2948. let mut entries = create_ticks(num_slots * TICKS_PER_SLOT, 0, test_state.last_blockhash);
  2949. assert!(entries.len() > 1);
  2950. entries.pop();
  2951. let shreds = entries_to_test_shreds(&entries, not_full_slot, parent_slot, false, 0);
  2952. test_state
  2953. .blockstore
  2954. .insert_shreds(shreds, None, false)
  2955. .unwrap();
  2956. let mut slots_stake_map: HashMap<Slot, u64> = test_state
  2957. .last_voted_fork_slots
  2958. .iter()
  2959. .map(|slot| (*slot, 900))
  2960. .collect();
  2961. slots_stake_map.insert(not_full_slot, 800);
  2962. assert_eq!(
  2963. find_heaviest_fork(
  2964. LastVotedForkSlotsFinalResult {
  2965. slots_stake_map,
  2966. epoch_info_vec: vec![
  2967. LastVotedForkSlotsEpochInfo {
  2968. epoch: 0,
  2969. total_stake: 1000,
  2970. actively_voting_stake: 900,
  2971. actively_voting_for_this_epoch_stake: 900,
  2972. },
  2973. LastVotedForkSlotsEpochInfo {
  2974. epoch: 1,
  2975. total_stake: 1000,
  2976. actively_voting_stake: 900,
  2977. actively_voting_for_this_epoch_stake: 900,
  2978. },
  2979. ],
  2980. },
  2981. test_state.bank_forks.clone(),
  2982. test_state.blockstore.clone(),
  2983. exit.clone(),
  2984. )
  2985. .unwrap_err()
  2986. .downcast::<WenRestartError>()
  2987. .unwrap(),
  2988. WenRestartError::BlockNotFull(not_full_slot)
  2989. );
  2990. // The following fails because we added two blocks at the end of the chain, they are full in blockstore
  2991. // but the parent of the first one is missing.
  2992. let missing_parent = last_vote_slot.saturating_add(1);
  2993. let new_slot = last_vote_slot.saturating_add(2);
  2994. let new_hash = insert_slots_into_blockstore(
  2995. test_state.blockstore.clone(),
  2996. last_vote_slot,
  2997. &[missing_parent],
  2998. 1,
  2999. test_state.last_blockhash,
  3000. );
  3001. let _ = insert_slots_into_blockstore(
  3002. test_state.blockstore.clone(),
  3003. missing_parent,
  3004. &[new_slot],
  3005. TICKS_PER_SLOT,
  3006. new_hash,
  3007. );
  3008. let mut slots_stake_map: HashMap<Slot, u64> = test_state
  3009. .last_voted_fork_slots
  3010. .iter()
  3011. .map(|slot| (*slot, 900))
  3012. .collect();
  3013. slots_stake_map.insert(missing_parent, 800);
  3014. slots_stake_map.insert(new_slot, 800);
  3015. assert_eq!(
  3016. find_heaviest_fork(
  3017. LastVotedForkSlotsFinalResult {
  3018. slots_stake_map,
  3019. epoch_info_vec: vec![
  3020. LastVotedForkSlotsEpochInfo {
  3021. epoch: 0,
  3022. total_stake: 1000,
  3023. actively_voting_stake: 900,
  3024. actively_voting_for_this_epoch_stake: 900,
  3025. },
  3026. LastVotedForkSlotsEpochInfo {
  3027. epoch: 1,
  3028. total_stake: 1000,
  3029. actively_voting_stake: 900,
  3030. actively_voting_for_this_epoch_stake: 900,
  3031. },
  3032. ],
  3033. },
  3034. test_state.bank_forks.clone(),
  3035. test_state.blockstore.clone(),
  3036. exit.clone(),
  3037. )
  3038. .unwrap_err()
  3039. .downcast::<WenRestartError>()
  3040. .unwrap(),
  3041. WenRestartError::BlockNotFrozenAfterReplay(
  3042. missing_parent,
  3043. Some("invalid block error: incomplete block".to_string())
  3044. ),
  3045. );
  3046. }
  3047. fn start_aggregate_heaviest_fork_thread(
  3048. test_state: &WenRestartTestInitResult,
  3049. heaviest_fork_slot: Slot,
  3050. heaviest_fork_bankhash: Hash,
  3051. exit: Arc<AtomicBool>,
  3052. expected_error: Option<WenRestartError>,
  3053. ) -> std::thread::JoinHandle<()> {
  3054. let progress = wen_restart_proto::WenRestartProgress {
  3055. state: RestartState::HeaviestFork.into(),
  3056. my_heaviest_fork: Some(HeaviestForkRecord {
  3057. slot: heaviest_fork_slot,
  3058. bankhash: heaviest_fork_bankhash.to_string(),
  3059. total_active_stake: WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT
  3060. .saturating_mul(TOTAL_VALIDATOR_COUNT as u64),
  3061. shred_version: SHRED_VERSION as u32,
  3062. wallclock: 0,
  3063. from: test_state.cluster_info.id().to_string(),
  3064. }),
  3065. ..Default::default()
  3066. };
  3067. let wen_restart_path = test_state.wen_restart_proto_path.clone();
  3068. let cluster_info = test_state.cluster_info.clone();
  3069. let bank_forks = test_state.bank_forks.clone();
  3070. Builder::new()
  3071. .name("solana-wen-restart-aggregate-heaviest-fork".to_string())
  3072. .spawn(move || {
  3073. let result = aggregate_restart_heaviest_fork(
  3074. &wen_restart_path,
  3075. cluster_info,
  3076. bank_forks,
  3077. exit,
  3078. &mut progress.clone(),
  3079. );
  3080. if let Some(expected_error) = expected_error {
  3081. assert_eq!(
  3082. result.unwrap_err().downcast::<WenRestartError>().unwrap(),
  3083. expected_error
  3084. );
  3085. } else {
  3086. assert!(result.is_ok());
  3087. }
  3088. })
  3089. .unwrap()
  3090. }
  3091. #[test]
  3092. fn test_aggregate_heaviest_fork() {
  3093. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3094. let test_state = wen_restart_test_init(&ledger_path);
  3095. let heaviest_fork_slot = test_state.last_voted_fork_slots[0] + 3;
  3096. let heaviest_fork_bankhash = Hash::new_unique();
  3097. let expected_active_stake = (WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT
  3098. - NON_CONFORMING_VALIDATOR_PERCENT)
  3099. * TOTAL_VALIDATOR_COUNT as u64;
  3100. let exit = Arc::new(AtomicBool::new(false));
  3101. let thread = start_aggregate_heaviest_fork_thread(
  3102. &test_state,
  3103. heaviest_fork_slot,
  3104. heaviest_fork_bankhash,
  3105. exit.clone(),
  3106. None,
  3107. );
  3108. let validators_to_take: usize =
  3109. (WAIT_FOR_SUPERMAJORITY_THRESHOLD_PERCENT * TOTAL_VALIDATOR_COUNT as u64 / 100 - 1)
  3110. .try_into()
  3111. .unwrap();
  3112. for keypair in test_state
  3113. .validator_voting_keypairs
  3114. .iter()
  3115. .take(validators_to_take)
  3116. {
  3117. let node_pubkey = keypair.node_keypair.pubkey();
  3118. let node = ContactInfo::new_rand(&mut rand::thread_rng(), Some(node_pubkey));
  3119. let now = timestamp();
  3120. push_restart_heaviest_fork(
  3121. test_state.cluster_info.clone(),
  3122. &node,
  3123. heaviest_fork_slot,
  3124. &heaviest_fork_bankhash,
  3125. expected_active_stake,
  3126. &keypair.node_keypair,
  3127. now,
  3128. );
  3129. }
  3130. exit.store(true, Ordering::Relaxed);
  3131. assert!(thread.join().is_ok());
  3132. }
  3133. #[test]
  3134. fn test_generate_snapshot() {
  3135. agave_logger::setup();
  3136. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3137. let test_state = wen_restart_test_init(&ledger_path);
  3138. let bank_snapshots_dir = tempfile::TempDir::new().unwrap();
  3139. let full_snapshot_archives_dir = tempfile::TempDir::new().unwrap();
  3140. let incremental_snapshot_archives_dir = tempfile::TempDir::new().unwrap();
  3141. let snapshot_config = SnapshotConfig {
  3142. bank_snapshots_dir: bank_snapshots_dir.as_ref().to_path_buf(),
  3143. full_snapshot_archives_dir: full_snapshot_archives_dir.as_ref().to_path_buf(),
  3144. incremental_snapshot_archives_dir: incremental_snapshot_archives_dir
  3145. .as_ref()
  3146. .to_path_buf(),
  3147. usage: SnapshotUsage::LoadAndGenerate,
  3148. ..Default::default()
  3149. };
  3150. let old_root_bank = test_state.bank_forks.read().unwrap().root_bank();
  3151. let old_root_slot = old_root_bank.slot();
  3152. let new_root_slot = test_state.last_voted_fork_slots[1];
  3153. let exit = Arc::new(AtomicBool::new(false));
  3154. let mut slots = test_state.last_voted_fork_slots.clone();
  3155. slots.reverse();
  3156. let old_last_vote_bankhash = find_bankhash_of_heaviest_fork(
  3157. new_root_slot,
  3158. slots,
  3159. test_state.blockstore.clone(),
  3160. test_state.bank_forks.clone(),
  3161. &exit,
  3162. )
  3163. .unwrap();
  3164. // We don't have any full snapshot, so if we call generate_snapshot() on the old
  3165. // root bank now, it should generate a full snapshot.
  3166. let (abs_request_sender, _abs_request_receiver) = unbounded();
  3167. let snapshot_controller =
  3168. SnapshotController::new(abs_request_sender.clone(), snapshot_config, new_root_slot);
  3169. let snapshot_config = snapshot_controller.snapshot_config();
  3170. let generated_record = generate_snapshot(
  3171. test_state.bank_forks.clone(),
  3172. &snapshot_controller,
  3173. &AbsStatus::new_for_tests(),
  3174. test_state.genesis_config_hash,
  3175. old_root_slot,
  3176. )
  3177. .unwrap();
  3178. assert!(Path::new(&generated_record.path).exists());
  3179. assert!(generated_record.path.starts_with(
  3180. snapshot_config
  3181. .full_snapshot_archives_dir
  3182. .to_string_lossy()
  3183. .as_ref()
  3184. ));
  3185. let generated_record = generate_snapshot(
  3186. test_state.bank_forks.clone(),
  3187. &snapshot_controller,
  3188. &AbsStatus::new_for_tests(),
  3189. test_state.genesis_config_hash,
  3190. new_root_slot,
  3191. )
  3192. .unwrap();
  3193. let new_root_bankhash = test_state
  3194. .bank_forks
  3195. .read()
  3196. .unwrap()
  3197. .get(new_root_slot)
  3198. .unwrap()
  3199. .hash();
  3200. assert_ne!(old_last_vote_bankhash, new_root_bankhash);
  3201. let new_shred_version = generated_record.shred_version;
  3202. assert_ne!(new_shred_version, SHRED_VERSION as u32);
  3203. let snapshot_hash = Hash::from_str(
  3204. generated_record
  3205. .path
  3206. .split('-')
  3207. .next_back()
  3208. .unwrap()
  3209. .split('.')
  3210. .next()
  3211. .unwrap(),
  3212. )
  3213. .unwrap();
  3214. assert_eq!(
  3215. generated_record,
  3216. GenerateSnapshotRecord {
  3217. slot: new_root_slot,
  3218. bankhash: new_root_bankhash.to_string(),
  3219. shred_version: new_shred_version,
  3220. path: build_incremental_snapshot_archive_path(
  3221. &snapshot_config.incremental_snapshot_archives_dir,
  3222. old_root_slot,
  3223. new_root_slot,
  3224. &SnapshotHash(snapshot_hash),
  3225. snapshot_config.archive_format,
  3226. )
  3227. .display()
  3228. .to_string(),
  3229. },
  3230. );
  3231. // Now generate a snapshot for older slot, it should fail because we already
  3232. // have a full snapshot.
  3233. assert_eq!(
  3234. generate_snapshot(
  3235. test_state.bank_forks.clone(),
  3236. &snapshot_controller,
  3237. &AbsStatus::new_for_tests(),
  3238. test_state.genesis_config_hash,
  3239. old_root_slot,
  3240. )
  3241. .unwrap_err()
  3242. .downcast::<WenRestartError>()
  3243. .unwrap(),
  3244. WenRestartError::GenerateSnapshotWhenOneExists(
  3245. old_root_slot,
  3246. snapshot_config
  3247. .full_snapshot_archives_dir
  3248. .to_string_lossy()
  3249. .to_string()
  3250. ),
  3251. );
  3252. // fails if we already have an incremental snapshot (we just generated one at new_root_slot).
  3253. let older_slot = new_root_slot - 1;
  3254. assert_eq!(
  3255. generate_snapshot(
  3256. test_state.bank_forks.clone(),
  3257. &snapshot_controller,
  3258. &AbsStatus::new_for_tests(),
  3259. test_state.genesis_config_hash,
  3260. older_slot,
  3261. )
  3262. .unwrap_err()
  3263. .downcast::<WenRestartError>()
  3264. .unwrap(),
  3265. WenRestartError::FutureSnapshotExists(
  3266. older_slot,
  3267. new_root_slot,
  3268. snapshot_config
  3269. .incremental_snapshot_archives_dir
  3270. .to_string_lossy()
  3271. .to_string()
  3272. ),
  3273. );
  3274. // Generate snapshot for a slot without any block, it should fail.
  3275. let empty_slot = new_root_slot + 100;
  3276. assert_eq!(
  3277. generate_snapshot(
  3278. test_state.bank_forks.clone(),
  3279. &snapshot_controller,
  3280. &AbsStatus::new_for_tests(),
  3281. test_state.genesis_config_hash,
  3282. empty_slot,
  3283. )
  3284. .unwrap_err()
  3285. .downcast::<WenRestartError>()
  3286. .unwrap(),
  3287. WenRestartError::BlockNotFound(empty_slot),
  3288. );
  3289. // Now turn off snapshot generation, we should generate a full snapshot.
  3290. let snapshot_config = SnapshotConfig {
  3291. bank_snapshots_dir: bank_snapshots_dir.as_ref().to_path_buf(),
  3292. full_snapshot_archives_dir: full_snapshot_archives_dir.as_ref().to_path_buf(),
  3293. incremental_snapshot_archives_dir: incremental_snapshot_archives_dir
  3294. .as_ref()
  3295. .to_path_buf(),
  3296. usage: SnapshotUsage::LoadOnly,
  3297. ..Default::default()
  3298. };
  3299. let snapshot_controller =
  3300. SnapshotController::new(abs_request_sender.clone(), snapshot_config, new_root_slot);
  3301. let snapshot_config = snapshot_controller.snapshot_config();
  3302. let generated_record = generate_snapshot(
  3303. test_state.bank_forks.clone(),
  3304. &snapshot_controller,
  3305. &AbsStatus::new_for_tests(),
  3306. test_state.genesis_config_hash,
  3307. test_state.last_voted_fork_slots[0],
  3308. )
  3309. .unwrap();
  3310. assert!(Path::new(&generated_record.path).exists());
  3311. assert!(generated_record.path.starts_with(
  3312. snapshot_config
  3313. .full_snapshot_archives_dir
  3314. .to_string_lossy()
  3315. .as_ref()
  3316. ));
  3317. }
  3318. #[test]
  3319. fn test_return_ok_after_wait_is_done() {
  3320. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3321. let test_state = wen_restart_test_init(&ledger_path);
  3322. let last_vote_slot = test_state.last_voted_fork_slots[0];
  3323. let last_vote_bankhash = Hash::new_unique();
  3324. let config = WenRestartConfig {
  3325. wen_restart_path: test_state.wen_restart_proto_path.clone(),
  3326. wen_restart_coordinator: test_state.wen_restart_coordinator,
  3327. last_vote: VoteTransaction::from(Vote::new(vec![last_vote_slot], last_vote_bankhash)),
  3328. blockstore: test_state.blockstore.clone(),
  3329. cluster_info: test_state.cluster_info.clone(),
  3330. bank_forks: test_state.bank_forks.clone(),
  3331. wen_restart_repair_slots: Some(Arc::new(RwLock::new(Vec::new()))),
  3332. wait_for_supermajority_threshold_percent: 80,
  3333. snapshot_controller: None,
  3334. abs_status: AbsStatus::new_for_tests(),
  3335. genesis_config_hash: test_state.genesis_config_hash,
  3336. exit: Arc::new(AtomicBool::new(false)),
  3337. };
  3338. assert!(write_wen_restart_records(
  3339. &test_state.wen_restart_proto_path,
  3340. &WenRestartProgress {
  3341. state: RestartState::Done.into(),
  3342. ..Default::default()
  3343. }
  3344. )
  3345. .is_ok());
  3346. assert_eq!(
  3347. wait_for_wen_restart(config.clone())
  3348. .unwrap_err()
  3349. .downcast::<WenRestartError>()
  3350. .unwrap(),
  3351. WenRestartError::MissingSnapshotInProtobuf
  3352. );
  3353. assert!(write_wen_restart_records(
  3354. &test_state.wen_restart_proto_path,
  3355. &WenRestartProgress {
  3356. state: RestartState::Done.into(),
  3357. my_snapshot: Some(GenerateSnapshotRecord {
  3358. slot: 0,
  3359. bankhash: Hash::new_unique().to_string(),
  3360. shred_version: SHRED_VERSION as u32,
  3361. path: "snapshot".to_string(),
  3362. }),
  3363. ..Default::default()
  3364. }
  3365. )
  3366. .is_ok());
  3367. assert!(wait_for_wen_restart(config).is_ok());
  3368. }
  3369. #[test]
  3370. fn test_receive_restart_heaviest_fork() {
  3371. let mut rng = rand::thread_rng();
  3372. let coordinator_keypair = Keypair::new();
  3373. let node_keypair = Arc::new(Keypair::new());
  3374. let cluster_info = Arc::new(ClusterInfo::new(
  3375. {
  3376. let mut contact_info =
  3377. ContactInfo::new_localhost(&node_keypair.pubkey(), timestamp());
  3378. contact_info.set_shred_version(SHRED_VERSION);
  3379. contact_info
  3380. },
  3381. node_keypair.clone(),
  3382. SocketAddrSpace::Unspecified,
  3383. ));
  3384. let exit = Arc::new(AtomicBool::new(false));
  3385. let random_keypair = Keypair::new();
  3386. let random_node = ContactInfo::new_rand(&mut rng, Some(random_keypair.pubkey()));
  3387. let random_slot = 3;
  3388. let random_hash = Hash::new_unique();
  3389. push_restart_heaviest_fork(
  3390. cluster_info.clone(),
  3391. &random_node,
  3392. random_slot,
  3393. &random_hash,
  3394. 0,
  3395. &random_keypair,
  3396. timestamp(),
  3397. );
  3398. let coordinator_node = ContactInfo::new_rand(&mut rng, Some(coordinator_keypair.pubkey()));
  3399. let coordinator_slot = 6;
  3400. let coordinator_hash = Hash::new_unique();
  3401. push_restart_heaviest_fork(
  3402. cluster_info.clone(),
  3403. &coordinator_node,
  3404. coordinator_slot,
  3405. &coordinator_hash,
  3406. 0,
  3407. &coordinator_keypair,
  3408. timestamp(),
  3409. );
  3410. let mut progress = WenRestartProgress {
  3411. state: RestartState::HeaviestFork.into(),
  3412. ..Default::default()
  3413. };
  3414. assert_eq!(
  3415. receive_restart_heaviest_fork(
  3416. coordinator_keypair.pubkey(),
  3417. cluster_info,
  3418. exit,
  3419. &mut progress
  3420. )
  3421. .unwrap(),
  3422. (coordinator_slot, coordinator_hash)
  3423. );
  3424. }
  3425. #[test]
  3426. fn test_repair_heaviest_fork() {
  3427. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3428. let my_heaviest_fork_slot = 1;
  3429. let coordinator_heaviest_slot_parent = 2;
  3430. let coordinator_heaviest_slot = 3;
  3431. let exit = Arc::new(AtomicBool::new(false));
  3432. let blockstore = Arc::new(Blockstore::open(ledger_path.path()).unwrap());
  3433. let wen_restart_repair_slots = Arc::new(RwLock::new(Vec::new()));
  3434. let exit_clone = exit.clone();
  3435. let blockstore_clone = blockstore.clone();
  3436. let wen_restart_repair_slots_clone = wen_restart_repair_slots.clone();
  3437. let repair_heaviest_fork_thread_handle = Builder::new()
  3438. .name("solana-repair-heaviest-fork".to_string())
  3439. .spawn(move || {
  3440. assert!(repair_heaviest_fork(
  3441. my_heaviest_fork_slot,
  3442. coordinator_heaviest_slot,
  3443. exit_clone,
  3444. blockstore_clone,
  3445. wen_restart_repair_slots_clone
  3446. )
  3447. .is_ok());
  3448. })
  3449. .unwrap();
  3450. sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS));
  3451. // When there is nothing in blockstore, should repair the heaviest slot.
  3452. assert_eq!(
  3453. *wen_restart_repair_slots.read().unwrap(),
  3454. vec![coordinator_heaviest_slot]
  3455. );
  3456. // Now add block 3, 3's parent is 2, should repair 2.
  3457. let _ = insert_slots_into_blockstore(
  3458. blockstore.clone(),
  3459. coordinator_heaviest_slot_parent,
  3460. &[coordinator_heaviest_slot],
  3461. TICKS_PER_SLOT,
  3462. Hash::default(),
  3463. );
  3464. sleep(Duration::from_millis(GOSSIP_SLEEP_MILLIS));
  3465. assert_eq!(
  3466. *wen_restart_repair_slots.read().unwrap(),
  3467. vec![coordinator_heaviest_slot_parent]
  3468. );
  3469. // Insert 2 which links to 1, should exit now.
  3470. let _ = insert_slots_into_blockstore(
  3471. blockstore.clone(),
  3472. my_heaviest_fork_slot,
  3473. &[coordinator_heaviest_slot_parent],
  3474. TICKS_PER_SLOT,
  3475. Hash::default(),
  3476. );
  3477. repair_heaviest_fork_thread_handle.join().unwrap();
  3478. }
  3479. #[test]
  3480. fn test_verify_coordinator_heaviest_fork() {
  3481. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3482. let test_state = wen_restart_test_init(&ledger_path);
  3483. let last_vote = test_state.last_voted_fork_slots[0];
  3484. let exit = Arc::new(AtomicBool::new(false));
  3485. // Create two forks: last_vote -> last_vote+1 and last_vote -> last_vote+2
  3486. let root_bank;
  3487. {
  3488. root_bank = test_state.bank_forks.read().unwrap().root_bank().clone();
  3489. }
  3490. let coordinator_slot = last_vote + 1;
  3491. let my_slot = last_vote + 2;
  3492. let _ = insert_slots_into_blockstore(
  3493. test_state.blockstore.clone(),
  3494. last_vote,
  3495. &[coordinator_slot],
  3496. TICKS_PER_SLOT,
  3497. test_state.last_blockhash,
  3498. );
  3499. let _ = insert_slots_into_blockstore(
  3500. test_state.blockstore.clone(),
  3501. last_vote,
  3502. &[my_slot],
  3503. TICKS_PER_SLOT,
  3504. test_state.last_blockhash,
  3505. );
  3506. let wen_restart_repair_slots = Arc::new(RwLock::new(Vec::new()));
  3507. assert_eq!(
  3508. verify_coordinator_heaviest_fork(
  3509. my_slot,
  3510. coordinator_slot,
  3511. &Hash::default(),
  3512. test_state.bank_forks.clone(),
  3513. test_state.blockstore.clone(),
  3514. exit.clone(),
  3515. wen_restart_repair_slots.clone()
  3516. )
  3517. .unwrap_err()
  3518. .downcast::<WenRestartError>()
  3519. .unwrap(),
  3520. WenRestartError::HeaviestForkOnLeaderOnDifferentFork(coordinator_slot, my_slot)
  3521. );
  3522. let coordinator_hash = Hash::new_unique();
  3523. let my_hash = root_bank.hash();
  3524. let root_slot = root_bank.slot();
  3525. assert_eq!(
  3526. verify_coordinator_heaviest_fork(
  3527. root_slot,
  3528. root_slot,
  3529. &coordinator_hash,
  3530. test_state.bank_forks.clone(),
  3531. test_state.blockstore.clone(),
  3532. exit.clone(),
  3533. wen_restart_repair_slots.clone()
  3534. )
  3535. .unwrap_err()
  3536. .downcast::<WenRestartError>()
  3537. .unwrap(),
  3538. WenRestartError::BankHashMismatch(root_slot, my_hash, coordinator_hash)
  3539. );
  3540. }
  3541. #[test]
  3542. fn test_send_and_receive_heaviest_fork() {
  3543. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3544. let test_state = wen_restart_test_init(&ledger_path);
  3545. let last_vote = test_state.last_voted_fork_slots[0];
  3546. let exit = Arc::new(AtomicBool::new(false));
  3547. let mut pushed_slot = 0;
  3548. let mut pushed_hash = Hash::default();
  3549. // The coordinator always sends its own choice.
  3550. let coordinator_slot = last_vote;
  3551. let mut slots = test_state.last_voted_fork_slots.clone();
  3552. slots.reverse();
  3553. let coordinator_hash = find_bankhash_of_heaviest_fork(
  3554. coordinator_slot,
  3555. slots,
  3556. test_state.blockstore.clone(),
  3557. test_state.bank_forks.clone(),
  3558. &exit,
  3559. )
  3560. .unwrap();
  3561. let mut progress = WenRestartProgress {
  3562. state: RestartState::HeaviestFork.into(),
  3563. ..Default::default()
  3564. };
  3565. // Set coordinator to myself, should return my choice.
  3566. let mut config = WenRestartConfig {
  3567. wen_restart_path: test_state.wen_restart_proto_path.clone(),
  3568. wen_restart_coordinator: test_state.cluster_info.id(),
  3569. last_vote: VoteTransaction::from(Vote::new(vec![last_vote], Hash::default())),
  3570. blockstore: test_state.blockstore.clone(),
  3571. cluster_info: test_state.cluster_info.clone(),
  3572. bank_forks: test_state.bank_forks.clone(),
  3573. wen_restart_repair_slots: Some(Arc::new(RwLock::new(Vec::new()))),
  3574. wait_for_supermajority_threshold_percent: 80,
  3575. snapshot_controller: None,
  3576. abs_status: AbsStatus::new_for_tests(),
  3577. genesis_config_hash: test_state.genesis_config_hash,
  3578. exit: exit.clone(),
  3579. };
  3580. assert_eq!(
  3581. send_and_receive_heaviest_fork(
  3582. coordinator_slot,
  3583. coordinator_hash,
  3584. &config,
  3585. &mut progress,
  3586. |slot, hash| {
  3587. pushed_slot = slot;
  3588. pushed_hash = hash;
  3589. }
  3590. )
  3591. .unwrap(),
  3592. (coordinator_slot, coordinator_hash)
  3593. );
  3594. assert_eq!(pushed_slot, coordinator_slot);
  3595. assert_eq!(pushed_hash, coordinator_hash);
  3596. // Now set the coordinator the someone else, need to return their choice.
  3597. let coordinator_keypair =
  3598. &test_state.validator_voting_keypairs[COORDINATOR_INDEX].node_keypair;
  3599. config.wen_restart_coordinator = coordinator_keypair.pubkey();
  3600. let mut rng = rand::thread_rng();
  3601. let node = ContactInfo::new_rand(&mut rng, Some(coordinator_keypair.pubkey()));
  3602. let now = timestamp();
  3603. push_restart_heaviest_fork(
  3604. test_state.cluster_info.clone(),
  3605. &node,
  3606. coordinator_slot,
  3607. &coordinator_hash,
  3608. 0,
  3609. coordinator_keypair,
  3610. now,
  3611. );
  3612. let my_slot = test_state.last_voted_fork_slots[1];
  3613. let my_hash = test_state
  3614. .bank_forks
  3615. .read()
  3616. .unwrap()
  3617. .get(my_slot)
  3618. .unwrap()
  3619. .hash();
  3620. assert_eq!(
  3621. send_and_receive_heaviest_fork(
  3622. my_slot,
  3623. my_hash,
  3624. &config,
  3625. &mut progress,
  3626. |slot, hash| {
  3627. pushed_slot = slot;
  3628. pushed_hash = hash;
  3629. }
  3630. )
  3631. .unwrap(),
  3632. (coordinator_slot, coordinator_hash)
  3633. );
  3634. assert_eq!(pushed_slot, coordinator_slot);
  3635. assert_eq!(pushed_hash, coordinator_hash);
  3636. // my slot on a different fork, should exit with error but still push heaviest fork.
  3637. let my_slot = coordinator_slot + 1;
  3638. let _ = insert_slots_into_blockstore(
  3639. test_state.blockstore.clone(),
  3640. 0,
  3641. &[coordinator_slot],
  3642. TICKS_PER_SLOT,
  3643. test_state.last_blockhash,
  3644. );
  3645. let my_hash = Hash::new_unique();
  3646. assert_eq!(
  3647. send_and_receive_heaviest_fork(
  3648. my_slot,
  3649. my_hash,
  3650. &config,
  3651. &mut progress,
  3652. |slot, hash| {
  3653. pushed_slot = slot;
  3654. pushed_hash = hash;
  3655. }
  3656. )
  3657. .unwrap_err()
  3658. .downcast::<WenRestartError>()
  3659. .unwrap(),
  3660. WenRestartError::HeaviestForkOnLeaderOnDifferentFork(coordinator_slot, my_slot)
  3661. );
  3662. assert_eq!(pushed_slot, my_slot);
  3663. assert_eq!(pushed_hash, my_hash);
  3664. }
  3665. fn run_and_check_find_bankhash_of_heaviest_fork(
  3666. test_state: &WenRestartTestInitResult,
  3667. slots: &[Slot],
  3668. slot: Slot,
  3669. ) {
  3670. let exit = Arc::new(AtomicBool::new(false));
  3671. assert_eq!(
  3672. find_bankhash_of_heaviest_fork(
  3673. slot,
  3674. slots.to_vec(),
  3675. test_state.blockstore.clone(),
  3676. test_state.bank_forks.clone(),
  3677. &exit,
  3678. )
  3679. .unwrap(),
  3680. test_state
  3681. .bank_forks
  3682. .read()
  3683. .unwrap()
  3684. .get(slot)
  3685. .unwrap()
  3686. .hash()
  3687. );
  3688. }
  3689. #[test]
  3690. fn test_find_bankhash_of_heaviest_fork() {
  3691. let ledger_path = get_tmp_ledger_path_auto_delete!();
  3692. let test_state = wen_restart_test_init(&ledger_path);
  3693. let last_vote = test_state.last_voted_fork_slots[0];
  3694. let mut slots = test_state.last_voted_fork_slots.clone();
  3695. slots.reverse();
  3696. run_and_check_find_bankhash_of_heaviest_fork(&test_state, &slots, last_vote);
  3697. let new_slot = last_vote + 1;
  3698. let _ = insert_slots_into_blockstore(
  3699. test_state.blockstore.clone(),
  3700. last_vote,
  3701. &[new_slot],
  3702. TICKS_PER_SLOT,
  3703. test_state.last_blockhash,
  3704. );
  3705. slots.push(new_slot);
  3706. run_and_check_find_bankhash_of_heaviest_fork(&test_state, &slots, new_slot);
  3707. let slot_full_but_not_replayed = last_vote + 2;
  3708. let _ = insert_slots_into_blockstore(
  3709. test_state.blockstore.clone(),
  3710. last_vote,
  3711. &[slot_full_but_not_replayed],
  3712. TICKS_PER_SLOT,
  3713. test_state.last_blockhash,
  3714. );
  3715. let new_bank = Bank::new_from_parent(
  3716. test_state.bank_forks.read().unwrap().get(new_slot).unwrap(),
  3717. &Pubkey::default(),
  3718. slot_full_but_not_replayed,
  3719. );
  3720. let _ = test_state
  3721. .bank_forks
  3722. .write()
  3723. .unwrap()
  3724. .insert_from_ledger(new_bank);
  3725. run_and_check_find_bankhash_of_heaviest_fork(
  3726. &test_state,
  3727. &slots,
  3728. slot_full_but_not_replayed,
  3729. );
  3730. }
  3731. }