localnet-sanity.sh 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. #!/usr/bin/env bash
  2. set -e
  3. skipSetup=false
  4. iterations=1
  5. restartInterval=never
  6. rollingRestart=false
  7. extraNodes=0
  8. walletRpcPort=:8899
  9. usage() {
  10. exitcode=0
  11. if [[ -n "$1" ]]; then
  12. exitcode=1
  13. echo "Error: $*"
  14. fi
  15. cat <<EOF
  16. usage: $0 [options...]
  17. Start a local cluster and run sanity on it
  18. options:
  19. -i [number] - Number of times to run sanity (default: $iterations)
  20. -k [number] - Restart the cluster after this number of sanity iterations (default: $restartInterval)
  21. -R - Restart the cluster by incrementially stopping and restarting
  22. nodes (at the cadence specified by -k). When disabled all
  23. nodes will be first killed then restarted (default: $rollingRestart)
  24. -b - Disable leader rotation
  25. -x - Add an extra validator (may be supplied multiple times)
  26. -r - Select the RPC endpoint hosted by a node that starts as
  27. a validator node. If unspecified the RPC endpoint hosted by
  28. the bootstrap validator will be used.
  29. -c - Reuse existing node/ledger configuration from a previous sanity
  30. run
  31. EOF
  32. exit $exitcode
  33. }
  34. cd "$(dirname "$0")"/..
  35. while getopts "ch?i:k:brxR" opt; do
  36. case $opt in
  37. h | \?)
  38. usage
  39. ;;
  40. c)
  41. skipSetup=true
  42. ;;
  43. i)
  44. iterations=$OPTARG
  45. ;;
  46. k)
  47. restartInterval=$OPTARG
  48. ;;
  49. x)
  50. extraNodes=$((extraNodes + 1))
  51. ;;
  52. r)
  53. walletRpcPort=":18899"
  54. ;;
  55. R)
  56. rollingRestart=true
  57. ;;
  58. *)
  59. usage "Error: unhandled option: $opt"
  60. ;;
  61. esac
  62. done
  63. source ci/upload-ci-artifact.sh
  64. source scripts/configure-metrics.sh
  65. source multinode-demo/common.sh --prebuild
  66. nodes=(
  67. "multinode-demo/bootstrap-validator.sh \
  68. --no-restart \
  69. --init-complete-file init-complete-node0.log \
  70. --dynamic-port-range 8000-8200"
  71. "multinode-demo/validator.sh \
  72. --no-restart \
  73. --dynamic-port-range 8200-8400
  74. --init-complete-file init-complete-node1.log \
  75. --rpc-port 18899"
  76. )
  77. if [[ extraNodes -gt 0 ]]; then
  78. for i in $(seq 1 $extraNodes); do
  79. portStart=$((8400 + i * 200))
  80. portEnd=$((portStart + 200))
  81. nodes+=(
  82. "multinode-demo/validator.sh \
  83. --no-restart \
  84. --dynamic-port-range $portStart-$portEnd
  85. --label dyn$i \
  86. --init-complete-file init-complete-node$((1 + i)).log"
  87. )
  88. done
  89. fi
  90. numNodes=$((2 + extraNodes))
  91. pids=()
  92. logs=()
  93. getNodeLogFile() {
  94. declare nodeIndex=$1
  95. declare cmd=$2
  96. declare baseCmd
  97. baseCmd=$(basename "${cmd// */}" .sh)
  98. echo "log-$baseCmd-$nodeIndex.txt"
  99. }
  100. startNode() {
  101. declare nodeIndex=$1
  102. declare cmd=$2
  103. echo "--- Start $cmd"
  104. declare log
  105. log=$(getNodeLogFile "$nodeIndex" "$cmd")
  106. rm -f "$log"
  107. $cmd > "$log" 2>&1 &
  108. declare pid=$!
  109. pids+=("$pid")
  110. echo "pid: $pid"
  111. echo "log: $log"
  112. }
  113. waitForNodeToInit() {
  114. declare initCompleteFile=$1
  115. while [[ ! -r $initCompleteFile ]]; do
  116. if [[ $SECONDS -ge 300 ]]; then
  117. echo "^^^ +++"
  118. echo "Error: $initCompleteFile not found in $SECONDS seconds"
  119. exit 1
  120. fi
  121. echo "Waiting for $initCompleteFile ($SECONDS)..."
  122. sleep 2
  123. done
  124. echo "Found $initCompleteFile"
  125. }
  126. initCompleteFiles=()
  127. waitForAllNodesToInit() {
  128. echo "--- ${#initCompleteFiles[@]} nodes booting"
  129. SECONDS=
  130. for initCompleteFile in "${initCompleteFiles[@]}"; do
  131. waitForNodeToInit "$initCompleteFile"
  132. done
  133. echo "All nodes finished booting in $SECONDS seconds"
  134. }
  135. startNodes() {
  136. declare addLogs=false
  137. if [[ ${#logs[@]} -eq 0 ]]; then
  138. addLogs=true
  139. fi
  140. initCompleteFiles=()
  141. maybeExpectedGenesisHash=
  142. for i in $(seq 0 $((${#nodes[@]} - 1))); do
  143. # wait for bootstrap validator to boot before starting other validators
  144. if [[ "$i" -eq 1 ]]; then
  145. SECONDS=
  146. waitForNodeToInit "$initCompleteFile"
  147. (
  148. set -x
  149. $solana_cli --keypair config/bootstrap-validator/identity.json \
  150. --url http://127.0.0.1:8899 genesis-hash
  151. ) | tee genesis-hash.log
  152. maybeExpectedGenesisHash="--expected-genesis-hash $(tail -n1 genesis-hash.log)"
  153. fi
  154. declare cmd=${nodes[$i]}
  155. declare initCompleteFile="init-complete-node$i.log"
  156. rm -f "$initCompleteFile"
  157. initCompleteFiles+=("$initCompleteFile")
  158. startNode "$i" "$cmd $maybeExpectedGenesisHash"
  159. if $addLogs; then
  160. logs+=("$(getNodeLogFile "$i" "$cmd")")
  161. fi
  162. done
  163. waitForAllNodesToInit
  164. }
  165. killNode() {
  166. declare pid=$1
  167. set +e
  168. if kill "$pid"; then
  169. echo "Waiting for $pid to exit..."
  170. wait "$pid"
  171. echo "$pid exited with $?"
  172. fi
  173. set -e
  174. }
  175. killNodes() {
  176. [[ ${#pids[@]} -gt 0 ]] || return
  177. # Try to use the RPC exit API to cleanly exit the first two nodes
  178. # (dynamic nodes, -x, are just killed)
  179. echo "--- RPC exit"
  180. $agave_validator --ledger "$SOLANA_CONFIG_DIR"/bootstrap-validator exit --force || true
  181. $agave_validator --ledger "$SOLANA_CONFIG_DIR"/validator exit --force || true
  182. # Give the nodes a splash of time to cleanly exit before killing them
  183. sleep 2
  184. echo "--- Killing nodes: ${pids[*]}"
  185. for pid in "${pids[@]}"; do
  186. killNode "$pid"
  187. done
  188. echo "done killing nodes"
  189. pids=()
  190. }
  191. rollingNodeRestart() {
  192. if [[ ${#logs[@]} -ne ${#nodes[@]} ]]; then
  193. echo "^^^ +++"
  194. echo "Error: log/nodes array length mismatch"
  195. exit 1
  196. fi
  197. if [[ ${#pids[@]} -ne ${#nodes[@]} ]]; then
  198. echo "^^^ +++"
  199. echo "Error: pids/nodes array length mismatch"
  200. exit 1
  201. fi
  202. declare oldPids=("${pids[@]}")
  203. for i in $(seq 0 $((${#logs[@]} - 1))); do
  204. declare pid=${oldPids[$i]}
  205. declare cmd=${nodes[$i]}
  206. if [[ $i -eq 0 ]]; then
  207. # First cmd should be the faucet, don't restart it.
  208. [[ "$cmd" = "multinode-demo/faucet.sh" ]]
  209. pids+=("$pid")
  210. else
  211. echo "--- Restarting $pid: $cmd"
  212. killNode "$pid"
  213. # Delay 20 seconds to ensure the remaining cluster nodes will
  214. # hit CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS (currently 15 seconds) for the
  215. # node that was just stopped
  216. echo "(sleeping for 20 seconds)"
  217. sleep 20
  218. declare initCompleteFile="init-complete-node$i.log"
  219. rm -f "$initCompleteFile"
  220. initCompleteFiles+=("$initCompleteFile")
  221. startNode "$i" "$cmd"
  222. fi
  223. done
  224. # 'Atomically' remove the old pids from the pids array
  225. declare oldPidsList
  226. oldPidsList="$(printf ":%s" "${oldPids[@]}"):"
  227. declare newPids=("${pids[0]}") # 0 = faucet pid
  228. for pid in "${pids[@]}"; do
  229. [[ $oldPidsList =~ :$pid: ]] || {
  230. newPids+=("$pid")
  231. }
  232. done
  233. pids=("${newPids[@]}")
  234. waitForAllNodesToInit
  235. }
  236. verifyLedger() {
  237. for ledger in bootstrap-validator validator; do
  238. echo "--- $ledger ledger verification"
  239. (
  240. set -x
  241. $solana_ledger_tool --ledger "$SOLANA_CONFIG_DIR"/$ledger verify
  242. ) || flag_error
  243. done
  244. }
  245. shutdown() {
  246. exitcode=$?
  247. killNodes
  248. set +e
  249. echo "--- Upload artifacts"
  250. for log in "${logs[@]}"; do
  251. upload-ci-artifact "$log"
  252. tail "$log"
  253. done
  254. exit $exitcode
  255. }
  256. trap shutdown EXIT INT
  257. set -e
  258. declare iteration=1
  259. flag_error() {
  260. echo "Failed (iteration: $iteration/$iterations)"
  261. echo "^^^ +++"
  262. exit 1
  263. }
  264. if ! $skipSetup; then
  265. clear_config_dir "$SOLANA_CONFIG_DIR"
  266. multinode-demo/setup.sh --hashes-per-tick sleep
  267. else
  268. verifyLedger
  269. fi
  270. startNodes
  271. lastTransactionCount=
  272. while [[ $iteration -le $iterations ]]; do
  273. echo "--- Node count ($iteration)"
  274. (
  275. set -x
  276. client_keypair=/tmp/client-id.json-$$
  277. $solana_keygen new --no-passphrase -fso $client_keypair || exit $?
  278. $solana_gossip --allow-private-addr spy -n 127.0.0.1:8001 --num-nodes-exactly $numNodes || exit $?
  279. rm -rf $client_keypair
  280. ) || flag_error
  281. echo "--- RPC API: bootstrap-validator getTransactionCount ($iteration)"
  282. (
  283. set -x
  284. curl --retry 5 --retry-delay 2 --retry-connrefused \
  285. -X POST -H 'Content-Type: application/json' \
  286. -d '{"jsonrpc":"2.0","id":1, "method":"getTransactionCount"}' \
  287. -o log-transactionCount.txt \
  288. http://localhost:8899
  289. cat log-transactionCount.txt
  290. ) || flag_error
  291. echo "--- RPC API: validator getTransactionCount ($iteration)"
  292. (
  293. set -x
  294. curl --retry 5 --retry-delay 2 --retry-connrefused \
  295. -X POST -H 'Content-Type: application/json' \
  296. -d '{"jsonrpc":"2.0","id":1, "method":"getTransactionCount"}' \
  297. http://localhost:18899
  298. ) || flag_error
  299. # Verify transaction count as reported by the bootstrap-validator node is advancing
  300. transactionCount=$(sed -e 's/{"jsonrpc":"2.0","result":\([0-9]*\),"id":1}/\1/' log-transactionCount.txt)
  301. if [[ -n $lastTransactionCount ]]; then
  302. echo "--- Transaction count check: $lastTransactionCount < $transactionCount"
  303. if [[ $lastTransactionCount -ge $transactionCount ]]; then
  304. echo "Error: Transaction count is not advancing"
  305. echo "* lastTransactionCount: $lastTransactionCount"
  306. echo "* transactionCount: $transactionCount"
  307. flag_error
  308. fi
  309. fi
  310. lastTransactionCount=$transactionCount
  311. echo "--- Wallet sanity ($iteration)"
  312. (
  313. set -x
  314. timeout 60s scripts/wallet-sanity.sh --url http://127.0.0.1"$walletRpcPort"
  315. ) || flag_error
  316. iteration=$((iteration + 1))
  317. if [[ $restartInterval != never && $((iteration % restartInterval)) -eq 0 ]]; then
  318. if $rollingRestart; then
  319. rollingNodeRestart
  320. else
  321. killNodes
  322. verifyLedger
  323. startNodes
  324. fi
  325. fi
  326. done
  327. killNodes
  328. verifyLedger
  329. echo +++
  330. echo "Ok ($iterations iterations)"
  331. exit 0