automation_utils.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/usr/bin/env bash
  2. # | source | this file
  3. # shellcheck disable=SC1090
  4. # shellcheck disable=SC1091
  5. # shellcheck disable=SC2034
  6. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
  7. REPO_ROOT=${DIR}/..
  8. source "${REPO_ROOT}"/ci/upload-ci-artifact.sh
  9. function execution_step {
  10. # shellcheck disable=SC2124
  11. STEP="$@"
  12. echo --- "${STEP[@]}"
  13. }
  14. function collect_logs {
  15. execution_step "Collect logs from remote nodes"
  16. rm -rf "${REPO_ROOT}"/net/log
  17. "${REPO_ROOT}"/net/net.sh logs
  18. for logfile in "${REPO_ROOT}"/net/log/*; do
  19. (
  20. upload-ci-artifact "$logfile"
  21. )
  22. done
  23. }
  24. function analyze_packet_loss {
  25. (
  26. set -x
  27. # shellcheck disable=SC1091
  28. source "${REPO_ROOT}"/net/config/config
  29. mkdir -p iftop-logs
  30. execution_step "Map private -> public IP addresses in iftop logs"
  31. # shellcheck disable=SC2154
  32. for i in "${!validatorIpList[@]}"; do
  33. # shellcheck disable=SC2154
  34. # shellcheck disable=SC2086
  35. # shellcheck disable=SC2027
  36. echo "{\"private\": \""${validatorIpListPrivate[$i]}""\", \"public\": \""${validatorIpList[$i]}""\"},"
  37. done > ip_address_map.txt
  38. for ip in "${validatorIpList[@]}"; do
  39. "${REPO_ROOT}"/net/scp.sh ip_address_map.txt solana@"$ip":~/solana/
  40. done
  41. execution_step "Remotely post-process iftop logs"
  42. # shellcheck disable=SC2154
  43. for ip in "${validatorIpList[@]}"; do
  44. iftop_log=iftop-logs/$ip-iftop.log
  45. # shellcheck disable=SC2016
  46. "${REPO_ROOT}"/net/ssh.sh solana@"$ip" 'PATH=$PATH:~/.cargo/bin/ ~/solana/scripts/iftop-postprocess.sh ~/solana/iftop.log temp.log ~solana/solana/ip_address_map.txt' > "$iftop_log"
  47. upload-ci-artifact "$iftop_log"
  48. done
  49. execution_step "Analyzing Packet Loss"
  50. "${REPO_ROOT}"/solana-release/bin/solana-log-analyzer analyze -f ./iftop-logs/ | sort -k 2 -g
  51. )
  52. }
  53. function wait_for_max_stake {
  54. max_stake="$1"
  55. if [[ $max_stake -eq 100 ]]; then
  56. return
  57. fi
  58. source "${REPO_ROOT}"/net/common.sh
  59. loadConfigFile
  60. # shellcheck disable=SC2154
  61. # shellcheck disable=SC2029
  62. ssh "${sshOptions[@]}" "${validatorIpList[0]}" "RUST_LOG=info \$HOME/.cargo/bin/solana wait-for-max-stake $max_stake --url http://127.0.0.1:8899"
  63. }
  64. function wait_for_equal_stake {
  65. source "${REPO_ROOT}"/net/common.sh
  66. loadConfigFile
  67. max_stake=$((100 / ${#validatorIpList[@]} + 1))
  68. execution_step "Waiting for max stake to fall below ${max_stake}%"
  69. wait_for_max_stake $max_stake
  70. }
  71. function get_slot {
  72. source "${REPO_ROOT}"/net/common.sh
  73. loadConfigFile
  74. ssh "${sshOptions[@]}" "${validatorIpList[0]}" '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 slot'
  75. }
  76. function get_bootstrap_validator_ip_address {
  77. source "${REPO_ROOT}"/net/common.sh
  78. loadConfigFile
  79. echo "${validatorIpList[0]}"
  80. }
  81. function get_active_stake {
  82. source "${REPO_ROOT}"/net/common.sh
  83. loadConfigFile
  84. ssh "${sshOptions[@]}" "${validatorIpList[0]}" \
  85. '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalActiveStake\": [0-9]*" | cut -d: -f2'
  86. }
  87. function get_current_stake {
  88. source "${REPO_ROOT}"/net/common.sh
  89. loadConfigFile
  90. ssh "${sshOptions[@]}" "${validatorIpList[0]}" \
  91. '$HOME/.cargo/bin/solana --url http://127.0.0.1:8899 validators --output=json | grep -o "totalCurrentStake\": [0-9]*" | cut -d: -f2'
  92. }
  93. function get_validator_confirmation_time {
  94. SINCE=$1
  95. declare q_mean_confirmation='
  96. SELECT ROUND(MEAN("duration_ms")) as "mean_confirmation_ms"
  97. FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
  98. WHERE time > now() - '"$SINCE"'s'
  99. mean_confirmation_ms=$( \
  100. curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
  101. --data-urlencode "db=${TESTNET_TAG}" \
  102. --data-urlencode "q=$q_mean_confirmation" |
  103. python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py --empty_error |
  104. cut -d' ' -f2)
  105. }
  106. function collect_performance_statistics {
  107. execution_step "Collect performance statistics about run"
  108. # total_transactions will be 0 when the node is leader, so exclude those
  109. declare q_mean_tps='
  110. SELECT ROUND(MEAN("median_sum")) as "mean_tps" FROM (
  111. SELECT MEDIAN(sum_total_transactions) AS "median_sum" FROM (
  112. SELECT SUM("total_transactions") AS "sum_total_transactions"
  113. FROM "'$TESTNET_TAG'"."autogen"."replay-slot-stats"
  114. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND total_transactions > 0
  115. GROUP BY time(1s), host_id)
  116. GROUP BY time(1s)
  117. )'
  118. declare q_max_tps='
  119. SELECT MAX("median_sum") as "max_tps" FROM (
  120. SELECT MEDIAN(sum_total_transactions) AS "median_sum" FROM (
  121. SELECT SUM("total_transactions") AS "sum_total_transactions"
  122. FROM "'$TESTNET_TAG'"."autogen"."replay-slot-stats"
  123. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s AND total_transactions > 0
  124. GROUP BY time(1s), host_id)
  125. GROUP BY time(1s)
  126. )'
  127. declare q_mean_confirmation='
  128. SELECT round(mean("duration_ms")) as "mean_confirmation_ms"
  129. FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
  130. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
  131. declare q_max_confirmation='
  132. SELECT round(max("duration_ms")) as "max_confirmation_ms"
  133. FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
  134. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
  135. declare q_99th_confirmation='
  136. SELECT round(percentile("duration_ms", 99)) as "99th_percentile_confirmation_ms"
  137. FROM "'$TESTNET_TAG'"."autogen"."validator-confirmation"
  138. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
  139. declare q_max_tower_distance_observed='
  140. SELECT MAX("tower_distance") as "max_tower_distance" FROM (
  141. SELECT last("slot") - last("root") as "tower_distance"
  142. FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
  143. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s
  144. GROUP BY time(1s), host_id)'
  145. declare q_last_tower_distance_observed='
  146. SELECT MEAN("tower_distance") as "last_tower_distance" FROM (
  147. SELECT last("slot") - last("root") as "tower_distance"
  148. FROM "'$TESTNET_TAG'"."autogen"."tower-observed"
  149. GROUP BY host_id)'
  150. curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
  151. --data-urlencode "db=${TESTNET_TAG}" \
  152. --data-urlencode "q=$q_mean_tps;$q_max_tps;$q_mean_confirmation;$q_max_confirmation;$q_99th_confirmation;$q_max_tower_distance_observed;$q_last_tower_distance_observed" |
  153. python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser.py >>"$RESULT_FILE"
  154. declare q_dropped_vote_hash_count='
  155. SELECT sum("count") as "sum_dropped_vote_hash"
  156. FROM "'$TESTNET_TAG'"."autogen"."dropped-vote-hash"
  157. WHERE time > now() - '"$TEST_DURATION_SECONDS"'s'
  158. # store in variable to be returned
  159. dropped_vote_hash_count=$( \
  160. curl -G "${INFLUX_HOST}/query?u=ro&p=topsecret" \
  161. --data-urlencode "db=${TESTNET_TAG}" \
  162. --data-urlencode "q=$q_dropped_vote_hash_count" |
  163. python3 "${REPO_ROOT}"/system-test/testnet-automation-json-parser-missing.py)
  164. }
  165. function upload_results_to_slack() {
  166. echo --- Uploading results to Slack Performance Results App
  167. if [[ -z $SLACK_WEBHOOK_URL ]] ; then
  168. echo "SLACK_WEBHOOOK_URL undefined"
  169. exit 1
  170. fi
  171. [[ -n $BUILDKITE_MESSAGE ]] || BUILDKITE_MESSAGE="Message not defined"
  172. COMMIT=$(git rev-parse HEAD)
  173. COMMIT_BUTTON_TEXT="$(echo "$COMMIT" | head -c 8)"
  174. COMMIT_URL="https://github.com/solana-labs/solana/commit/${COMMIT}"
  175. if [[ -n $BUILDKITE_BUILD_URL ]] ; then
  176. BUILD_BUTTON_TEXT="Build Kite Job"
  177. else
  178. BUILD_BUTTON_TEXT="Build URL not defined"
  179. BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/"
  180. fi
  181. GRAFANA_URL="https://internal-metrics.solana.com:3000/d/monitor-${CHANNEL:-edge}/cluster-telemetry-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${TESTNET_START_UNIX_MSECS:-0}&to=${TESTNET_FINISH_UNIX_MSECS:-0}"
  182. [[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined"
  183. [[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined"
  184. payLoad="$(cat <<EOF
  185. {
  186. "blocks": [
  187. {
  188. "type": "section",
  189. "text": {
  190. "type": "mrkdwn",
  191. "text": "*$BUILDKITE_MESSAGE*"
  192. }
  193. },
  194. {
  195. "type": "actions",
  196. "elements": [
  197. {
  198. "type": "button",
  199. "text": {
  200. "type": "plain_text",
  201. "text": "$COMMIT_BUTTON_TEXT",
  202. "emoji": true
  203. },
  204. "url": "$COMMIT_URL"
  205. },
  206. {
  207. "type": "button",
  208. "text": {
  209. "type": "plain_text",
  210. "text": "$BUILD_BUTTON_TEXT",
  211. "emoji": true
  212. },
  213. "url": "$BUILDKITE_BUILD_URL"
  214. },
  215. {
  216. "type": "button",
  217. "text": {
  218. "type": "plain_text",
  219. "text": "Grafana",
  220. "emoji": true
  221. },
  222. "url": "$GRAFANA_URL"
  223. }
  224. ]
  225. },
  226. {
  227. "type": "divider"
  228. },
  229. {
  230. "type": "section",
  231. "text": {
  232. "type": "mrkdwn",
  233. "text": "Test Configuration: \n\`\`\`$TEST_CONFIGURATION\`\`\`"
  234. }
  235. },
  236. {
  237. "type": "divider"
  238. },
  239. {
  240. "type": "section",
  241. "text": {
  242. "type": "mrkdwn",
  243. "text": "Result Details: \n\`\`\`$RESULT_DETAILS\`\`\`"
  244. }
  245. }
  246. ]
  247. }
  248. EOF
  249. )"
  250. curl -X POST \
  251. -H 'Content-type: application/json' \
  252. --data "$payLoad" \
  253. "$SLACK_WEBHOOK_URL"
  254. }
  255. function upload_results_to_discord() {
  256. echo --- Uploading results to Discord Performance Results App
  257. if [[ -z $DISCORD_WEBHOOK_URL ]] ; then
  258. echo "DISCORD_WEBHOOK_URL undefined"
  259. exit 1
  260. fi
  261. [[ -n $BUILDKITE_MESSAGE ]] || BUILDKITE_MESSAGE="Message not defined"
  262. COMMIT=$(git rev-parse HEAD)
  263. COMMIT_BUTTON_TEXT="$(echo "$COMMIT" | head -c 8)"
  264. COMMIT_URL="https://github.com/solana-labs/solana/commit/${COMMIT}"
  265. if [[ -n $BUILDKITE_BUILD_URL ]] ; then
  266. BUILD_BUTTON_TEXT="Build Kite Job"
  267. else
  268. BUILD_BUTTON_TEXT="Build URL not defined"
  269. BUILDKITE_BUILD_URL="https://buildkite.com/solana-labs/"
  270. fi
  271. GRAFANA_URL="https://internal-metrics.solana.com:3000/d/monitor-${CHANNEL:-edge}/cluster-telemetry-${CHANNEL:-edge}?var-testnet=${TESTNET_TAG:-testnet-automation}&from=${TESTNET_START_UNIX_MSECS:-0}&to=${TESTNET_FINISH_UNIX_MSECS:-0}"
  272. [[ -n $RESULT_DETAILS ]] || RESULT_DETAILS="Undefined"
  273. SANITIZED_RESULT=${RESULT_DETAILS//$'\n'/"\n"}
  274. [[ -n $TEST_CONFIGURATION ]] || TEST_CONFIGURATION="Undefined"
  275. curl "$DISCORD_WEBHOOK_URL" \
  276. -X POST \
  277. -H "Content-Type: application/json" \
  278. -d @- <<EOF
  279. {
  280. "username": "System Performance Test",
  281. "content": "\
  282. **$BUILDKITE_MESSAGE**\n\
  283. [$COMMIT_BUTTON_TEXT](<$COMMIT_URL>) | [$BUILD_BUTTON_TEXT](<$BUILDKITE_BUILD_URL>) | [Grafana](<$GRAFANA_URL>)\n\
  284. Test Configuration:\n\
  285. \`\`\`$TEST_CONFIGURATION\`\`\`\n\
  286. Result Details:\n\
  287. \`\`\`$SANITIZED_RESULT\`\`\`\n\
  288. "
  289. }
  290. EOF
  291. }
  292. function get_net_launch_software_version_launch_args() {
  293. declare channel="${1?}"
  294. declare artifact_basename="${2?}"
  295. declare return_varname="${3:?}"
  296. if [[ -n $channel ]]; then
  297. eval "$return_varname=-t\ \$channel"
  298. else
  299. execution_step "Downloading tar from build artifacts (${artifact_basename})"
  300. buildkite-agent artifact download "${artifact_basename}*.tar.bz2" .
  301. eval "$return_varname=-T\ \${artifact_basename}*.tar.bz2"
  302. fi
  303. }