gce-provider.sh 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. # |source| this file
  2. #
  3. # Utilities for working with GCE instances
  4. #
  5. # Default zone
  6. cloud_DefaultZone() {
  7. echo "us-west1-b"
  8. }
  9. cloud_DefaultCustomMemoryGB() {
  10. echo 64
  11. }
  12. #
  13. # cloud_RestartPreemptedInstances [namePrefix]
  14. #
  15. # Restart any preempted instances matching the specified prefix
  16. #
  17. # namePrefix - The instance name prefix of the preempted instances
  18. #
  19. cloud_RestartPreemptedInstances() {
  20. declare filter="$1"
  21. declare name status zone
  22. while read -r name status zone; do
  23. echo "Starting $status instance: $name"
  24. (
  25. set -x
  26. gcloud compute instances start --zone "$zone" "$name"
  27. )
  28. done < <(gcloud compute instances list \
  29. --filter "$filter" \
  30. --format 'value(name,status,zone)' \
  31. | grep TERMINATED)
  32. }
  33. #
  34. # __cloud_FindInstances
  35. #
  36. # Find instances matching the specified pattern.
  37. #
  38. # For each matching instance, an entry in the `instances` array will be added with the
  39. # following information about the instance:
  40. # "name:zone:public IP:private IP"
  41. #
  42. # filter - The instances to filter on
  43. #
  44. # examples:
  45. # $ __cloud_FindInstances "name=exact-machine-name"
  46. # $ __cloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
  47. #
  48. __cloud_FindInstances() {
  49. declare filter="$1"
  50. instances=()
  51. declare name zone publicIp privateIp status
  52. while read -r name publicIp privateIp status zone; do
  53. printf "%-30s | publicIp=%-16s privateIp=%s status=%s zone=%s\n" "$name" "$publicIp" "$privateIp" "$status" "$zone"
  54. instances+=("$name:$publicIp:$privateIp:$zone")
  55. done < <(gcloud compute instances list \
  56. --filter "$filter" \
  57. --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status,zone)' \
  58. | grep RUNNING)
  59. while read -r name status zone; do
  60. privateIp=TERMINATED
  61. publicIp=TERMINATED
  62. printf "%-30s | publicIp=%-16s privateIp=%s status=%s zone=%s\n" "$name" "$publicIp" "$privateIp" "$status" "$zone"
  63. instances+=("$name:$publicIp:$privateIp:$zone")
  64. done < <(gcloud compute instances list \
  65. --filter "$filter" \
  66. --format 'value(name,status,zone)' \
  67. | grep TERMINATED)
  68. }
  69. #
  70. # cloud_FindInstances [namePrefix]
  71. #
  72. # Find instances with names matching the specified prefix
  73. #
  74. # For each matching instance, an entry in the `instances` array will be added with the
  75. # following information about the instance:
  76. # "name:public IP:private IP"
  77. #
  78. # namePrefix - The instance name prefix to look for
  79. #
  80. # examples:
  81. # $ cloud_FindInstances all-machines-with-a-common-machine-prefix
  82. #
  83. cloud_FindInstances() {
  84. declare namePrefix="$1"
  85. __cloud_FindInstances "name~^$namePrefix"
  86. }
  87. #
  88. # cloud_FindInstance [name]
  89. #
  90. # Find an instance with a name matching the exact pattern.
  91. #
  92. # For each matching instance, an entry in the `instances` array will be added with the
  93. # following information about the instance:
  94. # "name:public IP:private IP"
  95. #
  96. # name - The instance name to look for
  97. #
  98. # examples:
  99. # $ cloud_FindInstance exact-machine-name
  100. #
  101. cloud_FindInstance() {
  102. declare name="$1"
  103. __cloud_FindInstances "name=$name"
  104. }
  105. #
  106. # cloud_Initialize [networkName]
  107. #
  108. # Perform one-time initialization that may be required for the given testnet.
  109. #
  110. # networkName - unique name of this testnet
  111. #
  112. # This function will be called before |cloud_CreateInstances|
  113. cloud_Initialize() {
  114. declare networkName="$1"
  115. # ec2-provider.sh creates firewall rules programmatically, should do the same
  116. # here.
  117. echo "Note: one day create $networkName firewall rules programmatically instead of assuming the 'testnet' tag exists"
  118. }
  119. #
  120. # cloud_CreateInstances [networkName] [namePrefix] [numNodes]
  121. # [enableGpu] [machineType] [zone]
  122. # [bootDiskSize] [startupScript] [address]
  123. # [bootDiskType] [additionalDiskSize] [preemptible]
  124. #
  125. # Creates one more identical instances.
  126. #
  127. # networkName - unique name of this testnet
  128. # namePrefix - unique string to prefix all the instance names with
  129. # numNodes - number of instances to create
  130. # enableGpu - Optionally enable GPU, use the value "true" to enable
  131. # eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
  132. # machineType - GCE machine type. Note that this may also include an
  133. # `--accelerator=` or other |gcloud compute instances create|
  134. # options
  135. # zone - cloud zone
  136. # bootDiskSize - Optional size of the boot disk in GB
  137. # startupScript - Optional startup script to execute when the instance boots
  138. # address - Optional name of the GCE static IP address to attach to the
  139. # instance. Requires that |numNodes| = 1 and that addressName
  140. # has been provisioned in the GCE region that is hosting `$zone`
  141. # bootDiskType - Optional specify SSD or HDD boot disk
  142. # additionalDiskSize - Optional specify size of additional storage volume
  143. # preemptible - Optionally request a preemptible instance ("true")
  144. #
  145. # Tip: use cloud_FindInstances to locate the instances once this function
  146. # returns
  147. cloud_CreateInstances() {
  148. declare networkName="$1"
  149. declare namePrefix="$2"
  150. declare numNodes="$3"
  151. declare enableGpu="$4"
  152. declare machineType="$5"
  153. declare zone="$6"
  154. declare optionalBootDiskSize="$7"
  155. declare optionalStartupScript="$8"
  156. declare optionalAddress="$9"
  157. declare optionalBootDiskType="${10:-pd-ssd}"
  158. declare optionalAdditionalDiskSize="${11}"
  159. declare optionalPreemptible="${12}"
  160. if $enableGpu; then
  161. # Custom Ubuntu 20.04 LTS image with CUDA 10.2 installed
  162. #
  163. # Unfortunately this image is not public. When this becomes an issue, use
  164. # the stock Ubuntu 20.04 image and programmatically install CUDA after the
  165. # instance boots
  166. #
  167. # imageName="ubuntu-2004-focal-v20201211-with-cuda-10-2 --image-project principal-lane-200702"
  168. # We don't have custom cuda image for ubnutu 22.04
  169. echo "Error: Not supported" >&2
  170. exit 1
  171. else
  172. imageName="ubuntu-2404-noble-amd64-v20250709 --image-project ubuntu-os-cloud"
  173. fi
  174. declare -a nodes
  175. if [[ $numNodes = 1 ]]; then
  176. nodes=("$namePrefix")
  177. else
  178. for node in $(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes"); do
  179. nodes+=("$node")
  180. done
  181. fi
  182. declare -a args
  183. args=(
  184. --zone "$zone"
  185. --tags testnet
  186. --metadata "testnet=$networkName"
  187. --maintenance-policy TERMINATE
  188. --restart-on-failure
  189. --scopes compute-rw
  190. )
  191. # shellcheck disable=SC2206 # Do not want to quote $imageName as it may contain extra args
  192. args+=(--image $imageName)
  193. if [[ $optionalPreemptible = true ]]; then
  194. args+=(--preemptible)
  195. fi
  196. # shellcheck disable=SC2206 # Do not want to quote $machineType as it may contain extra args
  197. for word in $machineType; do
  198. # Special handling for the "--min-cpu-platform" argument which may contain a
  199. # space (escaped as '%20')...
  200. args+=("${word//%20/ }")
  201. done
  202. if [[ -n $optionalBootDiskSize ]]; then
  203. args+=(
  204. --boot-disk-size "${optionalBootDiskSize}GB"
  205. )
  206. fi
  207. if [[ -n $optionalStartupScript ]]; then
  208. args+=(
  209. --metadata-from-file "startup-script=$optionalStartupScript"
  210. )
  211. fi
  212. if [[ -n $optionalBootDiskType ]]; then
  213. args+=(
  214. --boot-disk-type "${optionalBootDiskType}"
  215. )
  216. fi
  217. if [[ -n $optionalAddress ]]; then
  218. [[ $numNodes = 1 ]] || {
  219. echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
  220. exit 1
  221. }
  222. args+=(
  223. --address "$optionalAddress"
  224. )
  225. fi
  226. (
  227. set -x
  228. gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
  229. )
  230. if [[ -n $optionalAdditionalDiskSize ]]; then
  231. if [[ $numNodes = 1 ]]; then
  232. (
  233. set -x
  234. cloud_CreateAndAttachPersistentDisk "${namePrefix}" "$optionalAdditionalDiskSize" "pd-ssd" "$zone"
  235. )
  236. else
  237. for node in $(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes"); do
  238. (
  239. set -x
  240. cloud_CreateAndAttachPersistentDisk "${node}" "$optionalAdditionalDiskSize" "pd-ssd" "$zone"
  241. )
  242. done
  243. fi
  244. fi
  245. }
  246. #
  247. # cloud_DeleteInstances
  248. #
  249. # Deletes all the instances listed in the `instances` array
  250. #
  251. cloud_DeleteInstances() {
  252. if [[ ${#instances[0]} -eq 0 ]]; then
  253. echo No instances to delete
  254. return
  255. fi
  256. declare names=("${instances[@]/:*/}")
  257. declare zones=("${instances[@]/*:/}")
  258. declare unique_zones=()
  259. read -r -a unique_zones <<< "$(echo "${zones[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ')"
  260. for zone in "${unique_zones[@]}"; do
  261. set -x
  262. # Try deleting instances in all zones
  263. gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}" || true
  264. done
  265. }
  266. #
  267. # cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout]
  268. #
  269. # Return once the newly created VM instance is responding. This function is cloud-provider specific.
  270. #
  271. cloud_WaitForInstanceReady() {
  272. declare instanceName="$1"
  273. declare instanceIp="$2"
  274. # declare instanceZone="$3"
  275. declare timeout="$4"
  276. if [[ $instanceIp = "TERMINATED" ]]; then
  277. return 1
  278. fi
  279. timeout "${timeout}"s bash -c "set -o pipefail; until ping -c 3 $instanceIp | tr - _; do echo .; done"
  280. }
  281. #
  282. # cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
  283. #
  284. # Fetch a file from the given instance. This function uses a cloud-specific
  285. # mechanism to fetch the file
  286. #
  287. cloud_FetchFile() {
  288. declare instanceName="$1"
  289. # shellcheck disable=SC2034 # publicIp is unused
  290. declare publicIp="$2"
  291. declare remoteFile="$3"
  292. declare localFile="$4"
  293. declare zone="$5"
  294. if [[ $publicIp = "TERMINATED" ]]; then
  295. return 1
  296. fi
  297. (
  298. set -x
  299. gcloud compute scp --zone "$zone" "$instanceName:$remoteFile" "$localFile"
  300. )
  301. }
  302. #
  303. # cloud_CreateAndAttachPersistentDisk [instanceName] [diskSize] [diskType]
  304. #
  305. # Create a persistent disk and attach it to a pre-existing VM instance.
  306. # Set disk to auto-delete upon instance deletion
  307. #
  308. cloud_CreateAndAttachPersistentDisk() {
  309. declare instanceName="$1"
  310. declare diskSize="$2"
  311. declare diskType="$3"
  312. declare zone="$4"
  313. diskName="${instanceName}-pd"
  314. gcloud beta compute disks create "$diskName" \
  315. --size "$diskSize" \
  316. --type "$diskType" \
  317. --zone "$zone"
  318. gcloud compute instances attach-disk "$instanceName" \
  319. --disk "$diskName" \
  320. --zone "$zone"
  321. gcloud compute instances set-disk-auto-delete "$instanceName" \
  322. --disk "$diskName" \
  323. --zone "$zone" \
  324. --auto-delete
  325. }
  326. #
  327. # cloud_StatusAll
  328. #
  329. # Not yet implemented for this cloud provider
  330. cloud_StatusAll() {
  331. echo "ERROR: cloud_StatusAll is not yet implemented for GCE"
  332. }