Explorar o código

Add support for Azure instances in testnet creation (#3905)

* Add support for Azure instances in testnet creation

* Fixup

* Fix shellcheck errors

* More shellcheck and cleanup node creation and deletion

* More shellcheck and cleanup node creation and deletion

* Fixup instance wait API

* Fix revieew comments and add GPU installation extension
Dan Albert %!s(int64=6) %!d(string=hai) anos
pai
achega
4e7e5ace9d
Modificáronse 7 ficheiros con 354 adicións e 6 borrados
  1. 1 1
      net/README.md
  2. 1 0
      net/azure.sh
  3. 19 4
      net/gce.sh
  4. 1 1
      net/net.sh
  5. 306 0
      net/scripts/azure-provider.sh
  6. 13 0
      net/scripts/ec2-provider.sh
  7. 13 0
      net/scripts/gce-provider.sh

+ 1 - 1
net/README.md

@@ -32,7 +32,7 @@ NOTE: This example uses GCE.  If you are using AWS EC2, replace `./gce.sh` with
 ```bash
 $ cd net/
 $ ./gce.sh create -n 5 -c 1     #<-- Create a GCE testnet with 5 additional nodes (beyond the bootstrap node) and 1 client (billing starts here)
-$ ./init-metrics.sh $(whoami)   #<-- Configure a metrics database for the testnet
+$ ./init-metrics.sh -c $(whoami)   #<-- Configure a metrics database for the testnet
 $ ./net.sh start                #<-- Deploy the network from the local workspace and start all clients with bench-tps
 $ ./ssh.sh                      #<-- Details on how to ssh into any testnet node to access logs/etc
 $ ./gce.sh delete               #<-- Dispose of the network (billing stops here)

+ 1 - 0
net/azure.sh

@@ -0,0 +1 @@
+gce.sh

+ 19 - 4
net/gce.sh

@@ -30,6 +30,18 @@ ec2)
   clientMachineType=m4.2xlarge
   blockstreamerMachineType=m4.2xlarge
   ;;
+azure)
+  # shellcheck source=net/scripts/azure-provider.sh
+  source "$here"/scripts/azure-provider.sh
+
+  # TODO: Dial in machine types for Azure
+  cpuBootstrapLeaderMachineType=Standard_D16s_v3
+  gpuBootstrapLeaderMachineType=Standard_NC12
+  bootstrapLeaderMachineType=$cpuBootstrapLeaderMachineType
+  fullNodeMachineType=$cpuBootstrapLeaderMachineType
+  clientMachineType=Standard_D16s_v3
+  blockstreamerMachineType=Standard_D16s_v3
+  ;;
 *)
   echo "Error: Unknown cloud provider: $cloudProvider"
   ;;
@@ -191,6 +203,8 @@ gce)
   ;;
 ec2)
   ;;
+azure)
+  ;;
 *)
   echo "Error: Unknown cloud provider: $cloudProvider"
   ;;
@@ -202,10 +216,10 @@ esac
 #
 #   cmd   - The command to execute on each instance
 #           The command will receive arguments followed by any
-#           additionl arguments supplied to cloud_ForEachInstance:
+#           additional arguments supplied to cloud_ForEachInstance:
 #               name     - name of the instance
 #               publicIp - The public IP address of this instance
-#               privateIp - The priate IP address of this instance
+#               privateIp - The private IP address of this instance
 #               count    - Monotonically increasing count for each
 #                          invocation of cmd, starting at 1
 #               ...      - Extra args to cmd..
@@ -293,8 +307,9 @@ EOF
       declare nodeZone
       IFS=: read -r nodeName nodeIp _ nodeZone < <(echo "${instances[0]}")
 
-      # Try to ping the machine first.
-      timeout 90s bash -c "set -o pipefail; until ping -c 3 $nodeIp | tr - _; do echo .; done"
+      # Make sure the machine is alive or pingable
+      timeout_sec=90
+      cloud_WaitForInstanceReady "$nodeName" "$nodeIp" "$nodeZone" "$timeout_sec"
 
       if [[ ! -r $sshPrivateKey ]]; then
         echo "Fetching $sshPrivateKey from $nodeName"

+ 1 - 1
net/net.sh

@@ -302,7 +302,7 @@ startNode() {
       "
   ) >> "$logFile" 2>&1 &
   declare pid=$!
-  ln -sf "fullnode-$ipAddress.log" "$netLogDir/fullnode-$pid.log"
+  ln -sfT "fullnode-$ipAddress.log" "$netLogDir/fullnode-$pid.log"
   pids+=("$pid")
 }
 

+ 306 - 0
net/scripts/azure-provider.sh

@@ -0,0 +1,306 @@
+# |source| this file
+#
+# Utilities for working with Azure instances
+#
+
+# Default zone
+cloud_DefaultZone() {
+  echo "westus"
+}
+
+#
+# __cloud_GetConfigValueFromInstanceName
+# Return a piece of configuration information about an instance
+# Provide the exact name of an instance and the configuration key, and the corresponding value will be returned
+#
+# example:
+#   This will return the name of the resource group of the instance named
+#   __cloud_GetConfigValueFromInstanceName some-instance-name resourceGroup
+
+cloud_GetConfigValueFromInstanceName() {
+  query="[?name=='$1']"
+  key="[$2]"
+  config_value=$(az vm list -d -o tsv --query "$query.$key")
+}
+
+cloud_GetResourceGroupFromInstanceName() {
+  resourceGroup=$(az vm list -o tsv --query "[?name=='$1'].[resourceGroup]")
+}
+cloud_GetIdFromInstanceName() {
+  id=$(az vm list -o tsv --query "[?name=='$1'].[id]")
+}
+
+#
+# __cloud_FindInstances
+#
+# Find instances matching the specified pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP:location"
+#
+# filter   - The instances to filter on
+#
+# examples:
+#   $ __cloud_FindInstances prefix some-machine-prefix
+#   $ __cloud_FindInstances name exact-machine-name
+#
+#  Examples of plain-text filter command
+#
+#  This will return an exact match for a machine named pgnode
+#  az vm list -d --query "[?name=='pgnode'].[name,publicIps,privateIps,location]"
+#
+#  This will return a match for any machine with prefix pgnode, ex: pgnode and pgnode2
+#  az vm list -d --query "[?starts_with(name,'pgnode')].[name,publicIps,privateIps,location]"
+__cloud_FindInstances() {
+  case $1 in
+    prefix)
+      query="[?starts_with(name,'$2')]"
+      ;;
+    name)
+      query="[?name=='$2']"
+      ;;
+    *)
+      echo "Unknown filter command: $1"
+      ;;
+  esac
+
+  keys="[name,publicIps,privateIps,location]"
+
+  instances=()
+  while read -r name publicIp privateIp location; do
+    instances+=("$name:$publicIp:$privateIp:$location")
+  done < <(az vm list -d -o tsv --query "$query.$keys")
+  echo "${instances[*]}"
+}
+
+#
+# cloud_FindInstances [namePrefix]
+#
+# Find instances with names matching the specified prefix
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP:location"
+#
+# namePrefix - The instance name prefix to look for
+#
+# examples:
+#   $ cloud_FindInstances all-machines-with-a-common-machine-prefix
+#
+cloud_FindInstances() {
+  __cloud_FindInstances prefix "$1"
+}
+
+#
+# cloud_FindInstance [name]
+#
+# Find an instance with a name matching the exact pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP:location"
+#
+# name - The instance name to look for
+#
+# examples:
+#   $ cloud_FindInstance exact-machine-name
+#
+cloud_FindInstance() {
+  __cloud_FindInstances name "$1"
+}
+
+#
+# cloud_Initialize [networkName]
+#
+# Perform one-time initialization that may be required for the given testnet.
+#
+# networkName   - unique name of this testnet
+#
+# This function will be called before |cloud_CreateInstances|
+cloud_Initialize() {
+  declare networkName="$1"
+  # ec2-provider.sh creates firewall rules programmatically, should do the same
+  # here.
+  echo "TODO: create $networkName firewall rules programmatically instead of assuming the 'testnet' tag exists"
+}
+
+#
+# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
+#                       [machineType] [bootDiskSize] [enableGpu]
+#                       [startupScript] [address]
+#
+# Creates one more identical instances.
+#
+# networkName   - unique name of this testnet
+# namePrefix    - unique string to prefix all the instance names with
+# numNodes      - number of instances to create
+# imageName     - Disk image for the instances
+# machineType   - GCE machine type.  Note that this may also include an
+#                 `--accelerator=` or other |gcloud compute instances create|
+#                 options
+# bootDiskSize  - Optional size of the boot disk in GB
+# enableGpu     - Optionally enable GPU, use the value "true" to enable
+#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
+# startupScript - Optional startup script to execute when the instance boots
+# address       - Optional name of the GCE static IP address to attach to the
+#                 instance.  Requires that |numNodes| = 1 and that addressName
+#                 has been provisioned in the GCE region that is hosting `$zone`
+#
+# Tip: use cloud_FindInstances to locate the instances once this function
+#      returns
+cloud_CreateInstances() {
+  declare networkName="$1"
+  declare namePrefix="$2"
+  declare numNodes="$3"
+  declare enableGpu="$4"
+  declare machineType="$5"
+  declare zone="$6"
+  declare optionalBootDiskSize="$7"
+  declare optionalStartupScript="$8"
+  declare optionalAddress="$9"
+  declare optionalBootDiskType="${10}"
+
+  declare -a nodes
+  if [[ $numNodes = 1 ]]; then
+    nodes=("$namePrefix")
+  else
+    for node in $(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes"); do
+      nodes+=("$node")
+    done
+  fi
+
+  declare -a args
+  args=(
+    --resource-group "$networkName"
+    --tags testnet
+    --image UbuntuLTS
+    --size "$machineType"
+    --location "$zone"
+    --generate-ssh-keys
+  )
+
+  if [[ -n $optionalBootDiskSize ]]; then
+    args+=(
+      --os-disk-size-gb "$optionalBootDiskSize"
+    )
+  fi
+  if [[ -n $optionalStartupScript ]]; then
+    args+=(
+      --custom-data "$optionalStartupScript"
+    )
+  fi
+
+  if [[ -n $optionalBootDiskType ]]; then
+    echo Boot disk type not configurable
+  fi
+
+  if [[ -n $optionalAddress ]]; then
+    [[ $numNodes = 1 ]] || {
+      echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
+      exit 1
+    }
+    args+=(
+      --public-ip-address "$optionalAddress"
+    )
+  fi
+
+  (
+    set -x
+    # 1: Check if resource group exists.  If not, create it.
+    numGroup=$(az group list --query "length([?name=='$networkName'])")
+    if [[ $numGroup -eq 0 ]]; then
+      echo Resource Group "$networkName" does not exist.  Creating it now.
+      az group create --name "$networkName" --location "$zone"
+    else
+      echo Resource group "$networkName" already exists.
+      az group show --name "$networkName"
+    fi
+
+    # 2: For node in numNodes, create VM and put the creation process in the background with --no-wait
+    for nodeName in "${nodes[@]}"; do
+      az vm create --name "$nodeName" "${args[@]}" --no-wait
+    done
+
+    # 3: Wait until all nodes are created
+    for nodeName in "${nodes[@]}"; do
+      az vm wait --created --name "$nodeName" --resource-group "$networkName"
+    done
+
+    # 4. If GPU is to be enabled, install the appropriate extension
+    if $enableGpu; then
+      for nodeName in "${nodes[@]}"; do
+        az vm extension set \
+        --resource-group "$networkName" \
+        --vm-name "$nodeName" \
+        --name NvidiaGpuDriverLinux \
+        --publisher Microsoft.HpcCompute \
+        --version 1.2 \
+        --no-wait
+      done
+
+      # 5. Wait until all nodes have GPU extension installed
+      for nodeName in "${nodes[@]}"; do
+        az vm wait --updated --name "$nodeName" --resource-group "$networkName"
+      done
+    fi
+  )
+}
+
+#
+# cloud_DeleteInstances
+#
+# Deletes all the instances listed in the `instances` array
+#
+cloud_DeleteInstances() {
+  if [[ ${#instances[0]} -eq 0 ]]; then
+    echo No instances to delete
+    return
+  fi
+
+  declare names=("${instances[@]/:*/}")
+  (
+    set -x
+    id_list=()
+
+    # Build a space delimited list of all resource IDs to delete
+    for instance in "${names[@]}"; do
+      cloud_GetIdFromInstanceName "$instance"
+      id_list+=("$id")
+    done
+
+    # Delete all instances in the id_list and return once they are all deleted
+    az vm delete --ids "${id_list[@]}" --yes --verbose
+  )
+}
+
+#
+# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout]
+#
+# Return once the newly created VM instance is responding.  This function is cloud-provider specific.
+#
+cloud_WaitForInstanceReady() {
+  declare instanceName="$1"
+#  declare instanceIp="$2"  # unused
+#  declare instanceZone="$3"  # unused
+  declare timeout="$4"
+
+  cloud_GetResourceGroupFromInstanceName "$instanceName"
+  az vm wait -g "$resourceGroup" -n "$instanceName" --created  --interval 10 --timeout "$timeout"
+}
+
+#
+# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
+#
+# Fetch a file from the given instance.  This function uses a cloud-specific
+# mechanism to fetch the file
+#
+cloud_FetchFile() {
+  declare instanceName="$1"
+  declare publicIp="$2"
+  declare remoteFile="$3"
+  declare localFile="$4"
+
+  cloud_GetConfigValueFromInstanceName "$instanceName" osProfile.adminUsername
+  scp "${config_value}@${publicIp}:${remoteFile}" "$localFile"
+}

+ 13 - 0
net/scripts/ec2-provider.sh

@@ -340,6 +340,19 @@ cloud_DeleteInstances() {
   done
 }
 
+#
+# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout]
+#
+# Return once the newly created VM instance is responding.  This function is cloud-provider specific.
+#
+cloud_WaitForInstanceReady() {
+  declare instanceName="$1"
+  declare instanceIp="$2"
+#  declare instanceZone="$3"  # unused
+  declare timeout="$4"
+
+  timeout "${timeout}"s bash -c "set -o pipefail; until ping -c 3 $instanceIp | tr - _; do echo .; done"
+}
 
 #
 # cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]

+ 13 - 0
net/scripts/gce-provider.sh

@@ -215,6 +215,19 @@ cloud_DeleteInstances() {
   done
 }
 
+#
+# cloud_WaitForInstanceReady [instanceName] [instanceIp] [instanceZone] [timeout]
+#
+# Return once the newly created VM instance is responding.  This function is cloud-provider specific.
+#
+cloud_WaitForInstanceReady() {
+  declare instanceName="$1"
+  declare instanceIp="$2"
+#  declare instanceZone="$3"
+  declare timeout="$4"
+
+  timeout "${timeout}"s bash -c "set -o pipefail; until ping -c 3 $instanceIp | tr - _; do echo .; done"
+}
 
 #
 # cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]