瀏覽代碼

Add AWS EC2 support

Michael Vines 7 年之前
父節點
當前提交
f89f121d2b
共有 7 個文件被更改,包括 645 次插入272 次删除
  1. 39 5
      net/README.md
  2. 1 0
      net/ec2.sh
  3. 142 80
      net/gce.sh
  4. 20 0
      net/scripts/add-solana-user-authorized_keys.sh
  5. 242 0
      net/scripts/ec2-provider.sh
  6. 201 0
      net/scripts/gce-provider.sh
  7. 0 187
      net/scripts/gcloud.sh

+ 39 - 5
net/README.md

@@ -5,15 +5,30 @@ intended to be both dev and CD friendly.
 
 ### User Account Prerequisites
 
-Log in to GCP with:
+GCP and AWS are supported.
+
+#### GCP
+First authenticate with
 ```bash
 $ gcloud auth login
 ```
 
-Also ensure that `$(whoami)` is the name of an InfluxDB user account with enough
-access to create a new database.
+#### AWS
+Obtain your credentials from the AWS IAM Console and configure the AWS CLI with
+```bash
+$ aws configure
+```
+More information on AWS CLI configuration can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html#cli-quick-configuration)
+
+### Metrics configuration
+Ensure that `$(whoami)` is the name of an InfluxDB user account with enough
+access to create a new InfluxDB database.  Ask mvines@ for help if needed.
 
 ## Quick Start
+
+NOTE: This example uses GCP.  If you are using AWS, replace `./gce.sh` with
+`./ec2.sh` in the commands.
+
 ```bash
 $ cd net/
 $ ./gce.sh create -n 5 -c 1  #<-- Create a GCE testnet with 5 validators, 1 client (billing starts here)
@@ -32,6 +47,10 @@ network over public IP addresses:
 ```bash
 $ ./gce.sh create -P ...
 ```
+or
+```bash
+$ ./ec2.sh create -P ...
+```
 
 ### Deploying a Snap-based network
 To deploy the latest pre-built `edge` channel Snap (ie, latest from the `master`
@@ -46,6 +65,10 @@ First ensure the network instances are created with GPU enabled:
 ```bash
 $ ./gce.sh create -g ...
 ```
+or
+```bash
+$ ./ec2.sh create -g ...
+```
 
 If deploying a Snap-based network nothing further is required, as GPU presence
 is detected at runtime and the CUDA build is auto selected.
@@ -58,9 +81,20 @@ $ ./net.sh start -f "cuda,erasure"
 
 ### How to interact with a CD testnet deployed by ci/testnet-deploy.sh
 
+**AWS-Specific Extra Setup**: Follow the steps in `scripts/add-solana-user-authorized_keys.sh`,
+then redeploy the testnet before continuing in this section.
+
 Taking **master-testnet-solana-com** as an example, configure your workspace for
 the testnet using:
-```
+```bash
 $ ./gce.sh config -p master-testnet-solana-com
-$ ./ssh.sh                                     # <-- Details on how to ssh into any testnet node
+```
+or
+```bash
+$ ./ec2.sh config -p master-testnet-solana-com
+```
+
+Then run the following for details on how to ssh into any testnet node
+```bash
+$ ./ssh.sh
 ```

+ 1 - 0
net/ec2.sh

@@ -0,0 +1 @@
+gce.sh

+ 142 - 80
net/gce.sh

@@ -1,27 +1,44 @@
 #!/bin/bash -e
 
 here=$(dirname "$0")
-# shellcheck source=net/scripts/gcloud.sh
-source "$here"/scripts/gcloud.sh
 # shellcheck source=net/common.sh
 source "$here"/common.sh
 
+cloudProvider=$(basename "$0" .sh)
+case $cloudProvider in
+gce)
+  # shellcheck source=net/scripts/gce-provider.sh
+  source "$here"/scripts/gce-provider.sh
+
+  imageName="ubuntu-16-04-cuda-9-2-new"
+  leaderMachineType=n1-standard-16
+  validatorMachineType=n1-standard-4
+  clientMachineType=n1-standard-16
+  ;;
+ec2)
+  # shellcheck source=net/scripts/ec2-provider.sh
+  source "$here"/scripts/ec2-provider.sh
+
+  imageName="ami-04169656fea786776"
+  leaderMachineType=m4.4xlarge
+  validatorMachineType=m4.xlarge
+  clientMachineType=m4.4xlarge
+  ;;
+*)
+  echo "Error: Unknown cloud provider: $cloudProvider"
+  ;;
+esac
+
+
 prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
 validatorNodeCount=5
 clientNodeCount=1
-leaderBootDiskSize=1TB
-leaderMachineType=n1-standard-16
-leaderAccelerator=
-validatorMachineType=n1-standard-4
-validatorBootDiskSize=$leaderBootDiskSize
-validatorAccelerator=
-clientMachineType=n1-standard-16
-clientBootDiskSize=40GB
-clientAccelerator=
-
-imageName="ubuntu-16-04-cuda-9-2-new"
+leaderBootDiskSizeInGb=1000
+validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
+clientBootDiskSizeInGb=40
+
 publicNetwork=false
-zone="us-west1-b"
+enableGpu=false
 leaderAddress=
 
 usage() {
@@ -33,7 +50,7 @@ usage() {
   cat <<EOF
 usage: $0 [create|config|delete] [common options] [command-specific options]
 
-Configure a GCE-based testnet
+Manage testnet instances
 
  create - create a new testnet (implies 'config')
  config - configure the testnet and write a config file describing it
@@ -47,10 +64,13 @@ Configure a GCE-based testnet
    -n [number]      - Number of validator nodes (default: $validatorNodeCount)
    -c [number]      - Number of client nodes (default: $clientNodeCount)
    -P               - Use public network IP addresses (default: $publicNetwork)
-   -z [zone]        - GCP Zone for the nodes (default: $zone)
-   -i [imageName]   - Existing image on GCE (default: $imageName)
-   -g               - Enable GPU
-   -a [address]     - Set the leader node's external IP address to this GCE address
+   -z [zone]        - Zone for the nodes (default: $zone)
+   -g               - Enable GPU (default: $enableGpu)
+   -a [address]     - Set the leader node's external IP address to this value.
+                      For GCE, [address] is the "name" of the desired External
+                      IP Address.
+                      For EC2, [address] is the "allocation ID" of the desired
+                      Elastic IP.
 
  config-specific options:
    none
@@ -68,7 +88,7 @@ command=$1
 shift
 [[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
 
-while getopts "h?p:Pi:n:c:z:ga:" opt; do
+while getopts "h?p:Pn:c:z:ga:" opt; do
   case $opt in
   h | \?)
     usage
@@ -80,9 +100,6 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
   P)
     publicNetwork=true
     ;;
-  i)
-    imageName=$OPTARG
-    ;;
   n)
     validatorNodeCount=$OPTARG
     ;;
@@ -90,10 +107,10 @@ while getopts "h?p:Pi:n:c:z:ga:" opt; do
     clientNodeCount=$OPTARG
     ;;
   z)
-    zone=$OPTARG
+    cloud_SetZone "$OPTARG"
     ;;
   g)
-    leaderAccelerator="count=4,type=nvidia-tesla-k80"
+    enableGpu=true
     ;;
   a)
     leaderAddress=$OPTARG
@@ -108,6 +125,37 @@ shift $((OPTIND - 1))
 [[ -z $1 ]] || usage "Unexpected argument: $1"
 sshPrivateKey="$netConfigDir/id_$prefix"
 
+
+# cloud_ForEachInstance [cmd] [extra args to cmd]
+#
+# Execute a command for each element in the `instances` array
+#
+#   cmd   - The command to execute on each instance
+#           The command will receive arguments followed by any
+#           additionl arguments supplied to cloud_ForEachInstance:
+#               name     - name of the instance
+#               publicIp - The public IP address of this instance
+#               privateIp - The priate IP address of this instance
+#               count    - Monotonically increasing count for each
+#                          invocation of cmd, starting at 1
+#               ...      - Extra args to cmd..
+#
+#
+cloud_ForEachInstance() {
+  declare cmd="$1"
+  shift
+  [[ -n $cmd ]] || { echo cloud_ForEachInstance: cmd not specified; exit 1; }
+
+  declare count=1
+  for info in "${instances[@]}"; do
+    declare name publicIp privateIp
+    IFS=: read -r name publicIp privateIp < <(echo "$info")
+
+    eval "$cmd" "$name" "$publicIp" "$privateIp" "$count" "$@"
+    count=$((count + 1))
+  done
+}
+
 prepareInstancesAndWriteConfigFile() {
   $metricsWriteDatapoint "testnet-deploy net-config-begin=1"
 
@@ -122,10 +170,10 @@ EOF
 
   recordInstanceIp() {
     declare name="$1"
-    declare publicIp="$3"
-    declare privateIp="$4"
+    declare publicIp="$2"
+    declare privateIp="$3"
 
-    declare arrayName="$6"
+    declare arrayName="$5"
 
     echo "$arrayName+=($publicIp)  # $name" >> "$configFile"
     if [[ $arrayName = "leaderIp" ]]; then
@@ -139,121 +187,133 @@ EOF
 
   waitForStartupComplete() {
     declare name="$1"
-    declare publicIp="$3"
+    declare publicIp="$2"
 
     echo "Waiting for $name to finish booting..."
     (
       for i in $(seq 1 30); do
-        if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then
+        if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then
           break
         fi
         sleep 2
         echo "Retry $i..."
       done
     )
+    echo "$name has booted."
   }
 
   echo "Looking for leader instance..."
-  gcloud_FindInstances "name=$prefix-leader" show
+  cloud_FindInstance "$prefix-leader"
   [[ ${#instances[@]} -eq 1 ]] || {
     echo "Unable to find leader"
     exit 1
   }
 
-  echo "Fetching $sshPrivateKey from $leaderName"
   (
-    rm -rf "$sshPrivateKey"{,pub}
-
     declare leaderName
-    declare leaderZone
     declare leaderIp
-    IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}")
+    IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}")
 
-    set -x
+    # Try to ping the machine first.
+    timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
 
-    # Try to ping the machine first.  There can be a delay between when the
-    # instance is reported as RUNNING and when it's reachable over the network
-    timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
+    if [[ ! -r $sshPrivateKey ]]; then
+      echo "Fetching $sshPrivateKey from $leaderName"
 
-    # Try to scp in a couple times, sshd may not yet be up even though the
-    # machine can be pinged...
-    set -o pipefail
-    for i in $(seq 1 10); do
-      if gcloud compute scp --zone "$leaderZone" \
-          "$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then
-        break
-      fi
-      sleep 1
-      echo "Retry $i..."
-    done
+      # Try to scp in a couple times, sshd may not yet be up even though the
+      # machine can be pinged...
+      set -x -o pipefail
+      for i in $(seq 1 30); do
+        if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then
+          break
+        fi
+
+        sleep 1
+        echo "Retry $i..."
+      done
 
-    chmod 400 "$sshPrivateKey"
+      chmod 400 "$sshPrivateKey"
+      ls -l "$sshPrivateKey"
+    fi
   )
 
   echo "leaderIp=()" >> "$configFile"
-  gcloud_ForEachInstance recordInstanceIp leaderIp
-  gcloud_ForEachInstance waitForStartupComplete
+  cloud_ForEachInstance recordInstanceIp leaderIp
+  cloud_ForEachInstance waitForStartupComplete
 
   echo "Looking for validator instances..."
-  gcloud_FindInstances "name~^$prefix-validator" show
+  cloud_FindInstances "$prefix-validator"
   [[ ${#instances[@]} -gt 0 ]] || {
     echo "Unable to find validators"
     exit 1
   }
   echo "validatorIpList=()" >> "$configFile"
-  gcloud_ForEachInstance recordInstanceIp validatorIpList
-  gcloud_ForEachInstance waitForStartupComplete
+  cloud_ForEachInstance recordInstanceIp validatorIpList
+  cloud_ForEachInstance waitForStartupComplete
 
   echo "clientIpList=()" >> "$configFile"
   echo "Looking for client instances..."
-  gcloud_FindInstances "name~^$prefix-client" show
+  cloud_FindInstances "$prefix-client"
   [[ ${#instances[@]} -eq 0 ]] || {
-    gcloud_ForEachInstance recordInstanceIp clientIpList
-    gcloud_ForEachInstance waitForStartupComplete
+    cloud_ForEachInstance recordInstanceIp clientIpList
+    cloud_ForEachInstance waitForStartupComplete
   }
 
   echo "Wrote $configFile"
   $metricsWriteDatapoint "testnet-deploy net-config-complete=1"
 }
 
-case $command in
-delete)
+delete() {
   $metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
 
   # Delete the leader node first to prevent unusual metrics on the dashboard
   # during shutdown.
   # TODO: It would be better to fully cut-off metrics reporting before any
   # instances are deleted.
-  for filter in "^$prefix-leader" "^$prefix-"; do
-    gcloud_FindInstances "name~$filter"
+  for filter in "$prefix-leader" "$prefix-"; do
+    echo "Searching for instances: $filter"
+    cloud_FindInstances "$filter"
 
     if [[ ${#instances[@]} -eq 0 ]]; then
       echo "No instances found matching '$filter'"
     else
-      gcloud_DeleteInstances true
+      cloud_DeleteInstances true
     fi
   done
   rm -f "$configFile"
 
   $metricsWriteDatapoint "testnet-deploy net-delete-complete=1"
+
+}
+
+case $command in
+delete)
+  delete
   ;;
 
 create)
   [[ -n $validatorNodeCount ]] || usage "Need number of nodes"
+  if [[ $validatorNodeCount -le 0 ]]; then
+    usage "One or more validator nodes is required"
+  fi
+
+  delete
 
   $metricsWriteDatapoint "testnet-deploy net-create-begin=1"
 
   rm -rf "$sshPrivateKey"{,.pub}
-  ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
+
+  # Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa
+  ssh-keygen -t rsa -N '' -f "$sshPrivateKey"
 
   printNetworkInfo() {
     cat <<EOF
 ========================================================================================
 
 Network composition:
-  Leader = $leaderMachineType (GPU=${leaderAccelerator:-none})
-  Validators = $validatorNodeCount x $validatorMachineType (GPU=${validatorAccelerator:-none})
-  Client(s) = $clientNodeCount x $clientMachineType (GPU=${clientAccelerator:-none})
+  Leader = $leaderMachineType (GPU=$enableGpu)
+  Validators = $validatorNodeCount x $validatorMachineType
+  Client(s) = $clientNodeCount x $clientMachineType
 
 ========================================================================================
 
@@ -261,7 +321,7 @@ EOF
   }
   printNetworkInfo
 
-  declare startupScript="$netConfigDir"/gce-startup-script.sh
+  declare startupScript="$netConfigDir"/instance-startup-script.sh
   cat > "$startupScript" <<EOF
 #!/bin/bash -ex
 # autogenerated at $(date)
@@ -270,11 +330,12 @@ cat > /etc/motd <<EOM
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
   This instance has not been fully configured.
-  See "startup-script" log messages in /var/log/syslog for status:
-    $ sudo cat /var/log/syslog | grep startup-script
+
+  See startup script log messages in /var/log/syslog for status:
+    $ sudo cat /var/log/syslog | egrep \\(startup-script\\|cloud-init\)
 
   To block until setup is complete, run:
-    $ until [[ -f /.gce-startup-complete ]]; do sleep 1; done
+    $ until [[ -f /.instance-startup-complete ]]; do sleep 1; done
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 EOM
@@ -296,6 +357,7 @@ $(
   cat \
     disable-background-upgrades.sh \
     create-solana-user.sh \
+    add-solana-user-authorized_keys.sh \
     install-earlyoom.sh \
     install-libssl-compatability.sh \
     install-rsync.sh \
@@ -305,21 +367,21 @@ cat > /etc/motd <<EOM
 $(printNetworkInfo)
 EOM
 
-touch /.gce-startup-complete
+touch /.instance-startup-complete
 
 EOF
 
-  gcloud_CreateInstances "$prefix-leader" 1 "$zone" \
-    "$imageName" "$leaderMachineType" "$leaderBootDiskSize" "$leaderAccelerator" \
+  cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
+    "$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" "$enableGpu" \
     "$startupScript" "$leaderAddress"
 
-  gcloud_CreateInstances "$prefix-validator" "$validatorNodeCount" "$zone" \
-    "$imageName" "$validatorMachineType" "$validatorBootDiskSize" "$validatorAccelerator" \
+  cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
+    "$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" false \
     "$startupScript" ""
 
   if [[ $clientNodeCount -gt 0 ]]; then
-    gcloud_CreateInstances "$prefix-client" "$clientNodeCount" "$zone" \
-      "$imageName" "$clientMachineType" "$clientBootDiskSize" "$clientAccelerator" \
+    cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
+      "$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" false \
       "$startupScript" ""
   fi
 

+ 20 - 0
net/scripts/add-solana-user-authorized_keys.sh

@@ -0,0 +1,20 @@
+#!/bin/bash -ex
+
+[[ $(uname) = Linux ]] || exit 1
+[[ $USER = root ]] || exit 1
+
+[[ -d /home/solana/.ssh ]] || exit 1
+
+# /solana-authorized_keys contains the public keys for users that should
+# automatically be granted access to ALL testnets.
+#
+# To add an entry into this list:
+# 1. Run: ssh-keygen -t ecdsa -N '' -f ~/.ssh/id-solana-testnet
+# 2. Inline ~/.ssh/id-solana-testnet.pub below
+cat > /solana-authorized_keys <<EOF
+ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFBNwLw0i+rI312gWshojFlNw9NV7WfaKeeUsYADqOvM2o4yrO2pPw+sgW8W+/rPpVyH7zU9WVRgTME8NgFV1Vc=
+EOF
+
+sudo -u solana bash -c "
+  cat /solana-authorized_keys >> /home/solana/.ssh/authorized_keys
+"

+ 242 - 0
net/scripts/ec2-provider.sh

@@ -0,0 +1,242 @@
+# |source| this file
+#
+# Utilities for working with EC2 instances
+#
+
+zone=
+region=
+
+cloud_SetZone() {
+  zone="$1"
+  # AWS region is zone with the last character removed
+  region="${zone:0:$((${#zone} - 1))}"
+}
+
+# Set the default zone
+cloud_SetZone "us-east-1b"
+
+# sshPrivateKey should be globally defined whenever this function is called.
+#
+# TODO: Remove usage of the sshPrivateKey global
+__cloud_SshPrivateKeyCheck() {
+  # shellcheck disable=SC2154
+  if [[ -z $sshPrivateKey ]]; then
+    echo Error: sshPrivateKey not defined
+    exit 1
+  fi
+  if [[ ! -r $sshPrivateKey ]]; then
+    echo "Error: file is not readable: $sshPrivateKey"
+    exit 1
+  fi
+}
+
+#
+# __cloud_FindInstances
+#
+# Find instances with name matching the specified pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP"
+#
+# filter   - The instances to filter on
+#
+# examples:
+#   $ __cloud_FindInstances "exact-machine-name"
+#   $ __cloud_FindInstances "all-machines-with-a-common-machine-prefix*"
+#
+__cloud_FindInstances() {
+  declare filter="$1"
+
+  instances=()
+  declare name publicIp privateIp
+  while read -r name publicIp privateIp; do
+    printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
+    instances+=("$name:$publicIp:$privateIp")
+  done < <(aws ec2 describe-instances \
+             --region "$region" \
+             --filters \
+               "Name=tag:name,Values=$filter" \
+               "Name=instance-state-name,Values=pending,running" \
+             --query "Reservations[].Instances[].[InstanceId,PublicIpAddress,PrivateIpAddress]" \
+             --output text
+    )
+}
+
+#
+# cloud_FindInstances [namePrefix]
+#
+# Find instances with names matching the specified prefix
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP"
+#
+# namePrefix - The instance name prefix to look for
+#
+# examples:
+#   $ cloud_FindInstances all-machines-with-a-common-machine-prefix
+#
+cloud_FindInstances() {
+  declare namePrefix="$1"
+  __cloud_FindInstances "$namePrefix*"
+}
+
+#
+# cloud_FindInstance [name]
+#
+# Find an instance with a name matching the exact pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP"
+#
+# name - The instance name to look for
+#
+# examples:
+#   $ cloud_FindInstance exact-machine-name
+#
+cloud_FindInstance() {
+  declare name="$1"
+  __cloud_FindInstances "$name"
+}
+
+
+#
+# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
+#                       [machineType] [bootDiskSize] [enableGpu]
+#                       [startupScript] [address]
+#
+# Creates one more identical instances.
+#
+# networkName   - unique name of this testnet
+# namePrefix    - unique string to prefix all the instance names with
+# numNodes      - number of instances to create
+# imageName     - Disk image for the instances
+# machineType   - GCE machine type
+# bootDiskSize  - Optional size of the boot disk in GB
+# enableGpu     - Optionally enable GPU, use the value "true" to enable
+#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
+# startupScript - Optional startup script to execute when the instance boots
+# address       - Optional name of the GCE static IP address to attach to the
+#                 instance.  Requires that |numNodes| = 1 and that addressName
+#                 has been provisioned in the GCE region that is hosting |zone|
+#
+# Tip: use cloud_FindInstances to locate the instances once this function
+#      returns
+cloud_CreateInstances() {
+  declare networkName="$1"
+  declare namePrefix="$2"
+  declare numNodes="$3"
+  declare imageName="$4"
+  declare machineType="$5"
+  declare optionalBootDiskSize="$6"
+  declare optionalGpu="$7"
+  declare optionalStartupScript="$8"
+  declare optionalAddress="$9"
+
+  __cloud_SshPrivateKeyCheck
+  (
+    set -x
+    aws ec2 delete-key-pair --region "$region" --key-name "$networkName"
+    aws ec2 import-key-pair --region "$region" --key-name "$networkName" \
+      --public-key-material file://"${sshPrivateKey}".pub
+  )
+
+  declare -a args
+  args=(
+    --key-name "$networkName"
+    --count "$numNodes"
+    --region "$region"
+    --placement "AvailabilityZone=$zone"
+    --security-groups testnet
+    --image-id "$imageName"
+    --instance-type "$machineType"
+    --tag-specifications "ResourceType=instance,Tags=[{Key=name,Value=$namePrefix}]"
+  )
+  if [[ -n $optionalBootDiskSize ]]; then
+    args+=(
+      --block-device-mapping "[{\"DeviceName\": \"/dev/sda1\", \"Ebs\": { \"VolumeSize\": $optionalBootDiskSize }}]"
+    )
+  fi
+  if [[ $optionalGpu = true ]]; then
+    echo TODO: GPU support not implemented yet
+    exit 1
+  fi
+  if [[ -n $optionalStartupScript ]]; then
+    args+=(
+      --user-data "file://$optionalStartupScript"
+    )
+  fi
+
+  if [[ -n $optionalAddress ]]; then
+    [[ $numNodes = 1 ]] || {
+      echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
+      exit 1
+    }
+  fi
+
+  (
+    set -x
+    aws ec2 run-instances "${args[@]}"
+  )
+
+  if [[ -n $optionalAddress ]]; then
+    cloud_FindInstance "$namePrefix"
+    if [[ ${#instances[@]} -ne 1 ]]; then
+      echo "Failed to find newly created instance: $namePrefix"
+    fi
+
+    declare instanceId
+    IFS=: read -r instanceId _ < <(echo "${instances[0]}")
+    aws ec2 associate-address \
+      --instance-id "$instanceId" \
+      --region "region" \
+      --allocation-id "$optionalAddress"
+  fi
+}
+
+#
+# cloud_DeleteInstances
+#
+# Deletes all the instances listed in the `instances` array
+#
+cloud_DeleteInstances() {
+  if [[ ${#instances[0]} -eq 0 ]]; then
+    echo No instances to delete
+    return
+  fi
+  declare names=("${instances[@]/:*/}")
+  (
+    set -x
+    aws ec2 terminate-instances --region "$region" --instance-ids "${names[@]}"
+  )
+}
+
+
+#
+# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
+#
+# Fetch a file from the given instance.  This function uses a cloud-specific
+# mechanism to fetch the file
+#
+cloud_FetchFile() {
+  # shellcheck disable=SC2034 # instanceName is unused
+  declare instanceName="$1"
+  declare publicIp="$2"
+  declare remoteFile="$3"
+  declare localFile="$4"
+
+  __cloud_SshPrivateKeyCheck
+  (
+    set -x
+    scp \
+      -o "StrictHostKeyChecking=no" \
+      -o "UserKnownHostsFile=/dev/null" \
+      -o "User=solana" \
+      -o "IdentityFile=$sshPrivateKey" \
+      -o "LogLevel=ERROR" \
+      -F /dev/null \
+      "solana@$publicIp:$remoteFile" "$localFile"
+  )
+}

+ 201 - 0
net/scripts/gce-provider.sh

@@ -0,0 +1,201 @@
+# |source| this file
+#
+# Utilities for working with GCE instances
+#
+
+# Default zone
+zone="us-west1-b"
+cloud_SetZone() {
+  zone="$1"
+}
+
+
+#
+# __cloud_FindInstances
+#
+# Find instances matching the specified pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:zone:public IP:private IP"
+#
+# filter   - The instances to filter on
+#
+# examples:
+#   $ __cloud_FindInstances "name=exact-machine-name"
+#   $ __cloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
+#
+__cloud_FindInstances() {
+  declare filter="$1"
+  instances=()
+
+  declare name zone publicIp privateIp status
+  while read -r name publicIp privateIp status; do
+    if [[ $status != RUNNING ]]; then
+      echo "Warning: $name is not RUNNING, ignoring it."
+      continue
+    fi
+    printf "%-30s | publicIp=%-16s privateIp=%s\n" "$name" "$publicIp" "$privateIp"
+
+    instances+=("$name:$publicIp:$privateIp")
+  done < <(gcloud compute instances list \
+             --filter="$filter" \
+             --format 'value(name,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
+}
+#
+# cloud_FindInstances [namePrefix]
+#
+# Find instances with names matching the specified prefix
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP"
+#
+# namePrefix - The instance name prefix to look for
+#
+# examples:
+#   $ cloud_FindInstances all-machines-with-a-common-machine-prefix
+#
+cloud_FindInstances() {
+  declare namePrefix="$1"
+  __cloud_FindInstances "name~^$namePrefix"
+}
+
+#
+# cloud_FindInstance [name]
+#
+# Find an instance with a name matching the exact pattern.
+#
+# For each matching instance, an entry in the `instances` array will be added with the
+# following information about the instance:
+#   "name:public IP:private IP"
+#
+# name - The instance name to look for
+#
+# examples:
+#   $ cloud_FindInstance exact-machine-name
+#
+cloud_FindInstance() {
+  declare name="$1"
+  __cloud_FindInstances "name=$name"
+}
+
+#
+# cloud_CreateInstances [networkName] [namePrefix] [numNodes] [imageName]
+#                       [machineType] [bootDiskSize] [enableGpu]
+#                       [startupScript] [address]
+#
+# Creates one more identical instances.
+#
+# networkName   - unique name of this testnet
+# namePrefix    - unique string to prefix all the instance names with
+# numNodes      - number of instances to create
+# imageName     - Disk image for the instances
+# machineType   - GCE machine type
+# bootDiskSize  - Optional size of the boot disk in GB
+# enableGpu     - Optionally enable GPU, use the value "true" to enable
+#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
+# startupScript - Optional startup script to execute when the instance boots
+# address       - Optional name of the GCE static IP address to attach to the
+#                 instance.  Requires that |numNodes| = 1 and that addressName
+#                 has been provisioned in the GCE region that is hosting `$zone`
+#
+# Tip: use cloud_FindInstances to locate the instances once this function
+#      returns
+cloud_CreateInstances() {
+  declare networkName="$1"
+  declare namePrefix="$2"
+  declare numNodes="$3"
+  declare imageName="$4"
+  declare machineType="$5"
+  declare optionalBootDiskSize="$6"
+  declare optionalGpu="$7"
+  declare optionalStartupScript="$8"
+  declare optionalAddress="$9"
+
+  declare nodes
+  if [[ $numNodes = 1 ]]; then
+    nodes=("$namePrefix")
+  else
+    read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
+  fi
+
+  declare -a args
+  args=(
+    "--zone=$zone"
+    "--tags=testnet"
+    "--metadata=testnet=$networkName"
+    "--image=$imageName"
+    "--machine-type=$machineType"
+  )
+  if [[ -n $optionalBootDiskSize ]]; then
+    args+=(
+      "--boot-disk-size=${optionalBootDiskSize}GB"
+    )
+  fi
+  if [[ $optionalGpu = true ]]; then
+    args+=(
+      "--accelerator=count=4,type=nvidia-tesla-k80"
+      --maintenance-policy TERMINATE
+      --restart-on-failure
+    )
+  fi
+  if [[ -n $optionalStartupScript ]]; then
+    args+=(
+      --metadata-from-file "startup-script=$optionalStartupScript"
+    )
+  fi
+
+  if [[ -n $optionalAddress ]]; then
+    [[ $numNodes = 1 ]] || {
+      echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
+      exit 1
+    }
+    args+=(
+      "--address=$optionalAddress"
+    )
+  fi
+
+  (
+    set -x
+    gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
+  )
+}
+
+#
+# cloud_DeleteInstances
+#
+# Deletes all the instances listed in the `instances` array
+#
+cloud_DeleteInstances() {
+  if [[ ${#instances[0]} -eq 0 ]]; then
+    echo No instances to delete
+    return
+  fi
+  declare names=("${instances[@]/:*/}")
+
+  (
+    set -x
+    gcloud beta compute instances delete --zone "$zone" --quiet "${names[@]}"
+  )
+}
+
+
+#
+# cloud_FetchFile [instanceName] [publicIp] [remoteFile] [localFile]
+#
+# Fetch a file from the given instance.  This function uses a cloud-specific
+# mechanism to fetch the file
+#
+cloud_FetchFile() {
+  declare instanceName="$1"
+  # shellcheck disable=SC2034 # publicIp is unused
+  declare publicIp="$2"
+  declare remoteFile="$3"
+  declare localFile="$4"
+
+  (
+    set -x
+    gcloud compute scp --zone "$zone" "$instanceName:$remoteFile" "$localFile"
+  )
+}

+ 0 - 187
net/scripts/gcloud.sh

@@ -1,187 +0,0 @@
-# |source| this file
-#
-# Utilities for working with gcloud
-#
-
-
-#
-# gcloud_FindInstances [filter] [options]
-#
-# Find instances matching the specified pattern.
-#
-# For each matching instance, an entry in the `instances` array will be added with the
-# following information about the instance:
-#   "name:zone:public IP:private IP"
-#
-# filter   - The instances to filter on
-# options  - If set to the string "show", the list of instances will be echoed
-#            to stdout
-#
-# examples:
-#   $ gcloud_FindInstances "name=exact-machine-name"
-#   $ gcloud_FindInstances "name~^all-machines-with-a-common-machine-prefix"
-#
-gcloud_FindInstances() {
-  declare filter="$1"
-  declare options="$2"
-  instances=()
-
-  declare name zone publicIp privateIp status
-  while read -r name zone publicIp privateIp status; do
-    if [[ $status != RUNNING ]]; then
-      echo "Warning: $name is not RUNNING, ignoring it."
-      continue
-    fi
-    if [[ $options = show ]]; then
-      printf "%-30s | %-16s publicIp=%-16s privateIp=%s\n" "$name" "$zone" "$publicIp" "$privateIp"
-    fi
-
-    instances+=("$name:$zone:$publicIp:$privateIp")
-  done < <(gcloud compute instances list \
-             --filter="$filter" \
-             --format 'value(name,zone,networkInterfaces[0].accessConfigs[0].natIP,networkInterfaces[0].networkIP,status)')
-}
-
-#
-# gcloud_ForEachInstance [cmd] [extra args to cmd]
-#
-# Execute a command for each element in the `instances` array
-#
-#   cmd   - The command to execute on each instance
-#           The command will receive arguments followed by any
-#           additionl arguments supplied to gcloud_ForEachInstance:
-#               name     - name of the instance
-#               zone     - zone the instance is located in
-#               publicIp - The public IP address of this instance
-#               privateIp - The priate IP address of this instance
-#               count    - Monotonically increasing count for each
-#                          invocation of cmd, starting at 1
-#               ...      - Extra args to cmd..
-#
-#
-gcloud_ForEachInstance() {
-  declare cmd="$1"
-  shift
-  [[ -n $cmd ]] || { echo gcloud_ForEachInstance: cmd not specified; exit 1; }
-
-  declare count=1
-  for info in "${instances[@]}"; do
-    declare name zone publicIp privateIp
-    IFS=: read -r name zone publicIp privateIp < <(echo "$info")
-
-    eval "$cmd" "$name" "$zone" "$publicIp" "$privateIp" "$count" "$@"
-    count=$((count + 1))
-  done
-}
-
-#
-# gcloud_CreateInstances [namePrefix] [numNodes] [zone] [imageName]
-#                        [machineType] [bootDiskSize] [accelerator]
-#                        [startupScript] [address]
-#
-# Creates one more identical instances.
-#
-# namePrefix    - unique string to prefix all the instance names with
-# numNodes      - number of instances to create
-# zone          - zone to create the instances in
-# imageName     - Disk image for the instances
-# machineType   - GCE machine type
-# bootDiskSize  - Optional disk of the boot disk
-# accelerator   - Optional accelerator to attach to the instance(s), see
-#                 eg, request 4 K80 GPUs with "count=4,type=nvidia-tesla-k80"
-# startupScript - Optional startup script to execute when the instance boots
-# address       - Optional name of the GCE static IP address to attach to the
-#                 instance.  Requires that |numNodes| = 1 and that addressName
-#                 has been provisioned in the GCE region that is hosting |zone|
-#
-# Tip: use gcloud_FindInstances to locate the instances once this function
-#      returns
-gcloud_CreateInstances() {
-  declare namePrefix="$1"
-  declare numNodes="$2"
-  declare zone="$3"
-  declare imageName="$4"
-  declare machineType="$5"
-  declare optionalBootDiskSize="$6"
-  declare optionalAccelerator="$7"
-  declare optionalStartupScript="$8"
-  declare optionalAddress="$9"
-
-  declare nodes
-  if [[ $numNodes = 1 ]]; then
-    nodes=("$namePrefix")
-  else
-    read -ra nodes <<<$(seq -f "${namePrefix}%0${#numNodes}g" 1 "$numNodes")
-  fi
-
-  declare -a args
-  args=(
-    "--zone=$zone"
-    "--tags=testnet"
-    "--image=$imageName"
-    "--machine-type=$machineType"
-  )
-  if [[ -n $optionalBootDiskSize ]]; then
-    args+=(
-      "--boot-disk-size=$optionalBootDiskSize"
-    )
-  fi
-  if [[ -n $optionalAccelerator ]]; then
-    args+=(
-      "--accelerator=$optionalAccelerator"
-      --maintenance-policy TERMINATE
-      --restart-on-failure
-    )
-  fi
-  if [[ -n $optionalStartupScript ]]; then
-    args+=(
-      --metadata-from-file "startup-script=$optionalStartupScript"
-    )
-  fi
-
-  if [[ -n $optionalAddress ]]; then
-    [[ $numNodes = 1 ]] || {
-      echo "Error: address may not be supplied when provisioning multiple nodes: $optionalAddress"
-      exit 1
-    }
-    args+=(
-      "--address=$optionalAddress"
-    )
-  fi
-
-  (
-    set -x
-    gcloud beta compute instances create "${nodes[@]}" "${args[@]}"
-  )
-}
-
-#
-# gcloud_DeleteInstances [yes]
-#
-# Deletes all the instances listed in the `instances` array
-#
-# If yes = "true", skip the delete confirmation
-#
-gcloud_DeleteInstances() {
-  declare maybeQuiet=
-  if [[ $1 = true ]]; then
-    maybeQuiet=--quiet
-  fi
-
-  if [[ ${#instances[0]} -eq 0 ]]; then
-    echo No instances to delete
-    return
-  fi
-  declare names=("${instances[@]/:*/}")
-
-  # Assume all instances are in the same zone
-  # TODO: One day this assumption will be invalid
-  declare zone
-  IFS=: read -r _ zone _ < <(echo "${instances[0]}")
-
-  (
-    set -x
-    gcloud beta compute instances delete --zone "$zone" $maybeQuiet "${names[@]}"
-  )
-}
-