moki9 · October 30, 2022 02:33 · Mar 3, 2021 · Mar 3, 2021 · Mar 3, 2021
diff --git a/GCP_Slurm_Terraform.md b/GCP_Slurm_Terraform.md
@@ -79,6 +79,12 @@ README
 
 ### Note
 
-The creation of the resources may fail at step 8 because of the quotas of the
-resources. GCP sets very low quotas for C2-type instances and V100 GPUs for new
-projects. You may need to request a higher quota from GCP.
+* The creation of the resources may fail at step 8 because of the quotas of the 
+  resources. GCP sets very low quotas for C2-type instances and V100 GPUs for new
+  projects. You may need to request a higher quota from GCP.
+* The nodes in `debug-cpu` were automatically terminated with no problem when no
+  jobs were running, as described previously. However, those in `debug-gpu` did
+  not work. I have not figured out what went wrong. So be careful the bill of
+  those GPU nodes.
+* It seems NVIDIA driver was not automatically installed, though I didn't spend
+  much time investigating this issue.
diff --git a/README.md → GCP_Slurm_Terraform.md b/README.md → GCP_Slurm_Terraform.md
diff --git a/README.md b/README.md
@@ -0,0 +1,84 @@
+README
+======
+
+### Steps
+
+1. Install [Google Cloud SDK](https://cloud.google.com/sdk): to manipulate cloud
+   resources
+
+2. Install [Terraform](https://www.terraform.io/): to create/destroy clusters
+   from pre-defined specs
+
+3. Create/prepare a project on [Google Cloud Platform (GCP)](https://cloud.google.com/)
+
+4. [Enable Compute Engine API](https://cloud.google.com/apis/docs/getting-started)
+
+5. [Create a service account](https://cloud.google.com/iam/docs/creating-managing-service-accounts)
+   with a role of [project editor](https://cloud.google.com/iam/docs/understanding-roles#basic)
+
+6. [Create/download a JSON key file](https://cloud.google.com/iam/docs/creating-managing-service-account-keys#creating_service_account_keys)
+   for the service account. Note this file can not be re-downloaded. Keep it
+   safe. Or re-create a new one if lost.
+
+7. In the terminal, under this directory, execute
+   ```
+   $ terraform init
+   ```
+
+8. In the terminal, under this directory, execute
+   ```
+   $ terraform apply \
+         -var "project_id=<PROJECT ID>" \
+         -var "credential_file=<CREDENTIAL FILE NAME>"
+   ```
+
+   The `<PROJECT ID>` can be found on the GCP console. This command creates all
+   resources on GCP. Users can check the status of these resources on the GCP
+   console.
+
+9. To login to the master node:
+   ```
+   $ gcloud compute ssh gcp-cluster-login0 --zone=us-central1-a
+   ```
+   Note that even when the GCP console shows the login node and other nodes are
+   ready, it doesn't mean the Slurm is ready. It takes some time for the Slurm
+   to be usable.
+
+10. To destroy the cluster:
+    ```
+    $ terraform destroy \
+          -var "project_id=<PROJECT ID>" \
+          -var "credential_file=<CREDENTIAL FILE NAME>"
+    ```
+
+### Description of the resources
+
+* Node `gcp-cluster-controller`: where the Slurm daemon is at. This node is
+  always on. NFS server also lives here. `/home`, `/app`, `/etc/munge` are
+  mounted on all other nodes in the cluster. It's why this node has a larger
+  disk.
+* Node `gcp-cluster-login0`: the master/login node of the cluster. Users submit
+  jobs from this node. This node is always on.
+* Node `gcp-cluster-compute-0-image`: the template node for the Slurm partition
+  `debug-cpu`. It's down after being successfully created. This cluster will
+  create compute nodes when needed and destroy compute nodes when no job is
+  running after 300 seconds. Compute nodes are created using this template node
+  as the base image. So we don't need to wait for long for the compute nodes to
+  be usable.
+* Node `gcp-cluster-compute-1-image`: similar to `gcp-cluster-compute-0-image`
+  but for the partition `debug-gpu`.
+* Node `gcp-cluster-compute-<x>-<y>`: the actual compute nodes in partition
+  `<x>` and node ID `<y>`. These compute nodes are only created and shown when
+  there are Slurm jobs.
+* Network-related: `gcp-cluster-network`, `gcp-cluster-router`,
+  `gcp-cluster-nat`, and an external IP used by the virtual router. The default
+  SSH port (i.e., 22) is enable by default in the firewall, and it allows
+  connections from any external IP sources. Another opened port for external
+  access is for GCP's command-line tool `gcloud`. Users can also login to the
+  controller and the master nodes with `gcloud`.
+
+### Note
+
+The creation of the resources may fail at step 8 because of the quotas of the
+resources. GCP sets very low quotas for C2-type instances and V100 GPUs for new
+projects. You may need to request a higher quota from GCP.
diff --git a/main.tf b/main.tf
@@ -0,0 +1,171 @@
+# Description: terraform scripts to create a slurm cluster on Google Cloud Platform
+# Author: Pi-Yueh Chuang ([email protected])
+# License: BSD 3-Clause
+# Based on https://github.com/SchedMD/slurm-gcp
+
+terraform {
+  required_providers {
+    google = {
+      source  = "hashicorp/google"
+      version = "3.37.0"
+    }
+  }
+}
+
+provider "google" {
+  credentials = file(var.credential_file)
+  project     = var.project_id
+  region      = var.region
+  zone        = var.zone
+}
+
+# hard-coded variables
+locals {
+  cluster_name                  = "gcp-cluster"
+  disable_login_public_ips      = true
+  disable_controller_public_ips = true
+  disable_compute_public_ips    = true
+  partitions = [
+    {
+      name                 = "debug-cpu",
+      machine_type         = "c2-standard-4",
+      max_node_count       = 2,
+      zone                 = var.zone,
+      compute_disk_type    = "pd-ssd",
+      compute_disk_size_gb = 30,
+      compute_labels       = {},
+      cpu_platform         = "Intel Cascade Lake",
+      gpu_count            = 0,
+      gpu_type             = null,
+      network_storage      = [],
+      preemptible_bursting = true,
+      vpc_subnet           = null,
+      static_node_count    = 0
+    },
+    {
+      name                 = "debug-gpu",
+      machine_type         = "n1-standard-4",
+      max_node_count       = 1,
+      zone                 = var.zone,
+      compute_disk_type    = "pd-ssd",
+      compute_disk_size_gb = 30,
+      compute_labels       = {},
+      cpu_platform         = null,
+      gpu_count            = 1,
+      gpu_type             = "nvidia-tesla-v100",
+      network_storage      = [],
+      preemptible_bursting = true,
+      vpc_subnet           = null,
+      static_node_count    = 1
+    },
+  ]
+  ompi_version = "v4.0.x"
+}
+
+module "slurm_cluster_network" {
+  source = "github.com/SchedMD/slurm-gcp//tf/modules/network"
+
+  cluster_name                  = local.cluster_name
+  disable_login_public_ips      = local.disable_login_public_ips
+  disable_controller_public_ips = local.disable_controller_public_ips
+  disable_compute_public_ips    = local.disable_compute_public_ips
+  network_name                  = null
+  partitions                    = local.partitions
+  private_ip_google_access      = true
+  project                       = var.project_id
+  region                        = var.region
+  shared_vpc_host_project       = null
+  subnetwork_name               = null
+}
+
+module "slurm_cluster_controller" {
+  source = "github.com/SchedMD/slurm-gcp//tf/modules/controller"
+
+  boot_disk_size = 100
+  boot_disk_type = "pd-ssd"
+  cloudsql       = null
+  cluster_name   = local.cluster_name
+  compute_node_scopes = [
+    "https://www.googleapis.com/auth/monitoring.write",
+    "https://www.googleapis.com/auth/logging.write"
+  ]
+  compute_node_service_account  = "default"
+  disable_compute_public_ips    = local.disable_compute_public_ips
+  disable_controller_public_ips = local.disable_controller_public_ips
+  labels                        = {}
+  login_network_storage         = []
+  login_node_count              = 1
+  machine_type                  = "n1-standard-2"
+  munge_key                     = null
+  network_storage               = var.network_storage
+  ompi_version                  = local.ompi_version
+  partitions                    = local.partitions
+  project                       = var.project_id
+  region                        = var.region
+  secondary_disk                = false
+  secondary_disk_size           = 100
+  secondary_disk_type           = "pd-ssd"
+  scopes                        = ["https://www.googleapis.com/auth/cloud-platform"]
+  service_account               = "default"
+  shared_vpc_host_project       = null
+  slurm_version                 = "19.05-latest"
+  subnet_depend                 = module.slurm_cluster_network.subnet_depend
+  subnetwork_name               = null
+  suspend_time                  = 300
+  zone                          = var.zone
+}
+
+module "slurm_cluster_login" {
+  source = "github.com/SchedMD/slurm-gcp//tf/modules/login"
+
+  boot_disk_size            = 20
+  boot_disk_type            = "pd-standard"
+  cluster_name              = local.cluster_name
+  controller_name           = module.slurm_cluster_controller.controller_node_name
+  controller_secondary_disk = false
+  disable_login_public_ips  = local.disable_login_public_ips
+  labels                    = {}
+  login_network_storage     = []
+  machine_type              = "n1-standard-2"
+  munge_key                 = null
+  network_storage           = var.network_storage
+  node_count                = 1
+  ompi_version              = local.ompi_version
+  region                    = var.region
+  scopes = [
+    "https://www.googleapis.com/auth/monitoring.write",
+    "https://www.googleapis.com/auth/logging.write"
+  ]
+  service_account         = "default"
+  shared_vpc_host_project = null
+  subnet_depend           = module.slurm_cluster_network.subnet_depend
+  subnetwork_name         = null
+  zone                    = var.zone
+}
+
+module "slurm_cluster_compute" {
+  source = "github.com/SchedMD/slurm-gcp//tf/modules/compute"
+
+  compute_image_disk_size_gb = 20
+  compute_image_disk_type    = "pd-ssd"
+  compute_image_labels       = {}
+  compute_image_machine_type = "n1-standard-2"
+  controller_name            = module.slurm_cluster_controller.controller_node_name
+  controller_secondary_disk  = 0
+  cluster_name               = local.cluster_name
+  disable_compute_public_ips = local.disable_compute_public_ips
+  network_storage            = var.network_storage
+  ompi_version               = local.ompi_version
+  partitions                 = local.partitions
+  project                    = var.project_id
+  region                     = var.region
+  scopes = [
+    "https://www.googleapis.com/auth/monitoring.write",
+    "https://www.googleapis.com/auth/logging.write"
+  ]
+  service_account         = "default"
+  shared_vpc_host_project = null
+  subnet_depend           = module.slurm_cluster_network.subnet_depend
+  subnetwork_name         = null
+  zone                    = var.zone
+}
diff --git a/variables.tf b/variables.tf
@@ -0,0 +1,44 @@
+# Description: Input variables of main.tf
+# Author: Pi-Yueh Chuang ([email protected])
+# License: BSD 3-Clause
+
+
+# project_id is a mandatory variable from users
+variable "project_id" {
+  type        = string
+  description = "The GCP project where the cluster will be created in."
+}
+
+# credential_file is a mandatory variable from users
+variable "credential_file" {
+  type        = string
+  description = "The JSON credential file of a service account with project editor role."
+}
+
+variable "region" {
+  type        = string
+  description = "The region where the resources will be allocated in."
+  default     = "us-central1"
+}
+
+variable "zone" {
+  type        = string
+  description = "The zone under the region where the resources will be allocated in."
+  default     = "us-central1-a"
+}
+
+variable "network_storage" {
+  type = list(
+    object(
+      {
+        server_ip     = string,
+        remote_mount  = string,
+        local_mount   = string,
+        fs_type       = string,
+        mount_options = string
+      }
+    )
+  )
+  description = " An array of network attached storage mounts to be configured on all instances."
+  default     = []
+}