# Overview: # Install cloud_sql_proxy binary with appropriate credentials on all driver and worker nodes. # References: # https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy # https://cloud.google.com/sql/docs/postgres/sql-proxy#authentication-options # Step 1. # Follow https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy to set up: # - enable Cloud SQL API # - service account credentials, with "Cloud SQL Client" role. download the .json credential file # - find your instance connection name from the Cloud SQL Instance details page # Step 2. # set up a databricks initscript: dbutils.fs.put("/databricks/init/cluster_name/gcloud.sh", """ #!/bin/bash wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O cloud_sql_proxy chmod +x cloud_sql_proxy ./cloud_sql_proxy -instances=_your_cloud_sql_instance_name_=tcp:3306 -credential_file=/path/to/gcloud_creds.json 2>&1 & echo "finished installing gcloud postgres proxy" """ ) # Run the above code in notebook, and (re)start cluster to pick-up the init script results # Step 3. # now in a notebook attached to the cluster initialized as above: jdbcHostname = "127.0.0.1" # talk to local proxy jdbcDatabase = "database_name" jdbcPort = 3306 # this should line up with the tcp port number specified in init script username = "db_username" password = "db_password" # this example show's postgresql, I'd imagine mySQL to be quite similar jdbcUrl = "jdbc:postgresql://{0}:{1}/{2}?user={3}&password={4}".format(jdbcHostname, jdbcPort, jdbcDatabase, username, password) tableName = '_your_table_name_' spark.read \ .format("jdbc") \ .option("driver", "org.postgresql.Driver") \ .option("url", jdbcUrl) \ .option("dbtable", tableName) \ .load()