# # Some constants # aws_profile = "your_profile" aws_region = "your_region" s3_bucket = "your_bucket" # # Reading environment variables from aws credential file # import os import configparser config = configparser.ConfigParser() config.read(os.path.expanduser("~/.aws/credentials")) access_id = config.get(aws_profile, "aws_access_key_id") access_key = config.get(aws_profile, "aws_secret_access_key") # # Configuring pyspark # # see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895 # and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell" # If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache. # (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148) import pyspark sc=pyspark.SparkContext() # see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485 sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true") # see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark hadoop_conf=sc._jsc.hadoopConfiguration() # see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2 hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true") hadoop_conf.set("fs.s3a.access.key", access_id) hadoop_conf.set("fs.s3a.secret.key", access_key) # see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/ hadoop_conf.set("fs.s3a.connection.maximum", "100000") # see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com") # # Downloading the parquet file # sql=pyspark.sql.SparkSession(sc) path = s3_bucket + "your_path" dataS3=sql.read.parquet("s3a://" + path)