Skip to content

Instantly share code, notes, and snippets.

@nitinmlvya
Created April 23, 2019 06:08
Show Gist options
  • Select an option

  • Save nitinmlvya/ba4626e8ec40dc546119bb14a8349b45 to your computer and use it in GitHub Desktop.

Select an option

Save nitinmlvya/ba4626e8ec40dc546119bb14a8349b45 to your computer and use it in GitHub Desktop.

Revisions

  1. nitinmlvya created this gist Apr 23, 2019.
    27 changes: 27 additions & 0 deletions etl.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,27 @@
    import sys
    from awsglue.transforms import *
    from awsglue.utils import getResolvedOptions
    from pyspark.context import SparkContext
    from awsglue.context import GlueContext
    from awsglue.job import Job

    args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME'])

    ## Initialize the GlueContext and SparkContext
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    job = Job(glueContext)
    job.init(args['JOB_NAME'], args)

    ## Read the data from Amazon S3 and have their structure in the data catalog.
    datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "db_demo1", table_name = "tbl_syn_source_1_csv", transformation_ctx = "datasource1")

    datasource2 = glueContext.create_dynamic_frame.from_catalog(database = "db_demo1", table_name = "tbl_syn_source_2_csv", transformation_ctx = "datasource2")

    ## Apply transformation, join the tables
    join1 = Join.apply(frame1 = datasource1, frame2 = datasource2, keys1 = "statecode", keys2 = "code", transformation_ctx = "join1")

    ## Write the transformed data into Amazon Redshift
    datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = join1, catalog_connection = "my-redshift-1", connection_options = {"dbtable": "sch_demo_1.tbl_joined", "database": "dev"}, redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink1")
    job.commit()