Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save jsamuel1/cd38fc02a59c5a01c9ba2c4a8e4f36ae to your computer and use it in GitHub Desktop.
Save jsamuel1/cd38fc02a59c5a01c9ba2c4a8e4f36ae to your computer and use it in GitHub Desktop.

Revisions

  1. jsamuel1 revised this gist Jun 9, 2020. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions aws-athena-auto-partition-lambda.py
    Original file line number Diff line number Diff line change
    @@ -11,6 +11,7 @@
    Twitter: https://twitter.com/SqlAdmin
    License: Free for educational purpose.
    Minor modifications for my personal account: use environment variables + support organizations.
    NOTE:
    -----
    1) Before schedule it, you need to create partitions for till current date.
  2. jsamuel1 revised this gist Jun 9, 2020. 1 changed file with 46 additions and 31 deletions.
    77 changes: 46 additions & 31 deletions aws-athena-auto-partition-lambda.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,7 @@
    NOTE:
    -----
    1) Before schedule it, you need to create partitions for till current date.
    2) This is will start creating partitions with next day [current date +1].
    2) This is will start creating partitions with current day [current date].
    3) This will not return the Athena query is successful or not. But this
    will return the Query Execution ID.
    @@ -30,15 +30,18 @@
    -----------------------
    s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files
    PARAMETERS NEEDS TO CHANGE:
    ENVIRONMENT VARIABLES CAN BE SET:
    ---------------------------
    1) s3_buckcet - Bucket name where your cloudtrail logs stored.
    2) s3_prefix - Path for your cloudtrail logs (give the prefix before the regions.
    * organizationid
    * accountid - comma delimited list of accounts
    * s3_bucket - Bucket name where your cloudtrail logs stored.
    * s3_prefix - Path for your cloudtrail logs (give the prefix before the regions.
    for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
    So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
    3) s3_ouput - Path for where your Athena query results need to be saved.
    4) database - Name of the DB where your Cloudtrail logs table located.
    5) table_name - Nanme of the table where your Cloudtrail logs table located.
    * s3_output - Path for where your Athena query results need to be saved.
    * database - Name of the DB where your Cloudtrail logs table located.
    * table_name - Name of the table where your Cloudtrail logs table located.
    * dateoverride - run for a specific date, instead of current date
    DEBUGGING:
    ----------
    @@ -48,26 +51,29 @@

    #Import libraries
    import boto3
    import datetime
    from datetime import datetime
    import os

    #Connection for S3 and Athena
    s3 = boto3.client('s3')
    athena = boto3.client('athena')

    #Get Year, Month, Day for partition (this will get tomorrow date's value)
    date = datetime.datetime.now()
    if os.getenv('dateoverride'):
    date = datetime.strptime(os.getenv('dateoverride'), '%Y-%m-%d')
    else:
    date = datetime.now()
    athena_year = str(date.year)
    athena_month = str(date.month).rjust(2, '0')
    athena_day = str(date.day + 1).rjust(2, '0')
    athena_day = str(date.day).rjust(2, '0')

    #Parameters for S3 log location and Athena table
    #Fill this carefully (Read the commented section on top to help)
    s3_buckcet = 'cloudtrail-logs'
    s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
    s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
    s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'
    database = 'athena_log_database'
    table_name = 'cloudtrail_logs_table'
    organizationid = os.getenv('organizationid') or 'o-xxxxxxx'
    accountid = os.getenv('accountid') or s3.accountid
    s3_bucket = os.getenv('s3_bucket') or 'cloudtrail-logs'
    database = os.getenv('database') or 'athena_log_database'
    s3_output = os.getenv('s3_output') or f's3://{s3_bucket}/queryresults'


    #Executing the athena query:
    @@ -86,18 +92,27 @@ def run_query(query, database, s3_output):

    #Main function for get regions and run the query on the captured regions
    def lambda_handler(event, context):
    result = s3.list_objects(Bucket=s3_buckcet,Prefix=s3_prefix, Delimiter='/')
    for regions in result.get('CommonPrefixes'):
    get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
    query = str("ALTER TABLE "+ table_name +" ADD PARTITION (region='"
    + get_region + "',year="
    + athena_year + ",month="
    + athena_month + ",day="
    + athena_day
    + ") location '"+s3_input
    + get_region
    + "/" + athena_year + "/" + athena_month + "/"
    + athena_day + "';")
    #print(get_region) -- for debug
    #print(query) -- for debug
    run_query(query, database, s3_ouput)
    for account in accountid.split(','):
    print(f'account: ${account}')

    s3_prefix = os.getenv('s3_prefix') or f'{organizationid}/AWSLogs/{account}/CloudTrail/'
    s3_input = f's3://{s3_bucket}/{s3_prefix}'
    table_name = os.getenv('table_name') or 'cloudtrail_logs_' + account


    result = s3.list_objects(Bucket=s3_bucket,Prefix=s3_prefix, Delimiter='/')
    for regions in result.get('CommonPrefixes'):
    get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
    query = f'''
    ALTER TABLE {table_name}
    ADD PARTITION (
    region='{get_region}',
    year='{athena_year}',
    month='{athena_month}',
    day='{athena_day}'
    ) location '{s3_input}{get_region}/{athena_year}/{athena_month}/{athena_day}/';
    '''

    print(get_region) #-- for debug
    print(query) #-- for debug
    run_query(query, database, s3_output)
  3. @TheSuperAgent TheSuperAgent revised this gist Jul 8, 2018. No changes.
  4. @TheSuperAgent TheSuperAgent revised this gist Jun 1, 2018. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion aws-athena-auto-partition-lambda.py
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,6 @@
    Version 1.0
    Author: SqlAdmin
    Twitter: https://twitter.com/SqlAdmin
    Blog: https://www.sqlgossip.com
    License: Free for educational purpose.
    NOTE:
  5. @TheSuperAgent TheSuperAgent revised this gist Jun 1, 2018. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions aws-athena-auto-partition-lambda.py
    Original file line number Diff line number Diff line change
    @@ -38,8 +38,8 @@
    for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
    So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
    3) s3_ouput - Path for where your Athena query results need to be saved.
    4) database - Name of the DB where your cloudwatch logs table located.
    5) table_name - Nanme of the table where your cloudwatch logs table located.
    4) database - Name of the DB where your Cloudtrail logs table located.
    5) table_name - Nanme of the table where your Cloudtrail logs table located.
    DEBUGGING:
    ----------
    @@ -63,7 +63,7 @@

    #Parameters for S3 log location and Athena table
    #Fill this carefully (Read the commented section on top to help)
    s3_buckcet = 'sqladmin-cloudtrail'
    s3_buckcet = 'cloudtrail-logs'
    s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
    s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
    s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'
  6. @TheSuperAgent TheSuperAgent revised this gist May 29, 2018. 1 changed file with 6 additions and 0 deletions.
    6 changes: 6 additions & 0 deletions aws-athena-auto-partition-lambda.py
    Original file line number Diff line number Diff line change
    @@ -41,6 +41,10 @@
    4) database - Name of the DB where your cloudwatch logs table located.
    5) table_name - Nanme of the table where your cloudwatch logs table located.
    DEBUGGING:
    ----------
    1) comment the line 103 [run_query(query, database, s3_ouput]
    2) remove comment from line 101 and 102 [print(get-regions), print(query)]
    ---------------------------------------------------------------------------------'''

    #Import libraries
    @@ -95,4 +99,6 @@ def lambda_handler(event, context):
    + get_region
    + "/" + athena_year + "/" + athena_month + "/"
    + athena_day + "';")
    #print(get_region) -- for debug
    #print(query) -- for debug
    run_query(query, database, s3_ouput)
  7. @TheSuperAgent TheSuperAgent renamed this gist May 29, 2018. 1 changed file with 0 additions and 0 deletions.
  8. @TheSuperAgent TheSuperAgent created this gist May 29, 2018.
    98 changes: 98 additions & 0 deletions aws-athena-auto-partition-lamba.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,98 @@
    # Lambda function to create partition for Cloudtrail log on daily basis.
    # You need to schedule it in AWS Lambda.

    '''
    -------------------------------------------
    AWS Athena Create Partitions Automatically
    -------------------------------------------
    Version 1.0
    Author: SqlAdmin
    Twitter: https://twitter.com/SqlAdmin
    Blog: https://www.sqlgossip.com
    License: Free for educational purpose.
    NOTE:
    -----
    1) Before schedule it, you need to create partitions for till current date.
    2) This is will start creating partitions with next day [current date +1].
    3) This will not return the Athena query is successful or not. But this
    will return the Query Execution ID.
    HOW THIS WORKS:
    ---------------
    1) It'll check the list of regions that cloudwatch logs captured from the
    S3. Becuase few peoples will use only particular region. So they won't
    get any logs on other regions.
    2) Then it'll start executing the create partition queries against all
    the regions.
    Example Cloudtrail Path:
    -----------------------
    s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files
    PARAMETERS NEEDS TO CHANGE:
    ---------------------------
    1) s3_buckcet - Bucket name where your cloudtrail logs stored.
    2) s3_prefix - Path for your cloudtrail logs (give the prefix before the regions.
    for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
    So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
    3) s3_ouput - Path for where your Athena query results need to be saved.
    4) database - Name of the DB where your cloudwatch logs table located.
    5) table_name - Nanme of the table where your cloudwatch logs table located.
    ---------------------------------------------------------------------------------'''

    #Import libraries
    import boto3
    import datetime

    #Connection for S3 and Athena
    s3 = boto3.client('s3')
    athena = boto3.client('athena')

    #Get Year, Month, Day for partition (this will get tomorrow date's value)
    date = datetime.datetime.now()
    athena_year = str(date.year)
    athena_month = str(date.month).rjust(2, '0')
    athena_day = str(date.day + 1).rjust(2, '0')

    #Parameters for S3 log location and Athena table
    #Fill this carefully (Read the commented section on top to help)
    s3_buckcet = 'sqladmin-cloudtrail'
    s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
    s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
    s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'
    database = 'athena_log_database'
    table_name = 'cloudtrail_logs_table'


    #Executing the athena query:
    def run_query(query, database, s3_output):
    query_response = athena.start_query_execution(
    QueryString=query,
    QueryExecutionContext={
    'Database': database
    },
    ResultConfiguration={
    'OutputLocation': s3_output,
    }
    )
    print('Execution ID: ' + query_response['QueryExecutionId'])
    return query_response

    #Main function for get regions and run the query on the captured regions
    def lambda_handler(event, context):
    result = s3.list_objects(Bucket=s3_buckcet,Prefix=s3_prefix, Delimiter='/')
    for regions in result.get('CommonPrefixes'):
    get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
    query = str("ALTER TABLE "+ table_name +" ADD PARTITION (region='"
    + get_region + "',year="
    + athena_year + ",month="
    + athena_month + ",day="
    + athena_day
    + ") location '"+s3_input
    + get_region
    + "/" + athena_year + "/" + athena_month + "/"
    + athena_day + "';")
    run_query(query, database, s3_ouput)