Forked from SQLadmin/aws-athena-auto-partition-lambda.py
Last active
June 9, 2020 04:24
-
-
Save jsamuel1/cd38fc02a59c5a01c9ba2c4a8e4f36ae to your computer and use it in GitHub Desktop.
Revisions
-
jsamuel1 revised this gist
Jun 9, 2020 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -11,6 +11,7 @@ Twitter: https://twitter.com/SqlAdmin License: Free for educational purpose. Minor modifications for my personal account: use environment variables + support organizations. NOTE: ----- 1) Before schedule it, you need to create partitions for till current date. -
jsamuel1 revised this gist
Jun 9, 2020 . 1 changed file with 46 additions and 31 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,7 +14,7 @@ NOTE: ----- 1) Before schedule it, you need to create partitions for till current date. 2) This is will start creating partitions with current day [current date]. 3) This will not return the Athena query is successful or not. But this will return the Query Execution ID. @@ -30,15 +30,18 @@ ----------------------- s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files ENVIRONMENT VARIABLES CAN BE SET: --------------------------- * organizationid * accountid - comma delimited list of accounts * s3_bucket - Bucket name where your cloudtrail logs stored. * s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files So you need to use path: AWSLogs/AccountID/Cloudtrail/ ) * s3_output - Path for where your Athena query results need to be saved. * database - Name of the DB where your Cloudtrail logs table located. * table_name - Name of the table where your Cloudtrail logs table located. * dateoverride - run for a specific date, instead of current date DEBUGGING: ---------- @@ -48,26 +51,29 @@ #Import libraries import boto3 from datetime import datetime import os #Connection for S3 and Athena s3 = boto3.client('s3') athena = boto3.client('athena') #Get Year, Month, Day for partition (this will get tomorrow date's value) if os.getenv('dateoverride'): date = datetime.strptime(os.getenv('dateoverride'), '%Y-%m-%d') else: date = datetime.now() athena_year = str(date.year) athena_month = str(date.month).rjust(2, '0') athena_day = str(date.day).rjust(2, '0') #Parameters for S3 log location and Athena table #Fill this carefully (Read the commented section on top to help) organizationid = os.getenv('organizationid') or 'o-xxxxxxx' accountid = os.getenv('accountid') or s3.accountid s3_bucket = os.getenv('s3_bucket') or 'cloudtrail-logs' database = os.getenv('database') or 'athena_log_database' s3_output = os.getenv('s3_output') or f's3://{s3_bucket}/queryresults' #Executing the athena query: @@ -86,18 +92,27 @@ def run_query(query, database, s3_output): #Main function for get regions and run the query on the captured regions def lambda_handler(event, context): for account in accountid.split(','): print(f'account: ${account}') s3_prefix = os.getenv('s3_prefix') or f'{organizationid}/AWSLogs/{account}/CloudTrail/' s3_input = f's3://{s3_bucket}/{s3_prefix}' table_name = os.getenv('table_name') or 'cloudtrail_logs_' + account result = s3.list_objects(Bucket=s3_bucket,Prefix=s3_prefix, Delimiter='/') for regions in result.get('CommonPrefixes'): get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/','')) query = f''' ALTER TABLE {table_name} ADD PARTITION ( region='{get_region}', year='{athena_year}', month='{athena_month}', day='{athena_day}' ) location '{s3_input}{get_region}/{athena_year}/{athena_month}/{athena_day}/'; ''' print(get_region) #-- for debug print(query) #-- for debug run_query(query, database, s3_output) -
TheSuperAgent revised this gist
Jul 8, 2018 . No changes.There are no files selected for viewing
-
TheSuperAgent revised this gist
Jun 1, 2018 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,7 +9,6 @@ Version 1.0 Author: SqlAdmin Twitter: https://twitter.com/SqlAdmin License: Free for educational purpose. NOTE: -
TheSuperAgent revised this gist
Jun 1, 2018 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -38,8 +38,8 @@ for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files So you need to use path: AWSLogs/AccountID/Cloudtrail/ ) 3) s3_ouput - Path for where your Athena query results need to be saved. 4) database - Name of the DB where your Cloudtrail logs table located. 5) table_name - Nanme of the table where your Cloudtrail logs table located. DEBUGGING: ---------- @@ -63,7 +63,7 @@ #Parameters for S3 log location and Athena table #Fill this carefully (Read the commented section on top to help) s3_buckcet = 'cloudtrail-logs' s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/' s3_input = 's3://' + s3_buckcet + '/' + s3_prefix s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1' -
TheSuperAgent revised this gist
May 29, 2018 . 1 changed file with 6 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -41,6 +41,10 @@ 4) database - Name of the DB where your cloudwatch logs table located. 5) table_name - Nanme of the table where your cloudwatch logs table located. DEBUGGING: ---------- 1) comment the line 103 [run_query(query, database, s3_ouput] 2) remove comment from line 101 and 102 [print(get-regions), print(query)] ---------------------------------------------------------------------------------''' #Import libraries @@ -95,4 +99,6 @@ def lambda_handler(event, context): + get_region + "/" + athena_year + "/" + athena_month + "/" + athena_day + "';") #print(get_region) -- for debug #print(query) -- for debug run_query(query, database, s3_ouput) -
TheSuperAgent renamed this gist
May 29, 2018 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
TheSuperAgent created this gist
May 29, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,98 @@ # Lambda function to create partition for Cloudtrail log on daily basis. # You need to schedule it in AWS Lambda. ''' ------------------------------------------- AWS Athena Create Partitions Automatically ------------------------------------------- Version 1.0 Author: SqlAdmin Twitter: https://twitter.com/SqlAdmin Blog: https://www.sqlgossip.com License: Free for educational purpose. NOTE: ----- 1) Before schedule it, you need to create partitions for till current date. 2) This is will start creating partitions with next day [current date +1]. 3) This will not return the Athena query is successful or not. But this will return the Query Execution ID. HOW THIS WORKS: --------------- 1) It'll check the list of regions that cloudwatch logs captured from the S3. Becuase few peoples will use only particular region. So they won't get any logs on other regions. 2) Then it'll start executing the create partition queries against all the regions. Example Cloudtrail Path: ----------------------- s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files PARAMETERS NEEDS TO CHANGE: --------------------------- 1) s3_buckcet - Bucket name where your cloudtrail logs stored. 2) s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files So you need to use path: AWSLogs/AccountID/Cloudtrail/ ) 3) s3_ouput - Path for where your Athena query results need to be saved. 4) database - Name of the DB where your cloudwatch logs table located. 5) table_name - Nanme of the table where your cloudwatch logs table located. ---------------------------------------------------------------------------------''' #Import libraries import boto3 import datetime #Connection for S3 and Athena s3 = boto3.client('s3') athena = boto3.client('athena') #Get Year, Month, Day for partition (this will get tomorrow date's value) date = datetime.datetime.now() athena_year = str(date.year) athena_month = str(date.month).rjust(2, '0') athena_day = str(date.day + 1).rjust(2, '0') #Parameters for S3 log location and Athena table #Fill this carefully (Read the commented section on top to help) s3_buckcet = 'sqladmin-cloudtrail' s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/' s3_input = 's3://' + s3_buckcet + '/' + s3_prefix s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1' database = 'athena_log_database' table_name = 'cloudtrail_logs_table' #Executing the athena query: def run_query(query, database, s3_output): query_response = athena.start_query_execution( QueryString=query, QueryExecutionContext={ 'Database': database }, ResultConfiguration={ 'OutputLocation': s3_output, } ) print('Execution ID: ' + query_response['QueryExecutionId']) return query_response #Main function for get regions and run the query on the captured regions def lambda_handler(event, context): result = s3.list_objects(Bucket=s3_buckcet,Prefix=s3_prefix, Delimiter='/') for regions in result.get('CommonPrefixes'): get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/','')) query = str("ALTER TABLE "+ table_name +" ADD PARTITION (region='" + get_region + "',year=" + athena_year + ",month=" + athena_month + ",day=" + athena_day + ") location '"+s3_input + get_region + "/" + athena_year + "/" + athena_month + "/" + athena_day + "';") run_query(query, database, s3_ouput)