jsamuel1 · June 9, 2020 04:24 · Jun 9, 2020 · Jun 9, 2020 · Jul 8, 2018 · Jun 1, 2018
diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
@@ -11,6 +11,7 @@
 Twitter: https://twitter.com/SqlAdmin
 License: Free for educational purpose.                 
 
+Minor modifications for my personal account:  use environment variables + support organizations.
 NOTE: 
 -----
 1) Before schedule it, you need to create partitions for till current date.

diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
@@ -14,7 +14,7 @@
 NOTE: 
 -----
 1) Before schedule it, you need to create partitions for till current date.
-2) This is will start creating partitions with next day [current date +1].
+2) This is will start creating partitions with current day [current date].
 3) This will not return the Athena query is successful or not. But this
    will return the Query Execution ID. 
 
@@ -30,15 +30,18 @@
 -----------------------
 s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files
 
-PARAMETERS NEEDS TO CHANGE:
+ENVIRONMENT VARIABLES CAN BE SET:
 ---------------------------
-1) s3_buckcet - Bucket name where your cloudtrail logs stored.
-2) s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. 
+* organizationid
+* accountid - comma delimited list of accounts
+* s3_bucket - Bucket name where your cloudtrail logs stored.
+* s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. 
    for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
    So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
-3) s3_ouput - Path for where your Athena query results need to be saved.
-4) database - Name of the DB where your Cloudtrail logs table located.
-5) table_name - Nanme of the table where your Cloudtrail logs table located.
+* s3_output - Path for where your Athena query results need to be saved.
+* database - Name of the DB where your Cloudtrail logs table located.
+* table_name - Name of the table where your Cloudtrail logs table located.
+* dateoverride - run for a specific date, instead of current date
 
 DEBUGGING:
 ----------
@@ -48,26 +51,29 @@
 
 #Import libraries 
 import boto3
-import datetime
+from datetime import datetime
+import os
 
 #Connection for S3 and Athena
 s3 = boto3.client('s3')
 athena = boto3.client('athena')
 
 #Get Year, Month, Day for partition (this will get tomorrow date's value)
-date = datetime.datetime.now()
+if os.getenv('dateoverride'):
+    date = datetime.strptime(os.getenv('dateoverride'), '%Y-%m-%d')
+else:
+    date = datetime.now()
 athena_year = str(date.year)
 athena_month = str(date.month).rjust(2, '0')
-athena_day = str(date.day + 1).rjust(2, '0')
+athena_day = str(date.day).rjust(2, '0')
 
 #Parameters for S3 log location and Athena table
 #Fill this carefully (Read the commented section on top to help)
-s3_buckcet = 'cloudtrail-logs'
-s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
-s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
-s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'
-database = 'athena_log_database'
-table_name = 'cloudtrail_logs_table'
+organizationid = os.getenv('organizationid') or 'o-xxxxxxx'
+accountid = os.getenv('accountid') or s3.accountid
+s3_bucket = os.getenv('s3_bucket') or 'cloudtrail-logs'
+database = os.getenv('database') or 'athena_log_database'
+s3_output = os.getenv('s3_output') or f's3://{s3_bucket}/queryresults'
 
 
 #Executing the athena query:
@@ -86,18 +92,27 @@ def run_query(query, database, s3_output):
 
 #Main function for get regions and run the query on the captured regions
 def lambda_handler(event, context):
- result =  s3.list_objects(Bucket=s3_buckcet,Prefix=s3_prefix, Delimiter='/')
- for regions in result.get('CommonPrefixes'):
-    get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
-    query = str("ALTER TABLE "+ table_name +" ADD PARTITION (region='"
-            + get_region + "',year="
-            + athena_year + ",month=" 
-            + athena_month + ",day="
-            + athena_day
-            + ") location '"+s3_input
-            + get_region
-            + "/" + athena_year + "/" + athena_month + "/"
-            + athena_day + "';")
-      #print(get_region) -- for debug
-      #print(query) -- for debug
-    run_query(query, database, s3_ouput)
+  for account in accountid.split(','):
+    print(f'account: ${account}')
+
+    s3_prefix = os.getenv('s3_prefix') or f'{organizationid}/AWSLogs/{account}/CloudTrail/'
+    s3_input = f's3://{s3_bucket}/{s3_prefix}'
+    table_name = os.getenv('table_name') or 'cloudtrail_logs_' + account
+
+
+    result =  s3.list_objects(Bucket=s3_bucket,Prefix=s3_prefix, Delimiter='/')
+    for regions in result.get('CommonPrefixes'):
+      get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
+      query = f'''
+        ALTER TABLE {table_name} 
+        ADD PARTITION (
+          region='{get_region}',
+          year='{athena_year}',
+          month='{athena_month}',
+          day='{athena_day}'
+        ) location '{s3_input}{get_region}/{athena_year}/{athena_month}/{athena_day}/';
+        '''
+
+      print(get_region) #-- for debug
+      print(query) #-- for debug
+      run_query(query, database, s3_output)
diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
@@ -9,7 +9,6 @@
 Version 1.0
 Author: SqlAdmin
 Twitter: https://twitter.com/SqlAdmin
-Blog: https://www.sqlgossip.com 
 License: Free for educational purpose.                 
 
 NOTE: 

diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
@@ -38,8 +38,8 @@
    for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
    So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
 3) s3_ouput - Path for where your Athena query results need to be saved.
-4) database - Name of the DB where your cloudwatch logs table located.
-5) table_name - Nanme of the table where your cloudwatch logs table located.
+4) database - Name of the DB where your Cloudtrail logs table located.
+5) table_name - Nanme of the table where your Cloudtrail logs table located.
 
 DEBUGGING:
 ----------
@@ -63,7 +63,7 @@
 
 #Parameters for S3 log location and Athena table
 #Fill this carefully (Read the commented section on top to help)
-s3_buckcet = 'sqladmin-cloudtrail'
+s3_buckcet = 'cloudtrail-logs'
 s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
 s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
 s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'

diff --git a/aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lambda.py
@@ -41,6 +41,10 @@
 4) database - Name of the DB where your cloudwatch logs table located.
 5) table_name - Nanme of the table where your cloudwatch logs table located.
 
+DEBUGGING:
+----------
+1) comment the line 103 [run_query(query, database, s3_ouput]
+2) remove comment from line 101 and 102 [print(get-regions), print(query)]
 ---------------------------------------------------------------------------------'''
 
 #Import libraries 
@@ -95,4 +99,6 @@ def lambda_handler(event, context):
             + get_region
             + "/" + athena_year + "/" + athena_month + "/"
             + athena_day + "';")
+      #print(get_region) -- for debug
+      #print(query) -- for debug
     run_query(query, database, s3_ouput)
diff --git a/aws-athena-auto-partition-lamba.py → aws-athena-auto-partition-lambda.py b/aws-athena-auto-partition-lamba.py → aws-athena-auto-partition-lambda.py
diff --git a/aws-athena-auto-partition-lamba.py b/aws-athena-auto-partition-lamba.py
@@ -0,0 +1,98 @@
+# Lambda function to create partition for Cloudtrail log on daily basis.
+# You need to schedule it in AWS Lambda.
+
+'''
+-------------------------------------------
+AWS Athena Create Partitions Automatically
+-------------------------------------------
+
+Version 1.0
+Author: SqlAdmin
+Twitter: https://twitter.com/SqlAdmin
+Blog: https://www.sqlgossip.com 
+License: Free for educational purpose.                 
+
+NOTE: 
+-----
+1) Before schedule it, you need to create partitions for till current date.
+2) This is will start creating partitions with next day [current date +1].
+3) This will not return the Athena query is successful or not. But this
+   will return the Query Execution ID. 
+
+HOW THIS WORKS:
+---------------
+1) It'll check the list of regions that cloudwatch logs captured from the 
+   S3. Becuase few peoples will use only particular region. So they won't
+   get any logs on other regions. 
+2) Then it'll start executing the create partition queries against all 
+   the regions. 
+
+Example Cloudtrail Path:
+-----------------------
+s3://bucket/AWSLogs/Account_ID/Cloudtrail/regions/year/month/day/log_files
+
+PARAMETERS NEEDS TO CHANGE:
+---------------------------
+1) s3_buckcet - Bucket name where your cloudtrail logs stored.
+2) s3_prefix - Path for your cloudtrail logs (give the prefix before the regions. 
+   for eg: s3://bucket/AWSLogs/AccountID/Cloudtrail/regions/year/month/day/log_files
+   So you need to use path: AWSLogs/AccountID/Cloudtrail/ )
+3) s3_ouput - Path for where your Athena query results need to be saved.
+4) database - Name of the DB where your cloudwatch logs table located.
+5) table_name - Nanme of the table where your cloudwatch logs table located.
+
+---------------------------------------------------------------------------------'''
+
+#Import libraries 
+import boto3
+import datetime
+
+#Connection for S3 and Athena
+s3 = boto3.client('s3')
+athena = boto3.client('athena')
+
+#Get Year, Month, Day for partition (this will get tomorrow date's value)
+date = datetime.datetime.now()
+athena_year = str(date.year)
+athena_month = str(date.month).rjust(2, '0')
+athena_day = str(date.day + 1).rjust(2, '0')
+
+#Parameters for S3 log location and Athena table
+#Fill this carefully (Read the commented section on top to help)
+s3_buckcet = 'sqladmin-cloudtrail'
+s3_prefix = 'AWSLogs/XXXXXXXXXXXX/CloudTrail/'
+s3_input = 's3://' + s3_buckcet + '/' + s3_prefix
+s3_ouput = 's3://aws-athena-query-results-XXXXXXXXXXXXXX-us-east-1'
+database = 'athena_log_database'
+table_name = 'cloudtrail_logs_table'
+
+
+#Executing the athena query:
+def run_query(query, database, s3_output):
+        query_response = athena.start_query_execution(
+        QueryString=query,
+        QueryExecutionContext={
+            'Database': database
+            },
+        ResultConfiguration={
+            'OutputLocation': s3_output,
+            }
+        )
+        print('Execution ID: ' + query_response['QueryExecutionId'])
+        return query_response
+
+#Main function for get regions and run the query on the captured regions
+def lambda_handler(event, context):
+ result =  s3.list_objects(Bucket=s3_buckcet,Prefix=s3_prefix, Delimiter='/')
+ for regions in result.get('CommonPrefixes'):
+    get_region=(regions.get('Prefix','').replace(s3_prefix,'').replace('/',''))
+    query = str("ALTER TABLE "+ table_name +" ADD PARTITION (region='"
+            + get_region + "',year="
+            + athena_year + ",month=" 
+            + athena_month + ",day="
+            + athena_day
+            + ") location '"+s3_input
+            + get_region
+            + "/" + athena_year + "/" + athena_month + "/"
+            + athena_day + "';")
+    run_query(query, database, s3_ouput)