# A collection of useful Athena Queries
## Web Application Firewall
### Create WAF logs with projection (no need to partition)
- See the [docs](https://docs.aws.amazon.com/athena/latest/ug/waf-logs.html) for reference
- Replace the values `BUCKET_NAME`, `AWS_ACCOUNT_NUMBER`, `AWS_REGION`, `WEB_ACL_NAME` accordingly
- Replace `'projection.date.range' = '2023/01/01/00/00,NOW',` with the date your WAF logging begins (find in S3)
Query
```sql
CREATE EXTERNAL TABLE `waf_logs`(
`timestamp` bigint,
`formatversion` int,
`webaclid` string,
`terminatingruleid` string,
`terminatingruletype` string,
`action` string,
`terminatingrulematchdetails` array <
struct <
conditiontype: string,
sensitivitylevel: string,
location: string,
matcheddata: array < string >
>
>,
`httpsourcename` string,
`httpsourceid` string,
`rulegrouplist` array <
struct <
rulegroupid: string,
terminatingrule: struct <
ruleid: string,
action: string,
rulematchdetails: array <
struct <
conditiontype: string,
sensitivitylevel: string,
location: string,
matcheddata: array < string >
>
>
>,
nonterminatingmatchingrules: array <
struct <
ruleid: string,
action: string,
rulematchdetails: array <
struct <
conditiontype: string,
sensitivitylevel: string,
location: string,
matcheddata: array < string >
>
>
>
>,
excludedrules: string
>
>,
`ratebasedrulelist` array <
struct <
ratebasedruleid: string,
limitkey: string,
maxrateallowed: int
>
>,
`nonterminatingmatchingrules` array <
struct <
ruleid: string,
action: string,
rulematchdetails: array <
struct <
conditiontype: string,
sensitivitylevel: string,
location: string,
matcheddata: array < string >
>
>,
captcharesponse: struct <
responsecode: string,
solvetimestamp: string
>
>
>,
`requestheadersinserted` array <
struct <
name: string,
value: string
>
>,
`responsecodesent` string,
`httprequest` struct <
clientip: string,
country: string,
headers: array <
struct <
name: string,
value: string
>
>,
uri: string,
args: string,
httpversion: string,
httpmethod: string,
requestid: string
>,
`labels` array <
struct <
name: string
>
>,
`captcharesponse` struct <
responsecode: string,
solvetimestamp: string,
failureReason: string
>
)
PARTITIONED BY (
`region` string,
`date` string)
ROW FORMAT SERDE
'org.openx.data.jsonserde.JsonSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/WAFLogs/AWS_REGION/WEB_ACL_NAME/'
TBLPROPERTIES(
'projection.enabled' = 'true',
'projection.region.type' = 'enum',
'projection.region.values' = 'AWS_REGION',
'projection.date.type' = 'date',
'projection.date.range' = '2023/01/01/00/00,NOW',
'projection.date.format' = 'yyyy/MM/dd/HH/mm',
'projection.date.interval' = '1',
'projection.date.interval.unit' = 'HOURS',
'storage.location.template' = 's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/WAFLogs/${region}/WEB_ACL_NAME/${date}')
```
## Application Load Balancer Access Logs
### Create ALB logs with projection
- See the [docs](https://aws.amazon.com/premiumsupport/knowledge-center/athena-analyze-access-logs/) here
- Replace the values `BUCKET_NAME`, `AWS_ACCOUNT_NUMBER`, `AWS_REGION` accordingly
- Replace `'projection.date.range' = '2022/01/01,NOW',` with the date your ALB logging begins (find in S3)
Query
```sql
CREATE EXTERNAL TABLE IF NOT EXISTS alb_logs_projected (
type string,
time string,
elb string,
client_ip string,
client_port int,
target_ip string,
target_port int,
request_processing_time double,
target_processing_time double,
response_processing_time double,
elb_status_code int,
target_status_code string,
received_bytes bigint,
sent_bytes bigint,
request_verb string,
request_url string,
request_proto string,
user_agent string,
ssl_cipher string,
ssl_protocol string,
target_group_arn string,
trace_id string,
domain_name string,
chosen_cert_arn string,
matched_rule_priority string,
request_creation_time string,
actions_executed string,
redirect_url string,
lambda_error_reason string,
target_port_list string,
target_status_code_list string,
classification string,
classification_reason string
)
PARTITIONED BY
(
`date` STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' =
'([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\\s]+?)\" \"([^\\s]+)\" \"([^ ]*)\" \"([^ ]*)\" ?([^ ]*)?'
)
LOCATION 's3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/elasticloadbalancing/AWS_REGION/'
TBLPROPERTIES
(
"projection.enabled" = "true",
"projection.date.type" = "date",
"projection.date.range" = "2022/01/01,NOW",
"projection.date.format" = "yyyy/MM/dd",
"projection.date.interval" = "1",
"projection.date.interval.unit" = "DAYS",
"storage.location.template" = "s3://BUCKET_NAME/AWSLogs/AWS_ACCOUNT_NUMBER/elasticloadbalancing/AWS_REGION/${date}"
)
```
### Get last requests of a stopped instance/task
Query
```sql
# Get the IP address of a crashed task from the Stopped Tasks Widget on the Production-API-Segmented CW Dashboard
SELECT *
FROM alb_logs_projected
WHERE date = '2022/10/17'
AND target_processing_time > 10 # seconds - requests that take long to process can be problematic
AND received_bytes > 8000 # bytes- large payloads can typically cause OutOfMemory crashes on ecs tasks
-- AND request_url LIKE '%some_url'
-- AND regexp_like(request_verb, 'PUT')
AND regexp_like(target_ip, '172.31.29.76') # check ip of failed task
-- AND regexp_like(target_ip, '172.31.29.76|172.31.14.108|172.31.18.14|172.31.11.49|172.31.27.153') # check multiple ip
ORDER BY time DESC
```
### Get the amount of data sent and received per url in an ALB
Query
```sql
SELECT
request_verb,
request_url,
count(*) as count,
elb_status_code,
count(*)/(12 * 60) as count_per_min,
ceil(avg(received_bytes/1024)) as avg_received_kb,
ceil(avg(sent_bytes/1024)) as avg_sent_kb,
ceil(avg(target_processing_time)) as avg_target_processing_time_secs,
ceil(max(received_bytes/1024)) as max_received_kb,
ceil(max(sent_bytes/1024)) as max_sent_kb,
ceil(max(target_processing_time)) as max_target_processing_time_secs,
ceil((ceil(avg(target_processing_time)) * count(*))/3600) as total_processing_time_hrs
FROM alb_logs_projected
WHERE date like '2023/01/%'
GROUP BY 1, 2, 4
ORDER BY 3 DESC
```
## Cloudtrail
### Create projected cloudtrail logs
Query
```sql
CREATE EXTERNAL TABLE cloudtrail_logs_pp(
eventVersion STRING,
userIdentity STRUCT<
type: STRING,
principalId: STRING,
arn: STRING,
accountId: STRING,
invokedBy: STRING,
accessKeyId: STRING,
userName: STRING,
sessionContext: STRUCT<
attributes: STRUCT<
mfaAuthenticated: STRING,
creationDate: STRING>,
sessionIssuer: STRUCT<
type: STRING,
principalId: STRING,
arn: STRING,
accountId: STRING,
userName: STRING>,
ec2RoleDelivery:string,
webIdFederationData:map
>
>,
eventTime STRING,
eventSource STRING,
eventName STRING,
awsRegion STRING,
sourceIpAddress STRING,
userAgent STRING,
errorCode STRING,
errorMessage STRING,
requestparameters STRING,
responseelements STRING,
additionaleventdata STRING,
requestId STRING,
eventId STRING,
readOnly STRING,
resources ARRAY>,
eventType STRING,
apiVersion STRING,
recipientAccountId STRING,
serviceEventDetails STRING,
sharedEventID STRING,
vpcendpointid STRING,
tlsDetails struct<
tlsVersion:string,
cipherSuite:string,
clientProvidedHostHeader:string>
)
PARTITIONED BY (
`account` STRING,
`region` STRING,
`timestamp` STRING)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS INPUTFORMAT 'com.amazon.emr.cloudtrail.CloudTrailInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
's3://CLOUDTRAIL_BUCKET_NAME/AWSLogs/o-l09l6qorsp'
TBLPROPERTIES (
'projection.enabled'='true',
'projection.account.type'='enum',
'projection.account.values'='ACCOUNT_1,ACCOUNT_2,ACCOUNT_3,ACCOUNT_4,ACCOUNT_5,ACCOUNT_6,ACCOUNT_7',
'projection.region.type'='enum',
'projection.region.values'='ca-central-1,us-east-1,us-east-2,eu-central-1',
'projection.timestamp.format'='yyyy/MM/dd',
'projection.timestamp.interval'='1',
'projection.timestamp.interval.unit'='DAYS',
'projection.timestamp.range'='2023/01/01,NOW',
'projection.timestamp.type'='date',
'storage.location.template'='s3://CLOUDTRAIL_BUCKET_NAME/AWSLogs/o-l09l6xxxx/${account}/CloudTrail/${region}/${timestamp}')
```
### Notes
- Choose the right projection [type](https://docs.aws.amazon.com/athena/latest/ug/partition-projection-supported-types.html) based on the S3 location. Since I was logging cloudtrail across an organization, the s3 location turned out different than the one in the AWS docs. An Id like `o-l09l6xxxx` is added to the S3 path and I needed to acccount for that.
- It's super important to note that even though SQL is case insensitive, everything in `storage.location.template` **has to be in lower case**, no exceptions.