Skip to content

Instantly share code, notes, and snippets.

@TheRockStarDBA
Created September 26, 2024 21:11
Show Gist options
  • Save TheRockStarDBA/49b8829cafbdcb79ea2cb587aba4f579 to your computer and use it in GitHub Desktop.
Save TheRockStarDBA/49b8829cafbdcb79ea2cb587aba4f579 to your computer and use it in GitHub Desktop.

Revisions

  1. TheRockStarDBA created this gist Sep 26, 2024.
    64 changes: 64 additions & 0 deletions airflow_metatdata_mining.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    -- ref: https://github.com/jimtodd92/airflow_metadata_mining/blob/main/airflow_metatdata_mining.sql
    -- To get Critical user activities like paused and dagrun_clear in past 24 hours.
    SELECT event
    ,DATE (dttm)
    ,count(*)
    FROM PUBLIC.log
    WHERE DATE (dttm) BETWEEN (
    SELECT max(DATE (dttm)) - 1
    FROM PUBLIC.log
    )
    AND (
    SELECT max(DATE (dttm))
    FROM PUBLIC.log
    )
    AND event IN (
    'paused'
    ,'dagrun_clear'
    )
    GROUP BY event
    ,DATE (dttm)
    ORDER BY DATE (dttm) DESC;


    -- To get 10 long ran tasks in a day.
    SELECT dag_id
    ,task_id
    ,execution_date
    ,round(cast(duration / (60 * 60) AS NUMERIC), 2) AS duration_in_hour
    FROM PUBLIC.task_instance
    WHERE DATE (execution_date) = '2022-01-03'
    AND STATE IN ('success')
    AND duration IS NOT NULL
    GROUP BY task_id
    ,dag_id
    ,execution_date
    ORDER BY duration DESC limit 10;


    -- To understand how occupied the airflow scheduler is, and decide a maintenance window.
    SELECT x.start_time_window
    ,count(*)
    FROM (
    SELECT start_date
    ,CASE
    WHEN extract(hour FROM start_date) BETWEEN 0
    AND 5
    THEN '0-5'
    WHEN extract(hour FROM start_date) BETWEEN 6
    AND 11
    THEN '6-11'
    WHEN extract(hour FROM start_date) BETWEEN 12
    AND 17
    THEN '12-17'
    WHEN extract(hour FROM start_date) BETWEEN 18
    AND 23
    THEN '18-23'
    END AS start_time_window
    FROM PUBLIC.task_instance
    WHERE execution_date BETWEEN '2022-01-01'
    AND '2022-01-02'
    AND start_date IS NOT NULL
    ) AS x
    GROUP BY x.start_time_window
    ORDER BY cast(split_part(x.start_time_window, '-', 1) AS INT)