Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save krisjan-oldekamp/0a1a1c0085e6e6f68306f184ed1af4a1 to your computer and use it in GitHub Desktop.

Select an option

Save krisjan-oldekamp/0a1a1c0085e6e6f68306f184ed1af4a1 to your computer and use it in GitHub Desktop.

Revisions

  1. krisjan-oldekamp revised this gist Nov 18, 2021. 1 changed file with 69 additions and 78 deletions.
    147 changes: 69 additions & 78 deletions google_analytics_bigquery_user_mapping_table_customer.sql
    Original file line number Diff line number Diff line change
    @@ -1,86 +1,77 @@
    -- Author: Krisjan Oldekamp
    -- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset

    declare lookback_window int64 default 90; -- how many days to lookback into the dataset to search for ids (compared to today)

    DECLARE lookback_window INT64 DEFAULT 90; -- How many days to lookback into the dataset to search for IDs (compared to today)

    -- Deduplicate array of struct
    CREATE TEMP FUNCTION DEDUP(val ANY TYPE) AS ((
    SELECT ARRAY_AGG(t)
    FROM (SELECT MAX(v.timestamp) AS timestamp, v.id FROM UNNEST(val) v GROUP BY v.id ORDER BY timestamp DESC LIMIT 100) t
    -- udf: deduplicate array of struct
    create temp function dedup(arr any type) as ((
    select
    array_agg(t)
    from (
    select max(a.timestamp) as timestamp, a.id from unnest(arr) a group by a.id order by timestamp desc limit 100
    ) t
    ));

    WITH ga_user_ids AS (
    -- Select all user-IDs from the GA4 dataset in the specified time period
    SELECT
    *
    FROM (
    SELECT
    user_pseudo_id AS ga_client_id, -- Device-ID
    user_id AS customer_id, -- Custom defined User-ID (e.g. Customer-ID)
    (SELECT value.string_value FROM UNNEST(user_properties) WHERE key = "relay_id") AS customer_id_secondary, -- Secondary custom user-ID
    (SELECT value.string_value FROM UNNEST(event_params) WHERE event_name = "page_view" AND key = "gclid") AS gclid, -- Click-ID
    ecommerce.transaction_id AS transaction_id, -- Order-IDs
    MAX(event_timestamp) AS event_timestamp -- Timestamps
    FROM
    `<your-project>.analytics_<your-dataset>.events_*`
    WHERE
    _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL lookback_window DAY))
    AND FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
    GROUP BY
    ga_client_id,
    customer_id,
    customer_id_secondary,
    gclid,
    transaction_id
    )
    -- Filter out users without any additional user-IDs besides ga_client_id
    WHERE
    customer_id IS NOT NULL
    OR customer_id_secondary IS NOT NULL
    OR gclid IS NOT NULL
    OR transaction_id IS NOT NULL
    with ga_user_ids as (
    -- select all user-ids from the ga4 dataset in the specified time period
    select
    *
    from (
    select
    user_pseudo_id as ga_client_id, -- device-id
    user_id as customer_id, -- custom defined user-id (e.g. customer-id)
    (select value.string_value from unnest(user_properties) where key = 'relay_id') as customer_id_secondary, -- secondary custom user-id
    (select value.string_value from unnest(event_params) where event_name = 'page_view' and key = 'gclid') as gclid, -- click-id
    ecommerce.transaction_id as transaction_id, -- order-ids
    max(event_timestamp) as event_timestamp -- timestamps
    from `<your-project>.analytics_<your-dataset>.events_*`
    where
    _table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval lookback_window day))
    and format_date('%Y%m%d',date_sub(current_date(), interval 1 day))
    group by
    1,2,3,4,5
    )
    -- filter out users without any additional user-ids besides ga_client_id
    where
    customer_id is not null
    or customer_id_secondary is not null
    or gclid is not null
    or transaction_id is not null
    )

    -- Select all identifiers and group on customer-ID.
    SELECT
    customer_id,
    MAX(TIMESTAMP_MICROS(event_timestamp)) AS timestamp_last_visit,
    -- Aggegrate customer-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(ga_client_id IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, ga_client_id AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS ga_client_id,
    -- Aggegrate secondary customer-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(customer_id_secondary IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, customer_id_secondary AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS customer_id_secondary,
    -- Aggegrate GCLIDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(gclid IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, gclid AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS gclid,
    -- Aggegrate transaction-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(transaction_id IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, transaction_id AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS transaction_id,
    FROM
    ga_user_ids
    WHERE
    customer_id IS NOT NULL
    GROUP BY
    customer_id
    -- select all identifiers and group on customer-id.
    select
    customer_id,
    max(timestamp_micros(event_timestamp)) as timestamp_last_visit,
    -- aggegrate customer-ids to an array of structs
    dedup(array_agg(
    if(ga_client_id is not null,
    struct(timestamp_micros(event_timestamp) as timestamp, ga_client_id as id),
    null
    ) ignore nulls order by event_timestamp desc
    )) as ga_client_id,
    -- aggegrate secondary customer-ids to an array of structs
    dedup(array_agg(
    if(customer_id_secondary is not null,
    struct(timestamp_micros(event_timestamp) as timestamp, customer_id_secondary as id),
    null
    )ignore nulls order by event_timestamp desc
    )) as customer_id_secondary,
    -- aggegrate gclids to an array of structs
    dedup(array_agg(
    if(gclid is not null,
    struct(timestamp_micros(event_timestamp) as timestamp, gclid as id),
    null
    ) ignore nulls order by event_timestamp desc
    )) as gclid,
    -- aggegrate transaction-ids to an array of structs
    dedup(array_agg(
    if(transaction_id is not null,
    struct(timestamp_micros(event_timestamp) as timestamp, transaction_id as id),
    null
    ) ignore nulls order by event_timestamp desc
    )) as transaction_id,
    from ga_user_ids
    where
    customer_id is not null
    group by 1
  2. krisjan-oldekamp revised this gist Oct 12, 2021. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions google_analytics_bigquery_user_mapping_table_customer.sql
    Original file line number Diff line number Diff line change
    @@ -80,7 +80,7 @@ SELECT
    )) AS transaction_id,
    FROM
    ga_user_ids
    GROUP BY
    customer_id
    WHERE
    customer_id IS NOT NULL
    GROUP BY
    customer_id
  3. krisjan-oldekamp revised this gist Oct 3, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion google_analytics_bigquery_user_mapping_table_customer.sql
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    -- Author: Krisjan Oldekamp
    -- https://stacktonic.com/article/create-a-user-mapping-table-based-on-all-user-ids-in-the-google-analytics-4-big-query-dataset
    -- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset


    DECLARE lookback_window INT64 DEFAULT 90; -- How many days to lookback into the dataset to search for IDs (compared to today)
  4. krisjan-oldekamp revised this gist Oct 3, 2021. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion google_analytics_bigquery_user_mapping_table_customer.sql
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    -- Author: Krisjan Oldekamp
    -- https://stacktonic.com/article/
    -- https://stacktonic.com/article/create-a-user-mapping-table-based-on-all-user-ids-in-the-google-analytics-4-big-query-dataset


    DECLARE lookback_window INT64 DEFAULT 90; -- How many days to lookback into the dataset to search for IDs (compared to today)

  5. krisjan-oldekamp created this gist Oct 3, 2021.
    85 changes: 85 additions & 0 deletions google_analytics_bigquery_user_mapping_table_customer.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,85 @@
    -- Author: Krisjan Oldekamp
    -- https://stacktonic.com/article/

    DECLARE lookback_window INT64 DEFAULT 90; -- How many days to lookback into the dataset to search for IDs (compared to today)

    -- Deduplicate array of struct
    CREATE TEMP FUNCTION DEDUP(val ANY TYPE) AS ((
    SELECT ARRAY_AGG(t)
    FROM (SELECT MAX(v.timestamp) AS timestamp, v.id FROM UNNEST(val) v GROUP BY v.id ORDER BY timestamp DESC LIMIT 100) t
    ));

    WITH ga_user_ids AS (
    -- Select all user-IDs from the GA4 dataset in the specified time period
    SELECT
    *
    FROM (
    SELECT
    user_pseudo_id AS ga_client_id, -- Device-ID
    user_id AS customer_id, -- Custom defined User-ID (e.g. Customer-ID)
    (SELECT value.string_value FROM UNNEST(user_properties) WHERE key = "relay_id") AS customer_id_secondary, -- Secondary custom user-ID
    (SELECT value.string_value FROM UNNEST(event_params) WHERE event_name = "page_view" AND key = "gclid") AS gclid, -- Click-ID
    ecommerce.transaction_id AS transaction_id, -- Order-IDs
    MAX(event_timestamp) AS event_timestamp -- Timestamps
    FROM
    `<your-project>.analytics_<your-dataset>.events_*`
    WHERE
    _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL lookback_window DAY))
    AND FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
    GROUP BY
    ga_client_id,
    customer_id,
    customer_id_secondary,
    gclid,
    transaction_id
    )
    -- Filter out users without any additional user-IDs besides ga_client_id
    WHERE
    customer_id IS NOT NULL
    OR customer_id_secondary IS NOT NULL
    OR gclid IS NOT NULL
    OR transaction_id IS NOT NULL
    )

    -- Select all identifiers and group on customer-ID.
    SELECT
    customer_id,
    MAX(TIMESTAMP_MICROS(event_timestamp)) AS timestamp_last_visit,
    -- Aggegrate customer-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(ga_client_id IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, ga_client_id AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS ga_client_id,
    -- Aggegrate secondary customer-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(customer_id_secondary IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, customer_id_secondary AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS customer_id_secondary,
    -- Aggegrate GCLIDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(gclid IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, gclid AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS gclid,
    -- Aggegrate transaction-IDs to an array of structs
    DEDUP(ARRAY_AGG(
    IF(transaction_id IS NOT NULL,
    STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp, transaction_id AS id),
    NULL
    )
    IGNORE NULLS ORDER BY event_timestamp DESC
    )) AS transaction_id,
    FROM
    ga_user_ids
    GROUP BY
    customer_id
    WHERE
    customer_id IS NOT NULL