Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save perXautomatik/f81d928ccbcaa69f9fa68e9686f0f97d to your computer and use it in GitHub Desktop.
Save perXautomatik/f81d928ccbcaa69f9fa68e9686f0f97d to your computer and use it in GitHub Desktop.

Revisions

  1. perXautomatik revised this gist Jun 19, 2023. 1 changed file with 0 additions and 0 deletions.
    Binary file modified MostCommonDuplicatePath.sqlnb
    Binary file not shown.
  2. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 10 additions and 5 deletions.
    15 changes: 10 additions & 5 deletions 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -2,14 +2,14 @@
    -- Create a CTE named folders that selects the group ID, folder and filename from dupes table
    -- and adds a column named full_path that combines the folder and filename with a backslash
    with folders as (
    select [Group ID] as group_id, Folder || '\' || filename as full_path from dupes
    select [Group ID] as group_id, Folder || '\' || Filename as full_path from dupes
    ),
    -- Create a CTE named split that recursively splits the full_path column into single characters
    -- and appends them to a column named path_prefix, while keeping track of the last two characters
    -- in columns last_char and common_denominator
    split AS (
    select group_id, '' as path_prefix,'' as last_char, '' as common_denominator from folders
    select group_id, '' as path_prefix,'' as last_char, full_path as common_denominator from folders
    UNION ALL
    SELECT group_id,
    path_prefix || substr(common_denominator, 1, 1) as path_prefix,
    @@ -30,10 +30,8 @@ folder_names as (
    -- columns are not.
    -- This finds pairs of folders that have the same files inside them.
    select leftX,rightX,count(*) c from qzy group by leftx,rightx
    order by c desc limit 10
    -- It also groups by the right_folder.group_id column and orders by the length of the common_denominator column.
    -- It also groups by the right_folder.group_id column and orders by the longest denominator column.
    ,folder_pairs as (
    select left_folder.path_prefix as left_folder_name,
    right_folder.path_prefix as right_folder_name,
    @@ -44,3 +42,10 @@ select leftX,rightX,count(*) c from qzy group by leftx,rightx
    group by right_group_id
    order by length(left_folder.common_denominator)
    )
    -- Finally, select the columns left_folder_name, right_folder_name and count from folder_pairs,
    -- group by them, and order by count in descending order. It also limits the result to 10 rows.
    -- This shows the top 10 pairs of folders that have the most files in common.
    select left_folder_name,right_folder_name,count(*) as common_files from folder_pairs
    group by left_folder_name,right_folder_name
    order by common_files desc limit 10
  3. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 15 additions and 2 deletions.
    17 changes: 15 additions & 2 deletions 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -25,9 +25,22 @@ folder_names as (
    WHERE path_prefix != last_char and last_char = '\' and (common_denominator = '' OR last_char = '\')
    )
    -- Create a CTE named folder_pairs that performs a cross join between folder_names as left_folder
    -- and folder_names as right_folder, where the common_denominator columns are equal and the path_prefix
    -- columns are not.
    ,qzy as (select a.s as LeftX,b.s as RightX,b.id as bs from qq as a cross join qq as b where a.rest = b.rest and a.s != b.s group by bs order by length(a.rest))
    -- This finds pairs of folders that have the same files inside them.
    select leftX,rightX,count(*) c from qzy group by leftx,rightx
    order by c desc limit 10
    -- It also groups by the right_folder.group_id column and orders by the length of the common_denominator column.
    ,folder_pairs as (
    select left_folder.path_prefix as left_folder_name,
    right_folder.path_prefix as right_folder_name,
    right_folder.group_id as right_group_id
    from folder_names as left_folder cross join folder_names as right_folder
    where left_folder.common_denominator = right_folder.common_denominator
    and left_folder.path_prefix != right_folder.path_prefix
    group by right_group_id
    order by length(left_folder.common_denominator)
    )
  4. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -18,8 +18,13 @@ split AS (
    WHERE common_denominator <> ''
    ),
    -- Create a CTE named folder_names that filters the rows from split.
    -- This extracts the folder names from the original paths.
    folder_names as (
    SELECT group_id, path_prefix, last_char, common_denominator FROM split
    WHERE path_prefix != last_char and last_char = '\' and (common_denominator = '' OR last_char = '\')
    )
    ,qq as (SELECT id,s,last2,rest FROM split WHERE (rest = '' OR last2 = '\') and s != last2 and last2 = '\')
    ,qzy as (select a.s as LeftX,b.s as RightX,b.id as bs from qq as a cross join qq as b where a.rest = b.rest and a.s != b.s group by bs order by length(a.rest))
  5. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 9 additions and 8 deletions.
    17 changes: 9 additions & 8 deletions 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -8,15 +8,16 @@ with folders as (
    -- Create a CTE named split that recursively splits the full_path column into single characters
    -- and appends them to a column named path_prefix, while keeping track of the last two characters
    -- in columns last_char and common_denominator
    split AS (
    select group_id, '' as path_prefix,'' as last_char, '' as common_denominator from folders
    UNION ALL
    SELECT group_id,
    path_prefix || substr(common_denominator, 1, 1) as path_prefix,
    substr(common_denominator, 1, 1) as last_char,
    substr(common_denominator, 2) as common_denominator FROM split
    WHERE common_denominator <> ''
    ),
    split AS (
    select * from q
    UNION ALL
    SELECT id,s || substr(rest, 1, 1) as ss,
    substr(rest, 1, 1) as last2,
    substr(rest, 2) as last3 FROM split
    WHERE rest <> ''
    )
    ,qq as (SELECT id,s,last2,rest FROM split WHERE (rest = '' OR last2 = '\') and s != last2 and last2 = '\')
  6. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 6 additions and 1 deletion.
    7 changes: 6 additions & 1 deletion 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -1,9 +1,14 @@
    --show the top 10 pairs of folders that have the most files in common.
    -- Create a CTE named folders that selects the group ID, folder and filename from dupes table
    -- and adds a column named full_path that combines the folder and filename with a backslash
    with folders as (
    select [Group ID] as group_id, Folder || '\' || filename as full_path from dupes
    ),
    -- Create a CTE named split that recursively splits the full_path column into single characters
    -- and appends them to a column named path_prefix, while keeping track of the last two characters
    -- in columns last_char and common_denominator
    with q as (select [Group ID] as id,'' as s, '' as last2, Folder || '\' || filename as rest from dupes ),
    split AS (
    select * from q
    UNION ALL
  7. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    --show the top 10 pairs of folders that have the most files in common.
    -- Create a CTE named folders that selects the group ID, folder and filename from dupes table
    -- and adds a column named full_path that combines the folder and filename with a backslash


    with q as (select [Group ID] as id,'' as s, '' as last2, Folder || '\' || filename as rest from dupes ),
  8. perXautomatik revised this gist Jun 17, 2023. 1 changed file with 0 additions and 0 deletions.
    Binary file added MostCommonDuplicatePath.sqlnb
    Binary file not shown.
  9. perXautomatik created this gist Jun 17, 2023.
    20 changes: 20 additions & 0 deletions 10pairs-Of-Folders-WithMostFilesIncommon.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,20 @@
    --show the top 10 pairs of folders that have the most files in common.


    with q as (select [Group ID] as id,'' as s, '' as last2, Folder || '\' || filename as rest from dupes ),
    split AS (
    select * from q
    UNION ALL
    SELECT id,s || substr(rest, 1, 1) as ss,
    substr(rest, 1, 1) as last2,
    substr(rest, 2) as last3 FROM split
    WHERE rest <> ''
    )
    ,qq as (SELECT id,s,last2,rest FROM split WHERE (rest = '' OR last2 = '\') and s != last2 and last2 = '\')
    ,qzy as (select a.s as LeftX,b.s as RightX,b.id as bs from qq as a cross join qq as b where a.rest = b.rest and a.s != b.s group by bs order by length(a.rest))
    select leftX,rightX,count(*) c from qzy group by leftx,rightx
    order by c desc limit 10