--show the top 10 pairs of folders that have the most files in common. -- Create a CTE named folders that selects the group ID, folder and filename from dupes table -- and adds a column named full_path that combines the folder and filename with a backslash with folders as ( select [Group ID] as group_id, Folder || '\' || Filename as full_path from dupes ), -- Create a CTE named split that recursively splits the full_path column into single characters -- and appends them to a column named path_prefix, while keeping track of the last two characters -- in columns last_char and common_denominator split AS ( select group_id, '' as path_prefix,'' as last_char, full_path as common_denominator from folders UNION ALL SELECT group_id, path_prefix || substr(common_denominator, 1, 1) as path_prefix, substr(common_denominator, 1, 1) as last_char, substr(common_denominator, 2) as common_denominator FROM split WHERE common_denominator <> '' ), -- Create a CTE named folder_names that filters the rows from split. -- This extracts the folder names from the original paths. folder_names as ( SELECT group_id, path_prefix, last_char, common_denominator FROM split WHERE path_prefix != last_char and last_char = '\' and (common_denominator = '' OR last_char = '\') ) -- Create a CTE named folder_pairs that performs a cross join between folder_names as left_folder -- and folder_names as right_folder, where the common_denominator columns are equal and the path_prefix -- columns are not. -- This finds pairs of folders that have the same files inside them. -- It also groups by the right_folder.group_id column and orders by the longest denominator column. ,folder_pairs as ( select left_folder.path_prefix as left_folder_name, right_folder.path_prefix as right_folder_name, right_folder.group_id as right_group_id from folder_names as left_folder cross join folder_names as right_folder where left_folder.common_denominator = right_folder.common_denominator and left_folder.path_prefix != right_folder.path_prefix group by right_group_id order by length(left_folder.common_denominator) ) -- Finally, select the columns left_folder_name, right_folder_name and count from folder_pairs, -- group by them, and order by count in descending order. It also limits the result to 10 rows. -- This shows the top 10 pairs of folders that have the most files in common. select left_folder_name,right_folder_name,count(*) as common_files from folder_pairs group by left_folder_name,right_folder_name order by common_files desc limit 10