Last active
May 9, 2025 12:37
-
-
Save jefft/4a845d74203d971f479a88c0deed4bf4 to your computer and use it in GitHub Desktop.
Revisions
-
jefft revised this gist
May 9, 2025 . 1 changed file with 5 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,7 +13,7 @@ -- https://www.redradishtech.com -- -- CREATE SCHEMA IF NOT EXISTS autorag; -- Whether a page visible to groupname should be 'visible' to us. Return null if groupname is null. @@ -37,14 +37,15 @@ $$ LANGUAGE PLPGSQL IMMUTABLE; -- - restricted to username '[email protected]' + 'confluence-users' group = page is visible unless hidden elsewhere (return true) -- - no username restrictions + no group visibility restrictions = page is visible unless hidden elsewhere (return true). -- CREATE OR REPLACE FUNCTION pg_temp.pagepermissions_allow(username text, groupname text) RETURNS BOOLEAN AS $$ BEGIN return (coalesce(pg_temp.isvisibleuser(username), false) OR coalesce(pg_temp.isvisiblegroup(groupname), false)) OR (username is null AND groupname is null); END; $$ LANGUAGE PLPGSQL IMMUTABLE; -- We select dates, and want the text in ISO format SET datestyle = iso, ymd; @@ -76,8 +77,7 @@ WITH RECURSIVE parent_pages AS c.parentid FROM content c JOIN spaces USING (spaceid) WHERE contenttype='PAGE' AND prevver IS NULL AND content_status='current' -- -- Note: for testing, if you want to limit output to particular pages, do it here, e.g.: @@ -211,8 +211,7 @@ OUTPUT AS JOIN user_mapping ON content.creator=user_mapping.user_key JOIN cwd_user ON cwd_user.lower_user_name=user_mapping.lower_username JOIN bodycontent USING (contentid) WHERE space_and_parents_allow -- AND content.creationdate >= '2024-01-01' ) SELECT * FROM OUTPUT; -
jefft created this gist
May 9, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,218 @@ -- Run in a Confluence DC Postgres database, this SQL identifies pages that are visible to a particular user or group, respecting space permissions and page hierarchy permissions. -- -- My use-case is to dump all Confluence content into XML files, to be fed into Cloudflare AutoRAG (https://developers.cloudflare.com/autorag/) to create a company-internal chatbot. This would be easy, except that not _all_ content must be exported: -- - People might have marked certain pages or page trees as private (visible only to themselves) -- - there might be certain spaces private to select groups. -- I actually only want to export content visible to a universal group, like 'confluence-users', or to a particular person. This requires emulating the Confluence permissions system in SQL. -- -- Jump right to the end of the file to customize what, of identified content records, you actually want out. The default is to dump some useful fields plus the content body. -- -- Note: You can run this against a read-only database (e.g. a replica), but first separate out the 3 pl/pgsql functions, change the schema to something real (e.g. 'autorag'), and source them in the RW production instance. -- -- [email protected], 9/May/2025 -- https://www.redradishtech.com -- CREATE SCHEMA IF NOT EXISTS pg_temp; -- Whether a page visible to groupname should be 'visible' to us. Return null if groupname is null. CREATE OR REPLACE FUNCTION pg_temp.isvisiblegroup(groupname TEXT) RETURNS BOOLEAN AS $$ BEGIN RETURN groupname IN ('confluence-users'); END; $$ LANGUAGE PLPGSQL IMMUTABLE; -- Whether a page visible to username should be 'visible' to us. Return null if username is null. CREATE OR REPLACE FUNCTION pg_temp.isvisibleuser(username TEXT) RETURNS BOOLEAN AS $$ BEGIN RETURN username = 'matchesnobody'; END; $$ LANGUAGE PLPGSQL IMMUTABLE; -- Given one of a page's username/group visibility restrictions (so either username or groupname will be set, not both), returns true if it implies page visibility to us. -- E.g. if a page is: -- - restricted to username '[email protected]' + no group restrictions = page is hidden (return false). -- - restricted to username '[email protected]' + 'confluence-users' group = page is visible unless hidden elsewhere (return true) -- - no username restrictions + no group visibility restrictions = page is visible unless hidden elsewhere (return true). -- CREATE OR REPLACE FUNCTION pg_temp.pagepermissions_allow(username text, groupname text) RETURNS BOOLEAN AS $$ BEGIN return (coalesce(pg_temp.isvisibleuser(username), false) OR coalesce(pg_temp.isvisiblegroup(groupname), false)) OR (username is null AND groupname is null); END; $$ LANGUAGE PLPGSQL IMMUTABLE; -- We select dates, and want the text in ISO format SET datestyle = iso, ymd; -- The parent_pages CTE is a funky recursive query that fills out rows for the ancestor of each page, for later permission checks. -- Say we have: -- Space: [email protected] (public) -- Page 393785: [email protected]/Jeff Turner's Home (restricted to confluence-users and [email protected]') -- Page 283513404: [email protected]/AutoRAG test level 1 (public) -- Page 283513410: [email protected]/AutoRAG test level 1/AutoRAG test level 2 (restricted) -- Page 283513546: [email protected]/AutoRAG test level 1/AutoRAG test level 2/AutoRAG test level 3 (unrestricted) -- -- parent_pages returns rows for each page and its ancestors. E.g. for the level 3 page: -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────┬──────────────────────┬───────────┐ -- │ spaceid │ spacekey │ childcontentid │ childtitle │ contentid │ title │ parentid │ -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────┼──────────────────────┼───────────┤ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513546 │ AutoRAG test level 3 │ 283513410 │ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513410 │ AutoRAG test level 2 │ 283513404 │ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513404 │ AutoRAG test level 1 │ 393785 │ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 393785 │ Jeff Turner’s Home │ ␀ │ -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────┴──────────────────────┴───────────┘ -- WITH RECURSIVE parent_pages AS (SELECT c.spaceid AS spaceid, spacekey, c.contentid AS childcontentid, c.title AS childtitle, c.contentid, c.title, c.parentid FROM content c JOIN spaces USING (spaceid) WHERE spacekey NOT IN ('all') AND contenttype='PAGE' AND prevver IS NULL AND content_status='current' -- -- Note: for testing, if you want to limit output to particular pages, do it here, e.g.: -- AND title ~ 'AutoRAG test' AND title != '' UNION ALL SELECT pp.spaceid, pp.spacekey, childcontentid, childtitle, c2.contentid, c2.title, c2.parentid FROM content c2 INNER JOIN parent_pages pp ON pp.parentid = c2.contentid), -- -- -- Join each page with its 'View' page group permissions. Note how the 'user' restriction on level 2 makes the page hidden -- -- ┌──────────────────────┬──────────────────────┬───────────────────┬──────────────────┐ -- │ childtitle │ title │ viewgroups │ any_group_allows │ -- ├──────────────────────┼──────────────────────┼───────────────────┼──────────────────┤ -- │ AutoRAG test level 3 │ Jeff Turner’s Home │ confluence-users │ t │ -- │ AutoRAG test level 3 │ AutoRAG test level 1 │ ␀ │ t │ -- │ AutoRAG test level 3 │ AutoRAG test level 2 │ ␀ │ f │ -- │ AutoRAG test level 3 │ AutoRAG test level 3 │ ␀ │ t │ -- └──────────────────────┴──────────────────────┴───────────────────┴──────────────────┘ pageancestors_with_aggregated_viewperms AS (SELECT DISTINCT spaceid, spacekey, childcontentid, childtitle, contentid, title, string_agg(cp.groupname, ',') AS viewgroups, bool_or(pg_temp.pagepermissions_allow(cp.username, cp.groupname)) AS any_group_allows FROM parent_pages pages LEFT JOIN (SELECT * FROM content_perm_set WHERE cont_perm_type='View') cps ON cps.content_id=pages.contentid LEFT JOIN (SELECT * FROM content_perm WHERE cp_type='View') cp ON cp.cps_id=cps.id GROUP BY 1, 2, 3, 4, 5, 6 ORDER BY contentid), -- -- -- Squash each set of page-and-page-ancestors rows into one, AND-ing the permissions to simulate the behaviour that permissions are all levels must be granted to a page to be visible. Eg. our 'level 3' page boils down to one row: -- -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────────────┐ -- │ spaceid │ spacekey │ childcontentid │ childtitle │ all_parents_allow │ -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────────────┤ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ f │ -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────────────┘ -- -- Here are rolled-up lines for each page in our page hierarchy. Note how the restriction on level 2 causes level 3 to be hidden. -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────────────┐ -- │ spaceid │ spacekey │ childcontentid │ childtitle │ all_parents_allow │ -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────────────┤ -- │ 524292 │ [email protected] │ 283513404 │ AutoRAG test level 1 │ t │ -- │ 524292 │ [email protected] │ 283513410 │ AutoRAG test level 2 │ f │ -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ f │ -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────────────┘ -- pages_with_aggregated_parentperms AS (SELECT DISTINCT spaceid, spacekey, childcontentid, childtitle, bool_and(coalesce(any_group_allows, TRUE)) AS all_parents_allow FROM pageancestors_with_aggregated_viewperms GROUP BY 1, 2, 3, 4), -- -- Does the space permissions allow access to this page? This table, built upon parent_pages for scoping (but it could have been built on "content"), creates a spaceperms_allow boolean specifying whether the space permissions grant visibility. E.g. for the level 1 page in a public space: -- -- ┌────────────────┬──────────────────────┬──────────────────┐ -- │ childcontentid │ childtitle │ spaceperms_allow │ -- ├────────────────┼──────────────────────┼──────────────────┤ -- │ 283513404 │ AutoRAG test level 1 │ t │ -- └────────────────┴──────────────────────┴──────────────────┘ spaceperms AS (SELECT DISTINCT childcontentid, childtitle, coalesce(bool_or(pg_temp.pagepermissions_allow(permusername, permgroupname)), FALSE) spaceperms_allow FROM parent_pages LEFT JOIN (SELECT * FROM spacepermissions WHERE permtype='VIEWSPACE') spacepermissions USING (spaceid) GROUP BY 1, 2), -- -- Combined space permissions (spaceperms) and page-level permissions (pages_with_aggregated_parentperms), ANDing them together to determine final page visibility -- -- ┌────────────────┬──────────────────────┬───────────────────┬──────────────────┬─────────────────────────┐ -- │ childcontentid │ childtitle │ all_parents_allow │ spaceperms_allow │ space_and_parents_allow │ -- ├────────────────┼──────────────────────┼───────────────────┼──────────────────┼─────────────────────────┤ -- │ 283513404 │ AutoRAG test level 1 │ t │ t │ t │ -- │ 283513410 │ AutoRAG test level 2 │ f │ t │ f │ -- │ 283513546 │ AutoRAG test level 3 │ f │ t │ f │ -- └────────────────┴──────────────────────┴───────────────────┴──────────────────┴─────────────────────────┘ -- pages_with_aggregated_parent_and_space_perms AS (SELECT *, (spaceperms_allow AND all_parents_allow) AS space_and_parents_allow FROM pages_with_aggregated_parentperms JOIN spaceperms USING (childcontentid, childtitle)), -- -- Print visible pages with extra details OUTPUT AS (SELECT contentid, spaces.spacekey, spaces.spacename, content.creationdate::date, display_name AS creator, cwd_user.email_address AS creator_email, content.lastmoddate, title, BODY FROM pages_with_aggregated_parent_and_space_perms JOIN spaces USING (spaceid, spacekey) JOIN content ON childcontentid=content.contentid JOIN user_mapping ON content.creator=user_mapping.user_key JOIN cwd_user ON cwd_user.lower_user_name=user_mapping.lower_username JOIN bodycontent USING (contentid) WHERE space_and_parents_allow -- AND content.creationdate >= '2024-01-01' ) SELECT * FROM OUTPUT;