Skip to content

Instantly share code, notes, and snippets.

@jefft
Last active May 9, 2025 12:37
Show Gist options
  • Save jefft/4a845d74203d971f479a88c0deed4bf4 to your computer and use it in GitHub Desktop.
Save jefft/4a845d74203d971f479a88c0deed4bf4 to your computer and use it in GitHub Desktop.

Revisions

  1. jefft revised this gist May 9, 2025. 1 changed file with 5 additions and 6 deletions.
    11 changes: 5 additions & 6 deletions confluence_visiblepages.sql
    Original file line number Diff line number Diff line change
    @@ -13,7 +13,7 @@
    -- https://www.redradishtech.com
    --

    CREATE SCHEMA IF NOT EXISTS pg_temp;
    -- CREATE SCHEMA IF NOT EXISTS autorag;

    -- Whether a page visible to groupname should be 'visible' to us. Return null if groupname is null.

    @@ -37,14 +37,15 @@ $$ LANGUAGE PLPGSQL IMMUTABLE;
    -- - restricted to username '[email protected]' + 'confluence-users' group = page is visible unless hidden elsewhere (return true)
    -- - no username restrictions + no group visibility restrictions = page is visible unless hidden elsewhere (return true).
    --

    CREATE OR REPLACE FUNCTION pg_temp.pagepermissions_allow(username text, groupname text) RETURNS BOOLEAN AS $$
    BEGIN
    return (coalesce(pg_temp.isvisibleuser(username), false) OR coalesce(pg_temp.isvisiblegroup(groupname), false)) OR
    (username is null AND groupname is null);
    END;
    $$ LANGUAGE PLPGSQL IMMUTABLE;

    -- We select dates, and want the text in ISO format
    -- We select dates, and want the text in ISO format
    SET datestyle = iso,
    ymd;

    @@ -76,8 +77,7 @@ WITH RECURSIVE parent_pages AS
    c.parentid
    FROM content c
    JOIN spaces USING (spaceid)
    WHERE spacekey NOT IN ('all')
    AND contenttype='PAGE'
    WHERE contenttype='PAGE'
    AND prevver IS NULL
    AND content_status='current' --
    -- Note: for testing, if you want to limit output to particular pages, do it here, e.g.:
    @@ -211,8 +211,7 @@ OUTPUT AS
    JOIN user_mapping ON content.creator=user_mapping.user_key
    JOIN cwd_user ON cwd_user.lower_user_name=user_mapping.lower_username
    JOIN bodycontent USING (contentid)
    WHERE space_and_parents_allow
    -- AND content.creationdate >= '2024-01-01'
    WHERE space_and_parents_allow -- AND content.creationdate >= '2024-01-01'
    )
    SELECT *
    FROM OUTPUT;
  2. jefft created this gist May 9, 2025.
    218 changes: 218 additions & 0 deletions confluence_visiblepages.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,218 @@
    -- Run in a Confluence DC Postgres database, this SQL identifies pages that are visible to a particular user or group, respecting space permissions and page hierarchy permissions.
    --
    -- My use-case is to dump all Confluence content into XML files, to be fed into Cloudflare AutoRAG (https://developers.cloudflare.com/autorag/) to create a company-internal chatbot. This would be easy, except that not _all_ content must be exported:
    -- - People might have marked certain pages or page trees as private (visible only to themselves)
    -- - there might be certain spaces private to select groups.
    -- I actually only want to export content visible to a universal group, like 'confluence-users', or to a particular person. This requires emulating the Confluence permissions system in SQL.
    --
    -- Jump right to the end of the file to customize what, of identified content records, you actually want out. The default is to dump some useful fields plus the content body.
    --
    -- Note: You can run this against a read-only database (e.g. a replica), but first separate out the 3 pl/pgsql functions, change the schema to something real (e.g. 'autorag'), and source them in the RW production instance.
    --
    -- [email protected], 9/May/2025
    -- https://www.redradishtech.com
    --

    CREATE SCHEMA IF NOT EXISTS pg_temp;

    -- Whether a page visible to groupname should be 'visible' to us. Return null if groupname is null.

    CREATE OR REPLACE FUNCTION pg_temp.isvisiblegroup(groupname TEXT) RETURNS BOOLEAN AS $$
    BEGIN
    RETURN groupname IN ('confluence-users');
    END;
    $$ LANGUAGE PLPGSQL IMMUTABLE;

    -- Whether a page visible to username should be 'visible' to us. Return null if username is null.

    CREATE OR REPLACE FUNCTION pg_temp.isvisibleuser(username TEXT) RETURNS BOOLEAN AS $$
    BEGIN
    RETURN username = 'matchesnobody';
    END;
    $$ LANGUAGE PLPGSQL IMMUTABLE;

    -- Given one of a page's username/group visibility restrictions (so either username or groupname will be set, not both), returns true if it implies page visibility to us.
    -- E.g. if a page is:
    -- - restricted to username '[email protected]' + no group restrictions = page is hidden (return false).
    -- - restricted to username '[email protected]' + 'confluence-users' group = page is visible unless hidden elsewhere (return true)
    -- - no username restrictions + no group visibility restrictions = page is visible unless hidden elsewhere (return true).
    --
    CREATE OR REPLACE FUNCTION pg_temp.pagepermissions_allow(username text, groupname text) RETURNS BOOLEAN AS $$
    BEGIN
    return (coalesce(pg_temp.isvisibleuser(username), false) OR coalesce(pg_temp.isvisiblegroup(groupname), false)) OR
    (username is null AND groupname is null);
    END;
    $$ LANGUAGE PLPGSQL IMMUTABLE;

    -- We select dates, and want the text in ISO format
    SET datestyle = iso,
    ymd;

    -- The parent_pages CTE is a funky recursive query that fills out rows for the ancestor of each page, for later permission checks.
    -- Say we have:
    -- Space: [email protected] (public)
    -- Page 393785: [email protected]/Jeff Turner's Home (restricted to confluence-users and [email protected]')
    -- Page 283513404: [email protected]/AutoRAG test level 1 (public)
    -- Page 283513410: [email protected]/AutoRAG test level 1/AutoRAG test level 2 (restricted)
    -- Page 283513546: [email protected]/AutoRAG test level 1/AutoRAG test level 2/AutoRAG test level 3 (unrestricted)
    --
    -- parent_pages returns rows for each page and its ancestors. E.g. for the level 3 page:
    -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────┬──────────────────────┬───────────┐
    -- │ spaceid │ spacekey │ childcontentid │ childtitle │ contentid │ title │ parentid │
    -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────┼──────────────────────┼───────────┤
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513546 │ AutoRAG test level 3 │ 283513410 │
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513410 │ AutoRAG test level 2 │ 283513404 │
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 283513404 │ AutoRAG test level 1 │ 393785 │
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ 393785 │ Jeff Turner’s Home │ ␀ │
    -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────┴──────────────────────┴───────────┘
    --
    WITH RECURSIVE parent_pages AS
    (SELECT c.spaceid AS spaceid,
    spacekey,
    c.contentid AS childcontentid,
    c.title AS childtitle,
    c.contentid,
    c.title,
    c.parentid
    FROM content c
    JOIN spaces USING (spaceid)
    WHERE spacekey NOT IN ('all')
    AND contenttype='PAGE'
    AND prevver IS NULL
    AND content_status='current' --
    -- Note: for testing, if you want to limit output to particular pages, do it here, e.g.:
    -- AND title ~ 'AutoRAG test'

    AND title != ''
    UNION ALL SELECT pp.spaceid,
    pp.spacekey,
    childcontentid,
    childtitle,
    c2.contentid,
    c2.title,
    c2.parentid
    FROM content c2
    INNER JOIN parent_pages pp ON pp.parentid = c2.contentid), --
    --
    -- Join each page with its 'View' page group permissions. Note how the 'user' restriction on level 2 makes the page hidden
    --
    -- ┌──────────────────────┬──────────────────────┬───────────────────┬──────────────────┐
    -- │ childtitle │ title │ viewgroups │ any_group_allows │
    -- ├──────────────────────┼──────────────────────┼───────────────────┼──────────────────┤
    -- │ AutoRAG test level 3 │ Jeff Turner’s Home │ confluence-users │ t │
    -- │ AutoRAG test level 3 │ AutoRAG test level 1 │ ␀ │ t │
    -- │ AutoRAG test level 3 │ AutoRAG test level 2 │ ␀ │ f │
    -- │ AutoRAG test level 3 │ AutoRAG test level 3 │ ␀ │ t │
    -- └──────────────────────┴──────────────────────┴───────────────────┴──────────────────┘
    pageancestors_with_aggregated_viewperms AS
    (SELECT DISTINCT spaceid,
    spacekey,
    childcontentid,
    childtitle,
    contentid,
    title,
    string_agg(cp.groupname, ',') AS viewgroups,
    bool_or(pg_temp.pagepermissions_allow(cp.username, cp.groupname)) AS any_group_allows
    FROM parent_pages pages
    LEFT JOIN
    (SELECT *
    FROM content_perm_set
    WHERE cont_perm_type='View') cps ON cps.content_id=pages.contentid
    LEFT JOIN
    (SELECT *
    FROM content_perm
    WHERE cp_type='View') cp ON cp.cps_id=cps.id
    GROUP BY 1,
    2,
    3,
    4,
    5,
    6
    ORDER BY contentid), --
    --
    -- Squash each set of page-and-page-ancestors rows into one, AND-ing the permissions to simulate the behaviour that permissions are all levels must be granted to a page to be visible. Eg. our 'level 3' page boils down to one row:
    --
    -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────────────┐
    -- │ spaceid │ spacekey │ childcontentid │ childtitle │ all_parents_allow │
    -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────────────┤
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ f │
    -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────────────┘
    --
    -- Here are rolled-up lines for each page in our page hierarchy. Note how the restriction on level 2 causes level 3 to be hidden.
    -- ┌─────────┬─────────────────────────┬────────────────┬──────────────────────┬───────────────────┐
    -- │ spaceid │ spacekey │ childcontentid │ childtitle │ all_parents_allow │
    -- ├─────────┼─────────────────────────┼────────────────┼──────────────────────┼───────────────────┤
    -- │ 524292 │ [email protected] │ 283513404 │ AutoRAG test level 1 │ t │
    -- │ 524292 │ [email protected] │ 283513410 │ AutoRAG test level 2 │ f │
    -- │ 524292 │ [email protected] │ 283513546 │ AutoRAG test level 3 │ f │
    -- └─────────┴─────────────────────────┴────────────────┴──────────────────────┴───────────────────┘
    --
    pages_with_aggregated_parentperms AS
    (SELECT DISTINCT spaceid,
    spacekey,
    childcontentid,
    childtitle,
    bool_and(coalesce(any_group_allows, TRUE)) AS all_parents_allow
    FROM pageancestors_with_aggregated_viewperms
    GROUP BY 1,
    2,
    3,
    4), --
    -- Does the space permissions allow access to this page? This table, built upon parent_pages for scoping (but it could have been built on "content"), creates a spaceperms_allow boolean specifying whether the space permissions grant visibility. E.g. for the level 1 page in a public space:
    --
    -- ┌────────────────┬──────────────────────┬──────────────────┐
    -- │ childcontentid │ childtitle │ spaceperms_allow │
    -- ├────────────────┼──────────────────────┼──────────────────┤
    -- │ 283513404 │ AutoRAG test level 1 │ t │
    -- └────────────────┴──────────────────────┴──────────────────┘
    spaceperms AS
    (SELECT DISTINCT childcontentid,
    childtitle,
    coalesce(bool_or(pg_temp.pagepermissions_allow(permusername, permgroupname)), FALSE) spaceperms_allow
    FROM parent_pages
    LEFT JOIN
    (SELECT *
    FROM spacepermissions
    WHERE permtype='VIEWSPACE') spacepermissions USING (spaceid)
    GROUP BY 1,
    2), --
    -- Combined space permissions (spaceperms) and page-level permissions (pages_with_aggregated_parentperms), ANDing them together to determine final page visibility
    --
    -- ┌────────────────┬──────────────────────┬───────────────────┬──────────────────┬─────────────────────────┐
    -- │ childcontentid │ childtitle │ all_parents_allow │ spaceperms_allow │ space_and_parents_allow │
    -- ├────────────────┼──────────────────────┼───────────────────┼──────────────────┼─────────────────────────┤
    -- │ 283513404 │ AutoRAG test level 1 │ t │ t │ t │
    -- │ 283513410 │ AutoRAG test level 2 │ f │ t │ f │
    -- │ 283513546 │ AutoRAG test level 3 │ f │ t │ f │
    -- └────────────────┴──────────────────────┴───────────────────┴──────────────────┴─────────────────────────┘
    --
    pages_with_aggregated_parent_and_space_perms AS
    (SELECT *,
    (spaceperms_allow
    AND all_parents_allow) AS space_and_parents_allow
    FROM pages_with_aggregated_parentperms
    JOIN spaceperms USING (childcontentid,
    childtitle)), --
    -- Print visible pages with extra details
    OUTPUT AS
    (SELECT contentid,
    spaces.spacekey,
    spaces.spacename,
    content.creationdate::date,
    display_name AS creator,
    cwd_user.email_address AS creator_email,
    content.lastmoddate,
    title,
    BODY
    FROM pages_with_aggregated_parent_and_space_perms
    JOIN spaces USING (spaceid,
    spacekey)
    JOIN content ON childcontentid=content.contentid
    JOIN user_mapping ON content.creator=user_mapping.user_key
    JOIN cwd_user ON cwd_user.lower_user_name=user_mapping.lower_username
    JOIN bodycontent USING (contentid)
    WHERE space_and_parents_allow
    -- AND content.creationdate >= '2024-01-01'
    )
    SELECT *
    FROM OUTPUT;