Skip to content

Instantly share code, notes, and snippets.

@ateucher
Last active May 19, 2022 23:48
Show Gist options
  • Save ateucher/a60e539f70bdaff2e13362fda4ec4deb to your computer and use it in GitHub Desktop.
Save ateucher/a60e539f70bdaff2e13362fda4ec4deb to your computer and use it in GitHub Desktop.

Revisions

  1. ateucher revised this gist May 19, 2022. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion extract_pptx_notes.R
    Original file line number Diff line number Diff line change
    @@ -64,4 +64,4 @@ date: "{Sys.Date()}"
    }

    out
    }
    }
  2. ateucher revised this gist May 19, 2022. 1 changed file with 1 addition and 6 deletions.
    7 changes: 1 addition & 6 deletions extract_pptx_notes.R
    Original file line number Diff line number Diff line change
    @@ -64,9 +64,4 @@ date: "{Sys.Date()}"
    }

    out
    }


    pp_file <- "/Users/ateucher/OneDrive - Government of BC/LUP ppt prep_2020-10-14-2-ACT.pptx"

    extract_pptx_notes(pp_file, "docx")
    }
  3. ateucher created this gist May 19, 2022.
    72 changes: 72 additions & 0 deletions extract_pptx_notes.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,72 @@
    extract_pptx_notes <- function(pp_file, format = c("md", "docx")) {
    if (!requireNamespace("officer", quietly = TRUE))
    stop("pacakge 'officer' required.")
    if (!requireNamespace("xml2", quietly = TRUE))
    stop("pacakge 'xml2' required.")
    if (!requireNamespace("rmarkdown", quietly = TRUE))
    stop("pacakge 'rmarkdown' required.")
    if (!requireNamespace("glue", quietly = TRUE))
    stop("pacakge 'glue' required.")

    format <- match.arg(format)

    pp <- read_pptx(pp_file)

    file_sans_ext <- tools::file_path_sans_ext(pp_file)

    slides_with_notes_meta <- pp$notesSlide$get_metadata()
    slides_with_notes_meta$notes <- as.numeric(gsub("notesSlide(\\d{1,3}).*", "\\1", rownames(slides_with_notes_meta)))
    slides_with_notes_meta <- slides_with_notes_meta[!grepl("notesMaster", slides_with_notes_meta$target), ]
    slides_with_notes_meta$slide <- as.numeric(gsub(".+slide(\\d{1,3})\\.xml", "\\1", slides_with_notes_meta$target))

    if (!nrow(slides_with_notes_meta)) stop("No notes in this presentation")

    slide_nums <- seq(1, max(slides_with_notes_meta$slide))

    notes <- lapply(slide_nums, \(x) {
    notes_slide <- slides_with_notes_meta[slides_with_notes_meta$slide == x, "notes"]
    if (!length(notes_slide)) return(character(0))
    slide <- pp$notesSlide$get_slide(notes_slide)
    xml <- slide$get()
    # xpath search from here: https://robaboukhalil.medium.com/your-slide-deck-is-a-zip-file-in-disguise-36bb14f11c0b
    xpath <- "//*[local-name()='txBody']/*[local-name()='p']/*[local-name()='r']/*[local-name()='t']/text()"
    node <- xml_find_all(xml, xpath)
    as.character(node)
    })

    names(notes) <- paste("Slide", slide_nums)

    out <- paste0(file_sans_ext, "_notes.md")

    if (file.exists(out)) {
    overwrite <- askYesNo(glue("File {out} already exists. Overwrite?"))
    if (!overwrite) stop("Quitting", call. = FALSE)
    file.remove(out)
    }

    cat(glue('---
    title: "{basename(file_sans_ext)}"
    output: word_document
    date: "{Sys.Date()}"
    ---\n\n\n'), file = out)

    for (n in names(notes)) {
    cat(paste0("## ", n, ":\n\n"), file = out, append = TRUE)
    if (length(notes[[n]])) {
    cat(notes[[n]], file = out, sep = "\n\n", append = TRUE)
    cat("\n", file = out, append = TRUE)
    }
    }

    if (format == "docx") {
    out_docx <- paste0(file_sans_ext, "_notes.docx")
    return(rmarkdown::render(out, output_file = out_docx))
    }

    out
    }


    pp_file <- "/Users/ateucher/OneDrive - Government of BC/LUP ppt prep_2020-10-14-2-ACT.pptx"

    extract_pptx_notes(pp_file, "docx")