Skip to content

Instantly share code, notes, and snippets.

@mcshaz
Last active July 16, 2020 21:03
Show Gist options
  • Select an option

  • Save mcshaz/b4793c19096669a89ef83522437ec79d to your computer and use it in GitHub Desktop.

Select an option

Save mcshaz/b4793c19096669a89ef83522437ec79d to your computer and use it in GitHub Desktop.

Revisions

  1. mcshaz revised this gist Jul 16, 2020. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -104,8 +104,8 @@ oneSampleSurvival <- function(df, description) {
    df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
    # rename variables to required dataframe names for oneSampleSurvival function
    names(df) <- c("male", "dead", "dob", "entry", "exit")
    df$male = df$male == 1
    df$dead = df$dead == 1
    df$male <- df$male == 1
    df$dead <- df$dead == 1

    # looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
    oneSampleSurvival (df[df$exit - df$entry > 30,], "surviving >30 days from operation")
  2. mcshaz revised this gist Jul 16, 2020. 1 changed file with 11 additions and 8 deletions.
    19 changes: 11 additions & 8 deletions oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -1,10 +1,12 @@
    # see http://hedwig.mgh.harvard.edu/biostatistics/node/30 for the original paper and an excel spreadsheet with macros
    # the spreadsheet does much the same in VBA, but with considerably more code and complexity
    # the spreadsheet does much the same in VBA, but with considerably more code
    # unlike the vba spreadsheet, this also takes into acount the fraction of age in years at entry and exit (in creating per patient risk)
    # if the age is greater than the maximum data (101 years old), each subsequent year is a repeat of the final annual risk available
    library(survival)

    oneSampleSurvival <- function(df, description) {
    # NZ data between 0 and 100 for each year
    # 10 years of population and death data was averaged (2009-2018) for each year until 89 years old
    # 10 years of population and death data was averaged (2009-2018) for each year of age until 89 years old
    # data is then repeated in 5 yearly blocks as more granular data was hard to obtain
    # mortality at 0 years excludes mortality before 28 days (not relevant)
    maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
    @@ -67,10 +69,11 @@ oneSampleSurvival <- function(df, description) {
    })

    #main output
    print(description)
    print("using proportional hazards assumption")
    cat(description, "(using proportional hazards assumption)", "\n")
    totalDeaths <- sum(df$dead)
    print(smrProportionalHazard(totalDeaths, sum(riskOfDeath)))
    totalRisk <- sum(riskOfDeath)
    cat("observed / expected =", totalDeaths, "/", totalRisk, "\n")
    print(smrProportionalHazard(totalDeaths, totalRisk))

    df$yrsOldAtEntry <- as.numeric(df$entry - df$dob) / 365.25
    df$yrsInStudy <- as.numeric(df$exit - df$entry) / 365.25
    @@ -105,8 +108,8 @@ df$male = df$male == 1
    df$dead = df$dead == 1

    # looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
    oneSampleSurvival(df[df$exit - df$entry > 30,], "alive >30 days from operation")
    oneSampleSurvival (df[df$exit - df$entry > 30,], "surviving >30 days from operation")

    oneSampleSurvival(df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "alive >30 days from operation and >14yo at op.")
    oneSampleSurvival (df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "surviving >30 days and >14 yr old at operation")

    oneSampleSurvival(df[df$exit - df$entry > 365,], "alive at least 1 yr from op")
    oneSampleSurvival (df[df$exit - df$entry > 365,], "surviving at least 1 yr from operation")
  3. mcshaz revised this gist Jul 16, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -99,7 +99,7 @@ oneSampleSurvival <- function(df, description) {
    }

    df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
    # rename variables to required names for getStats function
    # rename variables to required dataframe names for oneSampleSurvival function
    names(df) <- c("male", "dead", "dob", "entry", "exit")
    df$male = df$male == 1
    df$dead = df$dead == 1
  4. mcshaz revised this gist Jul 16, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,7 @@ library(survival)

    oneSampleSurvival <- function(df, description) {
    # NZ data between 0 and 100 for each year
    # 10 years of population and death data was averaged (2009-2018) for each year until 89
    # 10 years of population and death data was averaged (2009-2018) for each year until 89 years old
    # data is then repeated in 5 yearly blocks as more granular data was hard to obtain
    # mortality at 0 years excludes mortality before 28 days (not relevant)
    maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
  5. mcshaz revised this gist Jul 16, 2020. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -93,8 +93,8 @@ oneSampleSurvival <- function(df, description) {
    lines(0:(as.integer(maxYears) + 1L), hazards, col="grey")
    legend("bottomleft",
    inset = 0.05,
    legend=c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
    col=c("grey", "black", "black"),
    legend = c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
    col = c("grey", "black", "black"),
    lty = c(1, 1, 2))
    }

  6. mcshaz revised this gist Jul 16, 2020. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,5 @@
    # see http://hedwig.mgh.harvard.edu/biostatistics/node/30
    # see http://hedwig.mgh.harvard.edu/biostatistics/node/30 for the original paper and an excel spreadsheet with macros
    # the spreadsheet does much the same in VBA, but with considerably more code and complexity
    library(survival)

    oneSampleSurvival <- function(df, description) {
    @@ -78,7 +79,6 @@ oneSampleSurvival <- function(df, description) {
    maxYears <- ceiling(max(df$yrsInStudy))
    hazards <- by(df, seq_len(nrow(df)), function(rw){
    mortVector <- getYrMortality(rw$yrsOldAtEntry, rw$yrsOldAtEntry + maxYears, rw$male)

    return(exp(-cumsum(mortVector)))
    })
    hazards <- c(1, rowMeans(simplify2array(hazards)))
  7. mcshaz created this gist Jul 16, 2020.
    112 changes: 112 additions & 0 deletions oneSampleSurvival.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,112 @@
    # see http://hedwig.mgh.harvard.edu/biostatistics/node/30
    library(survival)

    oneSampleSurvival <- function(df, description) {
    # NZ data between 0 and 100 for each year
    # 10 years of population and death data was averaged (2009-2018) for each year until 89
    # data is then repeated in 5 yearly blocks as more granular data was hard to obtain
    # mortality at 0 years excludes mortality before 28 days (not relevant)
    maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
    femaleMortality <- c(0.001440725276587, 0.000378599537757, 0.00021709924675, 9.97502891438584E-05, 9.85006425048784E-05, 8.92050090982164E-05, 8.95206096251493E-05, 7.96114330270264E-05, 6.92762143799792E-05, 0.000101432919309, 5.08040463311748E-05, 9.22608680532037E-05, 0.000122942485987, 0.000153294565916, 0.000172502569892, 0.000260513869301, 0.000255523432279, 0.000368255538044, 0.000356330469863, 0.000267139191191, 0.000358230732596, 0.000369833546194, 0.000305476109726, 0.000319588939803, 0.000297956887714, 0.000363010043724, 0.000362859918462, 0.000432002032646, 0.000321026034948, 0.000361625954016, 0.00040452437153, 0.000431745162574, 0.000512770712544, 0.00054317376104, 0.000488063712367, 0.000486563128689, 0.00057339122496, 0.00072400070304, 0.00072060329081, 0.000754517692419, 0.000849463968011, 0.000831645042867, 0.001042677971853, 0.001132957559664, 0.001207180709918, 0.00131585417895, 0.001513946743796, 0.001606617997058, 0.001723479078906, 0.001933808631927, 0.002226989986555, 0.002231269823877, 0.002626438738956, 0.002696431092572, 0.002966902852488, 0.003080837093037, 0.003413401888315, 0.003427147560834, 0.004010391720794, 0.004217567593114, 0.004606013461916, 0.005031607028952, 0.005503746741787, 0.006108782143772, 0.006845816438536, 0.007141344621113, 0.007955395638506, 0.009127032877493, 0.010042400111416, 0.010803760162666, 0.012057728392066, 0.01323141371974, 0.014562716593054, 0.015873768931767, 0.018880147634345, 0.02056309402693, 0.022568564901875, 0.025811635811795, 0.029461369991122, 0.032920253916872, 0.038426882965907, 0.042458470397799, 0.050308505460449, 0.05592788582796, 0.063922777668064, 0.074695945529539, 0.084828676018797, 0.095857321745606, 0.110779421454886, 0.129589230549757, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.444805194805195)

    getYrMortality <- function(startAgeYr, endAgeYr, isMale){
    if (isMale) {
    lookupVector <- maleMortality
    } else {
    lookupVector <- femaleMortality
    }
    startIndx <- as.integer(startAgeYr) + 1L # add 1 as mortality begins at 0 years of age (died <28d excluded)
    endIndx <- as.integer(endAgeYr) + 1L
    if (endIndx > length(lookupVector)) {
    lastIndx <- length(lookupVector)
    returnVar <- c(lookupVector[startIndx:lastIndx], rep(lookupVector[lastIndx], endIndx - lastIndx))
    } else {
    returnVar <- lookupVector[startIndx:endIndx]
    }
    return(returnVar)
    }

    smrProportionalHazard <- function(observed, expected, ci = 95) {
    stat <- (observed - expected) ^ 2 / expected

    smr <- observed / expected

    chiterm <- qchisq((100 + ci) / 200, 1)
    ub <- smr + chiterm / (2 * expected) + ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
    lb <- smr + chiterm / (2 * expected) - ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)

    return(c("smr" = smr,
    "p_value" = 1 - pchisq(stat, 1),
    "lower_ci" = lb,
    "upper_ci" = ub))
    }

    riskOfDeath <- by(df, seq_len(nrow(df)), function(rw){
    # extracts mortality data for each year of age between entry and exit date
    # first and last date will be fractionally reduced according to how much of thay age year was remaining
    startAgeYr <- as.numeric(rw$entry - rw$dob) / 365.25
    startProportion <- startAgeYr %% 1
    startAgeYr <- as.integer(startAgeYr)

    endAgeYr <- as.numeric(rw$exit - rw$dob) / 365.25
    endProportion <- endAgeYr %% 1
    endAgeYr <- as.integer(endAgeYr)

    returnVar <- getYrMortality(startAgeYr, endAgeYr, rw$male)

    if (startAgeYr == endAgeYr) {
    returnVar <- returnVar * (endProportion - startProportion)
    } else {
    returnVar[1] <- returnVar[1] * (1 - startProportion)
    len <- length(returnVar)
    returnVar[len] <- returnVar[len] * endProportion
    }

    return(sum(returnVar))
    })

    #main output
    print(description)
    print("using proportional hazards assumption")
    totalDeaths <- sum(df$dead)
    print(smrProportionalHazard(totalDeaths, sum(riskOfDeath)))

    df$yrsOldAtEntry <- as.numeric(df$entry - df$dob) / 365.25
    df$yrsInStudy <- as.numeric(df$exit - df$entry) / 365.25

    # create reference for population hazard
    maxYears <- ceiling(max(df$yrsInStudy))
    hazards <- by(df, seq_len(nrow(df)), function(rw){
    mortVector <- getYrMortality(rw$yrsOldAtEntry, rw$yrsOldAtEntry + maxYears, rw$male)

    return(exp(-cumsum(mortVector)))
    })
    hazards <- c(1, rowMeans(simplify2array(hazards)))
    survData <- survfit(Surv(df$yrsInStudy, df$dead) ~ 1, data = df)
    lowerY <- floor(survData$lower[length(survData$lower)] * 100) / 100
    par(cex.sub = 0.8)
    plot(survData,
    xlab = "Years from op",
    ylab = "Survival probability",
    ylim = c(lowerY, 1),
    sub = description)
    lines(0:(as.integer(maxYears) + 1L), hazards, col="grey")
    legend("bottomleft",
    inset = 0.05,
    legend=c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
    col=c("grey", "black", "black"),
    lty = c(1, 1, 2))
    }

    df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
    # rename variables to required names for getStats function
    names(df) <- c("male", "dead", "dob", "entry", "exit")
    df$male = df$male == 1
    df$dead = df$dead == 1

    # looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
    oneSampleSurvival(df[df$exit - df$entry > 30,], "alive >30 days from operation")

    oneSampleSurvival(df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "alive >30 days from operation and >14yo at op.")

    oneSampleSurvival(df[df$exit - df$entry > 365,], "alive at least 1 yr from op")