Created
January 13, 2014 19:25
-
-
Save jdavidson/8406379 to your computer and use it in GitHub Desktop.
Revisions
-
jdavidson created this gist
Jan 13, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,114 @@ library(ggplot2) library(ggthemes) library(plyr) library(lubridate) library(scales) library(data.table) options(scipen=999) options(stringsAsFactors = FALSE) rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv") # exclude non venture rounds (other, private equity, post-ipo) rounds <- subset(rounds, funding_round_type %in% c("venture", "angel", "series-a", "series-b"))# , "series-c+")) rounds <- subset(rounds, funded_month != "1960-01") # fix strange date data rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-")) rounds$round_raised_amount_usd <- cut(rounds$raised_amount_usd, breaks=c(0, 500000, 1000000, 5000000, 10000000, 20000000, 40000000, 80000000, Inf), right=FALSE)# round(rounds $raised_amount_usd / 500000) * 500000 # dedup rounds <- data.table(rounds) setkeyv(rounds, c("company_name", "funded_at", "funding_round_type")) rounds <- unique(rounds) # restrict to companies first funded after 2008 companies <- rounds[, list(first_funded_at = min(funded_at)), by = company_name] rounds <- join(rounds, companies) rounds <- subset(rounds, first_funded_at > ymd("2008-01-01")) # fix strange difference in units from diff my.diff <- function(x, lag=1) { n <- length(x) round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30) } # round sequences rounds_index <- rounds[, id := seq_along(funded_at), by=company_name] rounds_index <- rounds_index[, diff := c(my.diff(funded_at), NA), by=company_name] rounds_index$lifetime <- rounds_index$diff rounds_index[is.na(rounds_index$lifetime),]$lifetime <- round(as.numeric(difftime(max(rounds_index$funded_at), rounds_index[is.na(rounds_index$lifetime),]$funded_at, units="days") / 30)) # rounds_index <- ddply(rounds, .(company_name), transform, index=seq_along(funded_at), diff=c(my.diff(funded_at), NA)) # aggregate medians <- ddply(rounds_index, .(funding_round_type), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T)) medians <- medians[order(medians$median),] diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "diff")] setnames(diff_summary, "diff", "lifetime") diff_summary <- diff_summary[!is.na(diff_summary$lifetime),] diff_summary <- diff_summary[order(funding_round_type, lifetime, decreasing=T),] diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= funding_round_type] round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "lifetime")] round_lifetimes <- round_lifetimes[order(funding_round_type, lifetime, decreasing=T),] round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by= funding_round_type] diff_summary <- join(diff_summary, round_lifetimes) diff_summary <- diff_summary[order(funding_round_type, lifetime),] diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds ggplot(diff_summary, aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + geom_smooth() + ggtitle("Financings") + ylab("Financings") + xlab("Months After Funding") + scale_color_discrete(name = "Round") ggplot(diff_summary, aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=funding_round_type)) + geom_line() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + ggtitle("Likelihood Of Raising A Follow On Round By Time") + ylab("Percent of Companies that Raise a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + scale_color_discrete(name = "Round") ggsave("follow-on-likelihood-by-time.png") #### round_raised_amount_usd medians <- ddply(rounds_index, .(funding_round_type, round_raised_amount_usd), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T)) medians[order(medians$funding_round_type, medians$median),] diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "diff")] setnames(diff_summary, "diff", "lifetime") diff_summary <- diff_summary[!is.na(diff_summary$lifetime),] diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),] diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "round_raised_amount_usd")] round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "lifetime")] round_lifetimes <- round_lifetimes[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),] round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "round_raised_amount_usd")] diff_summary <- join(diff_summary, round_lifetimes) diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime),] diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds diff_summary <- diff_summary[!is.na(diff_summary$round_raised_amount_usd),] ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y=rounds, color= as.factor(round_raised_amount_usd))) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding") ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= as.factor(round_raised_amount_usd))) + geom_line() + xlim(0,48) + ggtitle("") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) diff_summary$funding_round_type <- factor(diff_summary$funding_round_type, levels=c("angel", "venture", "series-a", "series-b")) diff_summary$round_raised_amount_usd <- mapvalues(diff_summary$round_raised_amount_usd, from = c("[0,5e+05)", "[5e+05,1e+06)", "[1e+06,5e+06)", "[5e+06,1e+07)", "[1e+07,2e+07)", "[2e+07,4e+07)", "[4e+07,8e+07)", "[8e+07,Inf)"), to = c("$0-.5M", "$.5-1M", "$1-5M", "$5-10M", "$10-20M", "$20-40M", "$40-80M", "$80M+")) ggplot(subset(diff_summary, cum_total_rounds > 20 & round_raised_amount_usd != "$80M+"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=round_raised_amount_usd)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Round and Amount") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ funding_round_type) + scale_colour_few() ggsave("follow-on-likelihood-by-round-size.png") #### company_category_code medians <- ddply(rounds_index, .(funding_round_type, company_category_code), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T)) medians <- medians[order(medians$funding_round_type, medians$median),] diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "company_category_code", "diff")] setnames(diff_summary, "diff", "lifetime") diff_summary <- diff_summary[!is.na(diff_summary$lifetime),] diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime, decreasing=T),] diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "company_category_code")] round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "company_category_code", "lifetime")] round_lifetimes <- round_lifetimes[order(funding_round_type, company_category_code, lifetime, decreasing=T),] round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "company_category_code")] diff_summary <- join(diff_summary, round_lifetimes) diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime),] diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds category_counts <- ddply(rounds, .(company_category_code), summarize, counts=length(unique(company_name))) category_counts <- category_counts[order(category_counts$counts, decreasing=T),] ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding") + facet_wrap(~ company_category_code) ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= funding_round_type)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Category") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ company_category_code) ggsave("follow-on-likelihood-by-round-category.png")