Created
March 17, 2015 14:42
-
-
Save milesgrimshaw/477b32e661518de9b6c2 to your computer and use it in GitHub Desktop.
Revisions
-
milesgrimshaw created this gist
Mar 17, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,133 @@ # Load libraries library(ggplot2) # Set working directory setwd("~/Dropbox (Personal)/Personal/Github/MindBody/New/") # Read in the data data <- read.csv('mind_new.csv', header=TRUE, as.is=TRUE) # Convert everything to lower case data$Country <- tolower(data$Country) data$City <- tolower(data$City) data$Name <- tolower(data$Name) # Eliminate duplicates data$test <- paste(data$Name,data$Lon,data$Lat, sep="") new <- data[which(!duplicated(data$test)),] data <- new # Descriptive exploration nrow(data) length(unique(data$Country)) length(unique(data$City)) # Country breakdown country <- as.data.frame(unique(data$Country)) names(country) <- "Country" country$Count <- sapply(country$Country, function (i) length(which(data$Country==i))) country <- country[order(country$Count, decreasing=F),] summary(country$Count) country_top <- country[which(country$Count > 10),] country <- transform(country, Country=reorder(Country, Count) ) country_top <- transform(country_top, Country=reorder(Country, Count) ) # Plot the countries countryplot <- ggplot(country, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by Country") countryplot countryplot <- ggplot(country_top, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by Country (10+ Customers)") countryplot # Rename industries to be easy to read data$Industry[which(data$Industry == 'Children's Programs')] <- 'Child Programs' # Create levels for each unique industry data$Industry <- factor(data$Industry) levels(data$Industry) # Select only US businesses US <- data[which(data$Country=="united states"),] # City breakdown across all countries city <- as.data.frame(unique(data$City)) names(city) <- "City" city$Count <- sapply(city$City, function (i) length(which(data$City==i))) city_top <- city[which(city$Count > 50),] city <- transform(city, City=reorder(City, Count) ) city_top <- transform(city_top, City=reorder(City, Count) ) # Plot the top cities cityplot <- ggplot(city_top, aes( y=Count,x=City)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by City (50+ Customers)") cityplot # Create a data frame of count by industry tags all <- as.data.frame(unique(data$Industry)) names(all) <- "Industry" all$Count <- sapply(all$Industry, function (i) length(which(data$Industry==i))) all <- all[order(all$Count, decreasing=F),] all <- transform(all, Industry=reorder(Industry, Count) ) # Data frame by industry just for the U.S. us <- as.data.frame(unique(US$Industry)) names(us) <- "Industry" us$Count <- sapply(us$Industry, function (i) length(which(US$Industry==i))) us <- us[order(us$Count, decreasing=F),] us <- transform(us, Industry=reorder(Industry, Count) ) # Data frame by industry just for the NYC NYC <- US[which(US$City=="new york"),] nyc <- as.data.frame(unique(NYC$Industry)) names(nyc) <- "Industry" nyc$Count <- sapply(nyc$Industry, function (i) length(which(NYC$Industry==i))) nyc <- nyc[order(nyc$Count, decreasing=F),] nyc <- transform(nyc, Industry=reorder(Industry, Count) ) # Plot the industries allplot <- ggplot(all, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by Segment") allplot usplot <- ggplot(us, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by Segment in the U.S.") usplot nycplot <- ggplot(nyc, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + ggtitle("MindBodyOnline Clients by Segment in New York City") nycplot # How many of different names length(grep('crossfit',data$Name)) length(grep('crossfit',US$Name)) length(grep('zumba',data$Name)) length(grep('yoga',data$Name)) length(grep('bikram',data$Name)) length(grep('bikram',US$Name)) # Save the plots pdf(file="MindBody Countries.pdf",width=11,height=8.5) par(las=2) par(mar=c(5,8,4,2)) countryplot dev.off() pdf(file="MindBody Cities.pdf",width=11,height=8.5) par(las=2) par(mar=c(5,8,4,2)) cityplot dev.off() pdf(file="MindBody Industry Segments.pdf",width=11,height=8.5) par(las=2) par(mar=c(5,8,4,2)) allplot dev.off() pdf(file="MindBody Industry Segments US.pdf",width=11,height=8.5) par(las=2) par(mar=c(5,8,4,2)) usplot dev.off()