library(dplyr) # Helps manipulate data # Let's create example datasets records <- data.frame( user_id = c(1, 2, 3, 4, 5), location = c("A", "B", "C", "C", "D") ) # user_id location # 1 1 A # 2 2 B # 3 3 C # 4 4 C # 5 5 D transactions <- data.frame( user_id = c(1, 1, 1, 2, 3, 3, 3, 4, 5, 5), amt = c(50, 30, 20, 25, 42, 67, 43, 82, 90, 9) ) # user_id amt # 1 1 50 # 2 1 30 # 3 1 20 # 4 2 25 # 5 3 42 # 6 3 67 # 7 3 43 # 8 4 82 # 9 5 90 # 10 5 9 census <- data.frame( locale = c("A", "B", "E"), poverty_rate = c("0.19", "0.15", "0.25") ) # locale poverty_rate # 1 A 0.19 # 2 B 0.15 # 3 E 0.25 # A few types of matches # - inner join # - full join (also know as outer join) # - left join # - right join # inner join: # - Return only matched in both datasets # - Notice in "by" we use the name of merging variable in the # first data.frame first and the name of the second data.frame inner_join( records, census, by = c("location" = "locale") ) # user_id location poverty_rate # 1 1 A 0.19 # 2 2 B 0.15 # full join: # - Return matched and unmatched cases full_join( records, census, by = c("location" = "locale") ) # user_id location poverty_rate # 1 1 A 0.19 # 2 2 B 0.15 # 3 3 C # 4 4 C # 5 5 D # 6 NA E 0.25 # left join: # - Return matched cases and return unmatched cases from the first data.frame left_join( records, census, by = c("location" = "locale") ) # user_id location poverty_rate # 1 1 A 0.19 # 2 2 B 0.15 # 3 3 C # 4 4 C # 5 5 D # right join: # - Return matched cases and unmatched cases from the second data.frame right_join( records, census, by = c("location" = "locale") ) # user_id location poverty_rate # 1 1 A 0.19 # 2 2 B 0.15 # 3 NA E 0.25