lmullen · January 23, 2015 16:16 · Jan 23, 2015
diff --git a/scrape-smith-papers.R b/scrape-smith-papers.R
@@ -0,0 +1,44 @@
+library(rvest)
+library(dplyr)
+library(magrittr)
+
+# First find the list of people and parse out their names and urls.
+base <- "http://josephsmithpapers.org"
+list_of_people <- "/reference/people#a::"
+results <- paste0(base, list_of_people) %>%
+  html() %>%
+  html_nodes(".alphaItem")
+
+names <- results %>%
+  html_text() %>%
+  unlist()
+
+path <- results %>%
+  html_attr("href") %>%
+  unlist()
+
+people <- data_frame(names, path)
+
+get_person_data <- function(url) {
+  result <- html(url)
+  full_name <- result %>%
+    html_node(".metadata:nth-child(1) dd") %>%
+    html_text()
+  gender <- result %>%
+    html_node(".metadata:nth-child(2) dd") %>%
+    html_text()
+  bio <- result %>%
+    html_nodes("p") %>%
+    .[3] %>%
+    html_text()
+  mentions <- result %>%
+    html_nodes("#paper-link a") %>%
+    as.list()
+  data_frame(full_name, gender, bio) %>%
+    bind_cols(mentions)
+
+}
+
+temp <- paste0(base, people[1,2]) %>%
+  get_person_data()
+