lmullen · January 23, 2015 16:16
diff --git a/scrape-smith-papers.R b/scrape-smith-papers.R
 library(rvest)
 library(dplyr)
 library(magrittr)

 # First find the list of people and parse out their names and urls.
 base <- "http://josephsmithpapers.org"
 list_of_people <- "/reference/people#a::"
 results <- paste0(base, list_of_people) %>%
  html() %>%
  html_nodes(".alphaItem")

 names <- results %>%
  html_text() %>%
  unlist()

 path <- results %>%
  html_attr("href") %>%
  unlist()

 people <- data_frame(names, path)

 get_person_data <- function(url) {
  result <- html(url)
  full_name <- result %>%
    html_node(".metadata:nth-child(1) dd") %>%
    html_text()
  gender <- result %>%
    html_node(".metadata:nth-child(2) dd") %>%
    html_text()
  bio <- result %>%
    html_nodes("p") %>%
    .[3] %>%
    html_text()
  mentions <- result %>%
    html_nodes("#paper-link a") %>%
    as.list()
  data_frame(full_name, gender, bio) %>%
    bind_cols(mentions)

 }

 temp <- paste0(base, people[1,2]) %>%
  get_person_data()
	library(rvest)
	library(dplyr)
	library(magrittr)

	# First find the list of people and parse out their names and urls.
	base <- "http://josephsmithpapers.org"
	list_of_people <- "/reference/people#a::"
	results <- paste0(base, list_of_people) %>%
	html() %>%
	html_nodes(".alphaItem")

	names <- results %>%
	html_text() %>%
	unlist()

	path <- results %>%
	html_attr("href") %>%
	unlist()

	people <- data_frame(names, path)

	get_person_data <- function(url) {
	result <- html(url)
	full_name <- result %>%
	html_node(".metadata:nth-child(1) dd") %>%
	html_text()
	gender <- result %>%
	html_node(".metadata:nth-child(2) dd") %>%
	html_text()
	bio <- result %>%
	html_nodes("p") %>%
	.[3] %>%
	html_text()
	mentions <- result %>%
	html_nodes("#paper-link a") %>%
	as.list()
	data_frame(full_name, gender, bio) %>%
	bind_cols(mentions)

	}

	temp <- paste0(base, people[1,2]) %>%
	get_person_data()