neerajt · January 8, 2019 23:26
diff --git a/web_scraping_script.R b/web_scraping_script.R
 #                                      Houston R Users Group - Web Scraping Script
 # =========================================================================================================================================

 # Loading Necessary Libraries
 library(rvest)
 library(tidyverse)
 library(reshape2)
 # Other useful libraries
 #library(XML) # Hides xml function from rvest
 #library(RSelenium) # Helpful in dealing with dynamic webpages

 # I always use Chrome (might work in other browsers, but I'm not familiar with them)
 # Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf

 # Disclaimer: Always review the website terms of service agreement


 # ============================================= NBA Data from Sports Reference ========================================================

 # NBA Player-Season Data
 # https://www.basketball-reference.com/

 url <- paste0(
  "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
  "&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
  "&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")

 # Including XPath
 nba <- url %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="stats"]') %>%
  html_table() %>% 
  .[[1]]

 # But XPath is not actually needed here
 nba <- url %>% 
  read_html() %>% 
  html_table() %>% 
  .[[1]]

 # Cleaning up the scraped table
 names(nba) <- nba[1,]
 nba <- nba[-1,]
 nba <- nba[nba$Player != "Player",]

 # Checking out the data
 table(nba$Player) %>% data.frame
 ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
  geom_line()


 #============================================== Opiates - CDC Data Scrape Script =======================================================

 # Data on Opiate prescriptions at the County-Year level
 # https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

 # Scraping a single page
 url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
 cdc16 <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
  html_table()
 cdc16 <- cdc16[[1]]
 head(cdc16)
 names(cdc16) <- make.names(names(cdc16))
 #rm(cdc16)

 # Scrapping 2010 to 2016 to List and Making into a DF
 url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
 dfList <- lapply(url, function(i) {
  webpage <- read_html(i)
  draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
  draft <- html_table(draft_table)[[1]]
 })
 str(dfList)
 dfList[[1]] %>% head
 cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
 rm(dfList)

 # Fixing Format and Names
 names(cdc) <- make.names(names(cdc))
 cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
 cdc$variable <- gsub("X", "", cdc$variable)
 names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
 cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
 cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
 # Keeping TX Only
 cdc <- cdc[cdc$State %in% "TX",]

 # Checking out the data
 str(cdc)
 table(cdc$Year)
 require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
 ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
  geom_line() +
  theme(legend.position="none")



 # =============================================== Scraping State of the Union =========================================================

 # Transcribed State of the Union Speeches
 # https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses

 # Scraping State of the Union Speech
 url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"

 # XPath to the speech
 speech <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
  html_text()
 speech

 # Another way to do it that gives a slightly different format
 speechbypara <- url %>%
  read_html() %>%
  html_nodes("p") %>%
  html_text()
 speechbypara

 # <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
 # <p>: Paragraph elements
 # <ul>: Unordered bulleted list
 # <ol>: Ordered list
 # <li>: Individual List item
 # <div>: Division or section
 # <table>: Table
	# Houston R Users Group - Web Scraping Script
	# =========================================================================================================================================

	# Loading Necessary Libraries
	library(rvest)
	library(tidyverse)
	library(reshape2)
	# Other useful libraries
	#library(XML) # Hides xml function from rvest
	#library(RSelenium) # Helpful in dealing with dynamic webpages

	# I always use Chrome (might work in other browsers, but I'm not familiar with them)
	# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf

	# Disclaimer: Always review the website terms of service agreement


	# ============================================= NBA Data from Sports Reference ========================================================

	# NBA Player-Season Data
	# https://www.basketball-reference.com/

	url <- paste0(
	"https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
	"&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
	"&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")

	# Including XPath
	nba <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="stats"]') %>%
	html_table() %>%
	.[[1]]

	# But XPath is not actually needed here
	nba <- url %>%
	read_html() %>%
	html_table() %>%
	.[[1]]

	# Cleaning up the scraped table
	names(nba) <- nba[1,]
	nba <- nba[-1,]
	nba <- nba[nba$Player != "Player",]

	# Checking out the data
	table(nba$Player) %>% data.frame
	ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
	geom_line()


	#============================================== Opiates - CDC Data Scrape Script =======================================================

	# Data on Opiate prescriptions at the County-Year level
	# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

	# Scraping a single page
	url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
	cdc16 <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
	html_table()
	cdc16 <- cdc16[[1]]
	head(cdc16)
	names(cdc16) <- make.names(names(cdc16))
	#rm(cdc16)

	# Scrapping 2010 to 2016 to List and Making into a DF
	url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
	dfList <- lapply(url, function(i) {
	webpage <- read_html(i)
	draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
	draft <- html_table(draft_table)[[1]]
	})
	str(dfList)
	dfList[[1]] %>% head
	cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
	rm(dfList)

	# Fixing Format and Names
	names(cdc) <- make.names(names(cdc))
	cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
	cdc$variable <- gsub("X", "", cdc$variable)
	names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
	cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
	cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
	# Keeping TX Only
	cdc <- cdc[cdc$State %in% "TX",]

	# Checking out the data
	str(cdc)
	table(cdc$Year)
	require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
	ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
	geom_line() +
	theme(legend.position="none")



	# =============================================== Scraping State of the Union =========================================================

	# Transcribed State of the Union Speeches
	# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses

	# Scraping State of the Union Speech
	url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"

	# XPath to the speech
	speech <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
	html_text()
	speech

	# Another way to do it that gives a slightly different format
	speechbypara <- url %>%
	read_html() %>%
	html_nodes("p") %>%
	html_text()
	speechbypara

	# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
	# <p>: Paragraph elements
	# <ul>: Unordered bulleted list
	# <ol>: Ordered list
	# <li>: Individual List item
	# <div>: Division or section
	# <table>: Table