Skip to content

Instantly share code, notes, and snippets.

@neerajt
Created January 8, 2019 23:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save neerajt/e66e028ee1e954388a043a9087318537 to your computer and use it in GitHub Desktop.
Save neerajt/e66e028ee1e954388a043a9087318537 to your computer and use it in GitHub Desktop.
# Houston R Users Group - Web Scraping Script
# =========================================================================================================================================
# Loading Necessary Libraries
library(rvest)
library(tidyverse)
library(reshape2)
# Other useful libraries
#library(XML) # Hides xml function from rvest
#library(RSelenium) # Helpful in dealing with dynamic webpages
# I always use Chrome (might work in other browsers, but I'm not familiar with them)
# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf
# Disclaimer: Always review the website terms of service agreement
# ============================================= NBA Data from Sports Reference ========================================================
# NBA Player-Season Data
# https://www.basketball-reference.com/
url <- paste0(
"https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
"&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
"&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")
# Including XPath
nba <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="stats"]') %>%
html_table() %>%
.[[1]]
# But XPath is not actually needed here
nba <- url %>%
read_html() %>%
html_table() %>%
.[[1]]
# Cleaning up the scraped table
names(nba) <- nba[1,]
nba <- nba[-1,]
nba <- nba[nba$Player != "Player",]
# Checking out the data
table(nba$Player) %>% data.frame
ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
geom_line()
#============================================== Opiates - CDC Data Scrape Script =======================================================
# Data on Opiate prescriptions at the County-Year level
# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html
# Scraping a single page
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
cdc16 <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
html_table()
cdc16 <- cdc16[[1]]
head(cdc16)
names(cdc16) <- make.names(names(cdc16))
#rm(cdc16)
# Scrapping 2010 to 2016 to List and Making into a DF
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
dfList <- lapply(url, function(i) {
webpage <- read_html(i)
draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
draft <- html_table(draft_table)[[1]]
})
str(dfList)
dfList[[1]] %>% head
cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
rm(dfList)
# Fixing Format and Names
names(cdc) <- make.names(names(cdc))
cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
cdc$variable <- gsub("X", "", cdc$variable)
names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
# Keeping TX Only
cdc <- cdc[cdc$State %in% "TX",]
# Checking out the data
str(cdc)
table(cdc$Year)
require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
geom_line() +
theme(legend.position="none")
# =============================================== Scraping State of the Union =========================================================
# Transcribed State of the Union Speeches
# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses
# Scraping State of the Union Speech
url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"
# XPath to the speech
speech <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
html_text()
speech
# Another way to do it that gives a slightly different format
speechbypara <- url %>%
read_html() %>%
html_nodes("p") %>%
html_text()
speechbypara
# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
# <p>: Paragraph elements
# <ul>: Unordered bulleted list
# <ol>: Ordered list
# <li>: Individual List item
# <div>: Division or section
# <table>: Table
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment