Created
January 8, 2019 23:26
-
-
Save neerajt/e66e028ee1e954388a043a9087318537 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Houston R Users Group - Web Scraping Script | |
# ========================================================================================================================================= | |
# Loading Necessary Libraries | |
library(rvest) | |
library(tidyverse) | |
library(reshape2) | |
# Other useful libraries | |
#library(XML) # Hides xml function from rvest | |
#library(RSelenium) # Helpful in dealing with dynamic webpages | |
# I always use Chrome (might work in other browsers, but I'm not familiar with them) | |
# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf | |
# Disclaimer: Always review the website terms of service agreement | |
# ============================================= NBA Data from Sports Reference ======================================================== | |
# NBA Player-Season Data | |
# https://www.basketball-reference.com/ | |
url <- paste0( | |
"https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36", | |
"&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99", | |
"&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws") | |
# Including XPath | |
nba <- url %>% | |
read_html() %>% | |
html_nodes(xpath = '//*[@id="stats"]') %>% | |
html_table() %>% | |
.[[1]] | |
# But XPath is not actually needed here | |
nba <- url %>% | |
read_html() %>% | |
html_table() %>% | |
.[[1]] | |
# Cleaning up the scraped table | |
names(nba) <- nba[1,] | |
nba <- nba[-1,] | |
nba <- nba[nba$Player != "Player",] | |
# Checking out the data | |
table(nba$Player) %>% data.frame | |
ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) + | |
geom_line() | |
#============================================== Opiates - CDC Data Scrape Script ======================================================= | |
# Data on Opiate prescriptions at the County-Year level | |
# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html | |
# Scraping a single page | |
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html") | |
cdc16 <- url %>% | |
read_html() %>% | |
html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line | |
html_table() | |
cdc16 <- cdc16[[1]] | |
head(cdc16) | |
names(cdc16) <- make.names(names(cdc16)) | |
#rm(cdc16) | |
# Scrapping 2010 to 2016 to List and Making into a DF | |
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html") | |
dfList <- lapply(url, function(i) { | |
webpage <- read_html(i) | |
draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') | |
draft <- html_table(draft_table)[[1]] | |
}) | |
str(dfList) | |
dfList[[1]] %>% head | |
cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code")) | |
rm(dfList) | |
# Fixing Format and Names | |
names(cdc) <- make.names(names(cdc)) | |
cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code")) | |
cdc$variable <- gsub("X", "", cdc$variable) | |
names(cdc)[4:5] <- c("Year", "Prescribing.Rate") | |
cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year) | |
cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate) | |
# Keeping TX Only | |
cdc <- cdc[cdc$State %in% "TX",] | |
# Checking out the data | |
str(cdc) | |
table(cdc$Year) | |
require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year) | |
ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) + | |
geom_line() + | |
theme(legend.position="none") | |
# =============================================== Scraping State of the Union ========================================================= | |
# Transcribed State of the Union Speeches | |
# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses | |
# Scraping State of the Union Speech | |
url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2" | |
# XPath to the speech | |
speech <- url %>% | |
read_html() %>% | |
html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>% | |
html_text() | |
speech | |
# Another way to do it that gives a slightly different format | |
speechbypara <- url %>% | |
read_html() %>% | |
html_nodes("p") %>% | |
html_text() | |
speechbypara | |
# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc. | |
# <p>: Paragraph elements | |
# <ul>: Unordered bulleted list | |
# <ol>: Ordered list | |
# <li>: Individual List item | |
# <div>: Division or section | |
# <table>: Table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment