Skip to content

Instantly share code, notes, and snippets.

@valentinitnelav
Last active April 9, 2017 08:32
Show Gist options
  • Save valentinitnelav/27abd6f9cf7128e4b7241d3ce7e74f3c to your computer and use it in GitHub Desktop.
Save valentinitnelav/27abd6f9cf7128e4b7241d3ce7e74f3c to your computer and use it in GitHub Desktop.
Read <table> HTML tag with {rvest} using CSS selectors
# Read <table> HTML tag with {rvest} using CSS selectors
# ====================================================
# Load library
library(rvest)
# =======================
# Read the web page [accessed 08-Apr-2017]
# =======================
link <- "http://www.theplantlist.org/1.1/statistics/"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- read_html(link)
# =======================
# Use the general "table" selector to read all tables,
# then select desired table
# =======================
# -----------------------
# Without piping:
# -----------------------
tbl.nodes <- html_nodes(x = link.scrap,
css ="table")
tbls.lst <- html_table(tbl.nodes)
# The result is a list,
# therefore, an individual table can be further accessed via indexing
# so, select only the first table
my.tbl <- tbls.lst[[1]]
# also, drop unwanted first column
my.tbl <- my.tbl[,-1]
# adjust column name
colnames(my.tbl)[3] <- "Total_prc"
my.tbl
## Status Total Total_prc
## 1 Accepted 350,699 33.0%
## 2 Synonym 470,624 44.2%
## 3 Unplaced 243 0.0%
## 4 Unassessed 242,469 22.8%
# Note that all data is read as character!
str(my.tbl)
## 'data.frame': 4 obs. of 3 variables:
## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed"
## $ Total : chr "350,699" "470,624" "243" "242,469"
## $ Total_prc: chr "33.0%" "44.2%" "0.0%" "22.8%"
# -----------------------
# With piping:
# -----------------------
my.tbl.2 <-
html_nodes(x = link.scrap,
css ="table") %>%
html_table() %>%
.[[1]] %>% # select the first table
.[,-1] # drop unwanted first column
# adjust column name
colnames(my.tbl.2)[3] <- "Total_prc"
my.tbl.2
# =======================
# Read a specific table tag from the page
# using directly the table's selector
# =======================
my.tbl.3 <-
html_nodes(x = link.scrap,
css ='#columns > section > div:nth-child(5) > table') %>%
html_table() %>%
.[[1]] %>% # The indexing is to select the only element of the list
.[,-1] # drop unwanted first column
colnames(my.tbl.3)[3] <- "Total_prc"
my.tbl.3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment