valentinitnelav · April 9, 2017 08:32
diff --git a/rvest_read_tbl.R b/rvest_read_tbl.R
 # Read <table> HTML tag with {rvest} using CSS selectors
 # ====================================================

 # Load library
 library(rvest)

 # =======================
 # Read the web page [accessed 08-Apr-2017]
 # =======================
 link <- "http://www.theplantlist.org/1.1/statistics/"
 # NOTE: is ethical to store the page and not read it unnecessarily too many times, 
 # overloading their server
 link.scrap <- read_html(link)

 # =======================
 # Use the general "table" selector to read all tables,
 # then select desired table
 # =======================
 # -----------------------
 # Without piping:
 # -----------------------
 tbl.nodes <- html_nodes(x   = link.scrap, 
                        css ="table")
 tbls.lst  <- html_table(tbl.nodes)
 # The result is a list, 
 # therefore, an individual table can be further accessed via indexing
 # so, select only the first table
 my.tbl    <- tbls.lst[[1]]
 # also, drop unwanted first column
 my.tbl    <- my.tbl[,-1]
 # adjust column name
 colnames(my.tbl)[3] <- "Total_prc"
 my.tbl
 ##       Status   Total Total_prc
 ## 1   Accepted 350,699     33.0%
 ## 2    Synonym 470,624     44.2%
 ## 3   Unplaced     243      0.0%
 ## 4 Unassessed 242,469     22.8%

 # Note that all data is read as character!
 str(my.tbl)
 ## 'data.frame':	4 obs. of  3 variables:
 ##  $ Status   : chr  "Accepted" "Synonym" "Unplaced" "Unassessed"
 ##  $ Total    : chr  "350,699" "470,624" "243" "242,469"
 ##  $ Total_prc: chr  "33.0%" "44.2%" "0.0%" "22.8%"

 # -----------------------
 # With piping:
 # -----------------------
 my.tbl.2 <- 
    html_nodes(x   = link.scrap, 
               css ="table") %>%
    html_table() %>%
    .[[1]] %>% # select the first table
    .[,-1]     # drop unwanted first column
 # adjust column name
 colnames(my.tbl.2)[3] <- "Total_prc"
 my.tbl.2

 # =======================
 # Read a specific table tag from the page
 # using directly the table's selector
 # =======================
 my.tbl.3 <- 
    html_nodes(x   = link.scrap, 
               css ='#columns > section > div:nth-child(5) > table') %>%
    html_table() %>% 
    .[[1]] %>% # The indexing is to select the only element of the list
    .[,-1]     # drop unwanted first column
 colnames(my.tbl.3)[3] <- "Total_prc"
 my.tbl.3
	# Read <table> HTML tag with {rvest} using CSS selectors
	# ====================================================

	# Load library
	library(rvest)

	# =======================
	# Read the web page [accessed 08-Apr-2017]
	# =======================
	link <- "http://www.theplantlist.org/1.1/statistics/"
	# NOTE: is ethical to store the page and not read it unnecessarily too many times,
	# overloading their server
	link.scrap <- read_html(link)

	# =======================
	# Use the general "table" selector to read all tables,
	# then select desired table
	# =======================
	# -----------------------
	# Without piping:
	# -----------------------
	tbl.nodes <- html_nodes(x = link.scrap,
	css ="table")
	tbls.lst <- html_table(tbl.nodes)
	# The result is a list,
	# therefore, an individual table can be further accessed via indexing
	# so, select only the first table
	my.tbl <- tbls.lst[[1]]
	# also, drop unwanted first column
	my.tbl <- my.tbl[,-1]
	# adjust column name
	colnames(my.tbl)[3] <- "Total_prc"
	my.tbl
	## Status Total Total_prc
	## 1 Accepted 350,699 33.0%
	## 2 Synonym 470,624 44.2%
	## 3 Unplaced 243 0.0%
	## 4 Unassessed 242,469 22.8%

	# Note that all data is read as character!
	str(my.tbl)
	## 'data.frame': 4 obs. of 3 variables:
	## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed"
	## $ Total : chr "350,699" "470,624" "243" "242,469"
	## $ Total_prc: chr "33.0%" "44.2%" "0.0%" "22.8%"

	# -----------------------
	# With piping:
	# -----------------------
	my.tbl.2 <-
	html_nodes(x = link.scrap,
	css ="table") %>%
	html_table() %>%
	.[[1]] %>% # select the first table
	.[,-1] # drop unwanted first column
	# adjust column name
	colnames(my.tbl.2)[3] <- "Total_prc"
	my.tbl.2

	# =======================
	# Read a specific table tag from the page
	# using directly the table's selector
	# =======================
	my.tbl.3 <-
	html_nodes(x = link.scrap,
	css ='#columns > section > div:nth-child(5) > table') %>%
	html_table() %>%
	.[[1]] %>% # The indexing is to select the only element of the list
	.[,-1] # drop unwanted first column
	colnames(my.tbl.3)[3] <- "Total_prc"
	my.tbl.3