Skip to content

Instantly share code, notes, and snippets.

@potterzot
Last active October 11, 2019 19:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save potterzot/c55e827372a30a100f84249678f7e150 to your computer and use it in GitHub Desktop.
Save potterzot/c55e827372a30a100f84249678f7e150 to your computer and use it in GitHub Desktop.
R script for data carpentry lesson
################################################################
# EVERYTHING HERE IS THE CODE THE LESSON IS BASED OFF OF
# IN THE NEXT SECTION IS CODE ACTUALLY WRITTEN DURING THE LESSON
################################################################
install.packages("tidyverse")
install.packages(c("here", "reprex"))
### Resources
# Stack overflow: stackoverflow.com
# Jenny Bryan (@jennybryan) and the #rstats hashtag on twitter
# Advanced R book (for all levels): https://adv-r.hadley.nz/
# Geocomputation for R book: https://geocompr.robinlovelace.net/
### R Package examples
# https://github.com/ropensci/rnoaa
# https://github.com/tidyverse/reprex
# https://cran.r-project.org/web/packages/rnassqs/index.html
### Remember these shortcuts:
# CTRL+Enter: runs the code block
# CTRL+1: Switch to script window
# CTRL+2: Switch to console window
#### Load libraries ----
library(tidyverse)
library(reprex)
library(here)
# Setup a project (usually you've already done this)
# 1. Create a project in Rstudio
dir.create("data_raw")
dir.create("data")
dir.create("fig")
dir.create("src")
#### Download and load data ----
if(!file.exists(here("data_raw/rodent_counts.csv"))) {
download.file(url="https://ndownloader.figshare.com/files/2292169",
destfile = "data_raw/rodent_counts.csv")
}
# using rodent_counts
surveys <- read_csv(here("data_raw/rodent_counts.csv"))
# read.csv
# read.table
# read_
# help(read.csv)
#### dplyr verbs ----
## Selecting
sel_surveys <- select(surveys, plot_id, species_id, weight)
head(sub_surveys)
## Filtering
range(surveys$year)
fil_surveys <- filter(surveys, year == 1995)
## Pipes
# EXERCISE: How would you filter weight < 5 and include species_id, sex, and weight?
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)
# Piping is more readable
surveys_sml <- surveys %>%
select(species_id, sex, weight) %>%
filter(weight < 5)
## Mutate
surveys %>%
mutate(weight_kg = weight / 1000)
surveys %>%
mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2)
surveys %>%
filter(!is.na(weight)) %>%
mutate(weight_kg = weight / 1000) %>%
head()
# EXERCISE: Create a new data frame that meets the following criteria:
# 1. contains only the species_id column and a new column called hindfoot_half
# 2. hindfoot_half is half of hindfoot_length values
# 3. There are no NA values and all values are less than 30.
surveys_hindfoot_half <- surveys %>%
filter(!is.na(hindfoot_length)) %>%
mutate(hindfoot_half = hindfoot_length / 2) %>%
filter(hindfoot_half < 30) %>%
select(species_id, hindfoot_half)
## Split-Apply-Combine
surveys %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
surveys %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
min_weight = min(weight))
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
min_weight = min(weight)) %>%
arrange(min_weight)
## count(), n(), N(), row_number()
surveys %>%
count(sex, sort = TRUE)
surveys %>%
group_by(species, sex) %>%
count(sort = TRUE)
surveys %>%
group_by(species, sex) %>%
filter(row_number() == 1)
surveys %>%
group_by(species, sex) %>%
mutate(n_grp = n()) %>%
filter(row_number() == n_grp) %>%
head()
surveys %>%
group_by(species, plot_id) %>%
filter(row_number() == n()) %>%
head()
# EXERCISE: How would you select the record with the group max weight?
# 1. Group by sex, species
# 2. For each group, select the observation with the maximum
# 3. Sort by species and sex
# 4. return just species, sex, and weight
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species) %>%
filter(weight == max(weight)) %>%
select(species, sex, weight) %>%
arrange(species, sex)
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species) %>%
arrange(weight) %>%
filter(row_number() == n()) %>%
select(species, sex, weight) %>%
arrange(species, sex)
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species) %>%
arrange(desc(weight)) %>%
filter(row_number() == 1) %>%
select(species, sex, weight) %>%
arrange(species, sex)
## Joins and mutate_at, mutate_if
# left_join: keeps all records of the first data.frame
# right_join: keeps all records of the second data.frame
# inner_join: keeps records that are in both
# full_join: keeps all records from either
survey_means <- surveys %>%
group_by(species, sex) %>%
summarize(weight_mean = mean(weight, na.rm = TRUE),
hindfoot_mean = mean(hindfoot_length, na.rm = TRUE))
survey_sd <- surveys %>%
group_by(species, sex) %>%
summarize(weight_sd = sd(weight, na.rm = TRUE),
hindfoot_sd = sd(hindfoot_length, na.rm = TRUE))
survey_join1 <- left_join(surveys, survey_means, by = c("species", "sex"))
survey_join2 <- left_join(surveys, survey_sd, by = c("species", "sex"))
# Reduce alternative
Reduce(left_join, list(surveys, survey_means, survey_sd))
# Exercise: How could you do this in one call with mutate?
surveys %>%
filter(!is.na(weight)) %>%
group_by(species, sex) %>%
mutate(weight_mean = mean(weight, na.rm = TRUE),
weight_sd = sd(weight)
weight_dev = weight - weight_mean
hindfoot_mean = mean(hindfoot_length, na.rm = TRUE),
hindfoot_sd = sd(hindfoot_length, na.rm = TRUE),
hindfoot_dev = hindfoot_length - hindfoot_mean) %>%
select(species, sex,
weight, weight_dev, weight_mean,
hindfoot_length, hindfoot_dev, hindfoot_mean
)
## Tidy data:
# Each variable has its own column
# Each observation has its own row
# Each value must have its own cell
# Each type of observational unit forms a table
surveys_gw <- surveys %>%
filter(!is.na(weight)) %>%
group_by(genus, plot_id) %>%
summarize(mean_weight = mean(weight))
## Spread
# https://datacarpentry.org/R-ecology-lesson/img/spread_data_R.png
help(spread)
surveys_gw %>%
spread(genus, mean_weight) %>%
head()
surveys_wide <- surveys_gw %>%
spread(key = genus, value = mean_weight, fill = 0)
head(surveys_spread)
## Gather
# https://datacarpentry.org/R-ecology-lesson/img/gather_data_R.png
surveys_wide %>%
gather(key = genus, value = mean_weight, -plot_id)
surveys_long <- surveys_wide %>%
gather(key = genus, value = mean_weight, -plot_id)
#########################################
# CODE ACTUALLY WRITTEN DURING THE LESSON
#########################################
install.packages("tidyverse")
install.packages(c("here", "reprex"))
### Resources
# Stack overflow: stackoverflow.com
# Jenny Bryan (@jennybryan) and the #rstats hashtag on twitter
# Advanced R book (for all levels): https://adv-r.hadley.nz/
# Geocomputation for R book: https://geocompr.robinlovelace.net/
### R Package examples
# https://github.com/ropensci/rnoaa
# https://github.com/tidyverse/reprex
# https://cran.r-project.org/web/packages/rnassqs/index.html
#### Setup and load libraries ----
install.packages(c("here", "reprex"))
library(tidyverse)
library(here)
library(reprex)
library(purrr)
library(forcats)
dir.create("data_raw")
dir.create("data")
dir.create("fig")
if(!dir.exists("src")) {
dir.create("src")
}
#### Load and setup data ----
download.file(url="https://ndownloader.figshare.com/files/2292169",
destfile = "data_raw/rodent_counts.csv")
surveys <- read_csv(here("data_raw/rodent_counts.csv"))
dim(surveys)
head(surveys)
class(surveys)
surveys
surveys_alt <- read.csv("data_raw/rodent_counts.csv")
class(surveys_alt)
## Using dplyr
surveys[1:3, 1]
select(surveys, 1)
select(surveys, record_id, year)
head(surveys)
select(surveys, -genus, -species)
surveys[surveys$year < 1995,]
filter(surveys, year < 1995)
filter(surveys, !is.na(weight))
## Exercise: Filter weight less than 5 and include species_id, sex, and weight
filteredWeight <- filter(surveys,weight<5)
select(filteredWeight,species_id,sex,weight)
filter(surveys, weight < 5) %>%
select(species_id, sex, weight)
surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
## Mutate / transmute
surveys %>%
filter(!is.na(weight)) %>%
mutate(weight_kg = weight/1000,
weight_lb = weight_kg * 2.2) %>%
select(record_id, weight, weight_kg, weight_lb)
surveys %>%
filter(!is.na(weight)) %>%
transmute(record_id,
weight_kg = weight/1000,
weight_lb = weight_kg * 2.2)
## Exercise
# 1. contains only the species_id column and a new column hindfoot_half
# 2. hindfoot_half = hindfoot_length / 2
# 3. values of hindfoot_half < 30 and are not NA
surveys %>%
transmute(species_id,
hindfoot_half=hindfoot_length/2) %>%
filter(hindfoot_half < 30, !is.na(hindfoot_half))
surveys %>%
transmute(species_id,
hindfoot_half = hindfoot_length / 2) %>%
filter(!is.na(hindfoot_half),
hindfoot_half < 30)
surveys %>%
filter(!is.na(hindfoot_length)) %>%
filter(hindfoot_length<60) %>%
transmute(species_id,
hindfoot_half=hindfoot_length/2)
surveys %>%
mutate(hindfoot_half = hindfoot_length / 2) %>%
filter(hindfoot_half < 30) %>%
filter(!is.na(hindfoot_length)) %>%
select(species_id, hindfoot_half)
surveys %>%
filter(!is.na(hindfoot_length) & hindfoot_length < 60) %>%
transmute(hindfoot_half = hindfoot_length/2) %>%
select(species_id,hindfoot_length,hindfoot_half)
surveys %>%
mutate(hindfoot_half = hindfoot_length/2)
## Grouping variables and applying
## Split-Apply-Combine
names(surveys)
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
summarize(
weight = mean(weight))
ab <- surveys %>%
filter(species_id == "AB") %>%
select(species_id, sex, weight)
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
summarize(
weight_mean = mean(weight),
weight_sd = sd(weight),
weight_min = min(weight),
grp_n = n())
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
filter(weight == min(weight))
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
filter(weight == min(weight))
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
summarize(weight_mean = mean(weight)) %>%
arrange(desc(weight_mean))
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
filter(row_number() == n())
## Exercise:
# Select the row with the maximum weight for each species_id and sex
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
arrange(weight)
filter(row_number() == n())
x <- data.frame(
record_id = c("a", "b", "c"),
weight = c(1,4.5,3)
)
x
x %>%
arrange(weight) %>%
filter(row_number() == n())
x %>%
arrange(desc(weight)) %>%
filter(row_number() == 1)
surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, sex) %>%
filter(weight == max(weight)) %>%
filter(row_number()==1)
surveys %>%
filter(!is.na(weight), !is.na(sex)) %>%
group_by(species_id, sex) %>%
arrange(desc(weight)) %>%
filter(row_number() == 1)
## Means and standard deviations by group (species_id, plot_id)
gm <- surveys %>%
filter(!is.na(weight)) %>%
group_by(species_id, plot_id) %>%
summarize(weight_gm = mean(weight))
# So that we don't have to filter by is.na every time
surveys_w_weight <- surveys %>%
filter(!is.na(weight))
gd <- surveys_w_weight %>%
group_by(species_id, plot_id) %>%
transmute(record_id, weight_gd = weight - mean(weight)) %>%
ungroup()
head(gd)
## Merging data
# left_join(x,y, by = <some variable>) # keeps all records from x
# right_join(x,y, by = ...) #keeps all records from y and only matching from x
# inner_join(x,y, by = ...) #keeps only records that matched from both
# full_join(x,y, by = ...) # Keeps all records from x and all from y
surveys2 <- left_join(surveys, gm, by = c("species_id", "plot_id")) %>%
select(record_id, species_id, plot_id, weight, weight_gm) %>%
head()
surveys3 <- left_join(surveys2, select(gd, -species_id, -plot_id), by = "record_id")
head(surveys3)
# Exercise
# 1. Create a data set like surveys but with the two additional variables:
# a. weight_gm
# b. weight_gd
surveys_w_weight %>%
group_by(species_id, plot_id) %>%
mutate(weight_gm = mean(weight),
weight_gd = weight - weight_gm) %>%
select(record_id, species_id, plot_id,
weight, weight_gm, weight_gd) %>%
head()
head(surveys3)
## One more thing on merging
surveys3a <- Reduce(left_join, list(surveys, gm, gd))
## Tidy Data
# Each variable has it's own column
# Each observation has it's own row
# Each value to have it's own cell
head(surveys)
surveys_gm <- surveys_w_weight %>%
group_by(genus, plot_id) %>%
summarize(weight_mean = mean(weight))
surveys_gm
## Long to wide
surveys_wide <- surveys_gm %>%
spread(key = genus, value = weight_mean)
## Wide to long
surveys_long <- surveys_wide %>%
gather(key = genus, value = weight_mean, -plot_id)
head(surveys_long)
head(surveys_gm)
# Exercise Transform to long so that each column has the year
surveys_year <- surveys %>%
filter(!is.na(weight)) %>%
select(plot_id, year, weight) %>%
group_by(plot_id, year) %>%
summarize(weight_mean = mean(weight)) %>%
spread(key = year, value = weight_mean)
## Save your output!!!!
write_csv(surveys_year, "data/surveys_year.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment