-
-
Save potterzot/c55e827372a30a100f84249678f7e150 to your computer and use it in GitHub Desktop.
R script for data carpentry lesson
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################ | |
# EVERYTHING HERE IS THE CODE THE LESSON IS BASED OFF OF | |
# IN THE NEXT SECTION IS CODE ACTUALLY WRITTEN DURING THE LESSON | |
################################################################ | |
install.packages("tidyverse") | |
install.packages(c("here", "reprex")) | |
### Resources | |
# Stack overflow: stackoverflow.com | |
# Jenny Bryan (@jennybryan) and the #rstats hashtag on twitter | |
# Advanced R book (for all levels): https://adv-r.hadley.nz/ | |
# Geocomputation for R book: https://geocompr.robinlovelace.net/ | |
### R Package examples | |
# https://github.com/ropensci/rnoaa | |
# https://github.com/tidyverse/reprex | |
# https://cran.r-project.org/web/packages/rnassqs/index.html | |
### Remember these shortcuts: | |
# CTRL+Enter: runs the code block | |
# CTRL+1: Switch to script window | |
# CTRL+2: Switch to console window | |
#### Load libraries ---- | |
library(tidyverse) | |
library(reprex) | |
library(here) | |
# Setup a project (usually you've already done this) | |
# 1. Create a project in Rstudio | |
dir.create("data_raw") | |
dir.create("data") | |
dir.create("fig") | |
dir.create("src") | |
#### Download and load data ---- | |
if(!file.exists(here("data_raw/rodent_counts.csv"))) { | |
download.file(url="https://ndownloader.figshare.com/files/2292169", | |
destfile = "data_raw/rodent_counts.csv") | |
} | |
# using rodent_counts | |
surveys <- read_csv(here("data_raw/rodent_counts.csv")) | |
# read.csv | |
# read.table | |
# read_ | |
# help(read.csv) | |
#### dplyr verbs ---- | |
## Selecting | |
sel_surveys <- select(surveys, plot_id, species_id, weight) | |
head(sub_surveys) | |
## Filtering | |
range(surveys$year) | |
fil_surveys <- filter(surveys, year == 1995) | |
## Pipes | |
# EXERCISE: How would you filter weight < 5 and include species_id, sex, and weight? | |
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight) | |
# Piping is more readable | |
surveys_sml <- surveys %>% | |
select(species_id, sex, weight) %>% | |
filter(weight < 5) | |
## Mutate | |
surveys %>% | |
mutate(weight_kg = weight / 1000) | |
surveys %>% | |
mutate(weight_kg = weight / 1000, | |
weight_lb = weight_kg * 2.2) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
mutate(weight_kg = weight / 1000) %>% | |
head() | |
# EXERCISE: Create a new data frame that meets the following criteria: | |
# 1. contains only the species_id column and a new column called hindfoot_half | |
# 2. hindfoot_half is half of hindfoot_length values | |
# 3. There are no NA values and all values are less than 30. | |
surveys_hindfoot_half <- surveys %>% | |
filter(!is.na(hindfoot_length)) %>% | |
mutate(hindfoot_half = hindfoot_length / 2) %>% | |
filter(hindfoot_half < 30) %>% | |
select(species_id, hindfoot_half) | |
## Split-Apply-Combine | |
surveys %>% | |
group_by(sex) %>% | |
summarize(mean_weight = mean(weight, na.rm = TRUE)) | |
surveys %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight, na.rm = TRUE)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight, na.rm = TRUE)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight), | |
min_weight = min(weight)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species_id) %>% | |
summarize(mean_weight = mean(weight), | |
min_weight = min(weight)) %>% | |
arrange(min_weight) | |
## count(), n(), N(), row_number() | |
surveys %>% | |
count(sex, sort = TRUE) | |
surveys %>% | |
group_by(species, sex) %>% | |
count(sort = TRUE) | |
surveys %>% | |
group_by(species, sex) %>% | |
filter(row_number() == 1) | |
surveys %>% | |
group_by(species, sex) %>% | |
mutate(n_grp = n()) %>% | |
filter(row_number() == n_grp) %>% | |
head() | |
surveys %>% | |
group_by(species, plot_id) %>% | |
filter(row_number() == n()) %>% | |
head() | |
# EXERCISE: How would you select the record with the group max weight? | |
# 1. Group by sex, species | |
# 2. For each group, select the observation with the maximum | |
# 3. Sort by species and sex | |
# 4. return just species, sex, and weight | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species) %>% | |
filter(weight == max(weight)) %>% | |
select(species, sex, weight) %>% | |
arrange(species, sex) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species) %>% | |
arrange(weight) %>% | |
filter(row_number() == n()) %>% | |
select(species, sex, weight) %>% | |
arrange(species, sex) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(sex, species) %>% | |
arrange(desc(weight)) %>% | |
filter(row_number() == 1) %>% | |
select(species, sex, weight) %>% | |
arrange(species, sex) | |
## Joins and mutate_at, mutate_if | |
# left_join: keeps all records of the first data.frame | |
# right_join: keeps all records of the second data.frame | |
# inner_join: keeps records that are in both | |
# full_join: keeps all records from either | |
survey_means <- surveys %>% | |
group_by(species, sex) %>% | |
summarize(weight_mean = mean(weight, na.rm = TRUE), | |
hindfoot_mean = mean(hindfoot_length, na.rm = TRUE)) | |
survey_sd <- surveys %>% | |
group_by(species, sex) %>% | |
summarize(weight_sd = sd(weight, na.rm = TRUE), | |
hindfoot_sd = sd(hindfoot_length, na.rm = TRUE)) | |
survey_join1 <- left_join(surveys, survey_means, by = c("species", "sex")) | |
survey_join2 <- left_join(surveys, survey_sd, by = c("species", "sex")) | |
# Reduce alternative | |
Reduce(left_join, list(surveys, survey_means, survey_sd)) | |
# Exercise: How could you do this in one call with mutate? | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species, sex) %>% | |
mutate(weight_mean = mean(weight, na.rm = TRUE), | |
weight_sd = sd(weight) | |
weight_dev = weight - weight_mean | |
hindfoot_mean = mean(hindfoot_length, na.rm = TRUE), | |
hindfoot_sd = sd(hindfoot_length, na.rm = TRUE), | |
hindfoot_dev = hindfoot_length - hindfoot_mean) %>% | |
select(species, sex, | |
weight, weight_dev, weight_mean, | |
hindfoot_length, hindfoot_dev, hindfoot_mean | |
) | |
## Tidy data: | |
# Each variable has its own column | |
# Each observation has its own row | |
# Each value must have its own cell | |
# Each type of observational unit forms a table | |
surveys_gw <- surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(genus, plot_id) %>% | |
summarize(mean_weight = mean(weight)) | |
## Spread | |
# https://datacarpentry.org/R-ecology-lesson/img/spread_data_R.png | |
help(spread) | |
surveys_gw %>% | |
spread(genus, mean_weight) %>% | |
head() | |
surveys_wide <- surveys_gw %>% | |
spread(key = genus, value = mean_weight, fill = 0) | |
head(surveys_spread) | |
## Gather | |
# https://datacarpentry.org/R-ecology-lesson/img/gather_data_R.png | |
surveys_wide %>% | |
gather(key = genus, value = mean_weight, -plot_id) | |
surveys_long <- surveys_wide %>% | |
gather(key = genus, value = mean_weight, -plot_id) | |
######################################### | |
# CODE ACTUALLY WRITTEN DURING THE LESSON | |
######################################### | |
install.packages("tidyverse") | |
install.packages(c("here", "reprex")) | |
### Resources | |
# Stack overflow: stackoverflow.com | |
# Jenny Bryan (@jennybryan) and the #rstats hashtag on twitter | |
# Advanced R book (for all levels): https://adv-r.hadley.nz/ | |
# Geocomputation for R book: https://geocompr.robinlovelace.net/ | |
### R Package examples | |
# https://github.com/ropensci/rnoaa | |
# https://github.com/tidyverse/reprex | |
# https://cran.r-project.org/web/packages/rnassqs/index.html | |
#### Setup and load libraries ---- | |
install.packages(c("here", "reprex")) | |
library(tidyverse) | |
library(here) | |
library(reprex) | |
library(purrr) | |
library(forcats) | |
dir.create("data_raw") | |
dir.create("data") | |
dir.create("fig") | |
if(!dir.exists("src")) { | |
dir.create("src") | |
} | |
#### Load and setup data ---- | |
download.file(url="https://ndownloader.figshare.com/files/2292169", | |
destfile = "data_raw/rodent_counts.csv") | |
surveys <- read_csv(here("data_raw/rodent_counts.csv")) | |
dim(surveys) | |
head(surveys) | |
class(surveys) | |
surveys | |
surveys_alt <- read.csv("data_raw/rodent_counts.csv") | |
class(surveys_alt) | |
## Using dplyr | |
surveys[1:3, 1] | |
select(surveys, 1) | |
select(surveys, record_id, year) | |
head(surveys) | |
select(surveys, -genus, -species) | |
surveys[surveys$year < 1995,] | |
filter(surveys, year < 1995) | |
filter(surveys, !is.na(weight)) | |
## Exercise: Filter weight less than 5 and include species_id, sex, and weight | |
filteredWeight <- filter(surveys,weight<5) | |
select(filteredWeight,species_id,sex,weight) | |
filter(surveys, weight < 5) %>% | |
select(species_id, sex, weight) | |
surveys %>% | |
filter(weight < 5) %>% | |
select(species_id, sex, weight) | |
## Mutate / transmute | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
mutate(weight_kg = weight/1000, | |
weight_lb = weight_kg * 2.2) %>% | |
select(record_id, weight, weight_kg, weight_lb) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
transmute(record_id, | |
weight_kg = weight/1000, | |
weight_lb = weight_kg * 2.2) | |
## Exercise | |
# 1. contains only the species_id column and a new column hindfoot_half | |
# 2. hindfoot_half = hindfoot_length / 2 | |
# 3. values of hindfoot_half < 30 and are not NA | |
surveys %>% | |
transmute(species_id, | |
hindfoot_half=hindfoot_length/2) %>% | |
filter(hindfoot_half < 30, !is.na(hindfoot_half)) | |
surveys %>% | |
transmute(species_id, | |
hindfoot_half = hindfoot_length / 2) %>% | |
filter(!is.na(hindfoot_half), | |
hindfoot_half < 30) | |
surveys %>% | |
filter(!is.na(hindfoot_length)) %>% | |
filter(hindfoot_length<60) %>% | |
transmute(species_id, | |
hindfoot_half=hindfoot_length/2) | |
surveys %>% | |
mutate(hindfoot_half = hindfoot_length / 2) %>% | |
filter(hindfoot_half < 30) %>% | |
filter(!is.na(hindfoot_length)) %>% | |
select(species_id, hindfoot_half) | |
surveys %>% | |
filter(!is.na(hindfoot_length) & hindfoot_length < 60) %>% | |
transmute(hindfoot_half = hindfoot_length/2) %>% | |
select(species_id,hindfoot_length,hindfoot_half) | |
surveys %>% | |
mutate(hindfoot_half = hindfoot_length/2) | |
## Grouping variables and applying | |
## Split-Apply-Combine | |
names(surveys) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
summarize( | |
weight = mean(weight)) | |
ab <- surveys %>% | |
filter(species_id == "AB") %>% | |
select(species_id, sex, weight) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
summarize( | |
weight_mean = mean(weight), | |
weight_sd = sd(weight), | |
weight_min = min(weight), | |
grp_n = n()) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
filter(weight == min(weight)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
filter(weight == min(weight)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
summarize(weight_mean = mean(weight)) %>% | |
arrange(desc(weight_mean)) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
filter(row_number() == n()) | |
## Exercise: | |
# Select the row with the maximum weight for each species_id and sex | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
arrange(weight) | |
filter(row_number() == n()) | |
x <- data.frame( | |
record_id = c("a", "b", "c"), | |
weight = c(1,4.5,3) | |
) | |
x | |
x %>% | |
arrange(weight) %>% | |
filter(row_number() == n()) | |
x %>% | |
arrange(desc(weight)) %>% | |
filter(row_number() == 1) | |
surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, sex) %>% | |
filter(weight == max(weight)) %>% | |
filter(row_number()==1) | |
surveys %>% | |
filter(!is.na(weight), !is.na(sex)) %>% | |
group_by(species_id, sex) %>% | |
arrange(desc(weight)) %>% | |
filter(row_number() == 1) | |
## Means and standard deviations by group (species_id, plot_id) | |
gm <- surveys %>% | |
filter(!is.na(weight)) %>% | |
group_by(species_id, plot_id) %>% | |
summarize(weight_gm = mean(weight)) | |
# So that we don't have to filter by is.na every time | |
surveys_w_weight <- surveys %>% | |
filter(!is.na(weight)) | |
gd <- surveys_w_weight %>% | |
group_by(species_id, plot_id) %>% | |
transmute(record_id, weight_gd = weight - mean(weight)) %>% | |
ungroup() | |
head(gd) | |
## Merging data | |
# left_join(x,y, by = <some variable>) # keeps all records from x | |
# right_join(x,y, by = ...) #keeps all records from y and only matching from x | |
# inner_join(x,y, by = ...) #keeps only records that matched from both | |
# full_join(x,y, by = ...) # Keeps all records from x and all from y | |
surveys2 <- left_join(surveys, gm, by = c("species_id", "plot_id")) %>% | |
select(record_id, species_id, plot_id, weight, weight_gm) %>% | |
head() | |
surveys3 <- left_join(surveys2, select(gd, -species_id, -plot_id), by = "record_id") | |
head(surveys3) | |
# Exercise | |
# 1. Create a data set like surveys but with the two additional variables: | |
# a. weight_gm | |
# b. weight_gd | |
surveys_w_weight %>% | |
group_by(species_id, plot_id) %>% | |
mutate(weight_gm = mean(weight), | |
weight_gd = weight - weight_gm) %>% | |
select(record_id, species_id, plot_id, | |
weight, weight_gm, weight_gd) %>% | |
head() | |
head(surveys3) | |
## One more thing on merging | |
surveys3a <- Reduce(left_join, list(surveys, gm, gd)) | |
## Tidy Data | |
# Each variable has it's own column | |
# Each observation has it's own row | |
# Each value to have it's own cell | |
head(surveys) | |
surveys_gm <- surveys_w_weight %>% | |
group_by(genus, plot_id) %>% | |
summarize(weight_mean = mean(weight)) | |
surveys_gm | |
## Long to wide | |
surveys_wide <- surveys_gm %>% | |
spread(key = genus, value = weight_mean) | |
## Wide to long | |
surveys_long <- surveys_wide %>% | |
gather(key = genus, value = weight_mean, -plot_id) | |
head(surveys_long) | |
head(surveys_gm) | |
# Exercise Transform to long so that each column has the year | |
surveys_year <- surveys %>% | |
filter(!is.na(weight)) %>% | |
select(plot_id, year, weight) %>% | |
group_by(plot_id, year) %>% | |
summarize(weight_mean = mean(weight)) %>% | |
spread(key = year, value = weight_mean) | |
## Save your output!!!! | |
write_csv(surveys_year, "data/surveys_year.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment