Skip to content

Instantly share code, notes, and snippets.

@monkeycycle
Created May 30, 2018 09:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monkeycycle/c9006c6e006c1408e42591d734f10fb7 to your computer and use it in GitHub Desktop.
Save monkeycycle/c9006c6e006c1408e42591d734f10fb7 to your computer and use it in GitHub Desktop.
Assorted R utilities
# Load required packages
load_requirements <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#Returns all items in a list that are not contained in toMatch
#toMatch can be a single item or a list of items
exclude <- function (theList, toMatch){
return(setdiff(theList,include(theList,toMatch)))
}
#Returns all items in a list that ARE contained in toMatch
#toMatch can be a single item or a list of items
include <- function (theList, toMatch){
matches <- unique (grep(paste(toMatch,collapse="|"),
theList, value=TRUE))
return(matches)
}
# Clean up bad XLS formatting
remove_comma_from_numeric <- function(s) {
gsub(",", "", s, fixed = TRUE)
}
# Read in x rows, format as header and sub that in as the header
read_datafile_2header <- function(filename, skip_rows=0, header_rows=1){
filepath <- paste(data_src_path, filename, sep='')
# Read and format the headers which span multiple lines
headers <- read.csv(filepath, nrows=header_rows, header=FALSE)
headers_names <- sapply(headers,paste,collapse="_")
headers_names <- str_replace_all(headers_names, '-', '_')
headers_names <- str_replace_all(headers_names, ' ', '_')
headers_names <- str_replace_all(headers_names, '__', '_')
headers_names <- str_replace_all(headers_names, '__', '_')
df <- read.csv(file=filepath, skip = skip_rows, header=FALSE, stringsAsFactors=FALSE )
names(df) <- headers_names
names(df) <- gsub(' ', '_', names(df), fixed = TRUE)
names(df) <- gsub(' ', '_', names(df), fixed = TRUE)
names(df) <- gsub('__', '_', names(df), fixed = TRUE)
names(df) <- gsub('-', '_', names(df), fixed = TRUE)
names(df) <- gsub(',', '', names(df), fixed = TRUE)
names(df) <- gsub('.', '', names(df), fixed = TRUE)
return(df)
}
# https://stackoverflow.com/questions/12945687/read-all-worksheets-in-an-excel-workbook-into-an-r-list-with-data-frames
read_excel_allsheets <- function(filename, tibble = FALSE) {
# I prefer straight data.frames
# but if you like tidyverse tibbles (the default with read_excel)
# then just pass tibble = TRUE
sheets <- readxl::excel_sheets(filename)
x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X))
if(!tibble) x <- lapply(x, as.data.frame)
names(x) <- sheets
x
}
cleaner_worksheet <- function(df){
# # drop rows with missing values
# df <- df[rowSums(is.na(df)) == 0,]
# # remove serial comma from all variables
# df[,-1] <- as.numeric(gsub(",", "", as.matrix(df[,-1])))
# # create numeric version of year variable for graphing
# df$Year <- as.numeric(substr(df$year, 1, 4))
# return cleaned df
return(df)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment