monkeycycle · May 30, 2018 09:48
diff --git a/utils.R b/utils.R


 # Load required packages
 load_requirements <- function(pkg){
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg))
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
 }

 #Returns all items in a list that are not contained in toMatch
 #toMatch can be a single item or a list of items
 exclude <- function (theList, toMatch){
  return(setdiff(theList,include(theList,toMatch)))
 }

 #Returns all items in a list that ARE contained in toMatch
 #toMatch can be a single item or a list of items
 include <- function (theList, toMatch){
  matches <- unique (grep(paste(toMatch,collapse="|"),
                          theList, value=TRUE))
  return(matches)
 }

 # Clean up bad XLS formatting
 remove_comma_from_numeric <- function(s) {
  gsub(",", "", s, fixed = TRUE)
 }

 # Read in x rows, format as header and sub that in as the header
 read_datafile_2header <- function(filename, skip_rows=0, header_rows=1){

  filepath <- paste(data_src_path, filename, sep='')

  # Read and format the headers which span multiple lines
  headers <- read.csv(filepath, nrows=header_rows, header=FALSE)
  headers_names <- sapply(headers,paste,collapse="_")
  headers_names <- str_replace_all(headers_names, '-', '_')
  headers_names <- str_replace_all(headers_names, '  ', '_')
  headers_names <- str_replace_all(headers_names, '__', '_')
  headers_names <- str_replace_all(headers_names, '__', '_')

  df <- read.csv(file=filepath, skip = skip_rows, header=FALSE, stringsAsFactors=FALSE )
  names(df) <- headers_names

  names(df) <- gsub('  ', '_', names(df), fixed = TRUE)
  names(df) <- gsub(' ', '_', names(df), fixed = TRUE)
  names(df) <- gsub('__', '_', names(df), fixed = TRUE)
  names(df) <- gsub('-', '_', names(df), fixed = TRUE)
  names(df) <- gsub(',', '', names(df), fixed = TRUE)
  names(df) <- gsub('.', '', names(df), fixed = TRUE)

  return(df)

 }

 # https://stackoverflow.com/questions/12945687/read-all-worksheets-in-an-excel-workbook-into-an-r-list-with-data-frames
 read_excel_allsheets <- function(filename, tibble = FALSE) {
    # I prefer straight data.frames
    # but if you like tidyverse tibbles (the default with read_excel)
    # then just pass tibble = TRUE
    sheets <- readxl::excel_sheets(filename)
    x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X))
    if(!tibble) x <- lapply(x, as.data.frame)
    names(x) <- sheets
    x
 }


 cleaner_worksheet <- function(df){
  # # drop rows with missing values
  # df <- df[rowSums(is.na(df)) == 0,]
  # # remove serial comma from all variables
  # df[,-1] <- as.numeric(gsub(",", "", as.matrix(df[,-1])))
  # # create numeric version of year variable for graphing
  # df$Year <- as.numeric(substr(df$year, 1, 4))
  # return cleaned df
  return(df)
 }


	# Load required packages
	load_requirements <- function(pkg){
	new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
	if (length(new.pkg))
	install.packages(new.pkg, dependencies = TRUE)
	sapply(pkg, require, character.only = TRUE)
	}

	#Returns all items in a list that are not contained in toMatch
	#toMatch can be a single item or a list of items
	exclude <- function (theList, toMatch){
	return(setdiff(theList,include(theList,toMatch)))
	}

	#Returns all items in a list that ARE contained in toMatch
	#toMatch can be a single item or a list of items
	include <- function (theList, toMatch){
	matches <- unique (grep(paste(toMatch,collapse="\|"),
	theList, value=TRUE))
	return(matches)
	}

	# Clean up bad XLS formatting
	remove_comma_from_numeric <- function(s) {
	gsub(",", "", s, fixed = TRUE)
	}

	# Read in x rows, format as header and sub that in as the header
	read_datafile_2header <- function(filename, skip_rows=0, header_rows=1){

	filepath <- paste(data_src_path, filename, sep='')

	# Read and format the headers which span multiple lines
	headers <- read.csv(filepath, nrows=header_rows, header=FALSE)
	headers_names <- sapply(headers,paste,collapse="_")
	headers_names <- str_replace_all(headers_names, '-', '_')
	headers_names <- str_replace_all(headers_names, ' ', '_')
	headers_names <- str_replace_all(headers_names, '__', '_')
	headers_names <- str_replace_all(headers_names, '__', '_')

	df <- read.csv(file=filepath, skip = skip_rows, header=FALSE, stringsAsFactors=FALSE )
	names(df) <- headers_names

	names(df) <- gsub(' ', '_', names(df), fixed = TRUE)
	names(df) <- gsub(' ', '_', names(df), fixed = TRUE)
	names(df) <- gsub('__', '_', names(df), fixed = TRUE)
	names(df) <- gsub('-', '_', names(df), fixed = TRUE)
	names(df) <- gsub(',', '', names(df), fixed = TRUE)
	names(df) <- gsub('.', '', names(df), fixed = TRUE)

	return(df)

	}

	# https://stackoverflow.com/questions/12945687/read-all-worksheets-in-an-excel-workbook-into-an-r-list-with-data-frames
	read_excel_allsheets <- function(filename, tibble = FALSE) {
	# I prefer straight data.frames
	# but if you like tidyverse tibbles (the default with read_excel)
	# then just pass tibble = TRUE
	sheets <- readxl::excel_sheets(filename)
	x <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X))
	if(!tibble) x <- lapply(x, as.data.frame)
	names(x) <- sheets
	x
	}


	cleaner_worksheet <- function(df){
	# # drop rows with missing values
	# df <- df[rowSums(is.na(df)) == 0,]
	# # remove serial comma from all variables
	# df[,-1] <- as.numeric(gsub(",", "", as.matrix(df[,-1])))
	# # create numeric version of year variable for graphing
	# df$Year <- as.numeric(substr(df$year, 1, 4))
	# return cleaned df
	return(df)
	}