Skip to content

Instantly share code, notes, and snippets.

@nstrayer
Created January 21, 2020 21:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nstrayer/7af117b3b423335fc73f7fc51b9cf530 to your computer and use it in GitHub Desktop.
Save nstrayer/7af117b3b423335fc73f7fc51b9cf530 to your computer and use it in GitHub Desktop.
R Script to simulate missing not at random data and look at performance of different imputation strategies.
library(tidyverse)
n <- 150
sensitivity_threshold <- 5
data <- tibble(
a = rgamma(n = n, shape = 5, rate = 0.5),
b = rgamma(n = n, shape = a/2, rate = 0.5)
)
generate_missing_data <- function(i){
data_w_missing <- data %>%
mutate(obs = 1:n()) %>%
pivot_longer(c(a, b), names_to = "variable") %>%
mutate(
distance_from_threshold = sensitivity_threshold - value,
prob_of_missing = ifelse(distance_from_threshold > 0,
distance_from_threshold/sensitivity_threshold,
0),
missing = as.logical(rbinom(n = n(), size = 1, prob = prob_of_missing)),
value = ifelse(missing, NA, value)
) %>%
pivot_wider(
id_cols = obs,
names_from = variable,
values_from = value
) %>%
mutate(type = "no impute")
bind_rows(
data_w_missing,
mutate(data_w_missing,
a = ifelse(is.na(a), 0, a),
b = ifelse(is.na(b), 0, b),
type = "impute zero"),
mutate(data_w_missing,
a = ifelse(is.na(a), mean(a, na.rm = TRUE), a),
b = ifelse(is.na(b), mean(b, na.rm = TRUE), b),
type = "impute mean"),
mutate(data_w_missing,
a = ifelse(is.na(a), median(a, na.rm = TRUE), a),
b = ifelse(is.na(b), median(b, na.rm = TRUE), b),
type = "impute median"),
mutate(data_w_missing,
a = ifelse(is.na(a), min(a, na.rm = TRUE), a),
b = ifelse(is.na(b), min(b, na.rm = TRUE), b),
type = "impute min")
) %>%
mutate(i = i)
}
num_sims <- 100
correlations <- 1:num_sims %>%
map_dfr(generate_missing_data) %>%
group_by(type, i) %>%
summarise(
pearson = cor(a,b, use = "complete.obs"),
spearman = cor(a,b, method = "spearman", use = "complete.obs")
) %>%
pivot_longer(c(pearson, spearman), names_to = "correlation_type")
true_correlations <- tibble(
correlation_type = c("pearson", "spearman"),
truth = c(cor(data$a, data$b), cor(data$a, data$b, method = "spearman")))
ggplot(correlations, aes(x = value)) +
geom_density(fill = "steelblue", color = "steelblue") +
facet_grid(type~correlation_type) +
geom_vline(
data = true_correlations,
aes(xintercept = truth),
color = "black"
) +
labs(title = "Distribution of correlations by imputation type",
subtitle = "Missingness structure is related to distance from minimum threashold\nVertical line is true correlation value",
x = "correlation")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment