# Data tranformation functions ------------------------------------------ # Function to overview variable names, labels and value labels # for data imported from SPSS using haven package overview <- function(data) { var_names <- colnames(data) var_labels <- map(data, attributes) %>% map_chr(~.x$label) value_labels <- map(data, attributes) %>% map(~.x$labels) %>% map_chr(~str_c(.x, " = ", names(.x), collapse = "; ")) value_labels[value_labels == " = "] <- NA_character_ tibble(var_names, var_labels, value_labels) } # Function to extract labels from vector attributes and convert to factor fct_label <- function(x) { levs <- attributes(x)$labels labs <- names(levs) factor(x, levels = levs, labels = labs) } # function to generate variable names # prefix = variable prefix # range = integer vector, # suffix = any suffix if needed, # width = add leading zeros to numeric range to keep selected width var_range <- function(prefix, range, suffix = "", width = NULL) { range <- sprintf( paste0("%0.", width, "d"), range ) paste0(prefix, range, suffix) } # Function to recode reversed scored items # data = dataframe # vars = variable to recode # min = minimum attainable item score # max = maximum attainable item score # drop = keep only newly created variables? reverse <- function(data, vars, min, max, drop = FALSE) { data <- data %>% mutate( across(all_of(vars), ~min + max - .x) ) if (drop == TRUE) { data <- data %>% select(all_of(vars)) } return(data) } # Function to compute number of missing values in a row # data = dataframe, # vars = character vector of variable (columns) names, # if NULL (default), computes number of missing values across all columns # new_col = name of the new column created # drop = keep only newly created variable? row_missing <- function(data, vars = NULL, new_col = "n_miss", drop = FALSE) { if (!is.null(vars)) { x <- data %>% select(all_of(vars)) %>% is.na() %>% rowSums() } else { x <- data %>% is.na() %>% rowSums() } data[new_col] <- x if (drop == TRUE) { data[[new_col]] } else { data } } # Function to compute row means # data = dataframe, # vars = character vector of variable (column) names # new_col = name of the new column created # max_na = maximum number of missing values (otherwise returns NA) # drop = keep only newly created variable? row_mean <- function(data, vars, new_col = "total_score", max_na = 0, drop = FALSE) { slct <- data %>% select(all_of(vars)) n_miss <- slct %>% is.na() %>% rowSums() x <- slct %>% rowMeans(na.rm = TRUE) x[n_miss > max_na] <- NA_real_ data[, new_col] <- x if (drop == TRUE) { data[new_col] } else { data } } # Data exploration -------------------------------------------------------- # Returns TRUE, if a values is flagged as outlier # x = numeric vector # method = method to identify outliers # outher arguments specify the cut-offs for outlier detection # zscore = z-score difference from the mean is_outlier <- function(x, method = c("zscore", "madmedian", "boxplot"), zscore = 2, madmed = 2.24, iqrmult = 1.5) { method <- match.arg(method) if (method == "zscore") { abs(scale(x)) > zscore } else if (method == "madmedian") { abs(x - median(x, na.rm = TRUE)) / mad(x, na.rm = TRUE) > madmed } else if (method == "boxplot") { q <- quantile(x, probs = c(.25, .75), na.rm = TRUE) iqr <- (q[2] - q[1]) * iqrmult (x < q[1] - iqr) | (x > q[2] + iqr) } else { NULL } } # * Barplots ---------------------------------------------------------------- plot_bar <- function(data, var, ..., fill = NULL, facet = NULL, add_n = TRUE, sort = TRUE) { if (sort) { data[[var]] <- fct_infreq(data[[var]]) } if (is.null(fill) && is.null(facet)) { data <- data %>% count(.data[[var]]) } else if (!is.null(fill) && is.null(facet)) { data <- data %>% count(.data[[var]], .data[[fill]]) } else if (is.null(fill) && !is.null(facet)) { data <- data %>% count(.data[[var]], .data[[facet]]) %>% group_by(.data[[facet]]) } else { data <- data %>% count(.data[[var]], .data[[fill]], .data[[facet]]) %>% group_by(.data[[facet]]) } data <- data %>% mutate( p = n/sum(n) ) if (is.null(facet)) { data_labels <- data %>% group_by(.data[[var]]) %>% summarise( p = sum(p), n = sum(n)) } else { data_labels <- data %>% group_by(.data[[var]], .data[[facet]]) %>% summarise( p = sum(p), n = sum(n)) } p <- ggplot( data, aes_string( var, "p", fill = fill)) + geom_col(...) + coord_flip() + labs(y = "Rel. Freq.") + theme(legend.position = "top") if (add_n) { p <- p + geom_text( data = data_labels, aes(label = n, fill = NULL), fontface = "bold", hjust = -.25) } if (!is.null(facet)) { p <-p + facet_wrap(vars(.data[[facet]]), labeller = label_both) } p + scale_y_continuous( labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.2)) ) } plot_bar_mult <- function(data, var, fill = NULL, facet = NULL, add_n = TRUE, sort = TRUE) { map( var, ~plot_bar(data = data, .x, fill = fill, facet = facet, add_n = add_n, sort = sort) ) } plot_stackbar <- function(data, var, fill, ..., facet = NULL) { p <- data %>% ggplot(aes_string(var, fill = fill)) + geom_bar(..., position = "fill", color = "black") + ylab("Rel. Freq.") + coord_flip() + theme(legend.position = "top") + guides(fill = guide_legend( reverse = TRUE, byrow = TRUE, nrow = 1) ) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) if(!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]])) } p } plot_stackbar_mult <- function(data, var, fill, facet = NULL) { map2( var, fill, ~plot_stackbar(data = data, var = .x, fill = .y, facet = facet) ) } # * Histogram ------------------------------------------------------------- plot_histogram <- function(data, var, ..., facet = NULL, geom = c("histogram", "density", "freqpoly"), scales = "fixed", transform_fun = NULL) { geom <- match.arg(geom) if (!is.null(transform_fun)) { data[[var]] <- transform_fun(data[[var]]) } p <- data %>% ggplot(aes_string(var)) switch (geom, histogram = p <- p + geom_histogram(color = "white", ...), density = p <- p + geom_density(...), freqpoly = p <- p + geom_freqpoly(...) ) if (!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]]), scales = scales) } p } plot_histogram_mult <- function(data, var, facet = NULL, geom = c("histogram", "density", "freqpoly"), scales = "fixed", transform_fun = NULL) { map( var, ~plot_histogram(data, var = .x, facet = facet, geom = geom, scales = scales, transform_fun = transform_fun) ) } # * Boxplot -------------------------------------------------------------- plot_box <- function(data, var, groups = "var", ..., fill = NULL, facet = NULL, geom = c("boxplot", "violin"), order = TRUE) { geom = match.arg(geom) if (order && groups != "var") { p <- data %>% ggplot(aes_string( var, paste0("fct_reorder(", groups, ",", var, ", na.rm = TRUE)"), fill = fill) ) } else { p <-data %>% ggplot(aes_string(var, groups)) } switch (geom, boxplot = p <- p + geom_boxplot(...), violin = p <- p + geom_boxplot(...) ) if (!is.null(fill) && groups == fill) { p <- p + theme(legend.position = "none") } else { p <- p + theme(legend.position = "top") } if (!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]])) } p + labs(x = var, y = groups) } plot_box_mult <- function(data, var, groups = "var", fill = NULL, facet = NULL, geom = c("boxplot", "violin"), order = TRUE) { map( var, ~plot_box(data = data, var = .x, groups = groups, fill = fill, facet = facet, geom = geom, order = order) ) } # * Scatterplot ----------------------------------------------------------- plot_scatter <- function(data, xvar, yvar, ..., geom = c("point", "jitter", "count"), alpha = 1, sample = NULL, fit_line = TRUE, method = NULL) { if (!is.null(sample)) { data <- sample_n(data, size = sample) } p <- data %>% ggplot(aes_string(xvar, yvar)) switch (geom, point = p <- p + geom_point(..., alpha = alpha), jitter = p <- p + geom_jitter(..., alpha = alpha), count = p <- p + geom_count(..., alpha = alpha) ) if (fit_line) { p <- p + geom_smooth(method = method) } return(p) } plot_scatter_mult <- function(data, xvar, yvar, geom = c("point", "jitter", "count"), alpha = 1, sample = NULL, fit_line = TRUE, method = NULL){ map2( xvar, yvar, ~plot_scatter( data = data, xvar = .x, yvar = .y, alpha = alpha, geom = geom, sample = sample, fit_line = fit_line, method = method ) ) } # * mosaic plots ------------------------------------------------------------ plot_mosaic <- function(data, xvar, yvar) { require(ggmosaic) if (!is.factor(data[[yvar]])) { data[[yvar]] <- factor(data[[yvar]]) } data %>% ggplot() + ggmosaic::geom_mosaic( aes_string( x = paste0("product(", yvar, ",", xvar, ")"), fill = yvar ) ) + guides(fill = guide_legend(reverse = TRUE)) } plot_mosaic_mult <- function(data, xvar, yvar) { map2( xvar, yvar, ~plot_mosaic(data = data, xvar = .x, yvar = .y) ) } # Cross-tabs -------------------------------------------------------------- crosstabs <- function(df, row_var, col_var, ..., digits = 2) { freq <- df %>% select( {{row_var}}, {{col_var}} ) %>% table() row_p <- freq %>% prop.table(margin = 1) %>% (function(x) x*100) %>% round(digits = digits) col_p <- freq %>% prop.table(margin = 2) %>% (function(x) x*100) %>% round(digits = digits) tot_p <- freq %>% prop.table() %>% (function(x) x*100) %>% round(digits = digits) print(chisq.test(freq, ...)) list(n = freq, `row %` = row_p, `col %` = col_p, `tot %` = tot_p) %>% map(as.data.frame) %>% bind_rows(.id = "stat") %>% pivot_wider(names_from = {{col_var}}, values_from = Freq) } # Tests ------------------------------------------------------------------- cor_table <- function(data, use = "pairwise", method="pearson", sig_levels = c(0.05, 0.01, 0.001), adjust = "holm", conf = 0.95) { f <- function(x) format(round(x, 2), nsmall = 2) n <- map_int(data, ~sum(!is.na(.x))) M <- map_dbl(data, ~mean(.x, na.rm = TRUE)) SD <- map_dbl(data, ~sd(.x, na.rm = TRUE)) nms <- colnames(data) lgt <- length(nms) results <- psych::corr.test(data, use = use, method = method, adjust = adjust, alpha = 1 - conf) r <- results$ci$r if (adjust == "none") { ci <- results$ci[, c("lower", "upper")] p <- results$ci$p } else { ci <- results$ci.adj %>% set_names(c("lower", "upper")) p <- results$ci2$p.adj } input <- tibble(r, ci, p, stars = "") if (!is.null(sig_levels)) { input <- input %>% mutate( stars = case_when( p < sig_levels[[3]] ~ "***", p < sig_levels[[2]] ~ "** ", p < sig_levels[[1]] ~ "* ", TRUE ~ " " ) ) } m <- matrix("", nrow = lgt, ncol = lgt, dimnames = list(nms, nms)) m1 <- m m2 <- m m1[lower.tri(m1)] <- paste0(f(input$r), input$stars) if (length(results$n) == 1) { m1[upper.tri(m1)] <- results$n } else { m1[upper.tri(m1)] <- results$n[upper.tri(results$n)] } m2[lower.tri(m1)] <- paste0("[", f(input$lower),";", f(input$upper), "]") m3 <- matrix("", ncol = lgt, nrow = 2*lgt) colnames(m3) <- nms m3[c(TRUE, FALSE), ] <- m1 m3[c(FALSE, TRUE), ] <- m2 df1 <- data.frame(var = nms, n = n, M = f(M), SD = f(SD)) df2 <- data.frame(var = rep("", 2*lgt), n = rep("", 2*lgt), M = rep("", 2*lgt), SD = rep("", 2*lgt)) df2[c(TRUE, FALSE), ] <- df1 bind_cols(df2, as.data.frame(m3)) } ci_mean <- function(x, conf.level = 0.95, na.rm = TRUE) { if (na.rm) x <- x[!is.na(x)] n <- length(x) se <- sd(x) / sqrt(n) q <- (1 - conf.level) / 2 q <- 1 - q t <- qt(q, df = n - 1) M <- mean(x) CI_low = M - t*se CI_high = M + t*se data.frame(M = M, CI_low = CI_low, CI_high = CI_high) } quantiles <- function(x, probs = c(0, .25, .5, .75, 1), na.rm = TRUE) { q <- quantile(x, probs = probs, na.rm = na.rm) data.frame( Min = q[1], Q2 = q[2], Mdn = q[3], Q3 = q[4], Max = q[5] ) } # Power ------------------------------------------------------------------- pwr_df <- function(data, fn) { out <- data %>% pmap(fn) %>% map_df(~.x[names(.x)]) meth <- out$method[[1]] web <- out$url[[1]] cat("Method: ", meth, " ", "URL: ", web, "\n") out %>% select(-method, -url) } plot_pwr <- function(data, x, y, grp = NULL, facet = NULL) { p <- data %>% ggplot(aes({{x}}, {{y}})) + theme(legend.position = "top") if (is.null(grp)) { p <- p + geom_line(size = 1) } else { p <- p + geom_line(size = 1, aes(color = as.factor({{grp}}), group = as.factor({{grp}}))) + labs(color = dplyr::select(data, {{grp}}) %>% names()) } if (!is.null(facet)) { p <- p + facet_wrap(vars({{facet}}), labeller = "label_both") } p } r2_to_f <- function(r2_full, r2_null = 0, f_squared = FALSE) { if (f_squared) { (r2_full - r2_null) / (1 - r2_full) } else { sqrt((r2_full - r2_null) / (1 - r2_full)) } } f_to_r2 <- function(f) { f^2 / (1+f^2) } peta2_to_f <- function(peta2, f_squared = FALSE) { if (f_squared) { peta2 / (1 - peta2) } else { sqrt(peta2 / (1 - peta2)) } }