Show percentiles, mean, sd, min, max, first, last, and unique counts across quantitative variables in a dataset. This file works out how the ff_summ_bygroup function works from Fan’s REconTools Package.

There is a quantitative variable, summarize this quantitative variable by multiple categorical groups, show a variety of statistics. including:

  1. percentiles
  2. mean, sd, etc
  3. min, max 4, first, last, unique counts

Load Packages

rm(list = ls(all.names = TRUE))
library(tibble)
library(tidyr)
library(dplyr)
library(purrr)

# library(ggplot2)
library(kableExtra)

Load Data and Parameers

data(mtcars)
df_mtcars <- mtcars
df <- df_mtcars
vars.group <- c('am', 'vs')
var.numeric <- 'mpg'
str.stats.group <- 'allperc'
ar.perc <- c(0.10, 0.25, 0.5, 0.75, 0.9)
boo.overall.stats <- TRUE

By Group Summarizing

Statistics to Include

# List of statistics
# https://rdrr.io/cran/dplyr/man/summarise.html
strs.center <- c('mean', 'median')
strs.spread <- c('sd', 'IQR', 'mad')
strs.range <- c('min', 'max')
strs.pos <- c('first', 'last')
strs.count <- c('n_distinct')

# Grouping of Statistics
if (str.stats.group == 'main') {
    strs.all <- c('mean', 'min', 'max', 'sd')
}
if (str.stats.group == 'all') {
    strs.all <- c(strs.center, strs.spread, strs.range, strs.pos, strs.count)
}
if (str.stats.group == 'allperc') {
    ar_st_percentile_func_names <- paste0(ar.perc*100, "%")
    funs_percentiles <- map(ar.perc, ~partial(quantile, probs = .x, na.rm = TRUE)) %>% set_names(nm = ar_st_percentile_func_names)
    strs.all <- c(strs.center, strs.spread, funs_percentiles, strs.range, strs.pos, strs.count)
}

Overall Statistics


# Start Transform
df <- df %>% drop_na() %>% mutate(!!(var.numeric) := as.numeric(!!sym(var.numeric)))

# Overall Statistics
if (boo.overall.stats) {
    df.overall.stats <- df %>% summarize_at(vars(var.numeric), funs(!!!strs.all))
    if (length(strs.all) == 1) {
        # give it a name, otherwise if only one stat, name of stat not saved
        df.overall.stats <- df.overall.stats %>% rename(!!strs.all := !!sym(var.numeric))
    }
    names(df.overall.stats) <- paste0(var.numeric, '.', names(df.overall.stats))
}
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(var.numeric)` instead of `var.numeric` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas: 
#> 
#>   # Simple named list: 
#>   list(mean = mean, median = median)
#> 
#>   # Auto named with `tibble::lst()`: 
#>   tibble::lst(mean, median)
#> 
#>   # Using lambdas
#>   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

# Display Results
kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.mean mpg.median mpg.sd mpg.IQR mpg.mad mpg.10% mpg.25% mpg.50% mpg.75% mpg.90% mpg.min mpg.max mpg.first mpg.last mpg.n_distinct
20.09062 19.2 6.026948 7.375 5.41149 14.34 15.425 19.2 22.8 30.09 10.4 33.9 21 21.4 25

Summarizing by Groups


# Group Sort
df.select <- df %>%
              group_by(!!!syms(vars.group)) %>%
              arrange(!!!syms(c(vars.group, var.numeric)))



# Table of Statistics
df.table.grp.stats <- df.select %>% summarize_at(vars(var.numeric), funs(!!!strs.all))

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
am vs mean median sd IQR mad 10% 25% 50% 75% 90% min max first last n_distinct
0 0 15.05000 15.20 2.774396 2.575 2.29803 10.69 14.050 15.20 16.625 18.56 10.4 19.2 10.4 19.2 10
0 1 20.74286 21.40 2.471071 3.500 3.26172 17.98 18.650 21.40 22.150 23.44 17.8 24.4 17.8 24.4 7
1 0 19.75000 20.35 4.008865 4.225 3.85476 15.40 16.775 20.35 21.000 23.50 15.0 26.0 15.0 26.0 5
1 1 28.37143 30.40 4.757701 6.350 4.59606 22.24 25.050 30.40 31.400 33.00 21.4 33.9 21.4 33.9 6

# Add Stat Name
if (length(strs.all) == 1) {
    # give it a name, otherwise if only one stat, name of stat not saved
    df.table.grp.stats <- df.table.grp.stats %>% rename(!!strs.all := !!sym(var.numeric))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
am vs mean median sd IQR mad 10% 25% 50% 75% 90% min max first last n_distinct
0 0 15.05000 15.20 2.774396 2.575 2.29803 10.69 14.050 15.20 16.625 18.56 10.4 19.2 10.4 19.2 10
0 1 20.74286 21.40 2.471071 3.500 3.26172 17.98 18.650 21.40 22.150 23.44 17.8 24.4 17.8 24.4 7
1 0 19.75000 20.35 4.008865 4.225 3.85476 15.40 16.775 20.35 21.000 23.50 15.0 26.0 15.0 26.0 5
1 1 28.37143 30.40 4.757701 6.350 4.59606 22.24 25.050 30.40 31.400 33.00 21.4 33.9 21.4 33.9 6

Statistics as Row


# Row of Statistics
str.vars.group.combine <- paste0(vars.group, collapse='_')
if (length(vars.group) == 1) {
    df.row.grp.stats <- df.table.grp.stats %>%
            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.',
                                           vars.group, '.g',
                                           (!!!syms(vars.group)))) %>%
            gather(variable, value, -one_of(vars.group)) %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
} else {
    df.row.grp.stats <- df.table.grp.stats %>%
                            mutate(vars.groups.combine := paste0(paste0(vars.group, collapse='.')),
                                   !!(str.vars.group.combine) := paste0(interaction(!!!(syms(vars.group))))) %>%
                            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', vars.groups.combine, '.',
                                                                       (!!sym(str.vars.group.combine)))) %>%
                            ungroup() %>%
                            select(-vars.groups.combine, -one_of(vars.group)) %>%
            gather(variable, value, -one_of(str.vars.group.combine))  %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
}
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(str.vars.group.combine)` instead of `str.vars.group.combine` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.

# Display Results
kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.am.vs.0.0_10% mpg.am.vs.0.0_25% mpg.am.vs.0.0_50% mpg.am.vs.0.0_75% mpg.am.vs.0.0_90% mpg.am.vs.0.0_first mpg.am.vs.0.0_IQR mpg.am.vs.0.0_last mpg.am.vs.0.0_mad mpg.am.vs.0.0_max mpg.am.vs.0.0_mean mpg.am.vs.0.0_median mpg.am.vs.0.0_min mpg.am.vs.0.0_n_distinct mpg.am.vs.0.0_sd mpg.am.vs.0.1_10% mpg.am.vs.0.1_25% mpg.am.vs.0.1_50% mpg.am.vs.0.1_75% mpg.am.vs.0.1_90% mpg.am.vs.0.1_first mpg.am.vs.0.1_IQR mpg.am.vs.0.1_last mpg.am.vs.0.1_mad mpg.am.vs.0.1_max mpg.am.vs.0.1_mean mpg.am.vs.0.1_median mpg.am.vs.0.1_min mpg.am.vs.0.1_n_distinct mpg.am.vs.0.1_sd mpg.am.vs.1.0_10% mpg.am.vs.1.0_25% mpg.am.vs.1.0_50% mpg.am.vs.1.0_75% mpg.am.vs.1.0_90% mpg.am.vs.1.0_first mpg.am.vs.1.0_IQR mpg.am.vs.1.0_last mpg.am.vs.1.0_mad mpg.am.vs.1.0_max mpg.am.vs.1.0_mean mpg.am.vs.1.0_median mpg.am.vs.1.0_min mpg.am.vs.1.0_n_distinct mpg.am.vs.1.0_sd mpg.am.vs.1.1_10% mpg.am.vs.1.1_25% mpg.am.vs.1.1_50% mpg.am.vs.1.1_75% mpg.am.vs.1.1_90% mpg.am.vs.1.1_first mpg.am.vs.1.1_IQR mpg.am.vs.1.1_last mpg.am.vs.1.1_mad mpg.am.vs.1.1_max mpg.am.vs.1.1_mean mpg.am.vs.1.1_median mpg.am.vs.1.1_min mpg.am.vs.1.1_n_distinct mpg.am.vs.1.1_sd
10.69 14.05 15.2 16.625 18.56 10.4 2.575 19.2 2.29803 19.2 15.05 15.2 10.4 10 2.774396 17.98 18.65 21.4 22.15 23.44 17.8 3.5 24.4 3.26172 24.4 20.74286 21.4 17.8 7 2.471071 15.4 16.775 20.35 21 23.5 15 4.225 26 3.85476 26 19.75 20.35 15 5 4.008865 22.24 25.05 30.4 31.4 33 21.4 6.35 33.9 4.59606 33.9 28.37143 30.4 21.4 6 4.757701

Process Outputs


# Clean up name strings
names(df.table.grp.stats) <- gsub(x = names(df.table.grp.stats),pattern = "_", replacement = "\\.")
names(df.row.grp.stats) <- gsub(x = names(df.row.grp.stats),pattern = "_", replacement = "\\.")

# Return
list.return <- list(df_table_grp_stats = df.table.grp.stats, df_row_grp_stats = df.row.grp.stats)

# Overall Statistics, without grouping
if (boo.overall.stats) {
    df.row.stats.all <- c(df.row.grp.stats, df.overall.stats)
    list.return <- append(list.return, list(df_overall_stats = df.overall.stats,
                                            df_row_stats_all = df.row.stats.all))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
am vs mean median sd IQR mad 10% 25% 50% 75% 90% min max first last n.distinct
0 0 15.05000 15.20 2.774396 2.575 2.29803 10.69 14.050 15.20 16.625 18.56 10.4 19.2 10.4 19.2 10
0 1 20.74286 21.40 2.471071 3.500 3.26172 17.98 18.650 21.40 22.150 23.44 17.8 24.4 17.8 24.4 7
1 0 19.75000 20.35 4.008865 4.225 3.85476 15.40 16.775 20.35 21.000 23.50 15.0 26.0 15.0 26.0 5
1 1 28.37143 30.40 4.757701 6.350 4.59606 22.24 25.050 30.40 31.400 33.00 21.4 33.9 21.4 33.9 6
kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.am.vs.0.0.10% mpg.am.vs.0.0.25% mpg.am.vs.0.0.50% mpg.am.vs.0.0.75% mpg.am.vs.0.0.90% mpg.am.vs.0.0.first mpg.am.vs.0.0.IQR mpg.am.vs.0.0.last mpg.am.vs.0.0.mad mpg.am.vs.0.0.max mpg.am.vs.0.0.mean mpg.am.vs.0.0.median mpg.am.vs.0.0.min mpg.am.vs.0.0.n.distinct mpg.am.vs.0.0.sd mpg.am.vs.0.1.10% mpg.am.vs.0.1.25% mpg.am.vs.0.1.50% mpg.am.vs.0.1.75% mpg.am.vs.0.1.90% mpg.am.vs.0.1.first mpg.am.vs.0.1.IQR mpg.am.vs.0.1.last mpg.am.vs.0.1.mad mpg.am.vs.0.1.max mpg.am.vs.0.1.mean mpg.am.vs.0.1.median mpg.am.vs.0.1.min mpg.am.vs.0.1.n.distinct mpg.am.vs.0.1.sd mpg.am.vs.1.0.10% mpg.am.vs.1.0.25% mpg.am.vs.1.0.50% mpg.am.vs.1.0.75% mpg.am.vs.1.0.90% mpg.am.vs.1.0.first mpg.am.vs.1.0.IQR mpg.am.vs.1.0.last mpg.am.vs.1.0.mad mpg.am.vs.1.0.max mpg.am.vs.1.0.mean mpg.am.vs.1.0.median mpg.am.vs.1.0.min mpg.am.vs.1.0.n.distinct mpg.am.vs.1.0.sd mpg.am.vs.1.1.10% mpg.am.vs.1.1.25% mpg.am.vs.1.1.50% mpg.am.vs.1.1.75% mpg.am.vs.1.1.90% mpg.am.vs.1.1.first mpg.am.vs.1.1.IQR mpg.am.vs.1.1.last mpg.am.vs.1.1.mad mpg.am.vs.1.1.max mpg.am.vs.1.1.mean mpg.am.vs.1.1.median mpg.am.vs.1.1.min mpg.am.vs.1.1.n.distinct mpg.am.vs.1.1.sd
10.69 14.05 15.2 16.625 18.56 10.4 2.575 19.2 2.29803 19.2 15.05 15.2 10.4 10 2.774396 17.98 18.65 21.4 22.15 23.44 17.8 3.5 24.4 3.26172 24.4 20.74286 21.4 17.8 7 2.471071 15.4 16.775 20.35 21 23.5 15 4.225 26 3.85476 26 19.75 20.35 15 5 4.008865 22.24 25.05 30.4 31.4 33 21.4 6.35 33.9 4.59606 33.9 28.37143 30.4 21.4 6 4.757701
kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.mean mpg.median mpg.sd mpg.IQR mpg.mad mpg.10% mpg.25% mpg.50% mpg.75% mpg.90% mpg.min mpg.max mpg.first mpg.last mpg.n_distinct
20.09062 19.2 6.026948 7.375 5.41149 14.34 15.425 19.2 22.8 30.09 10.4 33.9 21 21.4 25