Summarize Variable by Factor Groups • REconTools

Show percentiles, mean, sd, min, max, first, last, and unique counts across quantitative variables in a dataset. This file works out how the ff_summ_bygroup function works from Fan’s REconTools Package.

There is a quantitative variable, summarize this quantitative variable by multiple categorical groups, show a variety of statistics. including:

percentiles
mean, sd, etc
min, max 4, first, last, unique counts

Load Packages

rm(list = ls(all.names = TRUE))
library(tibble)
library(tidyr)
library(dplyr)
library(purrr)

# library(ggplot2)
library(kableExtra)

Load Data and Parameers

data(mtcars)
df_mtcars <- mtcars
df <- df_mtcars
vars.group <- c('am', 'vs')
var.numeric <- 'mpg'
str.stats.group <- 'allperc'
ar.perc <- c(0.10, 0.25, 0.5, 0.75, 0.9)
boo.overall.stats <- TRUE

By Group Summarizing

Statistics to Include

# List of statistics
# https://rdrr.io/cran/dplyr/man/summarise.html
strs.center <- c('mean', 'median')
strs.spread <- c('sd', 'IQR', 'mad')
strs.range <- c('min', 'max')
strs.pos <- c('first', 'last')
strs.count <- c('n_distinct')

# Grouping of Statistics
if (str.stats.group == 'main') {
    strs.all <- c('mean', 'min', 'max', 'sd')
}
if (str.stats.group == 'all') {
    strs.all <- c(strs.center, strs.spread, strs.range, strs.pos, strs.count)
}
if (str.stats.group == 'allperc') {
    ar_st_percentile_func_names <- paste0(ar.perc*100, "%")
    funs_percentiles <- map(ar.perc, ~partial(quantile, probs = .x, na.rm = TRUE)) %>% set_names(nm = ar_st_percentile_func_names)
    strs.all <- c(strs.center, strs.spread, funs_percentiles, strs.range, strs.pos, strs.count)
}

Overall Statistics


# Start Transform
df <- df %>% drop_na() %>% mutate(!!(var.numeric) := as.numeric(!!sym(var.numeric)))

# Overall Statistics
if (boo.overall.stats) {
    df.overall.stats <- df %>% summarize_at(vars(var.numeric), funs(!!!strs.all))
    if (length(strs.all) == 1) {
        # give it a name, otherwise if only one stat, name of stat not saved
        df.overall.stats <- df.overall.stats %>% rename(!!strs.all := !!sym(var.numeric))
    }
    names(df.overall.stats) <- paste0(var.numeric, '.', names(df.overall.stats))
}
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(var.numeric)` instead of `var.numeric` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas: 
#> 
#>   # Simple named list: 
#>   list(mean = mean, median = median)
#> 
#>   # Auto named with `tibble::lst()`: 
#>   tibble::lst(mean, median)
#> 
#>   # Using lambdas
#>   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

# Display Results
kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

mpg.mean	mpg.median	mpg.sd	mpg.IQR	mpg.mad	mpg.10%	mpg.25%	mpg.50%	mpg.75%	mpg.90%	mpg.min	mpg.max	mpg.first	mpg.last	mpg.n_distinct
20.09062	19.2	6.026948	7.375	5.41149	14.34	15.425	19.2	22.8	30.09	10.4	33.9	21	21.4	25

Summarizing by Groups


# Group Sort
df.select <- df %>%
              group_by(!!!syms(vars.group)) %>%
              arrange(!!!syms(c(vars.group, var.numeric)))



# Table of Statistics
df.table.grp.stats <- df.select %>% summarize_at(vars(var.numeric), funs(!!!strs.all))

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

am	vs	mean	median	sd	IQR	mad	10%	25%	50%	75%	90%	min	max	first	last	n_distinct
0	0	15.05000	15.20	2.774396	2.575	2.29803	10.69	14.050	15.20	16.625	18.56	10.4	19.2	10.4	19.2	10
0	1	20.74286	21.40	2.471071	3.500	3.26172	17.98	18.650	21.40	22.150	23.44	17.8	24.4	17.8	24.4	7
1	0	19.75000	20.35	4.008865	4.225	3.85476	15.40	16.775	20.35	21.000	23.50	15.0	26.0	15.0	26.0	5
1	1	28.37143	30.40	4.757701	6.350	4.59606	22.24	25.050	30.40	31.400	33.00	21.4	33.9	21.4	33.9	6


# Add Stat Name
if (length(strs.all) == 1) {
    # give it a name, otherwise if only one stat, name of stat not saved
    df.table.grp.stats <- df.table.grp.stats %>% rename(!!strs.all := !!sym(var.numeric))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

am	vs	mean	median	sd	IQR	mad	10%	25%	50%	75%	90%	min	max	first	last	n_distinct
0	0	15.05000	15.20	2.774396	2.575	2.29803	10.69	14.050	15.20	16.625	18.56	10.4	19.2	10.4	19.2	10
0	1	20.74286	21.40	2.471071	3.500	3.26172	17.98	18.650	21.40	22.150	23.44	17.8	24.4	17.8	24.4	7
1	0	19.75000	20.35	4.008865	4.225	3.85476	15.40	16.775	20.35	21.000	23.50	15.0	26.0	15.0	26.0	5
1	1	28.37143	30.40	4.757701	6.350	4.59606	22.24	25.050	30.40	31.400	33.00	21.4	33.9	21.4	33.9	6

Statistics as Row


# Row of Statistics
str.vars.group.combine <- paste0(vars.group, collapse='_')
if (length(vars.group) == 1) {
    df.row.grp.stats <- df.table.grp.stats %>%
            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.',
                                           vars.group, '.g',
                                           (!!!syms(vars.group)))) %>%
            gather(variable, value, -one_of(vars.group)) %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
} else {
    df.row.grp.stats <- df.table.grp.stats %>%
                            mutate(vars.groups.combine := paste0(paste0(vars.group, collapse='.')),
                                   !!(str.vars.group.combine) := paste0(interaction(!!!(syms(vars.group))))) %>%
                            mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', vars.groups.combine, '.',
                                                                       (!!sym(str.vars.group.combine)))) %>%
                            ungroup() %>%
                            select(-vars.groups.combine, -one_of(vars.group)) %>%
            gather(variable, value, -one_of(str.vars.group.combine))  %>%
            unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
            spread(str.vars.group.combine, value)
}
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(str.vars.group.combine)` instead of `str.vars.group.combine` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.

# Display Results
kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

mpg.am.vs.0.0_10%	mpg.am.vs.0.0_25%	mpg.am.vs.0.0_50%	mpg.am.vs.0.0_75%	mpg.am.vs.0.0_90%	mpg.am.vs.0.0_first	mpg.am.vs.0.0_IQR	mpg.am.vs.0.0_last	mpg.am.vs.0.0_mad	mpg.am.vs.0.0_max	mpg.am.vs.0.0_mean	mpg.am.vs.0.0_median	mpg.am.vs.0.0_min	mpg.am.vs.0.0_n_distinct	mpg.am.vs.0.0_sd	mpg.am.vs.0.1_10%	mpg.am.vs.0.1_25%	mpg.am.vs.0.1_50%	mpg.am.vs.0.1_75%	mpg.am.vs.0.1_90%	mpg.am.vs.0.1_first	mpg.am.vs.0.1_IQR	mpg.am.vs.0.1_last	mpg.am.vs.0.1_mad	mpg.am.vs.0.1_max	mpg.am.vs.0.1_mean	mpg.am.vs.0.1_median	mpg.am.vs.0.1_min	mpg.am.vs.0.1_n_distinct	mpg.am.vs.0.1_sd	mpg.am.vs.1.0_10%	mpg.am.vs.1.0_25%	mpg.am.vs.1.0_50%	mpg.am.vs.1.0_75%	mpg.am.vs.1.0_90%	mpg.am.vs.1.0_first	mpg.am.vs.1.0_IQR	mpg.am.vs.1.0_last	mpg.am.vs.1.0_mad	mpg.am.vs.1.0_max	mpg.am.vs.1.0_mean	mpg.am.vs.1.0_median	mpg.am.vs.1.0_min	mpg.am.vs.1.0_n_distinct	mpg.am.vs.1.0_sd	mpg.am.vs.1.1_10%	mpg.am.vs.1.1_25%	mpg.am.vs.1.1_50%	mpg.am.vs.1.1_75%	mpg.am.vs.1.1_90%	mpg.am.vs.1.1_first	mpg.am.vs.1.1_IQR	mpg.am.vs.1.1_last	mpg.am.vs.1.1_mad	mpg.am.vs.1.1_max	mpg.am.vs.1.1_mean	mpg.am.vs.1.1_median	mpg.am.vs.1.1_min	mpg.am.vs.1.1_n_distinct	mpg.am.vs.1.1_sd
10.69	14.05	15.2	16.625	18.56	10.4	2.575	19.2	2.29803	19.2	15.05	15.2	10.4	10	2.774396	17.98	18.65	21.4	22.15	23.44	17.8	3.5	24.4	3.26172	24.4	20.74286	21.4	17.8	7	2.471071	15.4	16.775	20.35	21	23.5	15	4.225	26	3.85476	26	19.75	20.35	15	5	4.008865	22.24	25.05	30.4	31.4	33	21.4	6.35	33.9	4.59606	33.9	28.37143	30.4	21.4	6	4.757701

Process Outputs


# Clean up name strings
names(df.table.grp.stats) <- gsub(x = names(df.table.grp.stats),pattern = "_", replacement = "\\.")
names(df.row.grp.stats) <- gsub(x = names(df.row.grp.stats),pattern = "_", replacement = "\\.")

# Return
list.return <- list(df_table_grp_stats = df.table.grp.stats, df_row_grp_stats = df.row.grp.stats)

# Overall Statistics, without grouping
if (boo.overall.stats) {
    df.row.stats.all <- c(df.row.grp.stats, df.overall.stats)
    list.return <- append(list.return, list(df_overall_stats = df.overall.stats,
                                            df_row_stats_all = df.row.stats.all))
}

# Display Results
kable(df.table.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

am	vs	mean	median	sd	IQR	mad	10%	25%	50%	75%	90%	min	max	first	last	n.distinct
0	0	15.05000	15.20	2.774396	2.575	2.29803	10.69	14.050	15.20	16.625	18.56	10.4	19.2	10.4	19.2	10
0	1	20.74286	21.40	2.471071	3.500	3.26172	17.98	18.650	21.40	22.150	23.44	17.8	24.4	17.8	24.4	7
1	0	19.75000	20.35	4.008865	4.225	3.85476	15.40	16.775	20.35	21.000	23.50	15.0	26.0	15.0	26.0	5
1	1	28.37143	30.40	4.757701	6.350	4.59606	22.24	25.050	30.40	31.400	33.00	21.4	33.9	21.4	33.9	6

kable(df.row.grp.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

mpg.am.vs.0.0.10%	mpg.am.vs.0.0.25%	mpg.am.vs.0.0.50%	mpg.am.vs.0.0.75%	mpg.am.vs.0.0.90%	mpg.am.vs.0.0.first	mpg.am.vs.0.0.IQR	mpg.am.vs.0.0.last	mpg.am.vs.0.0.mad	mpg.am.vs.0.0.max	mpg.am.vs.0.0.mean	mpg.am.vs.0.0.median	mpg.am.vs.0.0.min	mpg.am.vs.0.0.n.distinct	mpg.am.vs.0.0.sd	mpg.am.vs.0.1.10%	mpg.am.vs.0.1.25%	mpg.am.vs.0.1.50%	mpg.am.vs.0.1.75%	mpg.am.vs.0.1.90%	mpg.am.vs.0.1.first	mpg.am.vs.0.1.IQR	mpg.am.vs.0.1.last	mpg.am.vs.0.1.mad	mpg.am.vs.0.1.max	mpg.am.vs.0.1.mean	mpg.am.vs.0.1.median	mpg.am.vs.0.1.min	mpg.am.vs.0.1.n.distinct	mpg.am.vs.0.1.sd	mpg.am.vs.1.0.10%	mpg.am.vs.1.0.25%	mpg.am.vs.1.0.50%	mpg.am.vs.1.0.75%	mpg.am.vs.1.0.90%	mpg.am.vs.1.0.first	mpg.am.vs.1.0.IQR	mpg.am.vs.1.0.last	mpg.am.vs.1.0.mad	mpg.am.vs.1.0.max	mpg.am.vs.1.0.mean	mpg.am.vs.1.0.median	mpg.am.vs.1.0.min	mpg.am.vs.1.0.n.distinct	mpg.am.vs.1.0.sd	mpg.am.vs.1.1.10%	mpg.am.vs.1.1.25%	mpg.am.vs.1.1.50%	mpg.am.vs.1.1.75%	mpg.am.vs.1.1.90%	mpg.am.vs.1.1.first	mpg.am.vs.1.1.IQR	mpg.am.vs.1.1.last	mpg.am.vs.1.1.mad	mpg.am.vs.1.1.max	mpg.am.vs.1.1.mean	mpg.am.vs.1.1.median	mpg.am.vs.1.1.min	mpg.am.vs.1.1.n.distinct	mpg.am.vs.1.1.sd
10.69	14.05	15.2	16.625	18.56	10.4	2.575	19.2	2.29803	19.2	15.05	15.2	10.4	10	2.774396	17.98	18.65	21.4	22.15	23.44	17.8	3.5	24.4	3.26172	24.4	20.74286	21.4	17.8	7	2.471071	15.4	16.775	20.35	21	23.5	15	4.225	26	3.85476	26	19.75	20.35	15	5	4.008865	22.24	25.05	30.4	31.4	33	21.4	6.35	33.9	4.59606	33.9	28.37143	30.4	21.4	6	4.757701

kable(df.overall.stats) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

mpg.mean	mpg.median	mpg.sd	mpg.IQR	mpg.mad	mpg.10%	mpg.25%	mpg.50%	mpg.75%	mpg.90%	mpg.min	mpg.max	mpg.first	mpg.last	mpg.n_distinct
20.09062	19.2	6.026948	7.375	5.41149	14.34	15.425	19.2	22.8	30.09	10.4	33.9	21	21.4	25