Show percentiles, mean, sd, min, max, first, last, and unique counts across quantitative variables in a dataset. This file works out how the ff_summ_bygroup function works from Fan’s REconTools Package.
There is a quantitative variable, summarize this quantitative variable by multiple categorical groups, show a variety of statistics. including:
- percentiles
- mean, sd, etc
- min, max 4, first, last, unique counts
Load Data and Parameers
data(mtcars)
df_mtcars <- mtcars
df <- df_mtcars
vars.group <- c('am', 'vs')
var.numeric <- 'mpg'
str.stats.group <- 'allperc'
ar.perc <- c(0.10, 0.25, 0.5, 0.75, 0.9)
boo.overall.stats <- TRUE
By Group Summarizing
Statistics to Include
# List of statistics
# https://rdrr.io/cran/dplyr/man/summarise.html
strs.center <- c('mean', 'median')
strs.spread <- c('sd', 'IQR', 'mad')
strs.range <- c('min', 'max')
strs.pos <- c('first', 'last')
strs.count <- c('n_distinct')
# Grouping of Statistics
if (str.stats.group == 'main') {
strs.all <- c('mean', 'min', 'max', 'sd')
}
if (str.stats.group == 'all') {
strs.all <- c(strs.center, strs.spread, strs.range, strs.pos, strs.count)
}
if (str.stats.group == 'allperc') {
ar_st_percentile_func_names <- paste0(ar.perc*100, "%")
funs_percentiles <- map(ar.perc, ~partial(quantile, probs = .x, na.rm = TRUE)) %>% set_names(nm = ar_st_percentile_func_names)
strs.all <- c(strs.center, strs.spread, funs_percentiles, strs.range, strs.pos, strs.count)
}
Overall Statistics
# Start Transform
df <- df %>% drop_na() %>% mutate(!!(var.numeric) := as.numeric(!!sym(var.numeric)))
# Overall Statistics
if (boo.overall.stats) {
df.overall.stats <- df %>% summarize_at(vars(var.numeric), funs(!!!strs.all))
if (length(strs.all) == 1) {
# give it a name, otherwise if only one stat, name of stat not saved
df.overall.stats <- df.overall.stats %>% rename(!!strs.all := !!sym(var.numeric))
}
names(df.overall.stats) <- paste0(var.numeric, '.', names(df.overall.stats))
}
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(var.numeric)` instead of `var.numeric` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas:
#>
#> # Simple named list:
#> list(mean = mean, median = median)
#>
#> # Auto named with `tibble::lst()`:
#> tibble::lst(mean, median)
#>
#> # Using lambdas
#> list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# Display Results
kable(df.overall.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.mean
|
mpg.median
|
mpg.sd
|
mpg.IQR
|
mpg.mad
|
mpg.10%
|
mpg.25%
|
mpg.50%
|
mpg.75%
|
mpg.90%
|
mpg.min
|
mpg.max
|
mpg.first
|
mpg.last
|
mpg.n_distinct
|
20.09062
|
19.2
|
6.026948
|
7.375
|
5.41149
|
14.34
|
15.425
|
19.2
|
22.8
|
30.09
|
10.4
|
33.9
|
21
|
21.4
|
25
|
Summarizing by Groups
am
|
vs
|
mean
|
median
|
sd
|
IQR
|
mad
|
10%
|
25%
|
50%
|
75%
|
90%
|
min
|
max
|
first
|
last
|
n_distinct
|
0
|
0
|
15.05000
|
15.20
|
2.774396
|
2.575
|
2.29803
|
10.69
|
14.050
|
15.20
|
16.625
|
18.56
|
10.4
|
19.2
|
10.4
|
19.2
|
10
|
0
|
1
|
20.74286
|
21.40
|
2.471071
|
3.500
|
3.26172
|
17.98
|
18.650
|
21.40
|
22.150
|
23.44
|
17.8
|
24.4
|
17.8
|
24.4
|
7
|
1
|
0
|
19.75000
|
20.35
|
4.008865
|
4.225
|
3.85476
|
15.40
|
16.775
|
20.35
|
21.000
|
23.50
|
15.0
|
26.0
|
15.0
|
26.0
|
5
|
1
|
1
|
28.37143
|
30.40
|
4.757701
|
6.350
|
4.59606
|
22.24
|
25.050
|
30.40
|
31.400
|
33.00
|
21.4
|
33.9
|
21.4
|
33.9
|
6
|
# Add Stat Name
if (length(strs.all) == 1) {
# give it a name, otherwise if only one stat, name of stat not saved
df.table.grp.stats <- df.table.grp.stats %>% rename(!!strs.all := !!sym(var.numeric))
}
# Display Results
kable(df.table.grp.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
am
|
vs
|
mean
|
median
|
sd
|
IQR
|
mad
|
10%
|
25%
|
50%
|
75%
|
90%
|
min
|
max
|
first
|
last
|
n_distinct
|
0
|
0
|
15.05000
|
15.20
|
2.774396
|
2.575
|
2.29803
|
10.69
|
14.050
|
15.20
|
16.625
|
18.56
|
10.4
|
19.2
|
10.4
|
19.2
|
10
|
0
|
1
|
20.74286
|
21.40
|
2.471071
|
3.500
|
3.26172
|
17.98
|
18.650
|
21.40
|
22.150
|
23.44
|
17.8
|
24.4
|
17.8
|
24.4
|
7
|
1
|
0
|
19.75000
|
20.35
|
4.008865
|
4.225
|
3.85476
|
15.40
|
16.775
|
20.35
|
21.000
|
23.50
|
15.0
|
26.0
|
15.0
|
26.0
|
5
|
1
|
1
|
28.37143
|
30.40
|
4.757701
|
6.350
|
4.59606
|
22.24
|
25.050
|
30.40
|
31.400
|
33.00
|
21.4
|
33.9
|
21.4
|
33.9
|
6
|
Statistics as Row
# Row of Statistics
str.vars.group.combine <- paste0(vars.group, collapse='_')
if (length(vars.group) == 1) {
df.row.grp.stats <- df.table.grp.stats %>%
mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.',
vars.group, '.g',
(!!!syms(vars.group)))) %>%
gather(variable, value, -one_of(vars.group)) %>%
unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
spread(str.vars.group.combine, value)
} else {
df.row.grp.stats <- df.table.grp.stats %>%
mutate(vars.groups.combine := paste0(paste0(vars.group, collapse='.')),
!!(str.vars.group.combine) := paste0(interaction(!!!(syms(vars.group))))) %>%
mutate(!!(str.vars.group.combine) := paste0(var.numeric, '.', vars.groups.combine, '.',
(!!sym(str.vars.group.combine)))) %>%
ungroup() %>%
select(-vars.groups.combine, -one_of(vars.group)) %>%
gather(variable, value, -one_of(str.vars.group.combine)) %>%
unite(str.vars.group.combine, c(str.vars.group.combine, 'variable')) %>%
spread(str.vars.group.combine, value)
}
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
#> Note: Using an external vector in selections is ambiguous.
#> i Use `all_of(str.vars.group.combine)` instead of `str.vars.group.combine` to silence this message.
#> i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#> This message is displayed once per session.
# Display Results
kable(df.row.grp.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.am.vs.0.0_10%
|
mpg.am.vs.0.0_25%
|
mpg.am.vs.0.0_50%
|
mpg.am.vs.0.0_75%
|
mpg.am.vs.0.0_90%
|
mpg.am.vs.0.0_first
|
mpg.am.vs.0.0_IQR
|
mpg.am.vs.0.0_last
|
mpg.am.vs.0.0_mad
|
mpg.am.vs.0.0_max
|
mpg.am.vs.0.0_mean
|
mpg.am.vs.0.0_median
|
mpg.am.vs.0.0_min
|
mpg.am.vs.0.0_n_distinct
|
mpg.am.vs.0.0_sd
|
mpg.am.vs.0.1_10%
|
mpg.am.vs.0.1_25%
|
mpg.am.vs.0.1_50%
|
mpg.am.vs.0.1_75%
|
mpg.am.vs.0.1_90%
|
mpg.am.vs.0.1_first
|
mpg.am.vs.0.1_IQR
|
mpg.am.vs.0.1_last
|
mpg.am.vs.0.1_mad
|
mpg.am.vs.0.1_max
|
mpg.am.vs.0.1_mean
|
mpg.am.vs.0.1_median
|
mpg.am.vs.0.1_min
|
mpg.am.vs.0.1_n_distinct
|
mpg.am.vs.0.1_sd
|
mpg.am.vs.1.0_10%
|
mpg.am.vs.1.0_25%
|
mpg.am.vs.1.0_50%
|
mpg.am.vs.1.0_75%
|
mpg.am.vs.1.0_90%
|
mpg.am.vs.1.0_first
|
mpg.am.vs.1.0_IQR
|
mpg.am.vs.1.0_last
|
mpg.am.vs.1.0_mad
|
mpg.am.vs.1.0_max
|
mpg.am.vs.1.0_mean
|
mpg.am.vs.1.0_median
|
mpg.am.vs.1.0_min
|
mpg.am.vs.1.0_n_distinct
|
mpg.am.vs.1.0_sd
|
mpg.am.vs.1.1_10%
|
mpg.am.vs.1.1_25%
|
mpg.am.vs.1.1_50%
|
mpg.am.vs.1.1_75%
|
mpg.am.vs.1.1_90%
|
mpg.am.vs.1.1_first
|
mpg.am.vs.1.1_IQR
|
mpg.am.vs.1.1_last
|
mpg.am.vs.1.1_mad
|
mpg.am.vs.1.1_max
|
mpg.am.vs.1.1_mean
|
mpg.am.vs.1.1_median
|
mpg.am.vs.1.1_min
|
mpg.am.vs.1.1_n_distinct
|
mpg.am.vs.1.1_sd
|
10.69
|
14.05
|
15.2
|
16.625
|
18.56
|
10.4
|
2.575
|
19.2
|
2.29803
|
19.2
|
15.05
|
15.2
|
10.4
|
10
|
2.774396
|
17.98
|
18.65
|
21.4
|
22.15
|
23.44
|
17.8
|
3.5
|
24.4
|
3.26172
|
24.4
|
20.74286
|
21.4
|
17.8
|
7
|
2.471071
|
15.4
|
16.775
|
20.35
|
21
|
23.5
|
15
|
4.225
|
26
|
3.85476
|
26
|
19.75
|
20.35
|
15
|
5
|
4.008865
|
22.24
|
25.05
|
30.4
|
31.4
|
33
|
21.4
|
6.35
|
33.9
|
4.59606
|
33.9
|
28.37143
|
30.4
|
21.4
|
6
|
4.757701
|
Process Outputs
# Clean up name strings
names(df.table.grp.stats) <- gsub(x = names(df.table.grp.stats),pattern = "_", replacement = "\\.")
names(df.row.grp.stats) <- gsub(x = names(df.row.grp.stats),pattern = "_", replacement = "\\.")
# Return
list.return <- list(df_table_grp_stats = df.table.grp.stats, df_row_grp_stats = df.row.grp.stats)
# Overall Statistics, without grouping
if (boo.overall.stats) {
df.row.stats.all <- c(df.row.grp.stats, df.overall.stats)
list.return <- append(list.return, list(df_overall_stats = df.overall.stats,
df_row_stats_all = df.row.stats.all))
}
# Display Results
kable(df.table.grp.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
am
|
vs
|
mean
|
median
|
sd
|
IQR
|
mad
|
10%
|
25%
|
50%
|
75%
|
90%
|
min
|
max
|
first
|
last
|
n.distinct
|
0
|
0
|
15.05000
|
15.20
|
2.774396
|
2.575
|
2.29803
|
10.69
|
14.050
|
15.20
|
16.625
|
18.56
|
10.4
|
19.2
|
10.4
|
19.2
|
10
|
0
|
1
|
20.74286
|
21.40
|
2.471071
|
3.500
|
3.26172
|
17.98
|
18.650
|
21.40
|
22.150
|
23.44
|
17.8
|
24.4
|
17.8
|
24.4
|
7
|
1
|
0
|
19.75000
|
20.35
|
4.008865
|
4.225
|
3.85476
|
15.40
|
16.775
|
20.35
|
21.000
|
23.50
|
15.0
|
26.0
|
15.0
|
26.0
|
5
|
1
|
1
|
28.37143
|
30.40
|
4.757701
|
6.350
|
4.59606
|
22.24
|
25.050
|
30.40
|
31.400
|
33.00
|
21.4
|
33.9
|
21.4
|
33.9
|
6
|
kable(df.row.grp.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.am.vs.0.0.10%
|
mpg.am.vs.0.0.25%
|
mpg.am.vs.0.0.50%
|
mpg.am.vs.0.0.75%
|
mpg.am.vs.0.0.90%
|
mpg.am.vs.0.0.first
|
mpg.am.vs.0.0.IQR
|
mpg.am.vs.0.0.last
|
mpg.am.vs.0.0.mad
|
mpg.am.vs.0.0.max
|
mpg.am.vs.0.0.mean
|
mpg.am.vs.0.0.median
|
mpg.am.vs.0.0.min
|
mpg.am.vs.0.0.n.distinct
|
mpg.am.vs.0.0.sd
|
mpg.am.vs.0.1.10%
|
mpg.am.vs.0.1.25%
|
mpg.am.vs.0.1.50%
|
mpg.am.vs.0.1.75%
|
mpg.am.vs.0.1.90%
|
mpg.am.vs.0.1.first
|
mpg.am.vs.0.1.IQR
|
mpg.am.vs.0.1.last
|
mpg.am.vs.0.1.mad
|
mpg.am.vs.0.1.max
|
mpg.am.vs.0.1.mean
|
mpg.am.vs.0.1.median
|
mpg.am.vs.0.1.min
|
mpg.am.vs.0.1.n.distinct
|
mpg.am.vs.0.1.sd
|
mpg.am.vs.1.0.10%
|
mpg.am.vs.1.0.25%
|
mpg.am.vs.1.0.50%
|
mpg.am.vs.1.0.75%
|
mpg.am.vs.1.0.90%
|
mpg.am.vs.1.0.first
|
mpg.am.vs.1.0.IQR
|
mpg.am.vs.1.0.last
|
mpg.am.vs.1.0.mad
|
mpg.am.vs.1.0.max
|
mpg.am.vs.1.0.mean
|
mpg.am.vs.1.0.median
|
mpg.am.vs.1.0.min
|
mpg.am.vs.1.0.n.distinct
|
mpg.am.vs.1.0.sd
|
mpg.am.vs.1.1.10%
|
mpg.am.vs.1.1.25%
|
mpg.am.vs.1.1.50%
|
mpg.am.vs.1.1.75%
|
mpg.am.vs.1.1.90%
|
mpg.am.vs.1.1.first
|
mpg.am.vs.1.1.IQR
|
mpg.am.vs.1.1.last
|
mpg.am.vs.1.1.mad
|
mpg.am.vs.1.1.max
|
mpg.am.vs.1.1.mean
|
mpg.am.vs.1.1.median
|
mpg.am.vs.1.1.min
|
mpg.am.vs.1.1.n.distinct
|
mpg.am.vs.1.1.sd
|
10.69
|
14.05
|
15.2
|
16.625
|
18.56
|
10.4
|
2.575
|
19.2
|
2.29803
|
19.2
|
15.05
|
15.2
|
10.4
|
10
|
2.774396
|
17.98
|
18.65
|
21.4
|
22.15
|
23.44
|
17.8
|
3.5
|
24.4
|
3.26172
|
24.4
|
20.74286
|
21.4
|
17.8
|
7
|
2.471071
|
15.4
|
16.775
|
20.35
|
21
|
23.5
|
15
|
4.225
|
26
|
3.85476
|
26
|
19.75
|
20.35
|
15
|
5
|
4.008865
|
22.24
|
25.05
|
30.4
|
31.4
|
33
|
21.4
|
6.35
|
33.9
|
4.59606
|
33.9
|
28.37143
|
30.4
|
21.4
|
6
|
4.757701
|
kable(df.overall.stats) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
mpg.mean
|
mpg.median
|
mpg.sd
|
mpg.IQR
|
mpg.mad
|
mpg.10%
|
mpg.25%
|
mpg.50%
|
mpg.75%
|
mpg.90%
|
mpg.min
|
mpg.max
|
mpg.first
|
mpg.last
|
mpg.n_distinct
|
20.09062
|
19.2
|
6.026948
|
7.375
|
5.41149
|
14.34
|
15.425
|
19.2
|
22.8
|
30.09
|
10.4
|
33.9
|
21
|
21.4
|
25
|