Summarize one variable in a dataset, by another categorical variable

Generate distributional and other statistics for a particular continuous variable, categorized by some discrete variables. Wage by gender for example.

ff_summ_bygroup(
  df,
  vars.group,
  var.numeric,
  str.stats.group = "main",
  ar.perc = c(0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99),
  str.stats.specify = NULL,
  boo.overall.stats = TRUE
)

Arguments

df: dataframe input dataframe of interest
vars.group: list of strings containing grouping variables, could be gender and age groups for example
var.numeric: string variable name of continuous quantitative variable to summarize
str.stats.group: string what type of statistics to consider see line 31 and below
ar.perc: array of percentiles to calculate, only calculated if str.stats.group = 'mainperc'

Value

a list of various variables

df_table_grp_stats - A dataframe where each row is a combination of categories, and columns are categories and statistics
df_row_grp_stats - A single row with all statistics
df_overall_stats - A dataframe with non-grouped overall summaries
df_row_stats_all - A named list of all statistics generated

References

https://fanwangecon.github.io/REconTools/reference/ff_summ_bygroup.html https://fanwangecon.github.io/REconTools/articles/fv_summ_bygroup.html https://github.com/FanWangEcon/REconTools/blob/master/R/ff_summ_bygroup.R

Author

Fan Wang, http://fanwangecon.github.io

Examples

data(mtcars)
df_mtcars <- mtcars
df <- df_mtcars
vars.group <- c('am', 'vs')
var.numeric <- 'mpg'
str.stats.group <- 'allperc'
ar.perc <- c(0.01, 0.05, 0.10, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99)
ls_summ_by_group <- ff_summ_bygroup(df, vars.group, var.numeric, str.stats.group, ar.perc)
#> Warning: attributes are not identical across measure variables;
#> they will be dropped
df_table_grp_stats <- ls_summ_by_group$df_table_grp_stats
df_row_grp_stats <- ls_summ_by_group$df_row_grp_stats
df_overall_stats <- ls_summ_by_group$df_overall_stats
df_row_stats_all <- ls_summ_by_group$df_row_stats_all
print(df_table_grp_stats)
#> # A tibble: 4 x 21
#> # Groups:   am [2]
#>      am    vs  mean median    sd   IQR   mad  `1%`  `5%` `10%` `25%` `50%` `75%`
#>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     0  15.0   15.2  2.77  2.57  2.30  10.4  10.4  10.7  14.0  15.2  16.6
#> 2     0     1  20.7   21.4  2.47  3.5   3.26  17.8  17.9  18.0  18.6  21.4  22.2
#> 3     1     0  19.8   20.4  4.01  4.22  3.85  15.0  15.2  15.4  16.8  20.4  21  
#> 4     1     1  28.4   30.4  4.76  6.35  4.60  21.5  21.8  22.2  25.0  30.4  31.4
#> # ... with 8 more variables: 90% <dbl>, 95% <dbl>, 99% <dbl>, min <dbl>,
#> #   max <dbl>, first <dbl>, last <dbl>, n.distinct <int>
print(df_row_grp_stats)
#> # A tibble: 1 x 76
#>   `mpg.am.vs.0.0.1%` `mpg.am.vs.0.0.10%` `mpg.am.vs.0.0.25%` `mpg.am.vs.0.0.5%`
#>                <dbl>               <dbl>               <dbl>              <dbl>
#> 1               10.4                10.7                14.0               10.4
#> # ... with 72 more variables: mpg.am.vs.0.0.50% <dbl>, mpg.am.vs.0.0.75% <dbl>,
#> #   mpg.am.vs.0.0.90% <dbl>, mpg.am.vs.0.0.95% <dbl>, mpg.am.vs.0.0.99% <dbl>,
#> #   mpg.am.vs.0.0.IQR <dbl>, mpg.am.vs.0.0.first <dbl>,
#> #   mpg.am.vs.0.0.last <dbl>, mpg.am.vs.0.0.mad <dbl>, mpg.am.vs.0.0.max <dbl>,
#> #   mpg.am.vs.0.0.mean <dbl>, mpg.am.vs.0.0.median <dbl>,
#> #   mpg.am.vs.0.0.min <dbl>, mpg.am.vs.0.0.n.distinct <dbl>,
#> #   mpg.am.vs.0.0.sd <dbl>, mpg.am.vs.0.1.1% <dbl>, ...
print(df_overall_stats)
#>   mpg.mean mpg.median   mpg.sd mpg.IQR mpg.mad mpg.1% mpg.5% mpg.10% mpg.25%
#> 1 20.09062       19.2 6.026948   7.375 5.41149   10.4 11.995   14.34  15.425
#>   mpg.50% mpg.75% mpg.90% mpg.95% mpg.99% mpg.min mpg.max mpg.first mpg.last
#> 1    19.2    22.8   30.09    31.3  33.435    10.4    33.9        21     21.4
#>   mpg.n_distinct
#> 1             25
print(df_row_stats_all)
#> $`mpg.am.vs.0.0.1%`
#> [1] 10.4
#> 
#> $`mpg.am.vs.0.0.10%`
#> [1] 10.69
#> 
#> $`mpg.am.vs.0.0.25%`
#> [1] 14.05
#> 
#> $`mpg.am.vs.0.0.5%`
#> [1] 10.4
#> 
#> $`mpg.am.vs.0.0.50%`
#> [1] 15.2
#> 
#> $`mpg.am.vs.0.0.75%`
#> [1] 16.625
#> 
#> $`mpg.am.vs.0.0.90%`
#> [1] 18.56
#> 
#> $`mpg.am.vs.0.0.95%`
#> [1] 18.925
#> 
#> $`mpg.am.vs.0.0.99%`
#> [1] 19.145
#> 
#> $mpg.am.vs.0.0.IQR
#> [1] 2.575
#> 
#> $mpg.am.vs.0.0.first
#> [1] 10.4
#> 
#> $mpg.am.vs.0.0.last
#> [1] 19.2
#> 
#> $mpg.am.vs.0.0.mad
#> [1] 2.29803
#> 
#> $mpg.am.vs.0.0.max
#> [1] 19.2
#> 
#> $mpg.am.vs.0.0.mean
#> [1] 15.05
#> 
#> $mpg.am.vs.0.0.median
#> [1] 15.2
#> 
#> $mpg.am.vs.0.0.min
#> [1] 10.4
#> 
#> $mpg.am.vs.0.0.n.distinct
#> [1] 10
#> 
#> $mpg.am.vs.0.0.sd
#> [1] 2.774396
#> 
#> $`mpg.am.vs.0.1.1%`
#> [1] 17.818
#> 
#> $`mpg.am.vs.0.1.10%`
#> [1] 17.98
#> 
#> $`mpg.am.vs.0.1.25%`
#> [1] 18.65
#> 
#> $`mpg.am.vs.0.1.5%`
#> [1] 17.89
#> 
#> $`mpg.am.vs.0.1.50%`
#> [1] 21.4
#> 
#> $`mpg.am.vs.0.1.75%`
#> [1] 22.15
#> 
#> $`mpg.am.vs.0.1.90%`
#> [1] 23.44
#> 
#> $`mpg.am.vs.0.1.95%`
#> [1] 23.92
#> 
#> $`mpg.am.vs.0.1.99%`
#> [1] 24.304
#> 
#> $mpg.am.vs.0.1.IQR
#> [1] 3.5
#> 
#> $mpg.am.vs.0.1.first
#> [1] 17.8
#> 
#> $mpg.am.vs.0.1.last
#> [1] 24.4
#> 
#> $mpg.am.vs.0.1.mad
#> [1] 3.26172
#> 
#> $mpg.am.vs.0.1.max
#> [1] 24.4
#> 
#> $mpg.am.vs.0.1.mean
#> [1] 20.74286
#> 
#> $mpg.am.vs.0.1.median
#> [1] 21.4
#> 
#> $mpg.am.vs.0.1.min
#> [1] 17.8
#> 
#> $mpg.am.vs.0.1.n.distinct
#> [1] 7
#> 
#> $mpg.am.vs.0.1.sd
#> [1] 2.471071
#> 
#> $`mpg.am.vs.1.0.1%`
#> [1] 15.04
#> 
#> $`mpg.am.vs.1.0.10%`
#> [1] 15.4
#> 
#> $`mpg.am.vs.1.0.25%`
#> [1] 16.775
#> 
#> $`mpg.am.vs.1.0.5%`
#> [1] 15.2
#> 
#> $`mpg.am.vs.1.0.50%`
#> [1] 20.35
#> 
#> $`mpg.am.vs.1.0.75%`
#> [1] 21
#> 
#> $`mpg.am.vs.1.0.90%`
#> [1] 23.5
#> 
#> $`mpg.am.vs.1.0.95%`
#> [1] 24.75
#> 
#> $`mpg.am.vs.1.0.99%`
#> [1] 25.75
#> 
#> $mpg.am.vs.1.0.IQR
#> [1] 4.225
#> 
#> $mpg.am.vs.1.0.first
#> [1] 15
#> 
#> $mpg.am.vs.1.0.last
#> [1] 26
#> 
#> $mpg.am.vs.1.0.mad
#> [1] 3.85476
#> 
#> $mpg.am.vs.1.0.max
#> [1] 26
#> 
#> $mpg.am.vs.1.0.mean
#> [1] 19.75
#> 
#> $mpg.am.vs.1.0.median
#> [1] 20.35
#> 
#> $mpg.am.vs.1.0.min
#> [1] 15
#> 
#> $mpg.am.vs.1.0.n.distinct
#> [1] 5
#> 
#> $mpg.am.vs.1.0.sd
#> [1] 4.008865
#> 
#> $`mpg.am.vs.1.1.1%`
#> [1] 21.484
#> 
#> $`mpg.am.vs.1.1.10%`
#> [1] 22.24
#> 
#> $`mpg.am.vs.1.1.25%`
#> [1] 25.05
#> 
#> $`mpg.am.vs.1.1.5%`
#> [1] 21.82
#> 
#> $`mpg.am.vs.1.1.50%`
#> [1] 30.4
#> 
#> $`mpg.am.vs.1.1.75%`
#> [1] 31.4
#> 
#> $`mpg.am.vs.1.1.90%`
#> [1] 33
#> 
#> $`mpg.am.vs.1.1.95%`
#> [1] 33.45
#> 
#> $`mpg.am.vs.1.1.99%`
#> [1] 33.81
#> 
#> $mpg.am.vs.1.1.IQR
#> [1] 6.35
#> 
#> $mpg.am.vs.1.1.first
#> [1] 21.4
#> 
#> $mpg.am.vs.1.1.last
#> [1] 33.9
#> 
#> $mpg.am.vs.1.1.mad
#> [1] 4.59606
#> 
#> $mpg.am.vs.1.1.max
#> [1] 33.9
#> 
#> $mpg.am.vs.1.1.mean
#> [1] 28.37143
#> 
#> $mpg.am.vs.1.1.median
#> [1] 30.4
#> 
#> $mpg.am.vs.1.1.min
#> [1] 21.4
#> 
#> $mpg.am.vs.1.1.n.distinct
#> [1] 6
#> 
#> $mpg.am.vs.1.1.sd
#> [1] 4.757701
#> 
#> $mpg.mean
#> [1] 20.09062
#> 
#> $mpg.median
#> [1] 19.2
#> 
#> $mpg.sd
#> [1] 6.026948
#> 
#> $mpg.IQR
#> [1] 7.375
#> 
#> $mpg.mad
#> [1] 5.41149
#> 
#> $`mpg.1%`
#>   1% 
#> 10.4 
#> 
#> $`mpg.5%`
#>     5% 
#> 11.995 
#> 
#> $`mpg.10%`
#>   10% 
#> 14.34 
#> 
#> $`mpg.25%`
#>    25% 
#> 15.425 
#> 
#> $`mpg.50%`
#>  50% 
#> 19.2 
#> 
#> $`mpg.75%`
#>  75% 
#> 22.8 
#> 
#> $`mpg.90%`
#>   90% 
#> 30.09 
#> 
#> $`mpg.95%`
#>  95% 
#> 31.3 
#> 
#> $`mpg.99%`
#>    99% 
#> 33.435 
#> 
#> $mpg.min
#> [1] 10.4
#> 
#> $mpg.max
#> [1] 33.9
#> 
#> $mpg.first
#> [1] 21
#> 
#> $mpg.last
#> [1] 21.4
#> 
#> $mpg.n_distinct
#> [1] 25
#>