Apply the Same Function over Columns and Row Groups

1.1 Convert Subset of Variables to Numeric

Multiply a subset of variables all by 10. We use dplyr’s across function to achieve this.

Note that in the example below, we also use across with group by to include a string array of grouping by variables.

Note: “across() makes it easy to apply the same transformation to multiple columns, allowing you to use select() semantics inside in”data-masking” functions like summarise() and mutate().”

# grouping by variables
ar_st_groups <- c("gear", "carb")
# use across to conduct operation over multiple variables
mtcars_3var_times10 <- mtcars %>% 
  group_by(across(one_of(ar_st_groups))) %>%
  mutate(across(matches("mpg|cyl|disp"), ~ .x * 10)) %>%
  select(gear, carb, mpg, cyl, disp) %>% head(n=5)
# pring
# Multiply several variables by 10
kable(mtcars_3var_times10 %>% slice_head(n = 5)) %>% 
    kable_styling_fc()

gear	carb	mpg	cyl	disp
3	1	214	60	2580
3	2	187	80	3600
4	1	228	40	1080
4	4	NA	60	1600
4	4	210	60	1600

1.2 Compute Row-specific Quantiles using Data Across Columns

We want to compute quantiles for each location, based on monthly variations in columns.

First, we generate a table with 12 columns (for months) and 3 rows (for locations).

# Generate data, 12 months as columns, and 
mt_data_rand <- matrix(rnorm(36, mean=0, sd=1), nrow=3, ncol=12)
it_rows <- seq(1, dim(mt_data_rand)[1])
it_cols <- seq(1, dim(mt_data_rand)[2])
# convert to table, column as month with leading 0
colnames(mt_data_rand) <- paste0('m', sprintf("%02d", it_cols))
tb_data_full <- as_tibble(mt_data_rand, rownames = NA) %>% 
    mutate(loc = paste0("loc", sprintf("%02d", row_number()))) %>%
    select(loc, everything())
# Display
kable(tb_data_full) %>% kable_styling_fc_wide()

loc	m01	m02	m03	m04	m05	m06	m07	m08	m09	m10	m11	m12
loc01	1.7869131	0.7013559	-0.2179749	-0.6250393	0.1533731	0.4264642	0.8781335	0.5539177	-0.3804710	-1.265396	-1.1231086	0.7799651
loc02	0.4978505	-0.4727914	-1.0260044	-1.6866933	-1.1381369	-0.2950715	0.8215811	-0.0619117	-0.6947070	2.168956	-0.4028848	-0.0833691
loc03	-1.9666172	-1.0678237	-0.7288912	0.8377870	1.2538149	0.8951257	0.6886403	-0.3059627	-0.2079173	1.207962	-0.4666554	0.2533185

Second, using apply to compute quantiles, row by row

# Extract the data components from the tibble, tibble has row and column names
tb_data_only <- tb_data_full %>% 
    column_to_rownames(var = "loc") %>% 
    select(contains("m"))
# Compute row specific quantiles
ar_quantiles_by_row <- apply(tb_data_only, 1, quantile, probs=0.75)
# Display
print(ar_quantiles_by_row)

##      loc01      loc02      loc03 
## 0.72100821 0.07802884 0.85212170

Third, generate matrix of two columns, ID and quantile.

# One particular quantil from location
tb_loc_quantile <- as_tibble(ar_quantiles_by_row) %>% 
    mutate(loc = names(ar_quantiles_by_row)) %>%
    rename(quantile = value) %>%
    select(loc, everything())
# Display
kable(tb_loc_quantile) %>% kable_styling_fc()

loc	quantile
loc01	0.7210082
loc02	0.0780288
loc03	0.8521217

1.3 Compute Row-specific Sums using Data Across Columns

We compute sum over several variables in the mtcars dataset. We will sum over several variables with shared prefix, after adding these prefix first. We introduce an NA value to make sure that we can sum ignoring NA

We sum using three different methods below: (1) purrr:reduce(), (2) base::rowSums(), (3) Manual sum. Note that the rowSums option is able to sum ignoring NA.

# we introduce NA value to first row
mtcars[1,1] <- NA
# Rename variables, and sum across
mtcars_rowsum <- mtcars %>%
    rename(stats_mpg = mpg, stats_cyl = cyl, stats_hp = hp) %>%
    mutate(
        cs_reduce = purrr::reduce(
            dplyr::pick(contains("stats")),
            `+`
        ),
        cs_rowsum = base::rowSums(
            dplyr::pick(contains("stats")),
            na.rm = TRUE
        ),
        cs_manual = stats_mpg + stats_cyl + stats_hp
    ) %>%
    select(matches("stats|cs"), gear)
# Display
# caption: "sum across columns"
kable(mtcars_rowsum %>% slice_head(n = 5)) %>% kable_styling_fc_wide()

	stats_mpg	stats_cyl	stats_hp	cs_reduce	cs_rowsum	cs_manual	gear
Mazda RX4	NA	6	110	NA	116.0	NA	4
Mazda RX4 Wag	21.0	6	110	137.0	137.0	137.0	4
Datsun 710	22.8	4	93	119.8	119.8	119.8	4
Hornet 4 Drive	21.4	6	110	137.4	137.4	137.4	3
Hornet Sportabout	18.7	8	175	201.7	201.7	201.7	3

See this discussion for column sum peed comparisons.

1.4 Sum Across Rows within Group

Following from the prior section, we now sum across rows within group.

# we introduce NA value to first row
# mtcars[1,1] <- NA
# Rename variables, and sum across
mtcars_grpsum <- mtcars_rowsum %>%  
    arrange(gear) %>% group_by(gear) %>%
    # srs = sum row sum
    mutate_at(vars(matches("stats|cs")), 
        .funs = list(gs = ~sum(., na.rm=TRUE))
    ) %>% 
    select(gear, matches("gs")) %>% 
    slice_head(n=1)
# Display
# caption: "gs = group sum, cs = col sum over the columns 
# with stats as prefix, sum across rows after col sum; gear = 4 
# difference for cs-rowsum-gs because it allowed for summing 
# ignoring NA for values across columns"
kable(mtcars_grpsum) %>% kable_styling_fc_wide()

gear	stats_mpg_gs	stats_cyl_gs	stats_hp_gs	cs_reduce_gs	cs_rowsum_gs	cs_manual_gs
3	241.6	112	2642	2995.6	2995.6	2995.6
4	273.4	56	1074	1287.4	1403.4	1287.4
5	106.9	30	978	1114.9	1114.9	1114.9

1.5 Replace NA for Multiple Variables

Replace some variables NA by some values, and other variables’ NAs by other values.

# Define
it_N <- 3
it_M <- 5
svr_id <- "date"

# NA dataframe, note need to define as NA_real_
# if define as NA, will not be able to replace with 99 column
# would be logical rather than double.
df_NA <- as_tibble(matrix(NA_real_, nrow = it_N, ncol = it_M)) %>%
    rowid_to_column(var = svr_id) %>%
    rename_at(
        vars(starts_with("V")),
        funs(str_replace(., "V", "var"))
    )
kable(df_NA) %>%
    kable_styling_fc()

date	var1	var2	var3	var4	var5
1	NA	NA	NA	NA	NA
2	NA	NA	NA	NA	NA
3	NA	NA	NA	NA	NA

# Replace NA
df_NA_replace <- df_NA %>%
    mutate_at(vars(one_of(c("var1", "var2"))), list(~ replace_na(., 0))) %>%
    mutate_at(vars(one_of(c("var3", "var5"))), list(~ replace_na(., 99)))

kable(df_NA_replace) %>%
    kable_styling_fc()

date	var3	var4	var5
1	99	NA	99
2	99	NA	99
3	99	NA	99

1.6 Cumulative Sum Multiple Variables

Each row is a different date, each column is the profit a firms earns on a date, we want to compute cumulatively how much a person is earning. Also renames variable names below jointly.

# Define
it_N <- 3
it_M <- 5
svr_id <- "date"

# random dataframe, daily profit of firms
# dp_fx: daily profit firm ID something
set.seed(123)
df_daily_profit <- as_tibble(matrix(rnorm(it_N * it_M), nrow = it_N, ncol = it_M)) %>%
    rowid_to_column(var = svr_id) %>%
    rename_at(
        vars(starts_with("V")),
        funs(str_replace(., "V", "dp_f"))
    )
kable(df_daily_profit) %>%
    kable_styling_fc()

date	dp_f1	dp_f2	dp_f3	dp_f4	dp_f5
1	-0.5604756	0.0705084	0.4609162	-0.4456620	0.4007715
2	-0.2301775	0.1292877	-1.2650612	1.2240818	0.1106827
3	1.5587083	1.7150650	-0.6868529	0.3598138	-0.5558411

# cumulative sum with suffix
df_cumu_profit_suffix <- df_daily_profit %>%
    mutate_at(vars(contains("dp_f")), .funs = list(cumu = ~ cumsum(.)))
kable(df_cumu_profit_suffix) %>%
    kable_styling_fc_wide()

date	dp_f1	dp_f2	dp_f3	dp_f4	dp_f5	dp_f1_cumu	dp_f2_cumu	dp_f3_cumu	dp_f4_cumu	dp_f5_cumu
1	-0.5604756	0.0705084	0.4609162	-0.4456620	0.4007715	-0.5604756	0.0705084	0.4609162	-0.4456620	0.4007715
2	-0.2301775	0.1292877	-1.2650612	1.2240818	0.1106827	-0.7906531	0.1997961	-0.8041450	0.7784198	0.5114542
3	1.5587083	1.7150650	-0.6868529	0.3598138	-0.5558411	0.7680552	1.9148611	-1.4909979	1.1382337	-0.0443870

# cumulative sum variables naming to prefix
df_cumu_profit <- df_cumu_profit_suffix %>%
    rename_at(vars(contains("_cumu")), list(~ paste("cp_f", gsub("_cumu", "", .), sep = ""))) %>%
    rename_at(vars(contains("cp_f")), list(~ gsub("dp_f", "", .)))
kable(df_cumu_profit) %>%
    kable_styling_fc_wide()

date	dp_f1	dp_f2	dp_f3	dp_f4	dp_f5	cp_f1	cp_f2	cp_f3	cp_f4	cp_f5
1	-0.5604756	0.0705084	0.4609162	-0.4456620	0.4007715	-0.5604756	0.0705084	0.4609162	-0.4456620	0.4007715
2	-0.2301775	0.1292877	-1.2650612	1.2240818	0.1106827	-0.7906531	0.1997961	-0.8041450	0.7784198	0.5114542
3	1.5587083	1.7150650	-0.6868529	0.3598138	-0.5558411	0.7680552	1.9148611	-1.4909979	1.1382337	-0.0443870