Go to the RMD, R, PDF, or HTML version of this file. Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
Multiply a subset of variables all by 10. We use dplyr’s across function to achieve this.
Note that in the example below, we also use across with group by to include a string array of grouping by variables.
Note: “across() makes it easy to apply the same transformation to multiple columns, allowing you to use select() semantics inside in”data-masking” functions like summarise() and mutate().”
# grouping by variables
ar_st_groups <- c("gear", "carb")
# use across to conduct operation over multiple variables
mtcars_3var_times10 <- mtcars %>%
group_by(across(one_of(ar_st_groups))) %>%
mutate(across(matches("mpg|cyl|disp"), ~ .x * 10)) %>%
select(gear, carb, mpg, cyl, disp) %>% head(n=5)
# pring
# Multiply several variables by 10
kable(mtcars_3var_times10 %>% slice_head(n = 5)) %>%
kable_styling_fc()
gear | carb | mpg | cyl | disp |
---|---|---|---|---|
3 | 1 | 214 | 60 | 2580 |
3 | 2 | 187 | 80 | 3600 |
4 | 1 | 228 | 40 | 1080 |
4 | 4 | NA | 60 | 1600 |
4 | 4 | 210 | 60 | 1600 |
We want to compute quantiles for each location, based on monthly variations in columns.
First, we generate a table with 12 columns (for months) and 3 rows (for locations).
# Generate data, 12 months as columns, and
mt_data_rand <- matrix(rnorm(36, mean=0, sd=1), nrow=3, ncol=12)
it_rows <- seq(1, dim(mt_data_rand)[1])
it_cols <- seq(1, dim(mt_data_rand)[2])
# convert to table, column as month with leading 0
colnames(mt_data_rand) <- paste0('m', sprintf("%02d", it_cols))
tb_data_full <- as_tibble(mt_data_rand, rownames = NA) %>%
mutate(loc = paste0("loc", sprintf("%02d", row_number()))) %>%
select(loc, everything())
# Display
kable(tb_data_full) %>% kable_styling_fc_wide()
loc | m01 | m02 | m03 | m04 | m05 | m06 | m07 | m08 | m09 | m10 | m11 | m12 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
loc01 | 1.7869131 | 0.7013559 | -0.2179749 | -0.6250393 | 0.1533731 | 0.4264642 | 0.8781335 | 0.5539177 | -0.3804710 | -1.265396 | -1.1231086 | 0.7799651 |
loc02 | 0.4978505 | -0.4727914 | -1.0260044 | -1.6866933 | -1.1381369 | -0.2950715 | 0.8215811 | -0.0619117 | -0.6947070 | 2.168956 | -0.4028848 | -0.0833691 |
loc03 | -1.9666172 | -1.0678237 | -0.7288912 | 0.8377870 | 1.2538149 | 0.8951257 | 0.6886403 | -0.3059627 | -0.2079173 | 1.207962 | -0.4666554 | 0.2533185 |
Second, using apply to compute quantiles, row by row
# Extract the data components from the tibble, tibble has row and column names
tb_data_only <- tb_data_full %>%
column_to_rownames(var = "loc") %>%
select(contains("m"))
# Compute row specific quantiles
ar_quantiles_by_row <- apply(tb_data_only, 1, quantile, probs=0.75)
# Display
print(ar_quantiles_by_row)
## loc01 loc02 loc03
## 0.72100821 0.07802884 0.85212170
Third, generate matrix of two columns, ID and quantile.
# One particular quantil from location
tb_loc_quantile <- as_tibble(ar_quantiles_by_row) %>%
mutate(loc = names(ar_quantiles_by_row)) %>%
rename(quantile = value) %>%
select(loc, everything())
# Display
kable(tb_loc_quantile) %>% kable_styling_fc()
loc | quantile |
---|---|
loc01 | 0.7210082 |
loc02 | 0.0780288 |
loc03 | 0.8521217 |
We compute sum over several variables in the mtcars dataset. We will sum over several variables with shared prefix, after adding these prefix first. We introduce an NA value to make sure that we can sum ignoring NA
We sum using three different methods below: (1) purrr:reduce(), (2) base::rowSums(), (3) Manual sum. Note that the rowSums option is able to sum ignoring NA.
# we introduce NA value to first row
mtcars[1,1] <- NA
# Rename variables, and sum across
mtcars_rowsum <- mtcars %>%
rename(stats_mpg = mpg, stats_cyl = cyl, stats_hp = hp) %>%
mutate(
cs_reduce = purrr::reduce(
dplyr::pick(contains("stats")),
`+`
),
cs_rowsum = base::rowSums(
dplyr::pick(contains("stats")),
na.rm = TRUE
),
cs_manual = stats_mpg + stats_cyl + stats_hp
) %>%
select(matches("stats|cs"), gear)
# Display
# caption: "sum across columns"
kable(mtcars_rowsum %>% slice_head(n = 5)) %>% kable_styling_fc_wide()
stats_mpg | stats_cyl | stats_hp | cs_reduce | cs_rowsum | cs_manual | gear | |
---|---|---|---|---|---|---|---|
Mazda RX4 | NA | 6 | 110 | NA | 116.0 | NA | 4 |
Mazda RX4 Wag | 21.0 | 6 | 110 | 137.0 | 137.0 | 137.0 | 4 |
Datsun 710 | 22.8 | 4 | 93 | 119.8 | 119.8 | 119.8 | 4 |
Hornet 4 Drive | 21.4 | 6 | 110 | 137.4 | 137.4 | 137.4 | 3 |
Hornet Sportabout | 18.7 | 8 | 175 | 201.7 | 201.7 | 201.7 | 3 |
See this discussion for column sum peed comparisons.
Following from the prior section, we now sum across rows within group.
# we introduce NA value to first row
# mtcars[1,1] <- NA
# Rename variables, and sum across
mtcars_grpsum <- mtcars_rowsum %>%
arrange(gear) %>% group_by(gear) %>%
# srs = sum row sum
mutate_at(vars(matches("stats|cs")),
.funs = list(gs = ~sum(., na.rm=TRUE))
) %>%
select(gear, matches("gs")) %>%
slice_head(n=1)
# Display
# caption: "gs = group sum, cs = col sum over the columns
# with stats as prefix, sum across rows after col sum; gear = 4
# difference for cs-rowsum-gs because it allowed for summing
# ignoring NA for values across columns"
kable(mtcars_grpsum) %>% kable_styling_fc_wide()
gear | stats_mpg_gs | stats_cyl_gs | stats_hp_gs | cs_reduce_gs | cs_rowsum_gs | cs_manual_gs |
---|---|---|---|---|---|---|
3 | 241.6 | 112 | 2642 | 2995.6 | 2995.6 | 2995.6 |
4 | 273.4 | 56 | 1074 | 1287.4 | 1403.4 | 1287.4 |
5 | 106.9 | 30 | 978 | 1114.9 | 1114.9 | 1114.9 |
Replace some variables NA by some values, and other variables’ NAs by other values.
# Define
it_N <- 3
it_M <- 5
svr_id <- "date"
# NA dataframe, note need to define as NA_real_
# if define as NA, will not be able to replace with 99 column
# would be logical rather than double.
df_NA <- as_tibble(matrix(NA_real_, nrow = it_N, ncol = it_M)) %>%
rowid_to_column(var = svr_id) %>%
rename_at(
vars(starts_with("V")),
funs(str_replace(., "V", "var"))
)
kable(df_NA) %>%
kable_styling_fc()
date | var1 | var2 | var3 | var4 | var5 |
---|---|---|---|---|---|
1 | NA | NA | NA | NA | NA |
2 | NA | NA | NA | NA | NA |
3 | NA | NA | NA | NA | NA |
# Replace NA
df_NA_replace <- df_NA %>%
mutate_at(vars(one_of(c("var1", "var2"))), list(~ replace_na(., 0))) %>%
mutate_at(vars(one_of(c("var3", "var5"))), list(~ replace_na(., 99)))
kable(df_NA_replace) %>%
kable_styling_fc()
date | var1 | var2 | var3 | var4 | var5 |
---|---|---|---|---|---|
1 | 0 | 0 | 99 | NA | 99 |
2 | 0 | 0 | 99 | NA | 99 |
3 | 0 | 0 | 99 | NA | 99 |
Each row is a different date, each column is the profit a firms earns on a date, we want to compute cumulatively how much a person is earning. Also renames variable names below jointly.
# Define
it_N <- 3
it_M <- 5
svr_id <- "date"
# random dataframe, daily profit of firms
# dp_fx: daily profit firm ID something
set.seed(123)
df_daily_profit <- as_tibble(matrix(rnorm(it_N * it_M), nrow = it_N, ncol = it_M)) %>%
rowid_to_column(var = svr_id) %>%
rename_at(
vars(starts_with("V")),
funs(str_replace(., "V", "dp_f"))
)
kable(df_daily_profit) %>%
kable_styling_fc()
date | dp_f1 | dp_f2 | dp_f3 | dp_f4 | dp_f5 |
---|---|---|---|---|---|
1 | -0.5604756 | 0.0705084 | 0.4609162 | -0.4456620 | 0.4007715 |
2 | -0.2301775 | 0.1292877 | -1.2650612 | 1.2240818 | 0.1106827 |
3 | 1.5587083 | 1.7150650 | -0.6868529 | 0.3598138 | -0.5558411 |
# cumulative sum with suffix
df_cumu_profit_suffix <- df_daily_profit %>%
mutate_at(vars(contains("dp_f")), .funs = list(cumu = ~ cumsum(.)))
kable(df_cumu_profit_suffix) %>%
kable_styling_fc_wide()
date | dp_f1 | dp_f2 | dp_f3 | dp_f4 | dp_f5 | dp_f1_cumu | dp_f2_cumu | dp_f3_cumu | dp_f4_cumu | dp_f5_cumu |
---|---|---|---|---|---|---|---|---|---|---|
1 | -0.5604756 | 0.0705084 | 0.4609162 | -0.4456620 | 0.4007715 | -0.5604756 | 0.0705084 | 0.4609162 | -0.4456620 | 0.4007715 |
2 | -0.2301775 | 0.1292877 | -1.2650612 | 1.2240818 | 0.1106827 | -0.7906531 | 0.1997961 | -0.8041450 | 0.7784198 | 0.5114542 |
3 | 1.5587083 | 1.7150650 | -0.6868529 | 0.3598138 | -0.5558411 | 0.7680552 | 1.9148611 | -1.4909979 | 1.1382337 | -0.0443870 |
# cumulative sum variables naming to prefix
df_cumu_profit <- df_cumu_profit_suffix %>%
rename_at(vars(contains("_cumu")), list(~ paste("cp_f", gsub("_cumu", "", .), sep = ""))) %>%
rename_at(vars(contains("cp_f")), list(~ gsub("dp_f", "", .)))
kable(df_cumu_profit) %>%
kable_styling_fc_wide()
date | dp_f1 | dp_f2 | dp_f3 | dp_f4 | dp_f5 | cp_f1 | cp_f2 | cp_f3 | cp_f4 | cp_f5 |
---|---|---|---|---|---|---|---|---|---|---|
1 | -0.5604756 | 0.0705084 | 0.4609162 | -0.4456620 | 0.4007715 | -0.5604756 | 0.0705084 | 0.4609162 | -0.4456620 | 0.4007715 |
2 | -0.2301775 | 0.1292877 | -1.2650612 | 1.2240818 | 0.1106827 | -0.7906531 | 0.1997961 | -0.8041450 | 0.7784198 | 0.5114542 |
3 | 1.5587083 | 1.7150650 | -0.6868529 | 0.3598138 | -0.5558411 | 0.7680552 | 1.9148611 | -1.4909979 | 1.1382337 | -0.0443870 |