vignettes/fv_panel_expand_longandwide.Rmd
fv_panel_expand_longandwide.Rmd
Compute averages over sub-groups of dates for each variable. Merge so that each date observation has as variables all lagged and forward information as additional variables. This file works out how the ff_panel_expand_longandwide function works.
There is a panel dataset, where each row is an observation at a date.
Doing this allows for lagged intereaction that are time specific in an arbitrary way.
# Load Library
rm(list = ls(all.names = TRUE))
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(broom)
# library(tidymodels)
library(REconTools)
# Select Cebu Only
df_hw_cebu <- df_hgt_wgt %>% filter(S.country == 'Cebu' & svymthRound <= 24 & svymthRound > 0)
str(df_hw_cebu)
#> spec_tbl_df [16,188 x 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#> $ S.country : chr [1:16188] "Cebu" "Cebu" "Cebu" "Cebu" ...
#> $ vil.id : num [1:16188] 1 1 1 1 1 1 1 1 1 1 ...
#> $ indi.id : num [1:16188] 1 1 1 1 1 1 1 1 1 1 ...
#> $ sex : chr [1:16188] "Male" "Male" "Male" "Male" ...
#> $ svymthRound: num [1:16188] 2 4 6 8 10 12 14 16 18 20 ...
#> $ momEdu : num [1:16188] 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 ...
#> $ wealthIdx : num [1:16188] 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 ...
#> $ hgt : num [1:16188] 54.6 58.8 65.6 67.4 70.4 70.8 NA NA NA NA ...
#> $ wgt : num [1:16188] 3679 7954 6386 6039 8892 ...
#> $ hgt0 : num [1:16188] 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 ...
#> $ wgt0 : num [1:16188] 2044 2044 2044 2044 2044 ...
#> $ prot : num [1:16188] 0.5 1.7 5.6 2.6 15 11.3 NA NA NA NA ...
#> $ cal : num [1:16188] 4.7 10.3 180.7 85.3 288 ...
#> $ p.A.prot : num [1:16188] 2042 1480 1908 2808 2526 ...
#> $ p.A.nProt : num [1:16188] 231 232 213 155 149 ...
#> - attr(*, "spec")=List of 3
#> ..$ cols :List of 15
#> .. ..$ S.country : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
#> .. ..$ vil.id : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ indi.id : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ sex : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
#> .. ..$ svymthRound: list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ momEdu : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ wealthIdx : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ hgt : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ wgt : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ hgt0 : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ wgt0 : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ prot : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ cal : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ p.A.prot : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> .. ..$ p.A.nProt : list()
#> .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#> ..$ default: list()
#> .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
#> ..$ skip : num 1
#> ..- attr(*, "class")= chr "col_spec"
# To Save Processing Time, only Expand Panel for Individuals with low ID numbers
df_hw_cebu <- df_hw_cebu %>% filter(indi.id <= 50)
# Count Unique
svr_unique_identifier = 'indi.id'
df_uniques_vil <- ff_summ_count_unique_by_groups(
df_hw_cebu, ar_svr_group=c('S.country', 'vil.id'), svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Columns `S.country`, `vil.id`
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas:
#>
#> # Simple named list:
#> list(mean = mean, median = median)
#>
#> # Auto named with `tibble::lst()`:
#> tibble::lst(mean, median)
#>
#> # Using lambdas
#> list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> Adding missing grouping variables: `S.country`, `vil.id`
print(df_uniques_vil, n=20)
#> # A tibble: 1 x 15
#> # Groups: S.country, vil.id [1]
#> S.country vil.id unique_indi indi.id_n svymthRound_n momEdu_n wealthIdx_n
#> <chr> <dbl> <int> <int> <int> <int> <int>
#> 1 Cebu 1 50 600 600 600 600
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> # wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> # p.A.nProt_n <int>
df_uniques_mth <- ff_summ_count_unique_by_groups(
df_hw_cebu, ar_svr_group=c('S.country', 'svymthRound'),svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Columns `S.country`, `svymthRound`
#> Adding missing grouping variables: `S.country`, `svymthRound`
print(df_uniques_mth, n=20)
#> # A tibble: 12 x 15
#> # Groups: S.country, svymthRound [12]
#> S.country svymthRound unique_indi vil.id_n indi.id_n momEdu_n wealthIdx_n
#> <chr> <dbl> <int> <int> <int> <int> <int>
#> 1 Cebu 2 50 50 50 50 50
#> 2 Cebu 4 50 50 50 50 50
#> 3 Cebu 6 50 50 50 50 50
#> 4 Cebu 8 50 50 50 50 50
#> 5 Cebu 10 50 50 50 50 50
#> 6 Cebu 12 50 50 50 50 50
#> 7 Cebu 14 50 50 50 50 50
#> 8 Cebu 16 50 50 50 50 50
#> 9 Cebu 18 50 50 50 50 50
#> 10 Cebu 20 50 50 50 50 50
#> 11 Cebu 22 50 50 50 50 50
#> 12 Cebu 24 50 50 50 50 50
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> # wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> # p.A.nProt_n <int>
df_uniques_mth <- ff_summ_count_unique_by_groups(
df_hw_cebu, ar_svr_group=c('S.country'),svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Column `S.country`
#> Adding missing grouping variables: `S.country`
print(df_uniques_mth, n=20)
#> # A tibble: 1 x 15
#> # Groups: S.country [1]
#> S.country unique_indi vil.id_n indi.id_n svymthRound_n momEdu_n wealthIdx_n
#> <chr> <int> <int> <int> <int> <int> <int>
#> 1 Cebu 50 600 600 600 600 600
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> # wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> # p.A.nProt_n <int>
# Create Additional Categorical Variables, ever 6 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth6 = recode(svymthRound,
`0`="m00t06", `2`="m00t06", `4`="m00t06", `6`="m00t06",
`8`="m08t12", `10`="m08t12", `12`="m08t12",
`14`="m14t18", `16`="m14t18", `18`="m14t18",
`20`="m20t24", `22`="m20t24", `24`="m20t24"))
# Create Additional Categorical Variables, ever 8 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth8 = recode(svymthRound,
`0`="m00t08", `2`="m00t08", `4`="m00t08", `6`="m00t08", `8`="m00t08",
`10`="m10t16", `12`="m10t16", `14`="m10t16", `16`="m10t16",
`18`="m18t24", `20`="m18t24", `22`="m18t24", `24`="m18t24"))
# Create Additional Categorical Variables, ever 10 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth10 = recode(svymthRound,
`0`="m00t08", `2`="m00t08", `4`="m00t08",
`6`="m00t08", `8`="m00t08", `10`="m10t16", `12`="m10t16", `14`="m10t16",
`16`="m10t16", `18`="m18t24", `20`="m18t24", `22`="m18t24", `24`="m18t24"))
# Re-order variables so that months come earlier
df_hw_cebu = df_hw_cebu %>% select(indi.id, svymthRound, mth6, mth8, mth10, everything())
Data per month, can still average if grouping is by months. If grouping not by per month, clearly need to average over the svr_data variable. Below, first try no averaging, directly reshape and merge. Then do averaging for subgroup.
# Generate New Variable
# Variables to do groupings by
svr_id_t <- 'svymthRound'
svr_id_i <- 'indi.id'
svr_data <- 'cal'
bl_gen_newgrp <- FALSE
# Select vars to keep for spreading
ls_svr_mth_keep <- c(svr_id_i, svr_id_t, svr_data)
df_hw_cebu_mth_2spread <- df_hw_cebu %>% select(!!!syms(ls_svr_mth_keep))
# Spread
df_hw_cebu_mth_wide <- df_hw_cebu_mth_2spread %>% spread(!!sym(svr_id_t), !!sym(svr_data), sep = "")
str(df_hw_cebu_mth_wide)
#> tibble [50 x 13] (S3: tbl_df/tbl/data.frame)
#> $ indi.id : num [1:50] 1 2 3 4 5 6 7 8 9 10 ...
#> $ svymthRound2 : num [1:50] 4.7 166.2 0.5 0.5 12.1 ...
#> $ svymthRound4 : num [1:50] 10.3 0.5 12.9 0.5 25.3 ...
#> $ svymthRound6 : num [1:50] 180.7 24.1 44.8 51.4 262.3 ...
#> $ svymthRound8 : num [1:50] 85.3 155.1 57 39.4 71.3 ...
#> $ svymthRound10: num [1:50] 288 142.4 73.7 47 87.8 ...
#> $ svymthRound12: num [1:50] 342.6 146.3 488.2 44.9 234.5 ...
#> $ svymthRound14: num [1:50] NA 228 459 221 453 ...
#> $ svymthRound16: num [1:50] NA 194 408 242 1484 ...
#> $ svymthRound18: num [1:50] NA 111 692 300 1206 ...
#> $ svymthRound20: num [1:50] NA 305 612 581 1178 ...
#> $ svymthRound22: num [1:50] NA 332 1226 420 1221 ...
#> $ svymthRound24: num [1:50] NA 363 555 466 943 ...
# Merge Back, now dataframe is both wide and long
df_hw_cebu_mth_widelong <- df_hw_cebu %>% left_join(df_hw_cebu_mth_wide)
#> Joining, by = "indi.id"
First, do not create additional dates based subgroups, just use raw dates variables, with unique indi/date, and generate the wide + long version of data.
# Generate New Variable
# Variables to do groupings by
svr_id_t <- 'mth6'
svr_id_i <- 'indi.id'
svr_data <- 'cal'
bl_gen_newgrp <- FALSE
# Select vars to keep for spreading
ls_svr_6mth_keep <- c(svr_id_i, svr_id_t, svr_data)
df_hw_cebu_6mth_2spread <- df_hw_cebu %>% select(!!!syms(ls_svr_6mth_keep))
# Aggregate
svr_data_mean <- paste(svr_data,svr_id_t,'mean',sep='_')
df_hw_cebu_6mth_2spread <- df_hw_cebu_6mth_2spread %>%
group_by(!!sym(svr_id_i), !!sym(svr_id_t)) %>%
summarise(!!sym(svr_data_mean) := mean(!!sym(svr_data)))
#> `summarise()` has grouped output by 'indi.id'. You can override using the `.groups` argument.
# Spread
df_hw_cebu_6mth_wide <- df_hw_cebu_6mth_2spread %>% spread(!!sym(svr_id_t), !!sym(svr_data_mean), sep = "")
# Merge Back, now dataframe is both wide and long
df_hw_cebu_widelong <- df_hw_cebu %>%
left_join(df_hw_cebu_mth_wide) %>%
left_join(df_hw_cebu_6mth_wide)
#> Joining, by = "indi.id"
#> Joining, by = "indi.id"
## Testing Regression Linear Including RHS Lag Input interaction with time
attach(df_hw_cebu_widelong)
vf_months <- model.matrix(~factor(svymthRound))
vf_mth6 <- model.matrix(~factor(mth6))
# Regression Model:
# h_t = a_0*t + a_1*1{t}*N_t + a_2*1{t}*N_{t-1}
rs_test = lm(hgt ~
svymthRound +
vf_months[,9:12]:svymthRound18 +
vf_months[,10:12]:svymthRound20 +
vf_months[,11:12]:svymthRound22 +
vf_months[,12]:svymthRound24 ,
data=df_hw_cebu_widelong)
rsm_test = summary(rs_test)
rsm_test =
rs_test = lm(hgt ~
svymthRound +
log(svymthRound) +
1/(svymthRound) +
vf_months[,1:12]:svymthRound2 +
vf_months[,2:12]:svymthRound4 +
vf_months[,2:12]:svymthRound6 +
vf_months[,4:12]:svymthRound8 +
vf_months[,5:12]:svymthRound10 +
vf_months[,6:12]:svymthRound12 +
vf_months[,7:12]:svymthRound14 +
vf_months[,8:12]:svymthRound16 +
vf_months[,9:12]:svymthRound18 +
vf_months[,10:12]:svymthRound20 +
vf_months[,11:12]:svymthRound22 +
vf_months[,12]:svymthRound24 ,
data=df_hw_cebu_widelong)
rsm_test = summary(rs_test)
#rsm_test
rs_test = lm(hgt ~
factor(indi.id):svymthRound +
factor(indi.id):log(svymthRound) +
factor(indi.id):1/(svymthRound) +
vf_months[,1:12]:svymthRound2 +
vf_months[,2:12]:svymthRound4 +
vf_months[,3:12]:svymthRound6 +
vf_months[,4:12]:svymthRound8 +
vf_months[,5:12]:svymthRound10 +
vf_months[,6:12]:svymthRound12 +
vf_months[,7:12]:svymthRound14 +
vf_months[,8:12]:svymthRound16 +
vf_months[,9:12]:svymthRound18 +
vf_months[,10:12]:svymthRound20 +
vf_months[,11:12]:svymthRound22 +
vf_months[,12]:svymthRound24 ,
data=df_hw_cebu_widelong)
rsm_coef = tidy(rs_test)
# vf_months[, 11:12]factor(svymthRound)24:svymthRound22
rsm_coef_inputs = rsm_coef %>% filter(grepl('vf_months', term))
rsm_coef_inputs
#> # A tibble: 78 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 vf_months[, 1:12](Intercept):svymthRoun~ -0.0145 0.0145 -1.00 0.318
#> 2 vf_months[, 1:12]factor(svymthRound)4:s~ -0.0252 0.0144 -1.75 0.0801
#> 3 vf_months[, 1:12]factor(svymthRound)6:s~ -0.0333 0.0196 -1.70 0.0896
#> 4 vf_months[, 1:12]factor(svymthRound)8:s~ -0.0324 0.0212 -1.52 0.129
#> 5 vf_months[, 1:12]factor(svymthRound)10:~ -0.0351 0.0210 -1.67 0.0950
#> 6 vf_months[, 1:12]factor(svymthRound)12:~ -0.0365 0.0195 -1.88 0.0613
#> 7 vf_months[, 1:12]factor(svymthRound)14:~ -0.0310 0.0171 -1.81 0.0709
#> 8 vf_months[, 1:12]factor(svymthRound)16:~ -0.0242 0.0142 -1.71 0.0890
#> 9 vf_months[, 1:12]factor(svymthRound)18:~ -0.0193 0.0108 -1.78 0.0756
#> 10 vf_months[, 1:12]factor(svymthRound)20:~ -0.0104 0.00731 -1.42 0.157
#> # ... with 68 more rows
rs_test_mth4 = lm(hgt ~
factor(indi.id):svymthRound +
factor(indi.id):log(svymthRound) +
factor(indi.id):1/(svymthRound) +
vf_mth6[,1:4]:mth6m00t06 +
vf_mth6[,2:4]:mth6m08t12 +
vf_mth6[,3:4]:mth6m14t18 +
vf_mth6[,4]:mth6m20t24 ,
data=df_hw_cebu_widelong)
rsm_coef_mth4 = tidy(rs_test_mth4)
rsm_coef_inputs = rsm_coef_mth4 %>% filter(grepl('vf_mth6', term))
rsm_coef_inputs
#> # A tibble: 10 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 vf_mth6[, 1:4](Intercept):mth6m00t06 -0.00413 0.00217 -1.90 0.0577
#> 2 vf_mth6[, 1:4]factor(mth6)m08t12:mth6m0~ -0.00782 0.00333 -2.35 0.0193
#> 3 vf_mth6[, 1:4]factor(mth6)m14t18:mth6m0~ -0.00747 0.00557 -1.34 0.181
#> 4 vf_mth6[, 1:4]factor(mth6)m20t24:mth6m0~ -0.00641 0.00852 -0.752 0.453
#> 5 vf_mth6[, 2:4]factor(mth6)m08t12:mth6m0~ 0.00539 0.00193 2.80 0.00537
#> 6 vf_mth6[, 2:4]factor(mth6)m14t18:mth6m0~ 0.00781 0.00392 1.99 0.0469
#> 7 vf_mth6[, 2:4]factor(mth6)m20t24:mth6m0~ 0.0103 0.00631 1.63 0.105
#> 8 vf_mth6[, 3:4]factor(mth6)m14t18:mth6m1~ -0.00144 0.00118 -1.22 0.225
#> 9 vf_mth6[, 3:4]factor(mth6)m20t24:mth6m1~ -0.00164 0.00235 -0.700 0.484
#> 10 vf_mth6[, 4]:mth6m20t24 -0.00129 0.00136 -0.945 0.345