Append Group Mean Lagged Values as Additional Panel Variables • REconTools

Compute averages over sub-groups of dates for each variable. Merge so that each date observation has as variables all lagged and forward information as additional variables. This file works out how the ff_panel_expand_longandwide function works.

There is a panel dataset, where each row is an observation at a date.

There is a variable recorded for a panel, and a variable with date info, and another with obs info

@param svr_id_t string time variable name
@param svr_id_i string individual ID name
@param svr_data string variable name
related to new variable for t to be generated:
- @param bl_gen_newgrp boolean if to generate new group or use svr_id_t levels directly
- @param svr_tgrp string new of new group variable to be generated
ar_it_ct: array indicating new variable generated by svr_id_t

Compute averages over sub-groups of dates for each variable, with different ways of specifying date subgroups.

average(svr_data) over svr

Reshape data so each date is a variable for selected subset of key variables
Merge results from 2 back to main, so that each indi/date observation has as variables all lagged and forward information as additional variables. Append not n lag m forward, but full history as additional variables

Doing this allows for lagged intereaction that are time specific in an arbitrary way.

Package and Data Loading and Parameter Setting

# Load Library
rm(list = ls(all.names = TRUE))
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
library(broom)
# library(tidymodels)
library(REconTools)

# Select Cebu Only
df_hw_cebu <- df_hgt_wgt %>% filter(S.country == 'Cebu' & svymthRound <= 24 & svymthRound > 0)
str(df_hw_cebu)
#> spec_tbl_df [16,188 x 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
#>  $ S.country  : chr [1:16188] "Cebu" "Cebu" "Cebu" "Cebu" ...
#>  $ vil.id     : num [1:16188] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ indi.id    : num [1:16188] 1 1 1 1 1 1 1 1 1 1 ...
#>  $ sex        : chr [1:16188] "Male" "Male" "Male" "Male" ...
#>  $ svymthRound: num [1:16188] 2 4 6 8 10 12 14 16 18 20 ...
#>  $ momEdu     : num [1:16188] 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 5.3 ...
#>  $ wealthIdx  : num [1:16188] 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 6.3 ...
#>  $ hgt        : num [1:16188] 54.6 58.8 65.6 67.4 70.4 70.8 NA NA NA NA ...
#>  $ wgt        : num [1:16188] 3679 7954 6386 6039 8892 ...
#>  $ hgt0       : num [1:16188] 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 44.2 ...
#>  $ wgt0       : num [1:16188] 2044 2044 2044 2044 2044 ...
#>  $ prot       : num [1:16188] 0.5 1.7 5.6 2.6 15 11.3 NA NA NA NA ...
#>  $ cal        : num [1:16188] 4.7 10.3 180.7 85.3 288 ...
#>  $ p.A.prot   : num [1:16188] 2042 1480 1908 2808 2526 ...
#>  $ p.A.nProt  : num [1:16188] 231 232 213 155 149 ...
#>  - attr(*, "spec")=List of 3
#>   ..$ cols   :List of 15
#>   .. ..$ S.country  : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
#>   .. ..$ vil.id     : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ indi.id    : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ sex        : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_character" "collector"
#>   .. ..$ svymthRound: list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ momEdu     : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ wealthIdx  : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ hgt        : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ wgt        : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ hgt0       : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ wgt0       : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ prot       : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ cal        : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ p.A.prot   : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   .. ..$ p.A.nProt  : list()
#>   .. .. ..- attr(*, "class")= chr [1:2] "collector_double" "collector"
#>   ..$ default: list()
#>   .. ..- attr(*, "class")= chr [1:2] "collector_guess" "collector"
#>   ..$ skip   : num 1
#>   ..- attr(*, "class")= chr "col_spec"

# To Save Processing Time, only Expand Panel for Individuals with low ID numbers
df_hw_cebu <- df_hw_cebu %>% filter(indi.id <= 50)

# Count Unique
svr_unique_identifier = 'indi.id'
df_uniques_vil <- ff_summ_count_unique_by_groups(
  df_hw_cebu, ar_svr_group=c('S.country', 'vil.id'), svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Columns `S.country`, `vil.id`
#> Warning: `funs()` was deprecated in dplyr 0.8.0.
#> Please use a list of either functions or lambdas: 
#> 
#>   # Simple named list: 
#>   list(mean = mean, median = median)
#> 
#>   # Auto named with `tibble::lst()`: 
#>   tibble::lst(mean, median)
#> 
#>   # Using lambdas
#>   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
#> Adding missing grouping variables: `S.country`, `vil.id`
print(df_uniques_vil, n=20)
#> # A tibble: 1 x 15
#> # Groups:   S.country, vil.id [1]
#>   S.country vil.id unique_indi indi.id_n svymthRound_n momEdu_n wealthIdx_n
#>   <chr>      <dbl>       <int>     <int>         <int>    <int>       <int>
#> 1 Cebu           1          50       600           600      600         600
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> #   wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> #   p.A.nProt_n <int>
df_uniques_mth <- ff_summ_count_unique_by_groups(
  df_hw_cebu, ar_svr_group=c('S.country', 'svymthRound'),svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Columns `S.country`, `svymthRound`
#> Adding missing grouping variables: `S.country`, `svymthRound`
print(df_uniques_mth, n=20)
#> # A tibble: 12 x 15
#> # Groups:   S.country, svymthRound [12]
#>    S.country svymthRound unique_indi vil.id_n indi.id_n momEdu_n wealthIdx_n
#>    <chr>           <dbl>       <int>    <int>     <int>    <int>       <int>
#>  1 Cebu                2          50       50        50       50          50
#>  2 Cebu                4          50       50        50       50          50
#>  3 Cebu                6          50       50        50       50          50
#>  4 Cebu                8          50       50        50       50          50
#>  5 Cebu               10          50       50        50       50          50
#>  6 Cebu               12          50       50        50       50          50
#>  7 Cebu               14          50       50        50       50          50
#>  8 Cebu               16          50       50        50       50          50
#>  9 Cebu               18          50       50        50       50          50
#> 10 Cebu               20          50       50        50       50          50
#> 11 Cebu               22          50       50        50       50          50
#> 12 Cebu               24          50       50        50       50          50
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> #   wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> #   p.A.nProt_n <int>
df_uniques_mth <- ff_summ_count_unique_by_groups(
  df_hw_cebu, ar_svr_group=c('S.country'),svr_unique_identifier)
#> `mutate_if()` ignored the following grouping variables:
#> Column `S.country`
#> Adding missing grouping variables: `S.country`
print(df_uniques_mth, n=20)
#> # A tibble: 1 x 15
#> # Groups:   S.country [1]
#>   S.country unique_indi vil.id_n indi.id_n svymthRound_n momEdu_n wealthIdx_n
#>   <chr>           <int>    <int>     <int>         <int>    <int>       <int>
#> 1 Cebu               50      600       600           600      600         600
#> # ... with 8 more variables: hgt_n <int>, wgt_n <int>, hgt0_n <int>,
#> #   wgt0_n <int>, prot_n <int>, cal_n <int>, p.A.prot_n <int>,
#> #   p.A.nProt_n <int>

Generate Additional Group Variables

# Create Additional Categorical Variables, ever 6 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth6 = recode(svymthRound,
                                    `0`="m00t06", `2`="m00t06", `4`="m00t06", `6`="m00t06",
                                    `8`="m08t12", `10`="m08t12", `12`="m08t12",
                                    `14`="m14t18", `16`="m14t18", `18`="m14t18",
                                    `20`="m20t24", `22`="m20t24", `24`="m20t24"))

# Create Additional Categorical Variables, ever 8 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth8 = recode(svymthRound,
                                    `0`="m00t08", `2`="m00t08", `4`="m00t08", `6`="m00t08", `8`="m00t08",
                                    `10`="m10t16", `12`="m10t16", `14`="m10t16", `16`="m10t16",
                                    `18`="m18t24", `20`="m18t24", `22`="m18t24", `24`="m18t24"))

# Create Additional Categorical Variables, ever 10 months for averaging
df_hw_cebu = df_hw_cebu %>% mutate(mth10 = recode(svymthRound,
                                    `0`="m00t08", `2`="m00t08", `4`="m00t08",
                                     `6`="m00t08",  `8`="m00t08", `10`="m10t16", `12`="m10t16", `14`="m10t16",
                                    `16`="m10t16", `18`="m18t24", `20`="m18t24", `22`="m18t24", `24`="m18t24"))

# Re-order variables so that months come earlier
df_hw_cebu = df_hw_cebu %>% select(indi.id, svymthRound, mth6, mth8, mth10, everything())

Averaging over Group Variables

Data per month, can still average if grouping is by months. If grouping not by per month, clearly need to average over the svr_data variable. Below, first try no averaging, directly reshape and merge. Then do averaging for subgroup.

Expand columns by Info for Each Month of Age


# Generate New Variable

# Variables to do groupings by
svr_id_t <- 'svymthRound'
svr_id_i <- 'indi.id'
svr_data <- 'cal'
bl_gen_newgrp <- FALSE

# Select vars to keep for spreading
ls_svr_mth_keep <- c(svr_id_i, svr_id_t, svr_data)
df_hw_cebu_mth_2spread <- df_hw_cebu %>% select(!!!syms(ls_svr_mth_keep))

# Spread
df_hw_cebu_mth_wide <- df_hw_cebu_mth_2spread %>% spread(!!sym(svr_id_t), !!sym(svr_data), sep = "")
str(df_hw_cebu_mth_wide)
#> tibble [50 x 13] (S3: tbl_df/tbl/data.frame)
#>  $ indi.id      : num [1:50] 1 2 3 4 5 6 7 8 9 10 ...
#>  $ svymthRound2 : num [1:50] 4.7 166.2 0.5 0.5 12.1 ...
#>  $ svymthRound4 : num [1:50] 10.3 0.5 12.9 0.5 25.3 ...
#>  $ svymthRound6 : num [1:50] 180.7 24.1 44.8 51.4 262.3 ...
#>  $ svymthRound8 : num [1:50] 85.3 155.1 57 39.4 71.3 ...
#>  $ svymthRound10: num [1:50] 288 142.4 73.7 47 87.8 ...
#>  $ svymthRound12: num [1:50] 342.6 146.3 488.2 44.9 234.5 ...
#>  $ svymthRound14: num [1:50] NA 228 459 221 453 ...
#>  $ svymthRound16: num [1:50] NA 194 408 242 1484 ...
#>  $ svymthRound18: num [1:50] NA 111 692 300 1206 ...
#>  $ svymthRound20: num [1:50] NA 305 612 581 1178 ...
#>  $ svymthRound22: num [1:50] NA 332 1226 420 1221 ...
#>  $ svymthRound24: num [1:50] NA 363 555 466 943 ...

# Merge Back, now dataframe is both wide and long
df_hw_cebu_mth_widelong <- df_hw_cebu %>% left_join(df_hw_cebu_mth_wide)
#> Joining, by = "indi.id"

Aggregate by Subgroup first columns by Info for Each Month of Age

First, do not create additional dates based subgroups, just use raw dates variables, with unique indi/date, and generate the wide + long version of data.


# Generate New Variable

# Variables to do groupings by
svr_id_t <- 'mth6'
svr_id_i <- 'indi.id'
svr_data <- 'cal'
bl_gen_newgrp <- FALSE

# Select vars to keep for spreading
ls_svr_6mth_keep <- c(svr_id_i, svr_id_t, svr_data)
df_hw_cebu_6mth_2spread <- df_hw_cebu %>% select(!!!syms(ls_svr_6mth_keep))

# Aggregate
svr_data_mean <- paste(svr_data,svr_id_t,'mean',sep='_')
df_hw_cebu_6mth_2spread <- df_hw_cebu_6mth_2spread %>%
  group_by(!!sym(svr_id_i), !!sym(svr_id_t)) %>%
  summarise(!!sym(svr_data_mean) := mean(!!sym(svr_data)))
#> `summarise()` has grouped output by 'indi.id'. You can override using the `.groups` argument.

# Spread
df_hw_cebu_6mth_wide <- df_hw_cebu_6mth_2spread %>% spread(!!sym(svr_id_t), !!sym(svr_data_mean), sep = "")

# Merge Back, now dataframe is both wide and long
df_hw_cebu_widelong <- df_hw_cebu %>%
                          left_join(df_hw_cebu_mth_wide) %>%
                          left_join(df_hw_cebu_6mth_wide)
#> Joining, by = "indi.id"
#> Joining, by = "indi.id"

## Testing Regression Linear Including RHS Lag Input interaction with time

Regression Testings

Regression Testings Small Subsets

attach(df_hw_cebu_widelong)
vf_months <- model.matrix(~factor(svymthRound))
vf_mth6 <- model.matrix(~factor(mth6))

Regression Testings Small Subsets Single Months

# Regression Model:
# h_t = a_0*t + a_1*1{t}*N_t + a_2*1{t}*N_{t-1}
rs_test = lm(hgt ~
               svymthRound +
               vf_months[,9:12]:svymthRound18 +
               vf_months[,10:12]:svymthRound20 +
               vf_months[,11:12]:svymthRound22 +
               vf_months[,12]:svymthRound24 ,
             data=df_hw_cebu_widelong)
rsm_test = summary(rs_test)
rsm_test =

rs_test = lm(hgt ~
               svymthRound +
               log(svymthRound) +
               1/(svymthRound) +
               vf_months[,1:12]:svymthRound2 +
               vf_months[,2:12]:svymthRound4 +
               vf_months[,2:12]:svymthRound6 +
               vf_months[,4:12]:svymthRound8 +
               vf_months[,5:12]:svymthRound10 +
               vf_months[,6:12]:svymthRound12 +
               vf_months[,7:12]:svymthRound14 +
               vf_months[,8:12]:svymthRound16 +
               vf_months[,9:12]:svymthRound18 +
               vf_months[,10:12]:svymthRound20 +
               vf_months[,11:12]:svymthRound22 +
               vf_months[,12]:svymthRound24 ,
             data=df_hw_cebu_widelong)
rsm_test = summary(rs_test)
#rsm_test

Regression Testing Every Single Month

rs_test = lm(hgt ~
             factor(indi.id):svymthRound +
             factor(indi.id):log(svymthRound) +
             factor(indi.id):1/(svymthRound) +
             vf_months[,1:12]:svymthRound2 +
             vf_months[,2:12]:svymthRound4 +
             vf_months[,3:12]:svymthRound6 +
             vf_months[,4:12]:svymthRound8 +
             vf_months[,5:12]:svymthRound10 +
             vf_months[,6:12]:svymthRound12 +
             vf_months[,7:12]:svymthRound14 +
             vf_months[,8:12]:svymthRound16 +
             vf_months[,9:12]:svymthRound18 +
             vf_months[,10:12]:svymthRound20 +
             vf_months[,11:12]:svymthRound22 +
             vf_months[,12]:svymthRound24 ,
           data=df_hw_cebu_widelong)
rsm_coef = tidy(rs_test)
# vf_months[, 11:12]factor(svymthRound)24:svymthRound22
rsm_coef_inputs = rsm_coef %>% filter(grepl('vf_months', term))
rsm_coef_inputs
#> # A tibble: 78 x 5
#>    term                                     estimate std.error statistic p.value
#>    <chr>                                       <dbl>     <dbl>     <dbl>   <dbl>
#>  1 vf_months[, 1:12](Intercept):svymthRoun~  -0.0145   0.0145      -1.00  0.318 
#>  2 vf_months[, 1:12]factor(svymthRound)4:s~  -0.0252   0.0144      -1.75  0.0801
#>  3 vf_months[, 1:12]factor(svymthRound)6:s~  -0.0333   0.0196      -1.70  0.0896
#>  4 vf_months[, 1:12]factor(svymthRound)8:s~  -0.0324   0.0212      -1.52  0.129 
#>  5 vf_months[, 1:12]factor(svymthRound)10:~  -0.0351   0.0210      -1.67  0.0950
#>  6 vf_months[, 1:12]factor(svymthRound)12:~  -0.0365   0.0195      -1.88  0.0613
#>  7 vf_months[, 1:12]factor(svymthRound)14:~  -0.0310   0.0171      -1.81  0.0709
#>  8 vf_months[, 1:12]factor(svymthRound)16:~  -0.0242   0.0142      -1.71  0.0890
#>  9 vf_months[, 1:12]factor(svymthRound)18:~  -0.0193   0.0108      -1.78  0.0756
#> 10 vf_months[, 1:12]factor(svymthRound)20:~  -0.0104   0.00731     -1.42  0.157 
#> # ... with 68 more rows

Regression Testing 6 months Inputs

rs_test_mth4 = lm(hgt ~
             factor(indi.id):svymthRound +
             factor(indi.id):log(svymthRound) +
             factor(indi.id):1/(svymthRound) +
             vf_mth6[,1:4]:mth6m00t06 +
             vf_mth6[,2:4]:mth6m08t12 +
             vf_mth6[,3:4]:mth6m14t18 +
               vf_mth6[,4]:mth6m20t24 ,
           data=df_hw_cebu_widelong)
rsm_coef_mth4 = tidy(rs_test_mth4)
rsm_coef_inputs = rsm_coef_mth4 %>% filter(grepl('vf_mth6', term))
rsm_coef_inputs
#> # A tibble: 10 x 5
#>    term                                     estimate std.error statistic p.value
#>    <chr>                                       <dbl>     <dbl>     <dbl>   <dbl>
#>  1 vf_mth6[, 1:4](Intercept):mth6m00t06     -0.00413   0.00217    -1.90  0.0577 
#>  2 vf_mth6[, 1:4]factor(mth6)m08t12:mth6m0~ -0.00782   0.00333    -2.35  0.0193 
#>  3 vf_mth6[, 1:4]factor(mth6)m14t18:mth6m0~ -0.00747   0.00557    -1.34  0.181  
#>  4 vf_mth6[, 1:4]factor(mth6)m20t24:mth6m0~ -0.00641   0.00852    -0.752 0.453  
#>  5 vf_mth6[, 2:4]factor(mth6)m08t12:mth6m0~  0.00539   0.00193     2.80  0.00537
#>  6 vf_mth6[, 2:4]factor(mth6)m14t18:mth6m0~  0.00781   0.00392     1.99  0.0469 
#>  7 vf_mth6[, 2:4]factor(mth6)m20t24:mth6m0~  0.0103    0.00631     1.63  0.105  
#>  8 vf_mth6[, 3:4]factor(mth6)m14t18:mth6m1~ -0.00144   0.00118    -1.22  0.225  
#>  9 vf_mth6[, 3:4]factor(mth6)m20t24:mth6m1~ -0.00164   0.00235    -0.700 0.484  
#> 10 vf_mth6[, 4]:mth6m20t24                  -0.00129   0.00136    -0.945 0.345