Join Panel with Multiple Keys
We have two datasets, one for student enrollment, panel over time, but some students do not show up on some dates. The other is a skeleton panel with all student ID and all dates. Often we need to join dataframes together, and we need to join by the student ID and the panel time Key at the same time. When students show up, there is a quiz score for that day, so the joined panel should have as data column quiz score
Student count is N, total dates are M. First we generate two panels below, then we join by both keys using left_join. First, define dataframes:
it_N <- 4
it_M <- 3
svr_id <- 'sid'
svr_date <- 'classday'
svr_attend <- 'date_in_class'
df_panel_balanced_skeleton <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup()
kable(df_panel_balanced_skeleton) %>%
kable_styling_fc()
sid
|
classday
|
1
|
1
|
1
|
2
|
1
|
3
|
2
|
1
|
2
|
2
|
2
|
3
|
3
|
1
|
3
|
2
|
3
|
3
|
4
|
1
|
4
|
2
|
4
|
3
|
set.seed(456)
df_panel_attend <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup() %>% mutate(in_class = case_when(rnorm(n(),mean=0,sd=1) < 0 ~ 1, TRUE ~ 0)) %>%
filter(in_class == 1) %>% select(!!sym(svr_id), !!sym(svr_date)) %>%
rename(!!sym(svr_attend) := !!sym(svr_date)) %>%
mutate(dayquizscore = rnorm(n(),mean=80,sd=10))
kable(df_panel_attend) %>%
kable_styling_fc()
sid
|
date_in_class
|
dayquizscore
|
1
|
1
|
89.88726
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
4
|
2
|
97.36936
|
Second, now join dataframes:
df_quiz_joined_multikey <- df_panel_balanced_skeleton %>%
left_join(df_panel_attend,
by=(c('sid'='sid', 'classday'='date_in_class')))
df_quiz_joined_multikey_setnames <- df_panel_balanced_skeleton %>%
left_join(df_panel_attend, by=setNames(c('sid', 'date_in_class'), c('sid', 'classday')))
kable(df_quiz_joined_multikey) %>%
kable_styling_fc()
sid
|
classday
|
dayquizscore
|
1
|
1
|
89.88726
|
1
|
2
|
NA
|
1
|
3
|
NA
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
3
|
1
|
NA
|
3
|
2
|
NA
|
3
|
3
|
NA
|
4
|
1
|
NA
|
4
|
2
|
97.36936
|
4
|
3
|
NA
|
kable(df_quiz_joined_multikey_setnames) %>%
kable_styling_fc()
sid
|
classday
|
dayquizscore
|
1
|
1
|
89.88726
|
1
|
2
|
NA
|
1
|
3
|
NA
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
3
|
1
|
NA
|
3
|
2
|
NA
|
3
|
3
|
NA
|
4
|
1
|
NA
|
4
|
2
|
97.36936
|
4
|
3
|
NA
|
Stack Panel Frames Together
There are multiple panel dataframe, each for different subsets of dates. All variable names and units of observations are identical. Use DPLYR bind_rows.
it_N <- 2
it_M <- 3
svr_id <- 'sid'
svr_date <- 'date'
df_panel_m1tom3 <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup()
df_panel_m4tom6 <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number() + 3) %>%
ungroup()
df_panel_m1tm6 <- bind_rows(df_panel_m1tom3, df_panel_m4tom6) %>% arrange(!!!syms(c(svr_id, svr_date)))
kable(df_panel_m1tom3) %>%
kable_styling_fc()
sid
|
date
|
1
|
1
|
1
|
2
|
1
|
3
|
2
|
1
|
2
|
2
|
2
|
3
|
kable(df_panel_m4tom6) %>%
kable_styling_fc()
sid
|
date
|
1
|
4
|
1
|
5
|
1
|
6
|
2
|
4
|
2
|
5
|
2
|
6
|
kable(df_panel_m1tm6) %>%
kable_styling_fc()
sid
|
date
|
1
|
1
|
1
|
2
|
1
|
3
|
1
|
4
|
1
|
5
|
1
|
6
|
2
|
1
|
2
|
2
|
2
|
3
|
2
|
4
|
2
|
5
|
2
|
6
|