Join Panel with Multiple Keys
We have two datasets, one for student enrollment, panel over time, but some students do not show up on some dates. The other is a skeleton panel with all student ID and all dates. Often we need to join dataframes together, and we need to join by the student ID and the panel time Key at the same time. When students show up, there is a quiz score for that day, so the joined panel should have as data column quiz score
Student count is \(N\), total dates are \(M\). First we generate two panels below, then we join by both keys using left_join. First, define dataframes:
# Define
it_N <- 4
it_M <- 3
svr_id <- 'sid'
svr_date <- 'classday'
svr_attend <- 'date_in_class'
# Panel Skeleton
df_panel_balanced_skeleton <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup()
# Print
kable(df_panel_balanced_skeleton) %>%
kable_styling_fc()
sid
|
classday
|
1
|
1
|
1
|
2
|
1
|
3
|
2
|
1
|
2
|
2
|
2
|
3
|
3
|
1
|
3
|
2
|
3
|
3
|
4
|
1
|
4
|
2
|
4
|
3
|
# Smaller Panel of Random Days in School
set.seed(456)
df_panel_attend <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup() %>% mutate(in_class = case_when(rnorm(n(),mean=0,sd=1) < 0 ~ 1, TRUE ~ 0)) %>%
filter(in_class == 1) %>% select(!!sym(svr_id), !!sym(svr_date)) %>%
rename(!!sym(svr_attend) := !!sym(svr_date)) %>%
mutate(dayquizscore = rnorm(n(),mean=80,sd=10))
# Print
kable(df_panel_attend) %>%
kable_styling_fc()
sid
|
date_in_class
|
dayquizscore
|
1
|
1
|
89.88726
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
4
|
2
|
97.36936
|
Second, now join dataframes:
# Join with explicit names
df_quiz_joined_multikey <- df_panel_balanced_skeleton %>%
left_join(df_panel_attend,
by=(c('sid'='sid', 'classday'='date_in_class')))
# Join with setname strings
df_quiz_joined_multikey_setnames <- df_panel_balanced_skeleton %>%
left_join(df_panel_attend, by=setNames(c('sid', 'date_in_class'), c('sid', 'classday')))
# Print
kable(df_quiz_joined_multikey) %>%
kable_styling_fc()
sid
|
classday
|
dayquizscore
|
1
|
1
|
89.88726
|
1
|
2
|
NA
|
1
|
3
|
NA
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
3
|
1
|
NA
|
3
|
2
|
NA
|
3
|
3
|
NA
|
4
|
1
|
NA
|
4
|
2
|
97.36936
|
4
|
3
|
NA
|
kable(df_quiz_joined_multikey_setnames) %>%
kable_styling_fc()
sid
|
classday
|
dayquizscore
|
1
|
1
|
89.88726
|
1
|
2
|
NA
|
1
|
3
|
NA
|
2
|
1
|
96.53929
|
2
|
2
|
65.59195
|
2
|
3
|
99.47356
|
3
|
1
|
NA
|
3
|
2
|
NA
|
3
|
3
|
NA
|
4
|
1
|
NA
|
4
|
2
|
97.36936
|
4
|
3
|
NA
|
Stack Panel Frames Together
There are multiple panel dataframe, each for different subsets of dates. All variable names and units of observations are identical. Use DPLYR bind_rows.
# Define
it_N <- 2 # Number of individuals
it_M <- 3 # Number of Months
svr_id <- 'sid'
svr_date <- 'date'
# Panel First Half of Year
df_panel_m1tom3 <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number()) %>%
ungroup()
# Panel Second Half of Year
df_panel_m4tom6 <- as_tibble(matrix(it_M, nrow=it_N, ncol=1)) %>%
rowid_to_column(var = svr_id) %>%
uncount(V1) %>%
group_by(!!sym(svr_id)) %>% mutate(!!sym(svr_date) := row_number() + 3) %>%
ungroup()
# Bind Rows
df_panel_m1tm6 <- bind_rows(df_panel_m1tom3, df_panel_m4tom6) %>% arrange(!!!syms(c(svr_id, svr_date)))
# Print
kable(df_panel_m1tom3) %>%
kable_styling_fc()
sid
|
date
|
1
|
1
|
1
|
2
|
1
|
3
|
2
|
1
|
2
|
2
|
2
|
3
|
kable(df_panel_m4tom6) %>%
kable_styling_fc()
sid
|
date
|
1
|
4
|
1
|
5
|
1
|
6
|
2
|
4
|
2
|
5
|
2
|
6
|
kable(df_panel_m1tm6) %>%
kable_styling_fc()
sid
|
date
|
1
|
1
|
1
|
2
|
1
|
3
|
1
|
4
|
1
|
5
|
1
|
6
|
2
|
1
|
2
|
2
|
2
|
3
|
2
|
4
|
2
|
5
|
2
|
6
|