vignettes/fv_panel_cumsum_grouplast.Rmd
fv_panel_cumsum_grouplast.Rmd
Dataset tracks individuals over time. At each date, calculate the cumulative sum across all individuals based on each individual’s final date of observation. This file works out how the ff_panel_cumsum_grouplast function works.
There are \(N\) groups indexed by \(j\), each group has \(\left\{M_{i}\right\}_{i=1}^{N}\) individuals. Variable \(x_{ji}\) is some variable measured at the group/individual level. There is a scheme that selects only individuals whose \(x_{ji} < W\). But we will only take one individual from each group, could be the individual from the group with the highest \(x_{ji}\) value. One could construct a variable \(G\), where each row corresponds to a \(x_{ji}\) value. What is the sum of all \(y_{ji}\) conditional on \(x_{ji} \le W\), only counting the largest individual within group.
Suppose we have data from N years, but each year’s data is incomplete, so information from some months is unavailable. Generate a cumulative sum up to any calendar month, where we only sum up the value observed on the last available date of each year, and on the last available month of the current year up to the month of accounting.
# Load Library
rm(list = ls(all.names = TRUE))
# library(tidyverse)
# library(tidymodels)
# library(rlang)
library(tibble)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(REconTools)
library(knitr)
library(kableExtra)
#>
#> Attaching package: 'kableExtra'
#> The following object is masked from 'package:dplyr':
#>
#> group_rows
# Generate X vector
set.seed(12345)
# Number of N
it_N <- 5
# M values for each i
ar_it_M <- sample(1:10, it_N, replace = TRUE)
ar_it_M_ID <- sample(1:it_N, it_N, replace = FALSE)
# Generate dataframe
tb_combine <- as_tibble(cbind(ar_it_M, ar_it_M_ID)) %>% rowid_to_column(var = "id")
# Generate X Vector
tb_long <- tb_combine %>% uncount(ar_it_M)
tb_long <- tb_long %>% add_column(xrand = runif(dim(tb_long)[1])) %>% arrange(xrand) %>% mutate(x = row_number())
# Generate within Group Rank
tb_long <- tb_long %>% arrange(id, x) %>% group_by(id) %>% mutate(rank_l = row_number())
# Select Core
tb_data <- tb_long %>% select(id, x) %>% add_column(y = runif(dim(tb_long)[1])) %>%
arrange(id,x) %>% group_by(id) %>% mutate(y = cumsum(y))
# Display
kable(tb_long) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
id | ar_it_M_ID | xrand | x | rank_l |
---|---|---|---|---|
1 | 2 | 0.1789636 | 6 | 1 |
1 | 2 | 0.3881440 | 16 | 2 |
1 | 2 | 0.4024851 | 18 | 3 |
2 | 5 | 0.2264672 | 9 | 1 |
2 | 5 | 0.3267524 | 13 | 2 |
2 | 5 | 0.3898285 | 17 | 3 |
2 | 5 | 0.4537281 | 20 | 4 |
2 | 5 | 0.5440579 | 22 | 5 |
2 | 5 | 0.6445426 | 25 | 6 |
2 | 5 | 0.6985436 | 27 | 7 |
2 | 5 | 0.7074819 | 28 | 8 |
2 | 5 | 0.9516588 | 37 | 9 |
2 | 5 | 0.9654153 | 39 | 10 |
3 | 3 | 0.0059876 | 1 | 1 |
3 | 3 | 0.1877124 | 7 | 2 |
3 | 3 | 0.3616256 | 14 | 3 |
3 | 3 | 0.3701041 | 15 | 4 |
3 | 3 | 0.4845578 | 21 | 5 |
3 | 3 | 0.6818336 | 26 | 6 |
3 | 3 | 0.7930072 | 32 | 7 |
3 | 3 | 0.8687949 | 34 | 8 |
4 | 4 | 0.0601952 | 4 | 1 |
4 | 4 | 0.1340316 | 5 | 2 |
4 | 4 | 0.2596812 | 10 | 3 |
4 | 4 | 0.3212247 | 12 | 4 |
4 | 4 | 0.4291988 | 19 | 5 |
4 | 4 | 0.6174246 | 23 | 6 |
4 | 4 | 0.7732432 | 30 | 7 |
4 | 4 | 0.7821933 | 31 | 8 |
4 | 4 | 0.9041547 | 35 | 9 |
4 | 4 | 0.9272740 | 36 | 10 |
5 | 1 | 0.0434565 | 2 | 1 |
5 | 1 | 0.0550538 | 3 | 2 |
5 | 1 | 0.2130255 | 8 | 3 |
5 | 1 | 0.3150282 | 11 | 4 |
5 | 1 | 0.6255428 | 24 | 5 |
5 | 1 | 0.7324961 | 29 | 6 |
5 | 1 | 0.8273029 | 33 | 7 |
5 | 1 | 0.9644703 | 38 | 8 |
kable(tb_data ) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
id | x | y |
---|---|---|
1 | 6 | 0.4992410 |
1 | 16 | 1.2290130 |
1 | 18 | 1.3093490 |
2 | 9 | 0.4355305 |
2 | 13 | 0.6721109 |
2 | 17 | 1.4636787 |
2 | 20 | 1.7223631 |
2 | 22 | 2.7083469 |
2 | 25 | 3.4652206 |
2 | 27 | 4.4449989 |
2 | 28 | 4.6639467 |
2 | 37 | 5.6126539 |
2 | 39 | 5.7621118 |
3 | 1 | 0.6003570 |
3 | 7 | 1.5467877 |
3 | 14 | 2.2351411 |
3 | 15 | 2.7406748 |
3 | 21 | 3.1136110 |
3 | 26 | 3.4494161 |
3 | 32 | 3.4976674 |
3 | 34 | 4.1166150 |
4 | 4 | 0.9614473 |
4 | 5 | 1.6164078 |
4 | 10 | 2.1266998 |
4 | 12 | 2.2767980 |
4 | 19 | 3.1472459 |
4 | 23 | 3.6616876 |
4 | 30 | 3.6703355 |
4 | 31 | 3.6895302 |
4 | 35 | 3.8340421 |
4 | 36 | 4.1390739 |
5 | 2 | 0.8256569 |
5 | 3 | 1.3280015 |
5 | 8 | 2.1315741 |
5 | 11 | 2.1922141 |
5 | 24 | 3.1201692 |
5 | 29 | 3.9283478 |
5 | 33 | 4.0071611 |
5 | 38 | 4.6080894 |
The dataframe we constructed has three columns, id, x and y.
This is a slow algorithm, redo all computations with the addition of each row, this works, but is very slow. Recompute row by row fully.
tb_data <- tb_data %>% arrange(x)
tb_data_cum_sum_top <- tb_data %>% mutate(cum_sum_top = 0,
cum_mean_top = 0,
cum_median_top = 0)
for (row_ctr in seq(1, dim(tb_data)[1])) {
# select up to current row sort and group
tb_data_up2row <- tb_data[1:row_ctr,] %>% arrange(id, x) %>% group_by(id)
# Obtain last element sorted by x for each group, and resort by x
tb_data_up2row <- tb_data_up2row %>% slice(n()) %>% arrange(x)
# cumulative sum of the highest element of each group below row_ctr
fl_cum_sum_top_cur <- tb_data_up2row %>% ungroup() %>%
summarize(y_sum_top_up2row = sum(y)) %>% pull(y_sum_top_up2row)
fl_cum_mean_top_cur <- tb_data_up2row %>% ungroup() %>%
summarize(y_mean_top_up2row = mean(y)) %>% pull(y_mean_top_up2row)
fl_cum_median_top_cur <- tb_data_up2row %>% ungroup() %>%
summarize(y_median_top_up2row = median(y)) %>% pull(y_median_top_up2row)
# Store results
tb_data_cum_sum_top[row_ctr, 'cum_sum_top'] <- fl_cum_sum_top_cur
tb_data_cum_sum_top[row_ctr, 'cum_mean_top'] <- fl_cum_mean_top_cur
tb_data_cum_sum_top[row_ctr, 'cum_median_top'] <- fl_cum_median_top_cur
# Display
if (row_ctr %% 10 == 0) {
cat('row_ctr:', row_ctr, '\n')
print(tb_data_up2row)
}
}
#> row_ctr: 10
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 1 6 0.499
#> 2 3 7 1.55
#> 3 5 8 2.13
#> 4 2 9 0.436
#> 5 4 10 2.13
#> row_ctr: 20
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 5 11 2.19
#> 2 3 15 2.74
#> 3 1 18 1.31
#> 4 4 19 3.15
#> 5 2 20 1.72
#> row_ctr: 30
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 1 18 1.31
#> 2 3 26 3.45
#> 3 2 28 4.66
#> 4 5 29 3.93
#> 5 4 30 3.67
# Display Final
kable(tb_data_cum_sum_top) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
id | x | y | cum_sum_top | cum_mean_top | cum_median_top |
---|---|---|---|---|---|
3 | 1 | 0.6003570 | 0.600357 | 0.6003570 | 0.6003570 |
5 | 2 | 0.8256569 | 1.426014 | 0.7130069 | 0.7130069 |
5 | 3 | 1.3280015 | 1.928359 | 0.9641792 | 0.9641792 |
4 | 4 | 0.9614473 | 2.889806 | 0.9632686 | 0.9614473 |
4 | 5 | 1.6164078 | 3.544766 | 1.1815888 | 1.3280015 |
1 | 6 | 0.4992410 | 4.044007 | 1.0110018 | 0.9641792 |
3 | 7 | 1.5467877 | 4.990438 | 1.2476095 | 1.4373946 |
5 | 8 | 2.1315741 | 5.794011 | 1.4485027 | 1.5815978 |
2 | 9 | 0.4355305 | 6.229541 | 1.2459082 | 1.5467877 |
4 | 10 | 2.1266998 | 6.739833 | 1.3479666 | 1.5467877 |
5 | 11 | 2.1922141 | 6.800473 | 1.3600946 | 1.5467877 |
4 | 12 | 2.2767980 | 6.950571 | 1.3901143 | 1.5467877 |
2 | 13 | 0.6721109 | 7.187152 | 1.4374304 | 1.5467877 |
3 | 14 | 2.2351411 | 7.875505 | 1.5751010 | 2.1922141 |
3 | 15 | 2.7406748 | 8.381039 | 1.6762078 | 2.1922141 |
1 | 16 | 1.2290130 | 9.110811 | 1.8221622 | 2.1922141 |
2 | 17 | 1.4636787 | 9.902379 | 1.9804757 | 2.1922141 |
1 | 18 | 1.3093490 | 9.982715 | 1.9965429 | 2.1922141 |
4 | 19 | 3.1472459 | 10.853163 | 2.1706325 | 2.1922141 |
2 | 20 | 1.7223631 | 11.111847 | 2.2223694 | 2.1922141 |
3 | 21 | 3.1136110 | 11.484783 | 2.2969566 | 2.1922141 |
2 | 22 | 2.7083469 | 12.470767 | 2.4941534 | 2.7083469 |
4 | 23 | 3.6616876 | 12.985209 | 2.5970417 | 2.7083469 |
5 | 24 | 3.1201692 | 13.913164 | 2.7826328 | 3.1136110 |
2 | 25 | 3.4652206 | 14.670037 | 2.9340075 | 3.1201692 |
3 | 26 | 3.4494161 | 15.005843 | 3.0011685 | 3.4494161 |
2 | 27 | 4.4449989 | 15.985621 | 3.1971242 | 3.4494161 |
2 | 28 | 4.6639467 | 16.204569 | 3.2409137 | 3.4494161 |
5 | 29 | 3.9283478 | 17.012747 | 3.4025494 | 3.6616876 |
4 | 30 | 3.6703355 | 17.021395 | 3.4042790 | 3.6703355 |
4 | 31 | 3.6895302 | 17.040590 | 3.4081180 | 3.6895302 |
3 | 32 | 3.4976674 | 17.088841 | 3.4177682 | 3.6895302 |
5 | 33 | 4.0071611 | 17.167655 | 3.4335309 | 3.6895302 |
3 | 34 | 4.1166150 | 17.786602 | 3.5573204 | 4.0071611 |
4 | 35 | 3.8340421 | 17.931114 | 3.5862228 | 4.0071611 |
4 | 36 | 4.1390739 | 18.236146 | 3.6472292 | 4.1166150 |
2 | 37 | 5.6126539 | 19.184853 | 3.8369706 | 4.1166150 |
5 | 38 | 4.6080894 | 19.785781 | 3.9571562 | 4.1390739 |
2 | 39 | 5.7621118 | 19.935239 | 3.9870478 | 4.1390739 |
tb_data <- tb_data %>% arrange(x)
tb_data_cum_sum_top <- tb_data %>% mutate(cum_sum_top = 0,
cum_mean_top = 0,
cum_median_top = 0)
ar_latest_indi <- rep(NA, 1, it_N)
for (row_ctr in seq(1, dim(tb_data)[1])) {
# current rank, what is the ID of the person at this rank
it_id_row <- tb_data[['id']][row_ctr]
# update the overall individual array with highest, latest value
ar_latest_indi[it_id_row] <- tb_data[['y']][row_ctr]
# Compute sum.
fl_cum_sum_top_cur <- sum(ar_latest_indi, na.rm = TRUE)
fl_cum_mean_top_cur <- mean(ar_latest_indi, na.rm = TRUE)
fl_cum_median_top_cur <- median(ar_latest_indi, na.rm = TRUE)
# Store results
tb_data_cum_sum_top[row_ctr, 'cum_sum_top'] <- fl_cum_sum_top_cur
tb_data_cum_sum_top[row_ctr, 'cum_mean_top'] <- fl_cum_mean_top_cur
tb_data_cum_sum_top[row_ctr, 'cum_median_top'] <- fl_cum_median_top_cur
# Display
if (row_ctr %% 10 == 0) {
cat('row_ctr:', row_ctr, '\n')
print(tb_data_up2row)
}
}
#> row_ctr: 10
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 1 18 1.31
#> 2 3 34 4.12
#> 3 4 36 4.14
#> 4 5 38 4.61
#> 5 2 39 5.76
#> row_ctr: 20
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 1 18 1.31
#> 2 3 34 4.12
#> 3 4 36 4.14
#> 4 5 38 4.61
#> 5 2 39 5.76
#> row_ctr: 30
#> # A tibble: 5 x 3
#> # Groups: id [5]
#> id x y
#> <int> <int> <dbl>
#> 1 1 18 1.31
#> 2 3 34 4.12
#> 3 4 36 4.14
#> 4 5 38 4.61
#> 5 2 39 5.76
# Display Final
kable(tb_data_cum_sum_top) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
id | x | y | cum_sum_top | cum_mean_top | cum_median_top |
---|---|---|---|---|---|
3 | 1 | 0.6003570 | 0.600357 | 0.6003570 | 0.6003570 |
5 | 2 | 0.8256569 | 1.426014 | 0.7130069 | 0.7130069 |
5 | 3 | 1.3280015 | 1.928359 | 0.9641792 | 0.9641792 |
4 | 4 | 0.9614473 | 2.889806 | 0.9632686 | 0.9614473 |
4 | 5 | 1.6164078 | 3.544766 | 1.1815888 | 1.3280015 |
1 | 6 | 0.4992410 | 4.044007 | 1.0110018 | 0.9641792 |
3 | 7 | 1.5467877 | 4.990438 | 1.2476095 | 1.4373946 |
5 | 8 | 2.1315741 | 5.794011 | 1.4485027 | 1.5815978 |
2 | 9 | 0.4355305 | 6.229541 | 1.2459082 | 1.5467877 |
4 | 10 | 2.1266998 | 6.739833 | 1.3479666 | 1.5467877 |
5 | 11 | 2.1922141 | 6.800473 | 1.3600946 | 1.5467877 |
4 | 12 | 2.2767980 | 6.950571 | 1.3901143 | 1.5467877 |
2 | 13 | 0.6721109 | 7.187152 | 1.4374304 | 1.5467877 |
3 | 14 | 2.2351411 | 7.875505 | 1.5751010 | 2.1922141 |
3 | 15 | 2.7406748 | 8.381039 | 1.6762078 | 2.1922141 |
1 | 16 | 1.2290130 | 9.110811 | 1.8221622 | 2.1922141 |
2 | 17 | 1.4636787 | 9.902379 | 1.9804757 | 2.1922141 |
1 | 18 | 1.3093490 | 9.982715 | 1.9965429 | 2.1922141 |
4 | 19 | 3.1472459 | 10.853163 | 2.1706325 | 2.1922141 |
2 | 20 | 1.7223631 | 11.111847 | 2.2223694 | 2.1922141 |
3 | 21 | 3.1136110 | 11.484783 | 2.2969566 | 2.1922141 |
2 | 22 | 2.7083469 | 12.470767 | 2.4941534 | 2.7083469 |
4 | 23 | 3.6616876 | 12.985209 | 2.5970417 | 2.7083469 |
5 | 24 | 3.1201692 | 13.913164 | 2.7826328 | 3.1136110 |
2 | 25 | 3.4652206 | 14.670037 | 2.9340075 | 3.1201692 |
3 | 26 | 3.4494161 | 15.005843 | 3.0011685 | 3.4494161 |
2 | 27 | 4.4449989 | 15.985621 | 3.1971242 | 3.4494161 |
2 | 28 | 4.6639467 | 16.204569 | 3.2409137 | 3.4494161 |
5 | 29 | 3.9283478 | 17.012747 | 3.4025494 | 3.6616876 |
4 | 30 | 3.6703355 | 17.021395 | 3.4042790 | 3.6703355 |
4 | 31 | 3.6895302 | 17.040590 | 3.4081180 | 3.6895302 |
3 | 32 | 3.4976674 | 17.088841 | 3.4177682 | 3.6895302 |
5 | 33 | 4.0071611 | 17.167655 | 3.4335309 | 3.6895302 |
3 | 34 | 4.1166150 | 17.786602 | 3.5573204 | 4.0071611 |
4 | 35 | 3.8340421 | 17.931114 | 3.5862228 | 4.0071611 |
4 | 36 | 4.1390739 | 18.236146 | 3.6472292 | 4.1166150 |
2 | 37 | 5.6126539 | 19.184853 | 3.8369706 | 4.1166150 |
5 | 38 | 4.6080894 | 19.785781 | 3.9571562 | 4.1390739 |
2 | 39 | 5.7621118 | 19.935239 | 3.9870478 | 4.1390739 |
Testing ff_panel_cumsum_grouplast.
Within group last occurance, cumulative sum of last occurance of individual by date. Think of date as x, y as individual outcomes, and id as individual ID. Each individual is observed at multiple x points. In the example here, at each x, only one id observed.
ff_panel_cumsum_grouplast(tb_data, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_cumsum', stat='sum')
#> # A tibble: 39 x 4
#> # Groups: id [5]
#> id x y y_cumsum
#> <int> <int> <dbl> <dbl>
#> 1 3 1 0.600 0.600
#> 2 5 2 0.826 1.43
#> 3 5 3 1.33 1.93
#> 4 4 4 0.961 2.89
#> 5 4 5 1.62 3.54
#> 6 1 6 0.499 4.04
#> 7 3 7 1.55 4.99
#> 8 5 8 2.13 5.79
#> 9 2 9 0.436 6.23
#> 10 4 10 2.13 6.74
#> # ... with 29 more rows
Thinking of taking Average SAT scores but, only of the last score people have. Calculate a moving average, where the moving window is all past information, and the average is across individuals, and for each individual, we only take the most recent score.
ff_panel_cumsum_grouplast(tb_data, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_movingaverage_mean', stat='mean')
#> # A tibble: 39 x 4
#> # Groups: id [5]
#> id x y y_movingaverage_mean
#> <int> <int> <dbl> <dbl>
#> 1 3 1 0.600 0.600
#> 2 5 2 0.826 0.713
#> 3 5 3 1.33 0.964
#> 4 4 4 0.961 0.963
#> 5 4 5 1.62 1.18
#> 6 1 6 0.499 1.01
#> 7 3 7 1.55 1.25
#> 8 5 8 2.13 1.45
#> 9 2 9 0.436 1.25
#> 10 4 10 2.13 1.35
#> # ... with 29 more rows
Suppose I have 100,000 records each at a unique sequencial date, records for over 500 individuals, each individual multiple times. Cumulative sum at each date, only summing the latest value for each individual. What is the speed of the algorithm here.
First Explore algorithms, initial algorithm very slow, improved algorithm dramtically faster. Using option quick to control
# Initialize
set.seed(67890)
it_loop <- 3
it_N_min <- 5
it_N_max <- 100
it_N_max <- 30
it_max_M_min <- 10
it_max_M_max <- 100
it_max_M_max <- 30
ar_it_N <- floor(seq(it_N_min, it_N_max, length.out=it_loop))
ar_it_max_M <- floor(seq(it_max_M_min, it_max_M_max, length.out=it_loop))
for (it_ctr in seq(1, it_loop)) {
# Set df size
it_N <- ar_it_N[it_ctr]
it_max_M <- ar_it_max_M[it_ctr]
# Generate Panel Frame
df_start_time <- Sys.time()
ar_it_M <- sample(1:it_max_M, it_N, replace = TRUE)
ar_it_M_ID <- sample(1:it_N, it_N, replace = FALSE)
tb_combine <- as_tibble(cbind(ar_it_M, ar_it_M_ID)) %>% rowid_to_column(var = "id")
tb_long <- tb_combine %>% uncount(ar_it_M)
tb_long <- tb_long %>% add_column(xrand = runif(dim(tb_long)[1])) %>% arrange(xrand) %>% mutate(x = row_number())
tb_long <- tb_long %>% arrange(id, x) %>% group_by(id) %>% mutate(rank_l = row_number())
df <- tb_long %>% select(id, x) %>% add_column(y = runif(dim(tb_long)[1])) %>% arrange(id,x) %>% group_by(id) %>% mutate(y = cumsum(y))
df_end_time <- Sys.time()
# Timing Test FAST
start_time_fast <- Sys.time()
tb_data_cum_sum_top <- ff_panel_cumsum_grouplast(df, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_movingsum_lastestscore', stat='sum', quick=TRUE)
end_time_fast <- Sys.time()
# Timing Test SLOW VERBOSE TRUE
start_time_slow <- Sys.time()
tb_data_cum_sum_top <- ff_panel_cumsum_grouplast(df, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_movingsum_lastestscore', stat='sum', quick=FALSE, verbose = TRUE)
end_time_slow <- Sys.time()
# Timing Test SLOW VERBOSE FALSE
start_time_slow_verbose <- Sys.time()
tb_data_cum_sum_top <- ff_panel_cumsum_grouplast(df, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_movingsum_lastestscore', stat='sum', quick=FALSE, verbose = FALSE)
end_time_slow_verbose <- Sys.time()
cat('it_ctr:', it_ctr, ', it_N:', it_N, ', it_max_N:', it_max_M, '\n')
print(paste0('Df Generation Took:', df_end_time - df_start_time))
print(paste0('Moving Stat Took QUICK True:', end_time_fast - start_time_fast))
print(paste0('Moving Stat Took SLOW VERBOSE True:', end_time_slow - start_time_slow))
print(paste0('Moving Stat Took SLOW VERBOSE False:', end_time_slow_verbose - start_time_slow_verbose))
}
#> it_ctr: 1 , it_N: 5 , it_max_N: 10
#> [1] "Df Generation Took:0.0268862247467041"
#> [1] "Moving Stat Took QUICK True:0.0159308910369873"
#> [1] "Moving Stat Took SLOW VERBOSE True:0.121488094329834"
#> [1] "Moving Stat Took SLOW VERBOSE False:0.101566791534424"
#> it_ctr: 2 , it_N: 17 , it_max_N: 20
#> [1] "Df Generation Took:0.0209119319915771"
#> [1] "Moving Stat Took QUICK True:0.191187143325806"
#> [1] "Moving Stat Took SLOW VERBOSE True:1.85213279724121"
#> [1] "Moving Stat Took SLOW VERBOSE False:1.43789315223694"
#> it_ctr: 3 , it_N: 30 , it_max_N: 30
#> [1] "Df Generation Took:0.0209121704101562"
#> [1] "Moving Stat Took QUICK True:0.658204793930054"
#> [1] "Moving Stat Took SLOW VERBOSE True:6.17477202415466"
#> [1] "Moving Stat Took SLOW VERBOSE False:5.06150007247925"
Panel structure gets somewhat large, 1000 individuals, each observed on average 100 times. This would be a very substantial panel. About 33 seconds, not very fast, could be improved later.
The structure is iterative by construction, difficult to fully vectorize. Perhaps achievable as well and faster with some moving average function, rolling window functions.
# Initialize
set.seed(67890)
it_loop <- 5
it_N_min <- 5
it_N_max <- 50
it_max_M_min <- 10
it_max_M_max <- 20
ar_it_N <- floor(seq(it_N_min, it_N_max, length.out=it_loop))
ar_it_max_M <- floor(seq(it_max_M_min, it_max_M_max, length.out=it_loop))
for (it_ctr in seq(1, it_loop)) {
# Set df size
it_N <- ar_it_N[it_ctr]
it_max_M <- ar_it_max_M[it_ctr]
# Generate Panel Frame
df_start_time <- Sys.time()
ar_it_M <- sample(1:it_max_M, it_N, replace = TRUE)
ar_it_M_ID <- sample(1:it_N, it_N, replace = FALSE)
tb_combine <- as_tibble(cbind(ar_it_M, ar_it_M_ID)) %>% rowid_to_column(var = "id")
tb_long <- tb_combine %>% uncount(ar_it_M)
tb_long <- tb_long %>% add_column(xrand = runif(dim(tb_long)[1])) %>% arrange(xrand) %>% mutate(x = row_number())
tb_long <- tb_long %>% arrange(id, x) %>% group_by(id) %>% mutate(rank_l = row_number())
df <- tb_long %>% select(id, x) %>% add_column(y = runif(dim(tb_long)[1])) %>% arrange(id,x) %>% group_by(id) %>% mutate(y = cumsum(y))
df_end_time <- Sys.time()
# Timing Test FAST
start_time_fast <- Sys.time()
tb_data_cum_sum_top <- ff_panel_cumsum_grouplast(df, svr_id='id', svr_x='x', svr_y='y', svr_cumsumtop = 'y_movingsum_lastestscore', stat='sum', quick=TRUE)
end_time_fast <- Sys.time()
cat('it_ctr:', it_ctr, ', it_N:', it_N, ', it_max_N:', it_max_M, '\n')
print(paste0('Df Generation Took:', df_end_time - df_start_time))
print(paste0('Moving Stat Took QUICK True:', end_time_fast - start_time_fast))
}
#> it_ctr: 1 , it_N: 5 , it_max_N: 10
#> [1] "Df Generation Took:0.0229029655456543"
#> [1] "Moving Stat Took QUICK True:0.0159330368041992"
#> it_ctr: 2 , it_N: 16 , it_max_N: 12
#> [1] "Df Generation Took:0.0238981246948242"
#> [1] "Moving Stat Took QUICK True:0.164302825927734"
#> it_ctr: 3 , it_N: 27 , it_max_N: 15
#> [1] "Df Generation Took:0.0209109783172607"
#> [1] "Moving Stat Took QUICK True:0.317649841308594"
#> it_ctr: 4 , it_N: 38 , it_max_N: 17
#> [1] "Df Generation Took:0.022907018661499"
#> [1] "Moving Stat Took QUICK True:0.479958057403564"
#> it_ctr: 5 , it_N: 50 , it_max_N: 20
#> [1] "Df Generation Took:0.0219070911407471"
#> [1] "Moving Stat Took QUICK True:0.615386009216309"