Go to the RMD, R, PDF, or HTML version of this file. Go back to fan’s REconTools Package, R Code Examples Repository (bookdown site), or Intro Stats with R Repository (bookdown site).
There are multiple csv files, each was simulated with a different combination of parameters, each file has the same columns and perhaps even the same number of rows. We want to combine the files together, and provide correct attributes to rows from each table stacked, based on each underlying csv file’s file name.
This is necessary, for example, when running computational exercises across EC2 instances in batch array and files are saved to different S3 folders. Need to gather parallel computational results together in a single file after syncing files locally with S3.
In the csv folder under this section, there are four subfolder, each containing 3 files with identical file structures.
We want to find the relevant csv files from these directories, and stack the results together.
First, search and find all files with certain prefix.
# can search in multiple paths, second path here has no relevant contents
spt_roots <- c('C:/Users/fan/R4Econ/panel/basic/_file/csv',
'C:/Users/fan/R4Econ/panel/basic/_file/tex')
# can skip file names with certain strings
spn_skip <- c('A3420')
# prefix search patther,
st_search_str <- 'solu_19E1NEp99r99x_ITG_PE_cev_*'
# Search and get all Path
ls_sfls <- list.files(path=spt_roots,
recursive=T,
pattern=st_search_str,
full.names=T)
# Skip path if contains words in skip list
if(!missing(spn_skip)) {
ls_sfls <- ls_sfls[!grepl(paste(spn_skip, collapse = "|"), ls_sfls)]
}
Second, show all the files found, show their full path, the file name and the two folder names above the file name.
# Loop and print found files
it_folders_names_to_keep = 2
for (spt_file in ls_sfls) {
ls_srt_folders_name_keep <- tail(strsplit(spt_file, "/")[[1]], n=it_folders_names_to_keep+1)
snm_file_name <- tail(ls_srt_folders_name_keep, 1)
ls_srt_folders_keep <- head(ls_srt_folders_name_keep, it_folders_names_to_keep)
print(paste0('path:', spt_file))
print(snm_file_name)
print(ls_srt_folders_keep)
}
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev-2000/solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0.csv"
## [1] "csv" "cev-2000"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev-2000/solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840.csv"
## [1] "csv" "cev-2000"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev-947/solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0.csv"
## [1] "csv" "cev-947"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev-947/solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840.csv"
## [1] "csv" "cev-947"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev2000/solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0.csv"
## [1] "csv" "cev2000"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev2000/solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840.csv"
## [1] "csv" "cev2000"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev947/solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A0.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A0.csv"
## [1] "csv" "cev947"
## [1] "path:C:/Users/fan/R4Econ/panel/basic/_file/csv/cev947/solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A6840.csv"
## [1] "solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A6840.csv"
## [1] "csv" "cev947"
Third, create a dataframe with the folder and file names:
# String matrix empty
mt_st_paths_names <- matrix(data=NA, nrow=length(ls_sfls), ncol=4)
# Loop and print found files
it_folders_names_to_keep = 2
it_file_counter = 0
for (spt_file in ls_sfls) {
# row counter
it_file_counter = it_file_counter + 1
# get file paths
ls_srt_folders_name_keep <- tail(strsplit(spt_file, "/")[[1]], n=it_folders_names_to_keep+1)
snm_file_name <- tail(ls_srt_folders_name_keep, 1)
ls_srt_folders_keep <- head(ls_srt_folders_name_keep, it_folders_names_to_keep)
# store
# tools::file_path_sans_ext to drop suffix
mt_st_paths_names[it_file_counter,1] = tools::file_path_sans_ext(snm_file_name)
mt_st_paths_names[it_file_counter,2] = ls_srt_folders_keep[1]
mt_st_paths_names[it_file_counter,3] = ls_srt_folders_keep[2]
mt_st_paths_names[it_file_counter,4] = spt_file
}
# Column Names
ar_st_varnames <- c('fileid','name','folder1','folder2', 'fullpath')
# Combine to tibble, add name col1, col2, etc.
tb_csv_info <- as_tibble(mt_st_paths_names) %>%
rowid_to_column(var = "id") %>%
rename_all(~c(ar_st_varnames))
# Display
kable(tb_csv_info[,1:4]) %>% kable_styling_fc()
fileid | name | folder1 | folder2 |
---|---|---|---|
1 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0 | csv | cev-2000 |
2 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840 | csv | cev-2000 |
3 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0 | csv | cev-947 |
4 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840 | csv | cev-947 |
5 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0 | csv | cev2000 |
6 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840 | csv | cev2000 |
7 | solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A0 | csv | cev947 |
8 | solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A6840 | csv | cev947 |
Fourth, create a dataframe by expanding each row with the datafile loaded in, use apply with anonymous function.
# Generate a list of dataframes
ls_df_loaded_files =
apply(tb_csv_info,
1,
function(row) {
# Loading file
spn_full_path <- row[5]
mt_csv = read.csv(file = spn_full_path)
# dataframe
it_fileid <- row[1]
snm_filename <- row[2]
srt_folder_level2 <- row[3]
srt_folder_level1 <- row[4]
tb_combine = as_tibble(mt_csv) %>%
na.omit %>%
rowid_to_column(var = "statesid") %>%
mutate(fileid = it_fileid,
filename = snm_filename,
folder_lvl1 = srt_folder_level1,
folder_lvl2 = srt_folder_level2) %>%
select(fileid, filename, folder_lvl1, folder_lvl2,
statesid, everything())
# return
return(tb_combine)
})
# Stack dataframes together
df_all_files = do.call(bind_rows, ls_df_loaded_files)
# show stacked table
kable(df_all_files[seq(1,601,50),1:6]) %>% kable_styling_fc_wide()
fileid | filename | folder_lvl1 | folder_lvl2 | statesid | EjV |
---|---|---|---|---|---|
1 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0 | cev-2000 | csv | 1 | -28.8586860 |
1 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0 | cev-2000 | csv | 51 | -0.2106603 |
2 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840 | cev-2000 | csv | 3 | -28.8586860 |
2 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840 | cev-2000 | csv | 53 | -0.0642997 |
3 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0 | cev-947 | csv | 5 | -5.8826609 |
3 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0 | cev-947 | csv | 55 | 0.0353187 |
4 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840 | cev-947 | csv | 7 | -2.7046907 |
4 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840 | cev-947 | csv | 57 | 0.1094474 |
5 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0 | cev2000 | csv | 9 | -2.9782236 |
5 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0 | cev2000 | csv | 59 | 0.3389275 |
6 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840 | cev2000 | csv | 11 | -1.7229647 |
6 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840 | cev2000 | csv | 61 | -14.6880377 |
7 | solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A0 | cev947 | csv | 13 | -1.7623279 |
Fifth, get additional information from the file name and file folder. Extract those as separate variables. The file names is dash connected, with various information. First, split just the final element of the string file name out, which is A###. Then, also extract the number next to N as a separate numeric column. Additional folder_lvl1 separate out the numeric number from the initial word cev.
# separate last eleemtnafter underscore
df_all_files_finalA <- df_all_files %>%
separate(filename, into = c("filename_main", "prod_type_st"),
sep="_(?=[^_]+$)",
remove = FALSE) %>%
select(fileid, filename, filename_main, prod_type_st, folder_lvl1, folder_lvl2,
statesid, everything())
# show stacked table
kable(df_all_files_finalA[seq(1,601,50),1:10]) %>% kable_styling_fc_wide()
fileid | filename | filename_main | prod_type_st | folder_lvl1 | folder_lvl2 | statesid | EjV | k_tt | b_tt |
---|---|---|---|---|---|---|---|---|---|
1 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000 | A0 | cev-2000 | csv | 1 | -28.8586860 | 0.000000 | 0.000000 |
1 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000 | A0 | cev-2000 | csv | 51 | -0.2106603 | 0.000000 | 78.005300 |
2 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000 | A6840 | cev-2000 | csv | 3 | -28.8586860 | 0.000000 | 0.000000 |
2 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c0_cev-2000 | A6840 | cev-2000 | csv | 53 | -0.0642997 | 0.000000 | 84.215368 |
3 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947 | A0 | cev-947 | csv | 5 | -5.8826609 | 0.000000 | 3.869909 |
3 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947 | A0 | cev-947 | csv | 55 | 0.0353187 | 0.000000 | 90.611739 |
4 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947 | A6840 | cev-947 | csv | 7 | -2.7046907 | 0.000000 | 7.855916 |
4 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c5_cev-947 | A6840 | cev-947 | csv | 57 | 0.1094474 | 0.000000 | 90.611739 |
5 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000 | A0 | cev2000 | csv | 9 | -2.9782236 | 0.000000 | 7.855916 |
5 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000 | A0 | cev2000 | csv | 59 | 0.3389275 | 0.000000 | 97.200000 |
6 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000 | A6840 | cev2000 | csv | 11 | -1.7229647 | 0.000000 | 11.961502 |
6 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000_A6840 | solu_19E1NEp99r99x_ITG_PE_cev_c19_cev2000 | A6840 | cev2000 | csv | 61 | -14.6880377 | 1.990694 | -1.879215 |
7 | solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947_A0 | solu_19E1NEp99r99x_ITG_PE_cev_c14_cev947 | A0 | cev947 | csv | 13 | -1.7623279 | 0.000000 | 16.190257 |
Split “A###” to “A” and “A###”. Additionally, also split cev#### to cev and ####, allow for positive and negative numbers. See regular expression 101 helper
# string and number separation
df_all_files_finalB <- df_all_files_finalA %>%
separate(prod_type_st,
into = c("prod_type_st_prefix", "prod_type_lvl"),
sep="(?<=[A-Za-z])(?=[-0-9])", # positive or negative numbers
remove=FALSE) %>%
separate(folder_lvl1,
into = c("cev_prefix", "cev_lvl"),
sep="(?<=[A-Za-z])(?=[-0-9])", # positive or negative numbers
remove=FALSE) %>%
mutate(cev_st = folder_lvl1,
prod_type_lvl = as.numeric(prod_type_lvl),
cev_lvl = as.numeric(cev_lvl)/10000) %>%
select(fileid,
prod_type_st, prod_type_lvl,
cev_st, cev_lvl,
statesid, EjV,
filename, folder_lvl1, folder_lvl2)
# Ordering, sort by cev_lvl, then prod_type_lvl, then stateid
df_all_files_finalB <- df_all_files_finalB %>%
arrange(cev_lvl, prod_type_lvl, statesid)
# show stacked table
kable(df_all_files_finalB[seq(1,49*16,49),1:7]) %>% kable_styling_fc_wide()
fileid | prod_type_st | prod_type_lvl | cev_st | cev_lvl | statesid | EjV |
---|---|---|---|---|---|---|
1 | A0 | 0 | cev-2000 | -0.2000 | 1 | -28.8586860 |
1 | A0 | 0 | cev-2000 | -0.2000 | 50 | -0.2106603 |
2 | A6840 | 6840 | cev-2000 | -0.2000 | 1 | -28.8586860 |
2 | A6840 | 6840 | cev-2000 | -0.2000 | 50 | -0.1311749 |
3 | A0 | 0 | cev-947 | -0.0947 | 1 | -28.0399281 |
3 | A0 | 0 | cev-947 | -0.0947 | 50 | -0.0911499 |
4 | A6840 | 6840 | cev-947 | -0.0947 | 1 | -28.0399281 |
4 | A6840 | 6840 | cev-947 | -0.0947 | 50 | -0.0134719 |
7 | A0 | 0 | cev947 | 0.0947 | 1 | -26.8243673 |
7 | A0 | 0 | cev947 | 0.0947 | 50 | 0.0857474 |
8 | A6840 | 6840 | cev947 | 0.0947 | 1 | -26.8243673 |
8 | A6840 | 6840 | cev947 | 0.0947 | 50 | 0.1608382 |
5 | A0 | 0 | cev2000 | 0.2000 | 1 | -26.2512036 |
5 | A0 | 0 | cev2000 | 0.2000 | 50 | 0.1694524 |
6 | A6840 | 6840 | cev2000 | 0.2000 | 1 | -26.2512036 |
6 | A6840 | 6840 | cev2000 | 0.2000 | 50 | 0.2432677 |