Chapter 1 Array, Matrix, Dataframe
1.1 List
1.1.1 Lists
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
- r list tutorial
- r vector vs list
- r initialize empty multiple element list
- r name rows and columns of 2 dimensional list
- r row and colum names of list
- list dimnames
- r named list to string
1.1.1.1 Iteratively Build Up a List of Strings
Build up a list of strings, where the strings share common components. Iteratre over lists to generate variations in elements of the string list.
# common string components
st_base_name <- 'snwx_v_planner_docdense'
st_base_middle <- 'b1_xi0_manna_88'
# numeric values to loop over
ar_st_beta_val <- c('bt60', 'bt70', 'bt80', 'bt90')
ar_st_edu_type <- c('e1lm2', 'e2hm2')
# initialize string list
ls_snm <- vector(mode = "list", length = length(ar_st_beta_val)*length(ar_st_edu_type))
# generate list
it_ctr = 0
for (st_beta_val in ar_st_beta_val) {
for (st_edu_type in ar_st_edu_type) {
it_ctr = it_ctr + 1
# snm_file_name <- 'snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt90'
snm_file_name <- paste(st_base_name, st_edu_type, st_base_middle, st_beta_val, sep ='_')
ls_snm[it_ctr] <- snm_file_name
}
}
# print
for (snm in ls_snm) {
print(snm)
}
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt60"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt60"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt70"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt70"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt80"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt80"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt90"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt90"
## [1] TRUE
1.1.1.2 Named List of Matrixes
Save a list of matrixes. Retrieve Element of that list via loop.
# Define an array to loop over
ar_fl_mean <- c(10, 20, 30)
# store restuls in named list
ls_mt_res = vector(mode = "list", length = length(ar_fl_mean))
ar_st_names <- paste0('mean', ar_fl_mean)
names(ls_mt_res) <- ar_st_names
# Loop and generat a list of dataframes
for (it_fl_mean in seq(1, length(ar_fl_mean))) {
fl_mean = ar_fl_mean[it_fl_mean]
# dataframe
set.seed(it_fl_mean)
tb_combine <- as_tibble(
matrix(rnorm(4,mean=fl_mean,sd=1), nrow=2, ncol=3)
) %>%
rowid_to_column(var = "id") %>%
rename_all(~c(c('id','var1','varb','vartheta')))
ls_mt_res[[it_fl_mean]] = tb_combine
}
# Retrieve elements
print(ls_mt_res[[1]])
print(ls_mt_res$mean10)
print(ls_mt_res[['mean10']])
# Print via Loop
for (it_fl_mean in seq(1, length(ar_fl_mean))) {
tb_combine = ls_mt_res[[it_fl_mean]]
print(tb_combine)
}
1.1.1.3 One Dimensional Named List
- define list
- slice list
- print r named list as a single line string
# Define Lists
ls_num <- list(1,2,3)
ls_str <- list('1','2','3')
ls_num_str <- list(1,2,'3')
# Named Lists
ar_st_names <- c('e1','e2','e3')
ls_num_str_named <- ls_num_str
names(ls_num_str_named) <- ar_st_names
# Add Element to Named List
ls_num_str_named$e4 <- 'this is added'
Initiate an empty list and add to it
# Initiate List
ls_abc <- vector(mode = "list", length = 0)
# Add Named Elements to List Sequentially
ls_abc$a = 1
ls_abc$b = 2
ls_abc$c = 'abc\'s third element'
# Get all Names Added to List
ar_st_list_names <- names(ls_abc)
# Print list in a loop
print(ls_abc)
## $a
## [1] 1
##
## $b
## [1] 2
##
## $c
## [1] "abc's third element"
for (it_list_ele_ctr in seq(1,length(ar_st_list_names))) {
st_list_ele_name <- ar_st_list_names[it_list_ele_ctr]
st_list_ele_val <- ls_abc[it_list_ele_ctr]
print(paste0(st_list_ele_name,'=',st_list_ele_val))
}
## [1] "a=1"
## [1] "b=2"
## [1] "c=abc's third element"
1.1.1.4 Named List Print Function
- r print input as string
- r print parameter code as string
- How to convert variable (object) name into String
The function below ffi_lst2str is also a function in REconTools: ff_sup_lst2str.
# list to String printing function
ffi_lst2str <- function(ls_list, st_desc, bl_print=TRUE) {
# string desc
if(missing(st_desc)){
st_desc <- deparse(substitute(ls_list))
}
# create string
st_string_from_list = paste0(paste0(st_desc, ':'),
paste(names(ls_list), ls_list, sep="=", collapse=";" ))
if (bl_print){
print(st_string_from_list)
}
}
# print full
ffi_lst2str(ls_num)
## [1] "ls_num:=1;=2;=3"
## [1] "ls_str:=1;=2;=3"
## [1] "ls_num_str:=1;=2;=3"
## [1] "ls_num_str_named:e1=1;e2=2;e3=3;e4=this is added"
## [1] "ls_num[2:3]:=2;=3"
## [1] "ls_str[2:3]:=2;=3"
## [1] "ls_num_str[2:4]:=2;=3;=NULL"
## [1] "ls_num_str_named[c(\"e2\", \"e3\", \"e4\")]:e2=2;e3=3;e4=this is added"
1.1.1.5 Two Dimensional Unnamed List
Generate a multiple dimensional list:
- Initiate with an N element empty list
- Reshape list to M by Q
- Fill list elements
- Get list element by row and column number
List allows for different data types to be stored together.
Note that element specific names in named list are not preserved when the list is reshaped to be two dimensional. Two dimensional list, however, could have row and column names.
# Dimensions
it_M <- 2
it_Q <- 3
it_N <- it_M*it_Q
# Initiate an Empty MxQ=N element list
ls_2d_flat <- vector(mode = "list", length = it_N)
ls_2d <- ls_2d_flat
# Named flat
ls_2d_flat_named <- ls_2d_flat
names(ls_2d_flat_named) <- paste0('e',seq(1,it_N))
ls_2d_named <- ls_2d_flat_named
# Reshape
dim(ls_2d) <- c(it_M, it_Q)
# named 2d list can not carry 1d name after reshape
dim(ls_2d_named) <- c(it_M, it_Q)
Print Various objects generated above, print list flattened.
## [1] "ls_2d_flat_named:e1=NULL;e2=NULL;e3=NULL;e4=NULL;e5=NULL;e6=NULL"
## [1] "ls_2d_named:=NULL;=NULL;=NULL;=NULL;=NULL;=NULL"
## [,1] [,2] [,3]
## [1,] NULL NULL NULL
## [2,] NULL NULL NULL
Select element from list:
## [1] "ls_2d[[1,2]]"
## NULL
1.1.1.6 Define Two Dimensional Named LIst
For naming two dimensional lists, rowname and colname does not work. Rather, we need to use dimnames. Note that in addition to dimnames, we can continue to have element specific names. Both can co-exist. But note that the element specific names are not preserved after dimension transform, so need to be redefined afterwards.
How to select an element of a two dimensional list:
- row and column names: dimnames, ls_2d_flat_named[[‘row2’,‘col2’]]
- named elements: names, ls_2d_flat_named[[‘e5’]]
- select by index: index, ls_2d_flat_named[[5]]
- converted two dimensional named list to tibble/matrix
Neither dimnames nor names are required, but both can be used to select elements.
# Dimensions
it_M <- 3
it_Q <- 4
it_N <- it_M*it_Q
# Initiate an Empty MxQ=N element list
ls_2d_flat_named <- vector(mode = "list", length = it_N)
dim(ls_2d_flat_named) <- c(it_M, it_Q)
# Fill with values
for (it_Q_ctr in seq(1,it_Q)) {
for (it_M_ctr in seq(1,it_M)) {
# linear index
ls_2d_flat_named[[it_M_ctr, it_Q_ctr]] <- (it_Q_ctr-1)*it_M+it_M_ctr
}
}
# Replace row names, note rownames does not work
dimnames(ls_2d_flat_named)[[1]] <- paste0('row',seq(1,it_M))
dimnames(ls_2d_flat_named)[[2]] <- paste0('col',seq(1,it_Q))
# Element Specific Names
names(ls_2d_flat_named) <- paste0('e',seq(1,it_N))
# Convert to Matrix
tb_2d_flat_named <- as_tibble(ls_2d_flat_named) %>% unnest()
mt_2d_flat_named <- as.matrix(tb_2d_flat_named)
Print various objects generated above:
## [1] "ls_2d_flat_named"
## col1 col2 col3 col4
## row1 1 4 7 10
## row2 2 5 8 11
## row3 3 6 9 12
## attr(,"names")
## [1] "e1" "e2" "e3" "e4" "e5" "e6" "e7" "e8" "e9" "e10" "e11" "e12"
## [1] "tb_2d_flat_named"
## [1] "mt_2d_flat_named"
## col1 col2 col3 col4
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
Select elements from list:
## [1] "ls_2d_flat_named[[\"row2\", \"col2\"]]:=5"
## [1] "ls_2d_flat_named[[\"e5\"]]:=5"
## [1] "ls_2d_flat_named[[5]]:=5"
1.1.1.7 Two-Dimensional Named List for Joint Probability Mass
There are two discrete random variables, generate some random discrete probability mass, name the columns and rows, and then convert to matrix.
set.seed(123)
# Generate prob list
it_Q <- 2
it_M <- 2
ls_2d <- vector(mode = "list", length = it_Q*it_M)
dim(ls_2d) <- c(it_Q, it_M)
# Random joint mass
ar_rand <- runif(it_Q*it_M)
ar_rand <- ar_rand/sum(ar_rand)
# Fill with values
it_ctr <- 0
for (it_Q_ctr in seq(1,it_Q)) {
for (it_M_ctr in seq(1,it_M)) {
# linear index
ls_2d[[it_M_ctr, it_Q_ctr]] <- ar_rand[(it_Q_ctr-1)*it_M+it_M_ctr]
}
}
# Replace row names, note rownames does not work
dimnames(ls_2d)[[1]] <- paste0('E',seq(1,it_M))
dimnames(ls_2d)[[2]] <- paste0('A',seq(1,it_Q))
# rename
ls_prob_joint_E_A <- ls_2d
mt_prob_joint_E_A <- matrix(unlist(ls_prob_joint_E_A), ncol=it_M, byrow=F)
print('ls_prob_joint_E_A')
## [1] "ls_prob_joint_E_A"
## A1 A2
## E1 0.1214495 0.1727188
## E2 0.3329164 0.3729152
## [,1] [,2]
## [1,] 0.1214495 0.1727188
## [2,] 0.3329164 0.3729152
Create conditional probabilities: \(F=P(A_1|E_1)\), \(B=P(A_1|E_2)\), \(C=P(E_1|A_1)\), \(D=P(E_1|A_2)\)
fl_F <- mt_prob_joint_E_A[1,1]/sum(mt_prob_joint_E_A[1,])
fl_B <- mt_prob_joint_E_A[2,1]/sum(mt_prob_joint_E_A[2,])
fl_C <- mt_prob_joint_E_A[1,1]/sum(mt_prob_joint_E_A[,1])
fl_D <- mt_prob_joint_E_A[1,2]/sum(mt_prob_joint_E_A[,2])
print(paste0('fl_F=', fl_F, ',fl_B=',fl_B,',fl_C=',fl_C,',fl_D=',fl_D))
## [1] "fl_F=0.412857205138471,fl_B=0.471665472604598,fl_C=0.267294503388642,fl_D=0.316546995323062"
1.2 Array
1.2.1 Array Basics
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.2.1.2 Multidimesional Arrays
1.2.1.2.2 Generate 2 Dimensional Array
First, we will generate an NaN matrix with 3 rows and 3 columnes.
## [1] 3 3
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA NA NA
## [3,] NA NA NA
Second, we will generate a matrix with 2 rows and four columns.
## [1] 2 4
## [,1] [,2] [,3] [,4]
## [1,] 1.0 0 0 0
## [2,] 1.5 2 4 3
1.2.1.2.3 Generate 3 Dimensional Array
First, we will create a three dimensional array with the same data as what was used to create the 2-dimensional array on top.
# Multidimensional Array
# 1 is r1c1t1, 1.5 in r2c1t1, 0 in r1c2t1, etc.
# Three dimensions, row first, column second, and tensor third
x <- array(c(1, 1.5, 0, 2, 0, 4, 0, 3), dim=c(2, 2, 2))
dim(x)
## [1] 2 2 2
## , , 1
##
## [,1] [,2]
## [1,] 1.0 0
## [2,] 1.5 2
##
## , , 2
##
## [,1] [,2]
## [1,] 0 0
## [2,] 4 3
Second, in the example below, we will generate a 3-dimensional array. The first dimension corresponds to different income levels, the second marital status, and the third the number of kids.We compute in the example below taxable income in 2008 given income levels given IRS rules.
# A, Income Array
ar_income <- seq(0,200000,length.out=3)
# B. Exemptions and Deductions
fl_exemption <- 3500# exemption amount per household member
mt_deduction <- matrix(data=NA, nrow=2, ncol=5)# Marital-status and number of children-specific deduction
mt_deduction[1,1] <- 5450# Single filers
mt_deduction[1,2:5] <- 8000# Single filer with children
mt_deduction[2,] <- 10900# Married couples filing jointly
# C. Taxable Income
mn_taxable_income <- array(NA, dim=c(length(ar_income), 2, 5))
for (y in 1:length(ar_income)){
for (m in 1:2){
for (k in 0:4){
mn_taxable_income[y,m,k+1] <- ar_income[y]-fl_exemption*m-fl_exemption*k-mt_deduction[m,k+1]
}
}
}
# D. Name dimensions
dimnames(mn_taxable_income)[[1]] = paste0('income=', round(ar_income, 0))
dimnames(mn_taxable_income)[[2]] = paste0('married=', 0:1)
dimnames(mn_taxable_income)[[3]] = paste0('kids=', 0:4)
# E. Print
dim(mn_taxable_income)
## [1] 3 2 5
## , , kids=0
##
## married=0 married=1
## income=0 -8950 -17900
## income=1e+05 91050 82100
## income=2e+05 191050 182100
##
## , , kids=1
##
## married=0 married=1
## income=0 -15000 -21400
## income=1e+05 85000 78600
## income=2e+05 185000 178600
##
## , , kids=2
##
## married=0 married=1
## income=0 -18500 -24900
## income=1e+05 81500 75100
## income=2e+05 181500 175100
##
## , , kids=3
##
## married=0 married=1
## income=0 -22000 -28400
## income=1e+05 78000 71600
## income=2e+05 178000 171600
##
## , , kids=4
##
## married=0 married=1
## income=0 -25500 -31900
## income=1e+05 74500 68100
## income=2e+05 174500 168100
1.2.1.3 Array Slicing
1.2.1.3.1 Get a Subset of Array Elements, N Cuts from M Points
There is an array with M elements, get N elements from the M elements.
First cut including the starting and ending points.
1.2.1.3.2 Remove Elements of Array
Select elements with direct indexing, or with head and tail functions. Get the first two elements of three elements array.
# Remove last element of array
vars.group.bydf <- c('23','dfa', 'wer')
vars.group.bydf[-length(vars.group.bydf)]
## [1] "23" "dfa"
## [1] "23" "dfa"
## [1] "23" "dfa"
Get last two elements of array.
# Remove first element of array
vars.group.bydf <- c('23','dfa', 'wer')
vars.group.bydf[2:length(vars.group.bydf)]
## [1] "dfa" "wer"
## [1] "dfa" "wer"
## [1] "dfa" "wer"
Select all except for the first and the last element of an array.
# define array
ar_amin <- c(0, 0.25, 0.50, 0.75, 1)
# select without head and tail
tail(head(ar_amin, -1), -1)
## [1] 0.25 0.50 0.75
Select the first and the last element of an array. The extreme values.
# define array
ar_amin <- c(0, 0.25, 0.50, 0.75, 1)
# select head and tail
c(head(ar_amin, 1), tail(ar_amin, 1))
## [1] 0 1
1.2.1.5 Complex Number
Handling numbers with real and imaginary components. Two separate issues, given an array of numbers that includes real as well as imaginary numbers, keep subset that only has real components. Additionally, for the same array, generate an equal length version of the array that includes the real components of all numbers.
Define complex numbers.
# Define a complex number
cx_number_a <- 0+0.0460246857561777i
# Define another complex number
cx_number_b <- complex(real = 0.02560982, imaginary = 0.0460246857561777)
# An array of numbers some of which are complex
ar_cx_number <- c(0.02560982+0.000000000i, 0.00000000+0.044895305i,
0.00000000+0.009153429i, 0.05462045+0.000000000i,
0.00000000+0.001198538i, 0.00000000+0.019267050i)
Extract real components from a complex array.
## [1] 0.02560982 0.00000000 0.00000000 0.05462045 0.00000000 0.00000000
## [1] 0.000000000 0.044895305 0.009153429 0.000000000 0.001198538 0.019267050
Keep only real elements of array.
# subset of array that is real
ar_fl_number_re_subset <- Re(ar_cx_number[Re(ar_cx_number)!=0])
print(ar_fl_number_re_subset)
## [1] 0.02560982 0.05462045
1.2.1.7 String Conversions
1.2.1.7.1 Add Positive and Negative Sign in Front of Values
We have a sequence of integers, some positive and some negative. We convert this into a string array, and append positive sign in front of positive values.
# An array of integers
ar_it_vals <- seq(-5, 5, by = 1)
# Add positive sign in front of positive and zero elements
st_it_vals <- paste0(ar_it_vals)
st_it_vals[ar_it_vals>0] <- paste0("+", st_it_vals[ar_it_vals>0])
st_it_vals[ar_it_vals==0] <- paste0("±", st_it_vals[ar_it_vals==0])
# Display
print(st_it_vals)
## [1] "-5" "-4" "-3" "-2" "-1" "±0" "+1" "+2" "+3" "+4" "+5"
1.2.1.8 Basic array calculations
First, we demonstrate how purrr::reduce() works with a simple summation example. We use the addition operator.
## [1] 6
Second, what if there is an NA value? NA will be ignored, we will write a custom function. The custom function, to work with reduce, should be such that it is “a binary function that takes two values and returns a single value”.
# define sum function that ignores NA
sum_ignore_na <- function(x,y) {
if (!is.na(x) && !is.na(y)) {
x + y
} else if (is.na(x)) {
y
} else if (is.na(y)) {
x
} else {
NA
}
}
# Using R pipe operator
# 1 + 10 + 1 = 12
fl_sum <- c(1, 10, NA, 1) |> purrr::reduce(sum_ignore_na)
print(fl_sum)
## [1] 12
1.2.2 Generate Arrays
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.2.2.1 Generate Often Used Arrays
1.2.2.1.1 Equi-distance Array with Bound
Consider multiple income groups in income bins that are equal-width, for the final income group, consider all individuals above some final bin minimum bound. Below the code generates this array of numbers: \(0, 20000, 40000, 60000, 80000, 100000, 100000000\).
# generate income cut-offs
fl_bin_start <- 0
# width equal to 20,000
fl_bin_width <- 2e4
# final point is 100 million
fl_bin_final_end <- 1e8
# final segment starting point is 100,000 dollars
fl_bin_final_start <- 1e5
# generate tincome bins
ar_income_bins <- c(
seq(fl_bin_start, fl_bin_final_start, by = fl_bin_width),
fl_bin_final_end
)
# Display
print(ar_income_bins)
## [1] 0e+00 2e+04 4e+04 6e+04 8e+04 1e+05 1e+08
Generate finer bins, at 5000 USD intervals, and stopping at 200 thousand dollars.
fl_bin_start <- 0
fl_bin_width <- 5e3
fl_bin_final_end <- 1e8
fl_bin_final_start <- 2e5
ar_income_bins <- c(
seq(fl_bin_start, fl_bin_final_start, by = fl_bin_width),
fl_bin_final_end
)
print(ar_income_bins)
## [1] 0.00e+00 5.00e+03 1.00e+04 1.50e+04 2.00e+04 2.50e+04 3.00e+04 3.50e+04 4.00e+04
## [10] 4.50e+04 5.00e+04 5.50e+04 6.00e+04 6.50e+04 7.00e+04 7.50e+04 8.00e+04 8.50e+04
## [19] 9.00e+04 9.50e+04 1.00e+05 1.05e+05 1.10e+05 1.15e+05 1.20e+05 1.25e+05 1.30e+05
## [28] 1.35e+05 1.40e+05 1.45e+05 1.50e+05 1.55e+05 1.60e+05 1.65e+05 1.70e+05 1.75e+05
## [37] 1.80e+05 1.85e+05 1.90e+05 1.95e+05 2.00e+05 1.00e+08
1.2.2.1.2 Log Space Arrays
Often need to generate arrays on log rather than linear scale, below is log 10 scaled grid.
# Parameters
it.lower.bd.inc.cnt <- 3
fl.log.lower <- -10
fl.log.higher <- -9
fl.min.rescale <- 0.01
it.log.count <- 4
# Generate
ar.fl.log.rescaled <- exp(log(10) * seq(log10(fl.min.rescale),
log10(fl.min.rescale +
(fl.log.higher - fl.log.lower)),
length.out = it.log.count
))
ar.fl.log <- ar.fl.log.rescaled + fl.log.lower - fl.min.rescale
# Print
ar.fl.log
## [1] -10.000000 -9.963430 -9.793123 -9.000000
1.2.2.2 Generate Arrays Based on Existing Arrays
1.2.2.2.1 Probability Mass Array and Discrete Value Array
There are two arrays, an array of values, and an array of probabilities. The probability array sums to 1. The array of values, however, might not be unique.
First, generate some array of numbers not sorted and some proability mass for each non-sorted, non-unique element of the array.
set.seed(123)
it_len <- 10
ar_x <- ceiling(runif(it_len) * 5 + 10)
ar_prob <- dbinom(seq(0, it_len - 1, length.out = it_len), it_len - 1, prob = 0.5)
print(cbind(ar_x, ar_prob))
## ar_x ar_prob
## [1,] 12 0.001953125
## [2,] 14 0.017578125
## [3,] 13 0.070312500
## [4,] 15 0.164062500
## [5,] 15 0.246093750
## [6,] 11 0.246093750
## [7,] 13 0.164062500
## [8,] 15 0.070312500
## [9,] 13 0.017578125
## [10,] 13 0.001953125
## [1] "sum(ar_prob)=1"
Second, sorting index for ar_x, and resort ar_prob with the same index:
ls_sorted_res <- sort(ar_x, decreasing = FALSE, index.return = TRUE)
ar_idx_increasing_x <- ls_sorted_res$ix
ar_x_sorted <- ls_sorted_res$x
ar_prob_sorted <- ar_prob[ar_idx_increasing_x]
print(cbind(ar_x_sorted, ar_prob_sorted))
## ar_x_sorted ar_prob_sorted
## [1,] 11 0.246093750
## [2,] 12 0.001953125
## [3,] 13 0.070312500
## [4,] 13 0.164062500
## [5,] 13 0.017578125
## [6,] 13 0.001953125
## [7,] 14 0.017578125
## [8,] 15 0.164062500
## [9,] 15 0.246093750
## [10,] 15 0.070312500
Third, sum within group and generate unique, using the aggregate function. Then we have a column of unique values and associated probabilities.
ar_x_unique <- unique(ar_x_sorted)
mt_prob_unique <- aggregate(ar_prob_sorted, by = list(ar_x_sorted), FUN = sum)
ar_x_unique_prob <- mt_prob_unique$x
print(cbind(ar_x_unique, ar_x_unique_prob))
## ar_x_unique ar_x_unique_prob
## [1,] 11 0.246093750
## [2,] 12 0.001953125
## [3,] 13 0.253906250
## [4,] 14 0.017578125
## [5,] 15 0.480468750
Finally, the several steps together.
# data
set.seed(123)
it_len <- 30
ar_x <- ceiling(runif(it_len) * 20 + 10)
ar_prob <- runif(it_len)
ar_prob <- ar_prob / sum(ar_prob)
# step 1, sort
ls_sorted_res <- sort(ar_x, decreasing = FALSE, index.return = TRUE)
# step 2, unique sorted
ar_x_unique <- unique(ls_sorted_res$x)
# step 3, mass for each unique
mt_prob_unique <- aggregate(ar_prob[ls_sorted_res$ix], by = list(ls_sorted_res$x), FUN = sum)
ar_x_unique_prob <- mt_prob_unique$x
# results
print(cbind(ar_x_unique, ar_x_unique_prob))
## ar_x_unique ar_x_unique_prob
## [1,] 11 0.071718383
## [2,] 13 0.040040920
## [3,] 15 0.017708800
## [4,] 16 0.141199002
## [5,] 17 0.020211876
## [6,] 19 0.052488290
## [7,] 20 0.049104113
## [8,] 21 0.067328518
## [9,] 22 0.109454333
## [10,] 23 0.060712145
## [11,] 24 0.107671406
## [12,] 25 0.015694798
## [13,] 26 0.068567789
## [14,] 28 0.090925756
## [15,] 29 0.001870451
## [16,] 30 0.085303420
1.2.2.3 Generate Integer Sequences
1.2.2.3.1 Gapped Possibly Overlapping Consecutive Sequences
Now, we generate a set of integer sequences, with gaps in between, but possibly overlapping, for example: \((1,2,3,4,5), (5,6), (10,11)\).
First, we select a small random subset of integers between min and max, and we generate randomly a sequence of length.out
of the same length. Each length.out
up to a max. (we adjust in apply in the next block to make sure max given duration does not exceed bound.)
# Number of random starting index
it_start_idx <- 11
it_end_idx <- 100
it_startdraws <- 6
# Maximum duration
it_duramax <- 3
# Random seed
set.seed(987)
# Draw random index between min and max
ar_it_start_idx <- sample(
x = seq(from = it_start_idx, to = it_end_idx, by = 1),
size = it_startdraws, replace = FALSE
)
ar_it_start_idx <- sort(ar_it_start_idx)
# Draw random durations, replace = TRUE because can repeat
ar_it_duration <- sample(
x = it_duramax, size = it_startdraws, replace = TRUE
)
# Print
print(glue::glue(
"random starts + duration: ",
"{ar_it_start_idx} + {ar_it_duration}"
))
## random starts + duration: 35 + 3
## random starts + duration: 39 + 3
## random starts + duration: 42 + 1
## random starts + duration: 56 + 2
## random starts + duration: 57 + 1
## random starts + duration: 73 + 1
Second, we expand the indexes with neighboring values, and create a list of consecutive integer sequences.
# start and end sequences
# note the min operator inside, the makes sure we do not exceed max
ls_ar_it_recession <- apply(
cbind(ar_it_start_idx, ar_it_start_idx + ar_it_duration),
1, function(row) {
return(seq(row[1], min(row[2], it_end_idx)))
}
)
# Draw it_m from indexed list of it_N
print("ls_ar_it_recession")
## [1] "ls_ar_it_recession"
## [[1]]
## [1] 35 36 37 38
##
## [[2]]
## [1] 39 40 41 42
##
## [[3]]
## [1] 42 43
##
## [[4]]
## [1] 56 57 58
##
## [[5]]
## [1] 57 58
##
## [[6]]
## [1] 73 74
Third, we can bring the sequences generated together if we want to
# Combine arrays
ar_it_recession_year <- (
sort(do.call(base::c, ls_ar_it_recession))
)
# Print
print(glue::glue(
"print full as array:",
"{ar_it_recession_year}"
))
## print full as array:35
## print full as array:36
## print full as array:37
## print full as array:38
## print full as array:39
## print full as array:40
## print full as array:41
## print full as array:42
## print full as array:42
## print full as array:43
## print full as array:56
## print full as array:57
## print full as array:57
## print full as array:58
## print full as array:58
## print full as array:73
## print full as array:74
1.2.2.3.2 Gapped non-Overlapping Consecutive Sequences
Now, we generate a set of integer sequences, with gaps in between, but not overlapping, for example: \((1,2,3), (5,6), (10,11)\). We follow a very similar structure as above, but now adjust starting draws by prior accumulated durations.
Note that in the code below, we could end up with less that it_startdraws
if there are consecutive start draws. We can only have non-consecutive start draws to avoid overlaps.
# Number of random starting index
it_start_idx <- 11
it_end_idx <- 100
it_startdraws_max <- 6
it_duramax <- 3
# Random seed
set.seed(987)
# Draw random index between min and max
ar_it_start_idx <- sort(sample(
seq(it_start_idx, it_end_idx),
it_startdraws_max,
replace = FALSE
))
# Draw random durations, replace = TRUE because can repeat
ar_it_duration <- sample(it_duramax, it_startdraws_max, replace = TRUE)
# Check space between starts
ar_it_startgap <- diff(ar_it_start_idx)
ar_it_dura_lenm1 <- ar_it_duration[1:(length(ar_it_duration) - 1)]
# Adjust durations
ar_it_dura_bd <- pmin(ar_it_startgap - 2, ar_it_dura_lenm1)
ar_it_duration[1:(length(ar_it_duration) - 1)] <- ar_it_dura_bd
# Drop consecutive starts
ar_bl_dura_nonneg <- which(ar_it_duration >= 0)
ar_it_start_idx <- ar_it_start_idx[ar_bl_dura_nonneg]
ar_it_duration <- ar_it_duration[ar_bl_dura_nonneg]
# list of recession periods
ls_ar_it_recession_non_overlap <- apply(
cbind(ar_it_start_idx, ar_it_start_idx + ar_it_duration),
1, function(row) {
return(seq(row[1], min(row[2], it_end_idx)))
}
)
# print
print("ls_ar_it_recession_non_overlap")
## [1] "ls_ar_it_recession_non_overlap"
## [[1]]
## [1] 35 36 37
##
## [[2]]
## [1] 39 40
##
## [[3]]
## [1] 42 43
##
## [[4]]
## [1] 57 58
##
## [[5]]
## [1] 73 74
1.2.3 String Arrays
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.2.3.1 Positive or Negative Floating Number to String
There is a number, that contains decimal and possibly negative sign and has some decimals, convert this to a string that is more easily used as a file or folder name.
ls_fl_rho <- c(1, -1, -1.5 -100, 0.5, 0.11111111, -199.22123)
for (fl_rho in ls_fl_rho) {
st_rho <- paste0(round(fl_rho, 4))
st_rho <- gsub(x = st_rho, pattern = "-", replacement = "n")
st_rho <- gsub(x = st_rho, pattern = "\\.", replacement = "p")
print(paste0('st_rho=', st_rho))
}
## [1] "st_rho=1"
## [1] "st_rho=n1"
## [1] "st_rho=n101p5"
## [1] "st_rho=0p5"
## [1] "st_rho=0p1111"
## [1] "st_rho=n199p2212"
1.2.3.2 String Replace
- r string wildcard replace between regex
- R - replace part of a string using wildcards
# String replacement
gsub(x = paste0(unique(df.slds.stats.perc$it.inner.counter), ':',
unique(df.slds.stats.perc$z_n_a_n), collapse = ';'),
pattern = "\n",
replacement = "")
gsub(x = var, pattern = "\n", replacement = "")
gsub(x = var.input, pattern = "\\.", replacement = "_")
String replaces a segment, search by wildcard. Given the string below, delete all text between carriage return and pound sign:
st_tex_text <- "\n% Lat2ex Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More LaLatex Comments\n"
st_clean_a1 <- gsub("\\%.*?\\\n", "", st_tex_text)
st_clean_a2 <- gsub("L.*?x", "[LATEX]", st_tex_text)
print(paste0('st_tex_text:', st_tex_text))
## [1] "st_tex_text:\n% Lat2ex Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More LaLatex Comments\n"
## [1] "st_clean_a1:\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n"
## [1] "st_clean_a2:\n% [LATEX] Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More [LATEX] Comments\n"
String delete after a particular string:
st_tex_text <- "\\end{equation}\n}\n% Even more comments from Latex preamble"
st_clean_a1 <- gsub("\\\n%.*","", st_tex_text)
print(paste0('st_tex_text:', st_tex_text))
## [1] "st_tex_text:\\end{equation}\n}\n% Even more comments from Latex preamble"
## [1] "st_clean_a1:\\end{equation}\n}"
1.2.3.3 Search If and Which String Contains
- r if string contains
- r if string contains either or grepl
- Use grepl to search either of multiple substrings in a text
Search for a single substring in a single string:
st_example_a <- 'C:/Users/fan/R4Econ/amto/tibble/fs_tib_basics.Rmd'
st_example_b <- 'C:/Users/fan/R4Econ/amto/tibble/_main.html'
grepl('_main', st_example_a)
## [1] FALSE
## [1] TRUE
Search for if one of a set of substring exists in a set of strings. In particular which one of the elements of ls_spn contains at least one of the elements of ls_str_if_contains. In the example below, only the first path does not contain either the word aggregate or index in the path. This can be used after all paths have been found recursively in some folder to select only desired paths from the full set of possibilities:
ls_spn <- c("C:/Users/fan/R4Econ//panel/basic/fs_genpanel.Rmd",
"C:/Users/fan/R4Econ//summarize/aggregate/main.Rmd",
"C:/Users/fan/R4Econ//summarize/index/fs_index_populate.Rmd")
ls_str_if_contains <- c("aggregate", "index")
str_if_contains <- paste(ls_str_if_contains, collapse = "|")
grepl(str_if_contains, ls_spn)
## [1] FALSE TRUE TRUE
1.2.3.4 String Split
Given some string, generated for example by cut, get the lower cut starting points, and also the higher end point
# Extract 0.216 and 0.500 as lower and upper bounds
st_cut_cate <- '(0.216,0.500]'
# Extract Lower Part
substring(strsplit(st_cut_cate, ",")[[1]][1], 2)
## [1] "0.216"
# Extract second part except final bracket Option 1
intToUtf8(rev(utf8ToInt(substring(intToUtf8(rev(utf8ToInt(strsplit(st_cut_cate, ",")[[1]][2]))), 2))))
## [1] "0.500"
# Extract second part except final bracket Option 2
gsub(strsplit(st_cut_cate, ",")[[1]][2], pattern = "]", replacement = "")
## [1] "0.500"
Make a part of a string bold. Go from “ABC EFG, OPQ, RST” to “ABC EFG, OPQ, RST”. This could be for making the name of an author bold, and preserve affiliation information.
st_paper_author_ori <- "ABC EFG, OPQ, RST"
ar_st_ori <- strsplit(st_paper_author_ori, ", ")[[1]]
st_after_1stcomma <- paste0(ar_st_ori[2:length(ar_st_ori)], collapse= ", ")
st_paper_author <- paste0('<b>', ar_st_ori[1], "</b>, ", st_after_1stcomma )
print(st_paper_author)
## [1] "<b>ABC EFG</b>, OPQ, RST"
1.2.3.5 String Concatenate
Concatenate string array into a single string.
## [1] "abc|efg"
Concatenate a numeric array into a single string.
# Simple Collapse
set.seed(123)
ar_fl_numbers <- runif(5)
paste0('ar_fl_numbers = ',
paste(round(ar_fl_numbers,3), collapse=', ')
)
## [1] "ar_fl_numbers = 0.288, 0.788, 0.409, 0.883, 0.94"
1.2.3.6 String Add Leading Zero
# Add Leading zero for integer values to allow for sorting when
# integers are combined into strings
it_z_n <- 1
it_a_n <- 192
print(sprintf("%02d", it_z_n))
## [1] "01"
## [1] "0192"
1.2.3.7 Substring Components
Given a string, with certain structure, get components.
- r time string get month and year and day
snm_full <- "20100701"
snm_year <-substr(snm_full,0,4)
snm_month <-substr(snm_full,5,6)
snm_day <-substr(snm_full,7,8)
print(paste0('full:', snm_full,
', year:', snm_year,
', month:', snm_month,
', day:', snm_day))
## [1] "full:20100701, year:2010, month:07, day:01"
1.2.4 Mesh Matrices, Arrays and Scalars
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
- r expand.grid meshed array to matrix
- r meshgrid
- r array to matrix
- r reshape array to matrix
- dplyr permuations rows of matrix and element of array
- tidyr expand_grid mesh matrix and vector
1.2.4.1 Mesh Two or More Vectors with expand_grid
In the example below, we have a matrix that is 2 by 2 (endogenous states), a vector that is 3 by 1 (choices), and another matrix that is 4 by 3 (exogenous states shocks).
We want to generate a tibble dataset that meshes the matrix and the vector, so that all combinations show up. Additionally, we want to add some additional values that are common across all rows to the meshed dataframe.
Note expand_grid is a from tidyr 1.0.0.
# A. Generate the 5 by 2 Matrix (ENDO STATES)
# it_child_count = N, the number of children
it_N_child_cnt = 2
# P fixed parameters, nN is N dimensional, nP is P dimensional
ar_nN_A = seq(-2, 2, length.out = it_N_child_cnt)
ar_nN_alpha = seq(0.1, 0.9, length.out = it_N_child_cnt)
fl_rho = 0.1
fl_lambda = 1.1
mt_nP_A_alpha = cbind(ar_nN_A, ar_nN_alpha, fl_rho, fl_lambda)
ar_st_varnames <- c('s_A', 's_alpha', 'p_rho', 'p_lambda')
tb_states_endo <- as_tibble(mt_nP_A_alpha) %>%
rename_all(~c(ar_st_varnames)) %>%
rowid_to_column(var = "state_id")
# B. Choice Grid
it_N_choice_cnt = 3
fl_max = 10
fl_min = 0
ar_nN_d = seq(fl_min, fl_max, length.out = it_N_choice_cnt)
ar_st_varnames <- c('c_food')
tb_choices <- as_tibble(ar_nN_d) %>%
rename_all(~c(ar_st_varnames)) %>%
rowid_to_column(var = "choice_id")
# C. Shock Grid
set.seed(123)
it_N_shock_cnt = 4
ar_nQ_shocks = exp(rnorm(it_N_shock_cnt, mean=0, sd=1))
ar_st_varnames <- c('s_eps')
tb_states_exo <- as_tibble(ar_nQ_shocks) %>%
rename_all(~c(ar_st_varnames)) %>%
rowid_to_column(var = "shock_id")
# dataframe expand with other non expanded variables
ar_st_varnames <-
tb_states_shk_choices <- tb_states_endo %>%
expand_grid(tb_choices) %>%
expand_grid(tb_states_exo) %>%
select(state_id, choice_id, shock_id,
s_A, s_alpha, s_eps, c_food,
p_rho, p_lambda)
# display
kable(tb_states_shk_choices) %>% kable_styling_fc()
state_id | choice_id | shock_id | s_A | s_alpha | s_eps | c_food | p_rho | p_lambda |
---|---|---|---|---|---|---|---|---|
1 | 1 | 1 | -2 | 0.1 | 0.5709374 | 0 | 0.1 | 1.1 |
1 | 1 | 2 | -2 | 0.1 | 0.7943926 | 0 | 0.1 | 1.1 |
1 | 1 | 3 | -2 | 0.1 | 4.7526783 | 0 | 0.1 | 1.1 |
1 | 1 | 4 | -2 | 0.1 | 1.0730536 | 0 | 0.1 | 1.1 |
1 | 2 | 1 | -2 | 0.1 | 0.5709374 | 5 | 0.1 | 1.1 |
1 | 2 | 2 | -2 | 0.1 | 0.7943926 | 5 | 0.1 | 1.1 |
1 | 2 | 3 | -2 | 0.1 | 4.7526783 | 5 | 0.1 | 1.1 |
1 | 2 | 4 | -2 | 0.1 | 1.0730536 | 5 | 0.1 | 1.1 |
1 | 3 | 1 | -2 | 0.1 | 0.5709374 | 10 | 0.1 | 1.1 |
1 | 3 | 2 | -2 | 0.1 | 0.7943926 | 10 | 0.1 | 1.1 |
1 | 3 | 3 | -2 | 0.1 | 4.7526783 | 10 | 0.1 | 1.1 |
1 | 3 | 4 | -2 | 0.1 | 1.0730536 | 10 | 0.1 | 1.1 |
2 | 1 | 1 | 2 | 0.9 | 0.5709374 | 0 | 0.1 | 1.1 |
2 | 1 | 2 | 2 | 0.9 | 0.7943926 | 0 | 0.1 | 1.1 |
2 | 1 | 3 | 2 | 0.9 | 4.7526783 | 0 | 0.1 | 1.1 |
2 | 1 | 4 | 2 | 0.9 | 1.0730536 | 0 | 0.1 | 1.1 |
2 | 2 | 1 | 2 | 0.9 | 0.5709374 | 5 | 0.1 | 1.1 |
2 | 2 | 2 | 2 | 0.9 | 0.7943926 | 5 | 0.1 | 1.1 |
2 | 2 | 3 | 2 | 0.9 | 4.7526783 | 5 | 0.1 | 1.1 |
2 | 2 | 4 | 2 | 0.9 | 1.0730536 | 5 | 0.1 | 1.1 |
2 | 3 | 1 | 2 | 0.9 | 0.5709374 | 10 | 0.1 | 1.1 |
2 | 3 | 2 | 2 | 0.9 | 0.7943926 | 10 | 0.1 | 1.1 |
2 | 3 | 3 | 2 | 0.9 | 4.7526783 | 10 | 0.1 | 1.1 |
2 | 3 | 4 | 2 | 0.9 | 1.0730536 | 10 | 0.1 | 1.1 |
Using expand_grid directly over arrays
1.2.4.2 Mesh Arrays with expand.grid
Given two arrays, mesh the two arrays together.
# use expand.grid to generate all combinations of two arrays
it_ar_A = 5
it_ar_alpha = 10
ar_A = seq(-2, 2, length.out=it_ar_A)
ar_alpha = seq(0.1, 0.9, length.out=it_ar_alpha)
mt_A_alpha = expand.grid(A = ar_A, alpha = ar_alpha)
mt_A_meshed = mt_A_alpha[,1]
dim(mt_A_meshed) = c(it_ar_A, it_ar_alpha)
mt_alpha_meshed = mt_A_alpha[,2]
dim(mt_alpha_meshed) = c(it_ar_A, it_ar_alpha)
# display
kable(mt_A_meshed) %>%
kable_styling_fc()
-2 | -2 | -2 | -2 | -2 | -2 | -2 | -2 | -2 | -2 |
-1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 | -1 |
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
0.1 | 0.1888889 | 0.2777778 | 0.3666667 | 0.4555556 | 0.5444444 | 0.6333333 | 0.7222222 | 0.8111111 | 0.9 |
0.1 | 0.1888889 | 0.2777778 | 0.3666667 | 0.4555556 | 0.5444444 | 0.6333333 | 0.7222222 | 0.8111111 | 0.9 |
0.1 | 0.1888889 | 0.2777778 | 0.3666667 | 0.4555556 | 0.5444444 | 0.6333333 | 0.7222222 | 0.8111111 | 0.9 |
0.1 | 0.1888889 | 0.2777778 | 0.3666667 | 0.4555556 | 0.5444444 | 0.6333333 | 0.7222222 | 0.8111111 | 0.9 |
0.1 | 0.1888889 | 0.2777778 | 0.3666667 | 0.4555556 | 0.5444444 | 0.6333333 | 0.7222222 | 0.8111111 | 0.9 |
Two Identical Arrays, individual attributes, each column is an individual for a matrix, and each row is also an individual.
# use expand.grid to generate all combinations of two arrays
it_ar_A = 5
ar_A = seq(-2, 2, length.out=it_ar_A)
mt_A_A = expand.grid(Arow = ar_A, Arow = ar_A)
mt_Arow = mt_A_A[,1]
dim(mt_Arow) = c(it_ar_A, it_ar_A)
mt_Acol = mt_A_A[,2]
dim(mt_Acol) = c(it_ar_A, it_ar_A)
# display
kable(mt_Arow) %>%
kable_styling_fc()
-2 | -2 | -2 | -2 | -2 |
-1 | -1 | -1 | -1 | -1 |
0 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 1 | 1 |
2 | 2 | 2 | 2 | 2 |
-2 | -1 | 0 | 1 | 2 |
-2 | -1 | 0 | 1 | 2 |
-2 | -1 | 0 | 1 | 2 |
-2 | -1 | 0 | 1 | 2 |
-2 | -1 | 0 | 1 | 2 |
1.3 Matrix
1.3.1 Generate Matrixes
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.3.1.1 Create a N by 2 Matrix from 3 arrays
Names of each array become row names automatically.
ar_row_one <- c(-1,+1)
ar_row_two <- c(-3,-2)
ar_row_three <- c(0.35,0.75)
mt_n_by_2 <- rbind(ar_row_one, ar_row_two, ar_row_three)
kable(mt_n_by_2) %>%
kable_styling_fc()
ar_row_one | -1.00 | 1.00 |
ar_row_two | -3.00 | -2.00 |
ar_row_three | 0.35 | 0.75 |
1.3.1.2 Name Matrix Columns and Rows
# An empty matrix with Logical NA
mt_named <- matrix(data=NA, nrow=2, ncol=2)
colnames(mt_named) <- paste0('c', seq(1,2))
rownames(mt_named) <- paste0('r', seq(1,2))
mt_named
## c1 c2
## r1 NA NA
## r2 NA NA
1.3.1.3 Generate NA Matrix
Allocate with NA or NA_real_ or NA_int_. Clarity in type definition is preferred.
## logi [1:2, 1:2] NA NA NA NA
# An empty matrix with numerica NA
mt_fl_na <- matrix(data=NA_real_, nrow=2, ncol=2)
mt_it_na <- matrix(data=NA_integer_, nrow=2, ncol=2)
str(mt_fl_na)
## num [1:2, 1:2] NA NA NA NA
## num [1:2, 1:2] NA NA NA NA
1.3.1.4 Generate Matrixes with values
Random draw from the normal distribution, random draw from the uniform distribution, and combine resulting matrixes.
# Generate 15 random normal, put in 5 rows, and 3 columns
mt_rnorm <- matrix(rnorm(15,mean=0,sd=1), nrow=5, ncol=3)
# Generate 15 random normal, put in 5 rows, and 3 columns
mt_runif <- matrix(runif(15,min=0,max=1), nrow=5, ncol=5)
# Combine
mt_rnorm_runif <- cbind(mt_rnorm, mt_runif)
# Display
kable(round(mt_rnorm_runif, 3)) %>% kable_styling_fc()
0.129 | -0.446 | -0.556 | 0.318 | 0.369 | 0.266 | 0.318 | 0.369 |
1.715 | 1.224 | 1.787 | 0.232 | 0.152 | 0.858 | 0.232 | 0.152 |
0.461 | 0.360 | 0.498 | 0.143 | 0.139 | 0.046 | 0.143 | 0.139 |
-1.265 | 0.401 | -1.967 | 0.415 | 0.233 | 0.442 | 0.415 | 0.233 |
-0.687 | 0.111 | 0.701 | 0.414 | 0.466 | 0.799 | 0.414 | 0.466 |
Now we generate a matrix with sequential integers, and either fill matrix by columns or fill matrix by rows.
# with byrow set to FALSE, will fill first col, then second col, etc..
mt_index_colbycol <- matrix(seq(0, 15), nrow=4, ncol=4, byrow=FALSE)
# Display
kable(mt_index_colbycol,
caption= "with byrow=FALSE, the default, will fill col by col") %>%
kable_styling_fc()
0 | 4 | 8 | 12 |
1 | 5 | 9 | 13 |
2 | 6 | 10 | 14 |
3 | 7 | 11 | 15 |
# with byrow set to TRUE, will fill row by row
mt_index_rowbyrow <- matrix(seq(0, 15), nrow=4, ncol=4, byrow=TRUE)
# Display
kable(mt_index_rowbyrow,
caption= " with byrow=TRUE, will fill row by row") %>%
kable_styling_fc()
0 | 1 | 2 | 3 |
4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 |
12 | 13 | 14 | 15 |
1.3.1.5 Replace a Subset of Matrix Values by NA_real_
For values in matrix that fall below or above some thresholds, we will replace these values by NA_real_.
fl_max_val <- 0.8
fl_min_val <- 0.2
mt_rnorm_runif_bd <- mt_rnorm_runif
mt_rnorm_runif_bd[which(mt_rnorm_runif < fl_min_val)] <- NA_real_
mt_rnorm_runif_bd[which(mt_rnorm_runif > fl_max_val)] <- NA_real_
# Print
print(mt_rnorm_runif_bd)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] NA NA NA 0.3181810 0.3688455 0.2659726 0.3181810 0.3688455
## [2,] NA NA NA 0.2316258 NA NA 0.2316258 NA
## [3,] 0.4609162 0.3598138 0.4978505 NA NA NA NA NA
## [4,] NA 0.4007715 NA 0.4145463 0.2330341 0.4422001 0.4145463 0.2330341
## [5,] NA NA 0.7013559 0.4137243 0.4659625 0.7989248 0.4137243 0.4659625
1.3.1.6 Sort Each Matrix Row or Column
Now we sort within each row or within each column of the random matrix.
# Within row sort
mt_rnorm_runif_row_sort <- t(apply(
mt_rnorm_runif, 1, sort
))
# Within column sort, note no transpose
mt_rnorm_runif_col_sort <- apply(
mt_rnorm_runif, 2, sort
)
# Display
kable(round(mt_rnorm_runif_row_sort, 3),
caption="Each row sort low to high") %>%
kable_styling_fc()
-0.556 | -0.446 | 0.129 | 0.266 | 0.318 | 0.318 | 0.369 | 0.369 |
0.152 | 0.152 | 0.232 | 0.232 | 0.858 | 1.224 | 1.715 | 1.787 |
0.046 | 0.139 | 0.139 | 0.143 | 0.143 | 0.360 | 0.461 | 0.498 |
-1.967 | -1.265 | 0.233 | 0.233 | 0.401 | 0.415 | 0.415 | 0.442 |
-0.687 | 0.111 | 0.414 | 0.414 | 0.466 | 0.466 | 0.701 | 0.799 |
kable(round(mt_rnorm_runif_col_sort, 3),
caption="Each column sort low to high") %>%
kable_styling_fc()
-1.265 | -0.446 | -1.967 | 0.143 | 0.139 | 0.046 | 0.143 | 0.139 |
-0.687 | 0.111 | -0.556 | 0.232 | 0.152 | 0.266 | 0.232 | 0.152 |
0.129 | 0.360 | 0.498 | 0.318 | 0.233 | 0.442 | 0.318 | 0.233 |
0.461 | 0.401 | 0.701 | 0.414 | 0.369 | 0.799 | 0.414 | 0.369 |
1.715 | 1.224 | 1.787 | 0.415 | 0.466 | 0.858 | 0.415 | 0.466 |
1.3.1.7 Compute Column and Row Statistics
Compute column and row means, and also column and row sums
## [1] "colSums=0.353,1.65,0.464,1.521,1.359,2.411,1.521,1.359"
## [1] "colMeans=0.071,0.33,0.093,0.304,0.272,0.482,0.304,0.272"
## [1] "rowSums=0.768,6.352,1.928,-1.094,2.683"
## [1] "rowMeans=0.096,0.794,0.241,-0.137,0.335"
1.3.1.8 Add Column to Matrix with Common Scalar Value
Given some matrix of information, add a column, where all rows of the column have the same numerical value. Use the matrix created prior. - R add column to matrix - r append column to matrix constant value
fl_new_first_col_val <- 111
fl_new_last_col_val <- 999
mt_with_more_columns <- cbind(rep(fl_new_first_col_val, dim(mt_rnorm_runif)[1]),
mt_rnorm_runif,
rep(fl_new_last_col_val, dim(mt_rnorm_runif)[1]))
# Display
kable(mt_with_more_columns) %>% kable_styling_fc_wide()
111 | 0.1292877 | -0.4456620 | -0.5558411 | 0.3181810 | 0.3688455 | 0.2659726 | 0.3181810 | 0.3688455 | 999 |
111 | 1.7150650 | 1.2240818 | 1.7869131 | 0.2316258 | 0.1524447 | 0.8578277 | 0.2316258 | 0.1524447 | 999 |
111 | 0.4609162 | 0.3598138 | 0.4978505 | 0.1428000 | 0.1388061 | 0.0458312 | 0.1428000 | 0.1388061 | 999 |
111 | -1.2650612 | 0.4007715 | -1.9666172 | 0.4145463 | 0.2330341 | 0.4422001 | 0.4145463 | 0.2330341 | 999 |
111 | -0.6868529 | 0.1106827 | 0.7013559 | 0.4137243 | 0.4659625 | 0.7989248 | 0.4137243 | 0.4659625 | 999 |
1.3.2 Linear Algebra
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.3.2.1 Matrix Multiplication
Multiply Together a 3 by 2 matrix and a 2 by 1 vector
ar_row_one <- c(-1,+1)
ar_row_two <- c(-3,-2)
ar_row_three <- c(0.35,0.75)
mt_n_by_2 <- rbind(ar_row_one, ar_row_two, ar_row_three)
ar_row_four <- c(3,4)
# Matrix Multiplication
mt_out <- mt_n_by_2 %*% ar_row_four
print(mt_n_by_2)
## [,1] [,2]
## ar_row_one -1.00 1.00
## ar_row_two -3.00 -2.00
## ar_row_three 0.35 0.75
## [1] 3 4
## [,1]
## ar_row_one 1.00
## ar_row_two -17.00
## ar_row_three 4.05
1.4 Regular Expression, Date, etc.
1.4.1 String Regular Expression
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
1.4.1.1 Character Class
The regex documentation states that: “A character class is a list of characters enclosed between ‘[’ and ’]’ which matches any single character in that list”
First, in the example below, we look for strings that contain at a single letter, symbol, or number in the string list enclosed in square brackets.
# Fou words with metacharacters
ls_st_regex_charclass <- c(
'00d',
'z\\12323_4',
'pa(_2+\\3',
'p99.9_sdfasdpf0',
'k9p.e_d+fd')
# Matches any characters with the letter p
print(grepl("[p]", ls_st_regex_charclass))
# Matches any characters with backslash
print(grepl("[\\]", ls_st_regex_charclass))
# Matches any characters with the number 3
print(grepl("[3]", ls_st_regex_charclass))
# > print(grepl("[p]", ls_st_regex_charclass))
# [1] FALSE FALSE TRUE TRUE TRUE
# > print(grepl("[\\]", ls_st_regex_charclass))
# [1] FALSE TRUE TRUE FALSE FALSE
# > print(grepl("[3]", ls_st_regex_charclass))
# [1] FALSE TRUE TRUE FALSE FALSEZ
Second, using the same set of words as examples, we now test if the strings contain at least a letter, symbol, or number in the string lis enclosed in square brakets.
# Matches any characters eithr with letter p or d
print(grepl('[pd]', ls_st_regex_charclass))
# Matches any characters eithr with letter p or _
print(grepl('[p_]', ls_st_regex_charclass))
# Matches any characters eithr with letter p or _ or 0
print(grepl('[p_0]', ls_st_regex_charclass))
# > print(grepl('[pd]', ls_st_regex_charclass))
# [1] TRUE FALSE TRUE TRUE TRUE
# > print(grepl('[p_]', ls_st_regex_charclass))
# [1] FALSE TRUE TRUE TRUE TRUE
# > print(grepl('[p_0]', ls_st_regex_charclass))
# [1] TRUE TRUE TRUE TRUE TRUE
Third, using ‘^’, carat, we exclude strings that include characters, letters, and symols. The documentation states that: “unless the first character of the list is the caret ‘^’, when it matches any character not in the list”.
1.4.1.2 Repetition Quantifiers
We have the following quantifiers:
‘?’: The preceding item is optional and will be matched at most once.
’*’: The preceding item will be matched zero or more times.
‘+’:The preceding item will be matched one or more times.
‘{n}’: The preceding item is matched exactly n times.
‘{n,}’: The preceding item is matched n or more times.
‘{n,m}’: The preceding item is matched at least n times, but not more than m times.
Now, we identifier strings where certain characters appear a certain number of times.
# Fou words with metacharacters
ls_st_regex_rep_quantifer <- c(
'00d',
'z\\12323_40',
'ppa(_2+\\3',
'p99.9_sdfasdpf0',
'k9p.e_d+fd')
# Matches any characters pp
print(grepl("[p]{2}", ls_st_regex_rep_quantifer))
# Matches any characters with the number 3
print(grepl("[9]{2}", ls_st_regex_rep_quantifer))
# > print(grepl("[p]{2}", ls_st_regex_rep_quantifer))
# [1] FALSE FALSE TRUE FALSE FALSE
# > print(grepl("[9]{2}", ls_st_regex_rep_quantifer))
# [1] FALSE FALSE FALSE TRUE FALSE
1.4.1.3 Matches Strings With Multiple Conditions with Repetition Quantifiers
Now we match string that satisfy multiple conditions jointly. We have the following quantifiers:
- ‘?’: The preceding item is optional and will be matched at most once.
- ’*’: The preceding item will be matched zero or more times.
- ‘+’:The preceding item will be matched one or more times.
- ‘{n}’: The preceding item is matched exactly n times.
- ‘{n,}’: The preceding item is matched n or more times.
- ‘{n,m}’: The preceding item is matched at least n times, but not more than m times.
First, we define our string array.
ls_st_regex_joint <- c(
'_asdf123p',
'pz12p323_40_',
'ppa(_2+\\3',
'p9_sdfasdpf0',
'p_k9p.e_d+fd',
'p123k_dfk')
Second, we identify three cases below:
- Matching words containing just “p_”
- Matching words containing “p9_” (replace 9 by another other alpha-numeric)
- Matching words containing either “p_” or “p9_”
# Start with p, followed by _
print(grepl("p_", ls_st_regex_joint))
# Start with p, followed by a single alpha-numeric, then _
print(grepl("p[[:alnum:]]_", ls_st_regex_joint))
# Start with p, followed by either:
# 1 single alpha-numeric, then _
# no alpha-numeric, then _
print(grepl("p[[:alnum:]]?_", ls_st_regex_joint))
# > print(grepl("p_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE FALSE TRUE FALSE
# > print(grepl("p[[:alnum:]]_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE TRUE FALSE FALSE
# > print(grepl("p[[:alnum:]]?_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE TRUE TRUE FALSE
Third, we identify cases, where there the word contains substring starting with “p” and ending with “_“, with any number (including 0) of alpha-numeric characters in between. Note:
- In the first string, both “_” and “p” appear, but “p” appears after, so does not match
- Note in the second word, “p” and “_” appear multiple times
- Note in the third word, “p” and “_” both appear, but are separated by a non-alpha-numeric character
print(grepl("p[[:alnum:]]*_", ls_st_regex_joint))
# > print(grepl("p[[:alnum:]]*_", ls_st_regex_joint))
# [1] FALSE TRUE FALSE TRUE TRUE TRUE
Fourth, we use alternative repetition quantifiers, plus, rather than asterisks, which means we must have at least one alpha-numeric character in between “p” and the “_“, in which case, the fifth word no longer satisfies the search condition.