Chapter 1 Array, Matrix, Dataframe

1.1 List

1.1.1 Lists

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

  • r list tutorial
  • r vector vs list
  • r initialize empty multiple element list
  • r name rows and columns of 2 dimensional list
  • r row and colum names of list
  • list dimnames
  • r named list to string

1.1.1.1 Iteratively Build Up a List of Strings

Build up a list of strings, where the strings share common components. Iteratre over lists to generate variations in elements of the string list.

# common string components
st_base_name <- 'snwx_v_planner_docdense'
st_base_middle <- 'b1_xi0_manna_88'
# numeric values to loop over
ar_st_beta_val <- c('bt60', 'bt70', 'bt80', 'bt90')
ar_st_edu_type <- c('e1lm2', 'e2hm2')

# initialize string list
ls_snm <- vector(mode = "list", length = length(ar_st_beta_val)*length(ar_st_edu_type))

# generate list 
it_ctr = 0
for (st_beta_val in ar_st_beta_val) {
  for (st_edu_type in ar_st_edu_type) {
    it_ctr = it_ctr + 1
    # snm_file_name <- 'snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt90'
    snm_file_name <- paste(st_base_name, st_edu_type, st_base_middle, st_beta_val, sep ='_')
    ls_snm[it_ctr] <- snm_file_name
  }
}

# print
for (snm in ls_snm) {
  print(snm)
}
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt60"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt60"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt70"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt70"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt80"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt80"
## [1] "snwx_v_planner_docdense_e1lm2_b1_xi0_manna_88_bt90"
## [1] "snwx_v_planner_docdense_e2hm2_b1_xi0_manna_88_bt90"
# if string in string
grepl('snwx_v_planner', snm)
## [1] TRUE

1.1.1.2 Named List of Matrixes

Save a list of matrixes. Retrieve Element of that list via loop.

# Define an array to loop over
ar_fl_mean <- c(10, 20, 30)

# store restuls in named list
ls_mt_res = vector(mode = "list", length = length(ar_fl_mean))
ar_st_names <- paste0('mean', ar_fl_mean)
names(ls_mt_res) <- ar_st_names

# Loop and generat a list of dataframes
for (it_fl_mean in seq(1, length(ar_fl_mean))) {
  fl_mean = ar_fl_mean[it_fl_mean]

  # dataframe
  set.seed(it_fl_mean)
  tb_combine <- as_tibble(
    matrix(rnorm(4,mean=fl_mean,sd=1), nrow=2, ncol=3)
    ) %>%
    rowid_to_column(var = "id") %>%
    rename_all(~c(c('id','var1','varb','vartheta')))

  ls_mt_res[[it_fl_mean]] = tb_combine
}

# Retrieve elements
print(ls_mt_res[[1]])
print(ls_mt_res$mean10)
print(ls_mt_res[['mean10']])

# Print via Loop 
for (it_fl_mean in seq(1, length(ar_fl_mean))) {
  tb_combine = ls_mt_res[[it_fl_mean]]
  print(tb_combine)
}

1.1.1.3 One Dimensional Named List

  1. define list
  2. slice list
  3. print r named list as a single line string
# Define Lists
ls_num <- list(1,2,3)
ls_str <- list('1','2','3')
ls_num_str <- list(1,2,'3')

# Named Lists
ar_st_names <- c('e1','e2','e3')
ls_num_str_named <- ls_num_str
names(ls_num_str_named) <- ar_st_names

# Add Element to Named List
ls_num_str_named$e4 <- 'this is added'

Initiate an empty list and add to it

# Initiate List
ls_abc <- vector(mode = "list", length = 0)
# Add Named Elements to List Sequentially
ls_abc$a = 1
ls_abc$b = 2
ls_abc$c = 'abc\'s third element'
# Get all Names Added to List
ar_st_list_names <- names(ls_abc)
# Print list in a loop
print(ls_abc)
## $a
## [1] 1
## 
## $b
## [1] 2
## 
## $c
## [1] "abc's third element"
for (it_list_ele_ctr in seq(1,length(ar_st_list_names))) {
  st_list_ele_name <- ar_st_list_names[it_list_ele_ctr]
  st_list_ele_val <- ls_abc[it_list_ele_ctr]
  print(paste0(st_list_ele_name,'=',st_list_ele_val))
}
## [1] "a=1"
## [1] "b=2"
## [1] "c=abc's third element"

1.1.1.4 Named List Print Function

The function below ffi_lst2str is also a function in REconTools: ff_sup_lst2str.

# list to String printing function
ffi_lst2str <- function(ls_list, st_desc, bl_print=TRUE) {

  # string desc
  if(missing(st_desc)){
    st_desc <- deparse(substitute(ls_list))
  }

  # create string
  st_string_from_list = paste0(paste0(st_desc, ':'),
                               paste(names(ls_list), ls_list, sep="=", collapse=";" ))

  if (bl_print){
    print(st_string_from_list)
  }
}

# print full
ffi_lst2str(ls_num)
## [1] "ls_num:=1;=2;=3"
ffi_lst2str(ls_str)
## [1] "ls_str:=1;=2;=3"
ffi_lst2str(ls_num_str)
## [1] "ls_num_str:=1;=2;=3"
ffi_lst2str(ls_num_str_named)
## [1] "ls_num_str_named:e1=1;e2=2;e3=3;e4=this is added"
# print subset
ffi_lst2str(ls_num[2:3])
## [1] "ls_num[2:3]:=2;=3"
ffi_lst2str(ls_str[2:3])
## [1] "ls_str[2:3]:=2;=3"
ffi_lst2str(ls_num_str[2:4])
## [1] "ls_num_str[2:4]:=2;=3;=NULL"
ffi_lst2str(ls_num_str_named[c('e2','e3','e4')])
## [1] "ls_num_str_named[c(\"e2\", \"e3\", \"e4\")]:e2=2;e3=3;e4=this is added"

1.1.1.5 Two Dimensional Unnamed List

Generate a multiple dimensional list:

  1. Initiate with an N element empty list
  2. Reshape list to M by Q
  3. Fill list elements
  4. Get list element by row and column number

List allows for different data types to be stored together.

Note that element specific names in named list are not preserved when the list is reshaped to be two dimensional. Two dimensional list, however, could have row and column names.

# Dimensions
it_M <- 2
it_Q <- 3
it_N <- it_M*it_Q

# Initiate an Empty MxQ=N element list
ls_2d_flat <- vector(mode = "list", length = it_N)
ls_2d <- ls_2d_flat

# Named flat
ls_2d_flat_named <- ls_2d_flat
names(ls_2d_flat_named) <- paste0('e',seq(1,it_N))
ls_2d_named <- ls_2d_flat_named

# Reshape
dim(ls_2d) <- c(it_M, it_Q)
# named 2d list can not carry 1d name after reshape
dim(ls_2d_named) <- c(it_M, it_Q)

Print Various objects generated above, print list flattened.

# display
ffi_lst2str(ls_2d_flat_named)
## [1] "ls_2d_flat_named:e1=NULL;e2=NULL;e3=NULL;e4=NULL;e5=NULL;e6=NULL"
# print(ls_2d_flat_named)
ffi_lst2str(ls_2d_named)
## [1] "ls_2d_named:=NULL;=NULL;=NULL;=NULL;=NULL;=NULL"
print(ls_2d_named)
##      [,1] [,2] [,3]
## [1,] NULL NULL NULL
## [2,] NULL NULL NULL

Select element from list:

# Select Values, double bracket to select from 2dim list
print('ls_2d[[1,2]]')
## [1] "ls_2d[[1,2]]"
print(ls_2d[[1,2]])
## NULL

1.1.1.6 Define Two Dimensional Named LIst

For naming two dimensional lists, rowname and colname does not work. Rather, we need to use dimnames. Note that in addition to dimnames, we can continue to have element specific names. Both can co-exist. But note that the element specific names are not preserved after dimension transform, so need to be redefined afterwards.

How to select an element of a two dimensional list:

  1. row and column names: dimnames, ls_2d_flat_named[[‘row2’,‘col2’]]
  2. named elements: names, ls_2d_flat_named[[‘e5’]]
  3. select by index: index, ls_2d_flat_named[[5]]
  4. converted two dimensional named list to tibble/matrix

Neither dimnames nor names are required, but both can be used to select elements.

# Dimensions
it_M <- 3
it_Q <- 4
it_N <- it_M*it_Q

# Initiate an Empty MxQ=N element list
ls_2d_flat_named <- vector(mode = "list", length = it_N)
dim(ls_2d_flat_named) <- c(it_M, it_Q)

# Fill with values
for (it_Q_ctr in seq(1,it_Q)) {
  for (it_M_ctr in seq(1,it_M)) {
    # linear index
    ls_2d_flat_named[[it_M_ctr, it_Q_ctr]] <- (it_Q_ctr-1)*it_M+it_M_ctr
  }
}

# Replace row names, note rownames does not work
dimnames(ls_2d_flat_named)[[1]] <- paste0('row',seq(1,it_M))
dimnames(ls_2d_flat_named)[[2]] <- paste0('col',seq(1,it_Q))

# Element Specific Names
names(ls_2d_flat_named) <- paste0('e',seq(1,it_N))

# Convert to Matrix
tb_2d_flat_named <- as_tibble(ls_2d_flat_named) %>% unnest()
mt_2d_flat_named <- as.matrix(tb_2d_flat_named)

Print various objects generated above:

# These are not element names, can still name each element
# display
print('ls_2d_flat_named')
## [1] "ls_2d_flat_named"
print(ls_2d_flat_named)
##      col1 col2 col3 col4
## row1 1    4    7    10  
## row2 2    5    8    11  
## row3 3    6    9    12  
## attr(,"names")
##  [1] "e1"  "e2"  "e3"  "e4"  "e5"  "e6"  "e7"  "e8"  "e9"  "e10" "e11" "e12"
print('tb_2d_flat_named')
## [1] "tb_2d_flat_named"
print(tb_2d_flat_named)
print('mt_2d_flat_named')
## [1] "mt_2d_flat_named"
print(mt_2d_flat_named)
##      col1 col2 col3 col4
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12

Select elements from list:

# Select elements with with dimnames
ffi_lst2str(ls_2d_flat_named[['row2','col2']])
## [1] "ls_2d_flat_named[[\"row2\", \"col2\"]]:=5"
# Select elements with element names
ffi_lst2str(ls_2d_flat_named[['e5']])
## [1] "ls_2d_flat_named[[\"e5\"]]:=5"
# Select elements with index
ffi_lst2str(ls_2d_flat_named[[5]])
## [1] "ls_2d_flat_named[[5]]:=5"

1.1.1.7 Two-Dimensional Named List for Joint Probability Mass

There are two discrete random variables, generate some random discrete probability mass, name the columns and rows, and then convert to matrix.

set.seed(123)

# Generate prob list 
it_Q <- 2
it_M <- 2
ls_2d <- vector(mode = "list", length = it_Q*it_M)
dim(ls_2d) <- c(it_Q, it_M)
# Random joint mass 
ar_rand <- runif(it_Q*it_M)
ar_rand <- ar_rand/sum(ar_rand)
# Fill with values
it_ctr <- 0
for (it_Q_ctr in seq(1,it_Q)) {
  for (it_M_ctr in seq(1,it_M)) {
    # linear index
    ls_2d[[it_M_ctr, it_Q_ctr]] <- ar_rand[(it_Q_ctr-1)*it_M+it_M_ctr]
  }
}
# Replace row names, note rownames does not work
dimnames(ls_2d)[[1]] <- paste0('E',seq(1,it_M))
dimnames(ls_2d)[[2]] <- paste0('A',seq(1,it_Q))
# rename 
ls_prob_joint_E_A <- ls_2d
mt_prob_joint_E_A <- matrix(unlist(ls_prob_joint_E_A), ncol=it_M, byrow=F)
print('ls_prob_joint_E_A')
## [1] "ls_prob_joint_E_A"
print(ls_prob_joint_E_A)
##    A1        A2       
## E1 0.1214495 0.1727188
## E2 0.3329164 0.3729152
print(mt_prob_joint_E_A)
##           [,1]      [,2]
## [1,] 0.1214495 0.1727188
## [2,] 0.3329164 0.3729152

Create conditional probabilities: \(F=P(A_1|E_1)\), \(B=P(A_1|E_2)\), \(C=P(E_1|A_1)\), \(D=P(E_1|A_2)\)

fl_F <- mt_prob_joint_E_A[1,1]/sum(mt_prob_joint_E_A[1,])
fl_B <- mt_prob_joint_E_A[2,1]/sum(mt_prob_joint_E_A[2,])
fl_C <- mt_prob_joint_E_A[1,1]/sum(mt_prob_joint_E_A[,1])
fl_D <- mt_prob_joint_E_A[1,2]/sum(mt_prob_joint_E_A[,2])
print(paste0('fl_F=', fl_F, ',fl_B=',fl_B,',fl_C=',fl_C,',fl_D=',fl_D))
## [1] "fl_F=0.412857205138471,fl_B=0.471665472604598,fl_C=0.267294503388642,fl_D=0.316546995323062"

1.2 Array

1.2.1 Array Basics

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.2.1.1 Sum and Product of Elements in Array

Product of Elements in Array.

ar_a <- c(1,2,3)
ar_b <- c(1,2,3,4)
prod(ar_a)
## [1] 6
prod(ar_b)
## [1] 24

1.2.1.2 Multidimesional Arrays

1.2.1.2.1 Repeat one Number by the Size of an Array
ar_a <- c(1,2,3)
ar_b <- c(1,2,3/1,2,3)
rep(0, length(ar_a))
## [1] 0 0 0
1.2.1.2.2 Generate 2 Dimensional Array

First, we will generate an NaN matrix with 3 rows and 3 columnes.

mt_x <- array(NA, dim=c(3, 3))
dim(mt_x)
## [1] 3 3
print(mt_x)
##      [,1] [,2] [,3]
## [1,]   NA   NA   NA
## [2,]   NA   NA   NA
## [3,]   NA   NA   NA

Second, we will generate a matrix with 2 rows and four columns.

mt_x <- array(c(1, 1.5, 0, 2, 0, 4, 0, 3), dim=c(2, 4))
dim(mt_x)
## [1] 2 4
print(mt_x)
##      [,1] [,2] [,3] [,4]
## [1,]  1.0    0    0    0
## [2,]  1.5    2    4    3
1.2.1.2.3 Generate 3 Dimensional Array

First, we will create a three dimensional array with the same data as what was used to create the 2-dimensional array on top.

# Multidimensional Array
# 1 is r1c1t1, 1.5 in r2c1t1, 0 in r1c2t1, etc.
# Three dimensions, row first, column second, and tensor third
x <- array(c(1, 1.5, 0, 2, 0, 4, 0, 3), dim=c(2, 2, 2))
dim(x)
## [1] 2 2 2
print(x)
## , , 1
## 
##      [,1] [,2]
## [1,]  1.0    0
## [2,]  1.5    2
## 
## , , 2
## 
##      [,1] [,2]
## [1,]    0    0
## [2,]    4    3

Second, in the example below, we will generate a 3-dimensional array. The first dimension corresponds to different income levels, the second marital status, and the third the number of kids.We compute in the example below taxable income in 2008 given income levels given IRS rules.

# A, Income Array
ar_income <- seq(0,200000,length.out=3)

# B. Exemptions and Deductions
fl_exemption <- 3500# exemption amount per household member
mt_deduction <- matrix(data=NA, nrow=2, ncol=5)# Marital-status and number of children-specific deduction
mt_deduction[1,1] <- 5450# Single filers
mt_deduction[1,2:5] <- 8000# Single filer with children
mt_deduction[2,] <- 10900# Married couples filing jointly

# C. Taxable Income
mn_taxable_income <- array(NA, dim=c(length(ar_income), 2, 5))
for (y in 1:length(ar_income)){
    for (m in 1:2){
        for (k in 0:4){
            mn_taxable_income[y,m,k+1] <- ar_income[y]-fl_exemption*m-fl_exemption*k-mt_deduction[m,k+1]
        }
    }
}

# D. Name dimensions
dimnames(mn_taxable_income)[[1]] = paste0('income=', round(ar_income, 0))
dimnames(mn_taxable_income)[[2]] = paste0('married=', 0:1)
dimnames(mn_taxable_income)[[3]] = paste0('kids=', 0:4)

# E. Print
dim(mn_taxable_income)
## [1] 3 2 5
print(mn_taxable_income)
## , , kids=0
## 
##              married=0 married=1
## income=0         -8950    -17900
## income=1e+05     91050     82100
## income=2e+05    191050    182100
## 
## , , kids=1
## 
##              married=0 married=1
## income=0        -15000    -21400
## income=1e+05     85000     78600
## income=2e+05    185000    178600
## 
## , , kids=2
## 
##              married=0 married=1
## income=0        -18500    -24900
## income=1e+05     81500     75100
## income=2e+05    181500    175100
## 
## , , kids=3
## 
##              married=0 married=1
## income=0        -22000    -28400
## income=1e+05     78000     71600
## income=2e+05    178000    171600
## 
## , , kids=4
## 
##              married=0 married=1
## income=0        -25500    -31900
## income=1e+05     74500     68100
## income=2e+05    174500    168100

1.2.1.3 Array Slicing

1.2.1.3.1 Get a Subset of Array Elements, N Cuts from M Points

There is an array with M elements, get N elements from the M elements.

First cut including the starting and ending points.

it_M <- 5
it_N <- 4
ar_all_elements = seq(1,10,10)
1.2.1.3.2 Remove Elements of Array

Select elements with direct indexing, or with head and tail functions. Get the first two elements of three elements array.

# Remove last element of array
vars.group.bydf <- c('23','dfa', 'wer')
vars.group.bydf[-length(vars.group.bydf)]
## [1] "23"  "dfa"
# Use the head function to remove last element
head(vars.group.bydf, -1)
## [1] "23"  "dfa"
head(vars.group.bydf, 2)
## [1] "23"  "dfa"

Get last two elements of array.

# Remove first element of array
vars.group.bydf <- c('23','dfa', 'wer')
vars.group.bydf[2:length(vars.group.bydf)]
## [1] "dfa" "wer"
# Use Tail function
tail(vars.group.bydf, -1)
## [1] "dfa" "wer"
tail(vars.group.bydf, 2)
## [1] "dfa" "wer"

Select all except for the first and the last element of an array.

# define array
ar_amin <- c(0, 0.25, 0.50, 0.75, 1)
# select without head and tail
tail(head(ar_amin, -1), -1)
## [1] 0.25 0.50 0.75

Select the first and the last element of an array. The extreme values.

# define array
ar_amin <- c(0, 0.25, 0.50, 0.75, 1)
# select head and tail
c(head(ar_amin, 1), tail(ar_amin, 1))
## [1] 0 1

1.2.1.4 NA in Array

1.2.1.4.1 Check if NA is in Array
# Convert Inf and -Inf to NA
x <- c(1, -1, Inf, 10, -Inf)
na_if(na_if(x, -Inf), Inf)
## [1]  1 -1 NA 10 NA

1.2.1.5 Complex Number

Handling numbers with real and imaginary components. Two separate issues, given an array of numbers that includes real as well as imaginary numbers, keep subset that only has real components. Additionally, for the same array, generate an equal length version of the array that includes the real components of all numbers.

Define complex numbers.

# Define a complex number
cx_number_a <- 0+0.0460246857561777i
# Define another complex number
cx_number_b <- complex(real = 0.02560982, imaginary = 0.0460246857561777)
# An array of numbers some of which are complex
ar_cx_number <- c(0.02560982+0.000000000i, 0.00000000+0.044895305i, 
                  0.00000000+0.009153429i, 0.05462045+0.000000000i, 
                  0.00000000+0.001198538i, 0.00000000+0.019267050i)

Extract real components from a complex array.

# equi-length real component
ar_fl_number_re <- Re(ar_cx_number)
print(ar_fl_number_re)
## [1] 0.02560982 0.00000000 0.00000000 0.05462045 0.00000000 0.00000000
# equi-length img component
ar_fl_number_im <- Im(ar_cx_number)
print(ar_fl_number_im)
## [1] 0.000000000 0.044895305 0.009153429 0.000000000 0.001198538 0.019267050

Keep only real elements of array.

# subset of array that is real
ar_fl_number_re_subset <- Re(ar_cx_number[Re(ar_cx_number)!=0])
print(ar_fl_number_re_subset)
## [1] 0.02560982 0.05462045

1.2.1.6 Number Formatting

1.2.1.6.1 e notation
  1. Case one: 1.149946e+00
    • this is approximately: 1.14995
  2. Case two: 9.048038e-01
    • this is approximately: 0.90480
  3. Case three: 9.048038e-01
    • this is approximately: 0.90480

1.2.1.7 String Conversions

1.2.1.7.1 Add Positive and Negative Sign in Front of Values

We have a sequence of integers, some positive and some negative. We convert this into a string array, and append positive sign in front of positive values.

# An array of integers
ar_it_vals <- seq(-5, 5, by = 1)
# Add positive sign in front of positive and zero elements
st_it_vals <- paste0(ar_it_vals)
st_it_vals[ar_it_vals>0] <- paste0("+", st_it_vals[ar_it_vals>0])
st_it_vals[ar_it_vals==0] <- paste0("±", st_it_vals[ar_it_vals==0])
# Display
print(st_it_vals)
##  [1] "-5" "-4" "-3" "-2" "-1" "±0" "+1" "+2" "+3" "+4" "+5"

1.2.1.8 Basic array calculations

First, we demonstrate how purrr::reduce() works with a simple summation example. We use the addition operator.

# Using R pipe operator
# 1 + 2 + 3 = 6
fl_sum <- 1:3 |> purrr::reduce(`+`)
print(fl_sum)
## [1] 6

Second, what if there is an NA value? NA will be ignored, we will write a custom function. The custom function, to work with reduce, should be such that it is “a binary function that takes two values and returns a single value”.

# define sum function that ignores NA
sum_ignore_na <- function(x,y) {
  if (!is.na(x) && !is.na(y)) {
    x + y
  } else if (is.na(x)) {
    y
  } else if (is.na(y)) {
    x
  } else {
    NA
  }
}

# Using R pipe operator
# 1 + 10 + 1 = 12
fl_sum <- c(1, 10, NA, 1) |> purrr::reduce(sum_ignore_na)
print(fl_sum)
## [1] 12

1.2.2 Generate Arrays

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.2.2.1 Generate Often Used Arrays

1.2.2.1.1 Equi-distance Array with Bound

Consider multiple income groups in income bins that are equal-width, for the final income group, consider all individuals above some final bin minimum bound. Below the code generates this array of numbers: \(0, 20000, 40000, 60000, 80000, 100000, 100000000\).

# generate income cut-offs
fl_bin_start <- 0
# width equal to 20,000
fl_bin_width <- 2e4
# final point is 100 million
fl_bin_final_end <- 1e8
# final segment starting point is 100,000 dollars
fl_bin_final_start <- 1e5
# generate tincome bins
ar_income_bins <- c(
  seq(fl_bin_start, fl_bin_final_start, by = fl_bin_width),
  fl_bin_final_end
)
# Display
print(ar_income_bins)
## [1] 0e+00 2e+04 4e+04 6e+04 8e+04 1e+05 1e+08

Generate finer bins, at 5000 USD intervals, and stopping at 200 thousand dollars.

fl_bin_start <- 0
fl_bin_width <- 5e3
fl_bin_final_end <- 1e8
fl_bin_final_start <- 2e5
ar_income_bins <- c(
  seq(fl_bin_start, fl_bin_final_start, by = fl_bin_width),
  fl_bin_final_end
)
print(ar_income_bins)
##  [1] 0.00e+00 5.00e+03 1.00e+04 1.50e+04 2.00e+04 2.50e+04 3.00e+04 3.50e+04 4.00e+04
## [10] 4.50e+04 5.00e+04 5.50e+04 6.00e+04 6.50e+04 7.00e+04 7.50e+04 8.00e+04 8.50e+04
## [19] 9.00e+04 9.50e+04 1.00e+05 1.05e+05 1.10e+05 1.15e+05 1.20e+05 1.25e+05 1.30e+05
## [28] 1.35e+05 1.40e+05 1.45e+05 1.50e+05 1.55e+05 1.60e+05 1.65e+05 1.70e+05 1.75e+05
## [37] 1.80e+05 1.85e+05 1.90e+05 1.95e+05 2.00e+05 1.00e+08
1.2.2.1.2 Log Space Arrays

Often need to generate arrays on log rather than linear scale, below is log 10 scaled grid.

# Parameters
it.lower.bd.inc.cnt <- 3
fl.log.lower <- -10
fl.log.higher <- -9
fl.min.rescale <- 0.01
it.log.count <- 4
# Generate
ar.fl.log.rescaled <- exp(log(10) * seq(log10(fl.min.rescale),
  log10(fl.min.rescale +
    (fl.log.higher - fl.log.lower)),
  length.out = it.log.count
))
ar.fl.log <- ar.fl.log.rescaled + fl.log.lower - fl.min.rescale
# Print
ar.fl.log
## [1] -10.000000  -9.963430  -9.793123  -9.000000

1.2.2.2 Generate Arrays Based on Existing Arrays

1.2.2.2.1 Probability Mass Array and Discrete Value Array

There are two arrays, an array of values, and an array of probabilities. The probability array sums to 1. The array of values, however, might not be unique.

First, generate some array of numbers not sorted and some proability mass for each non-sorted, non-unique element of the array.

set.seed(123)
it_len <- 10
ar_x <- ceiling(runif(it_len) * 5 + 10)
ar_prob <- dbinom(seq(0, it_len - 1, length.out = it_len), it_len - 1, prob = 0.5)
print(cbind(ar_x, ar_prob))
##       ar_x     ar_prob
##  [1,]   12 0.001953125
##  [2,]   14 0.017578125
##  [3,]   13 0.070312500
##  [4,]   15 0.164062500
##  [5,]   15 0.246093750
##  [6,]   11 0.246093750
##  [7,]   13 0.164062500
##  [8,]   15 0.070312500
##  [9,]   13 0.017578125
## [10,]   13 0.001953125
print(paste0("sum(ar_prob)=", sum(ar_prob)))
## [1] "sum(ar_prob)=1"

Second, sorting index for ar_x, and resort ar_prob with the same index:

ls_sorted_res <- sort(ar_x, decreasing = FALSE, index.return = TRUE)
ar_idx_increasing_x <- ls_sorted_res$ix
ar_x_sorted <- ls_sorted_res$x
ar_prob_sorted <- ar_prob[ar_idx_increasing_x]
print(cbind(ar_x_sorted, ar_prob_sorted))
##       ar_x_sorted ar_prob_sorted
##  [1,]          11    0.246093750
##  [2,]          12    0.001953125
##  [3,]          13    0.070312500
##  [4,]          13    0.164062500
##  [5,]          13    0.017578125
##  [6,]          13    0.001953125
##  [7,]          14    0.017578125
##  [8,]          15    0.164062500
##  [9,]          15    0.246093750
## [10,]          15    0.070312500

Third, sum within group and generate unique, using the aggregate function. Then we have a column of unique values and associated probabilities.

ar_x_unique <- unique(ar_x_sorted)
mt_prob_unique <- aggregate(ar_prob_sorted, by = list(ar_x_sorted), FUN = sum)
ar_x_unique_prob <- mt_prob_unique$x
print(cbind(ar_x_unique, ar_x_unique_prob))
##      ar_x_unique ar_x_unique_prob
## [1,]          11      0.246093750
## [2,]          12      0.001953125
## [3,]          13      0.253906250
## [4,]          14      0.017578125
## [5,]          15      0.480468750

Finally, the several steps together.

# data
set.seed(123)
it_len <- 30
ar_x <- ceiling(runif(it_len) * 20 + 10)
ar_prob <- runif(it_len)
ar_prob <- ar_prob / sum(ar_prob)
# step 1, sort
ls_sorted_res <- sort(ar_x, decreasing = FALSE, index.return = TRUE)
# step 2, unique sorted
ar_x_unique <- unique(ls_sorted_res$x)
# step 3, mass for each unique
mt_prob_unique <- aggregate(ar_prob[ls_sorted_res$ix], by = list(ls_sorted_res$x), FUN = sum)
ar_x_unique_prob <- mt_prob_unique$x
# results
print(cbind(ar_x_unique, ar_x_unique_prob))
##       ar_x_unique ar_x_unique_prob
##  [1,]          11      0.071718383
##  [2,]          13      0.040040920
##  [3,]          15      0.017708800
##  [4,]          16      0.141199002
##  [5,]          17      0.020211876
##  [6,]          19      0.052488290
##  [7,]          20      0.049104113
##  [8,]          21      0.067328518
##  [9,]          22      0.109454333
## [10,]          23      0.060712145
## [11,]          24      0.107671406
## [12,]          25      0.015694798
## [13,]          26      0.068567789
## [14,]          28      0.090925756
## [15,]          29      0.001870451
## [16,]          30      0.085303420

1.2.2.3 Generate Integer Sequences

1.2.2.3.1 Gapped Possibly Overlapping Consecutive Sequences

Now, we generate a set of integer sequences, with gaps in between, but possibly overlapping, for example: \((1,2,3,4,5), (5,6), (10,11)\).

First, we select a small random subset of integers between min and max, and we generate randomly a sequence of length.out of the same length. Each length.out up to a max. (we adjust in apply in the next block to make sure max given duration does not exceed bound.)

# Number of random starting index
it_start_idx <- 11
it_end_idx <- 100
it_startdraws <- 6
# Maximum duration
it_duramax <- 3

# Random seed
set.seed(987)
# Draw random index between min and max
ar_it_start_idx <- sample(
  x = seq(from = it_start_idx, to = it_end_idx, by = 1),
  size = it_startdraws, replace = FALSE
)
ar_it_start_idx <- sort(ar_it_start_idx)
# Draw random durations, replace = TRUE because can repeat
ar_it_duration <- sample(
  x = it_duramax, size = it_startdraws, replace = TRUE
)

# Print
print(glue::glue(
  "random starts + duration: ",
  "{ar_it_start_idx} + {ar_it_duration}"
))
## random starts + duration: 35 + 3
## random starts + duration: 39 + 3
## random starts + duration: 42 + 1
## random starts + duration: 56 + 2
## random starts + duration: 57 + 1
## random starts + duration: 73 + 1

Second, we expand the indexes with neighboring values, and create a list of consecutive integer sequences.

# start and end sequences
# note the min operator inside, the makes sure we do not exceed max
ls_ar_it_recession <- apply(
  cbind(ar_it_start_idx, ar_it_start_idx + ar_it_duration),
  1, function(row) {
    return(seq(row[1], min(row[2], it_end_idx)))
  }
)
# Draw it_m from indexed list of it_N
print("ls_ar_it_recession")
## [1] "ls_ar_it_recession"
print(ls_ar_it_recession)
## [[1]]
## [1] 35 36 37 38
## 
## [[2]]
## [1] 39 40 41 42
## 
## [[3]]
## [1] 42 43
## 
## [[4]]
## [1] 56 57 58
## 
## [[5]]
## [1] 57 58
## 
## [[6]]
## [1] 73 74

Third, we can bring the sequences generated together if we want to

# Combine arrays
ar_it_recession_year <- (
  sort(do.call(base::c, ls_ar_it_recession))
)
# Print
print(glue::glue(
  "print full as array:",
  "{ar_it_recession_year}"
))
## print full as array:35
## print full as array:36
## print full as array:37
## print full as array:38
## print full as array:39
## print full as array:40
## print full as array:41
## print full as array:42
## print full as array:42
## print full as array:43
## print full as array:56
## print full as array:57
## print full as array:57
## print full as array:58
## print full as array:58
## print full as array:73
## print full as array:74
1.2.2.3.2 Gapped non-Overlapping Consecutive Sequences

Now, we generate a set of integer sequences, with gaps in between, but not overlapping, for example: \((1,2,3), (5,6), (10,11)\). We follow a very similar structure as above, but now adjust starting draws by prior accumulated durations.

Note that in the code below, we could end up with less that it_startdraws if there are consecutive start draws. We can only have non-consecutive start draws to avoid overlaps.

# Number of random starting index
it_start_idx <- 11
it_end_idx <- 100
it_startdraws_max <- 6
it_duramax <- 3

# Random seed
set.seed(987)
# Draw random index between min and max
ar_it_start_idx <- sort(sample(
  seq(it_start_idx, it_end_idx),
  it_startdraws_max,
  replace = FALSE
))
# Draw random durations, replace = TRUE because can repeat
ar_it_duration <- sample(it_duramax, it_startdraws_max, replace = TRUE)

# Check space between starts
ar_it_startgap <- diff(ar_it_start_idx)
ar_it_dura_lenm1 <- ar_it_duration[1:(length(ar_it_duration) - 1)]
# Adjust durations
ar_it_dura_bd <- pmin(ar_it_startgap - 2, ar_it_dura_lenm1)
ar_it_duration[1:(length(ar_it_duration) - 1)] <- ar_it_dura_bd

# Drop consecutive starts
ar_bl_dura_nonneg <- which(ar_it_duration >= 0)
ar_it_start_idx <- ar_it_start_idx[ar_bl_dura_nonneg]
ar_it_duration <- ar_it_duration[ar_bl_dura_nonneg]

# list of recession periods
ls_ar_it_recession_non_overlap <- apply(
  cbind(ar_it_start_idx, ar_it_start_idx + ar_it_duration),
  1, function(row) {
    return(seq(row[1], min(row[2], it_end_idx)))
  }
)

# print
print("ls_ar_it_recession_non_overlap")
## [1] "ls_ar_it_recession_non_overlap"
print(ls_ar_it_recession_non_overlap)
## [[1]]
## [1] 35 36 37
## 
## [[2]]
## [1] 39 40
## 
## [[3]]
## [1] 42 43
## 
## [[4]]
## [1] 57 58
## 
## [[5]]
## [1] 73 74

1.2.3 String Arrays

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.2.3.1 Positive or Negative Floating Number to String

There is a number, that contains decimal and possibly negative sign and has some decimals, convert this to a string that is more easily used as a file or folder name.

ls_fl_rho <- c(1, -1, -1.5 -100, 0.5, 0.11111111, -199.22123)
for (fl_rho in ls_fl_rho) {
  st_rho <- paste0(round(fl_rho, 4))
  st_rho <- gsub(x = st_rho,  pattern = "-", replacement = "n")
  st_rho <- gsub(x = st_rho,  pattern = "\\.", replacement = "p")
  print(paste0('st_rho=', st_rho))
}
## [1] "st_rho=1"
## [1] "st_rho=n1"
## [1] "st_rho=n101p5"
## [1] "st_rho=0p5"
## [1] "st_rho=0p1111"
## [1] "st_rho=n199p2212"

1.2.3.2 String Replace

# String replacement
gsub(x = paste0(unique(df.slds.stats.perc$it.inner.counter), ':',
                unique(df.slds.stats.perc$z_n_a_n), collapse = ';'),
     pattern = "\n",
     replacement = "")
gsub(x = var,  pattern = "\n", replacement = "")
gsub(x = var.input,  pattern = "\\.", replacement = "_")

String replaces a segment, search by wildcard. Given the string below, delete all text between carriage return and pound sign:

st_tex_text <- "\n% Lat2ex Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More LaLatex Comments\n"
st_clean_a1 <- gsub("\\%.*?\\\n", "", st_tex_text)
st_clean_a2 <- gsub("L.*?x", "[LATEX]", st_tex_text)
print(paste0('st_tex_text:', st_tex_text))
## [1] "st_tex_text:\n% Lat2ex Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More LaLatex Comments\n"
print(paste0('st_clean_a1:', st_clean_a1))
## [1] "st_clean_a1:\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n"
print(paste0('st_clean_a2:', st_clean_a2))
## [1] "st_clean_a2:\n% [LATEX] Comments\n\\newcommand{\\exa}{\\text{from external file: } \\alpha + \\beta}\n% More [LATEX] Comments\n"

String delete after a particular string:

st_tex_text <- "\\end{equation}\n}\n% Even more comments from Latex preamble"
st_clean_a1 <- gsub("\\\n%.*","", st_tex_text)
print(paste0('st_tex_text:', st_tex_text))
## [1] "st_tex_text:\\end{equation}\n}\n% Even more comments from Latex preamble"
print(paste0('st_clean_a1:', st_clean_a1))
## [1] "st_clean_a1:\\end{equation}\n}"

1.2.3.3 Search If and Which String Contains

Search for a single substring in a single string:

st_example_a <- 'C:/Users/fan/R4Econ/amto/tibble/fs_tib_basics.Rmd'
st_example_b <- 'C:/Users/fan/R4Econ/amto/tibble/_main.html'
grepl('_main', st_example_a)
## [1] FALSE
grepl('_main', st_example_b)
## [1] TRUE

Search for if one of a set of substring exists in a set of strings. In particular which one of the elements of ls_spn contains at least one of the elements of ls_str_if_contains. In the example below, only the first path does not contain either the word aggregate or index in the path. This can be used after all paths have been found recursively in some folder to select only desired paths from the full set of possibilities:

ls_spn <- c("C:/Users/fan/R4Econ//panel/basic/fs_genpanel.Rmd",
            "C:/Users/fan/R4Econ//summarize/aggregate/main.Rmd",
            "C:/Users/fan/R4Econ//summarize/index/fs_index_populate.Rmd")
ls_str_if_contains <- c("aggregate", "index")
str_if_contains <- paste(ls_str_if_contains, collapse = "|")
grepl(str_if_contains, ls_spn)
## [1] FALSE  TRUE  TRUE

1.2.3.4 String Split

Given some string, generated for example by cut, get the lower cut starting points, and also the higher end point

# Extract 0.216 and 0.500 as lower and upper bounds
st_cut_cate <- '(0.216,0.500]'
# Extract Lower Part
substring(strsplit(st_cut_cate, ",")[[1]][1], 2)
## [1] "0.216"
# Extract second part except final bracket Option 1
intToUtf8(rev(utf8ToInt(substring(intToUtf8(rev(utf8ToInt(strsplit(st_cut_cate, ",")[[1]][2]))), 2))))
## [1] "0.500"
# Extract second part except final bracket Option 2
gsub(strsplit(st_cut_cate, ",")[[1]][2],  pattern = "]", replacement = "")
## [1] "0.500"

Make a part of a string bold. Go from “ABC EFG, OPQ, RST” to “ABC EFG, OPQ, RST”. This could be for making the name of an author bold, and preserve affiliation information.

st_paper_author_ori <- "ABC EFG, OPQ, RST"
ar_st_ori <- strsplit(st_paper_author_ori, ", ")[[1]]
st_after_1stcomma <- paste0(ar_st_ori[2:length(ar_st_ori)], collapse= ", ")
st_paper_author <- paste0('<b>', ar_st_ori[1], "</b>, ", st_after_1stcomma )
print(st_paper_author)
## [1] "<b>ABC EFG</b>, OPQ, RST"

1.2.3.5 String Concatenate

Concatenate string array into a single string.

# Simple Collapse
vars.group.by <- c('abc', 'efg')
paste0(vars.group.by, collapse='|')
## [1] "abc|efg"

Concatenate a numeric array into a single string.

# Simple Collapse
set.seed(123)
ar_fl_numbers <- runif(5)
paste0('ar_fl_numbers = ', 
       paste(round(ar_fl_numbers,3), collapse=', ')
)
## [1] "ar_fl_numbers = 0.288, 0.788, 0.409, 0.883, 0.94"

1.2.3.6 String Add Leading Zero

# Add Leading zero for integer values to allow for sorting when
# integers are combined into strings
it_z_n <- 1
it_a_n <- 192
print(sprintf("%02d", it_z_n))
## [1] "01"
print(sprintf("%04d", it_a_n))
## [1] "0192"

1.2.3.7 Substring Components

Given a string, with certain structure, get components.

  • r time string get month and year and day
snm_full <- "20100701"
snm_year <-substr(snm_full,0,4)
snm_month <-substr(snm_full,5,6)
snm_day <-substr(snm_full,7,8)
print(paste0('full:', snm_full,
             ', year:', snm_year,
             ', month:', snm_month,
             ', day:', snm_day))
## [1] "full:20100701, year:2010, month:07, day:01"

1.2.4 Mesh Matrices, Arrays and Scalars

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

  • r expand.grid meshed array to matrix
  • r meshgrid
  • r array to matrix
  • r reshape array to matrix
  • dplyr permuations rows of matrix and element of array
  • tidyr expand_grid mesh matrix and vector

1.2.4.1 Mesh Two or More Vectors with expand_grid

In the example below, we have a matrix that is 2 by 2 (endogenous states), a vector that is 3 by 1 (choices), and another matrix that is 4 by 3 (exogenous states shocks).

We want to generate a tibble dataset that meshes the matrix and the vector, so that all combinations show up. Additionally, we want to add some additional values that are common across all rows to the meshed dataframe.

Note expand_grid is a from tidyr 1.0.0.

# A. Generate the 5 by 2 Matrix (ENDO STATES)
# it_child_count = N, the number of children
it_N_child_cnt = 2
# P fixed parameters, nN is N dimensional, nP is P dimensional
ar_nN_A = seq(-2, 2, length.out = it_N_child_cnt)
ar_nN_alpha = seq(0.1, 0.9, length.out = it_N_child_cnt)
fl_rho = 0.1
fl_lambda = 1.1
mt_nP_A_alpha = cbind(ar_nN_A, ar_nN_alpha, fl_rho, fl_lambda)
ar_st_varnames <- c('s_A', 's_alpha', 'p_rho', 'p_lambda')
tb_states_endo <- as_tibble(mt_nP_A_alpha) %>%
  rename_all(~c(ar_st_varnames)) %>%
  rowid_to_column(var = "state_id")

# B. Choice Grid
it_N_choice_cnt = 3
fl_max = 10
fl_min = 0
ar_nN_d = seq(fl_min, fl_max, length.out = it_N_choice_cnt)
ar_st_varnames <- c('c_food')
tb_choices <- as_tibble(ar_nN_d) %>%
  rename_all(~c(ar_st_varnames)) %>%
  rowid_to_column(var = "choice_id")

# C. Shock Grid
set.seed(123)
it_N_shock_cnt = 4
ar_nQ_shocks = exp(rnorm(it_N_shock_cnt, mean=0, sd=1))
ar_st_varnames <- c('s_eps')
tb_states_exo <- as_tibble(ar_nQ_shocks) %>%
  rename_all(~c(ar_st_varnames)) %>%
  rowid_to_column(var = "shock_id")

# dataframe expand with other non expanded variables
ar_st_varnames <-
tb_states_shk_choices <- tb_states_endo %>%
  expand_grid(tb_choices) %>%
  expand_grid(tb_states_exo) %>%
  select(state_id, choice_id, shock_id,
         s_A, s_alpha, s_eps, c_food,
         p_rho, p_lambda)

# display
kable(tb_states_shk_choices) %>% kable_styling_fc()
state_id choice_id shock_id s_A s_alpha s_eps c_food p_rho p_lambda
1 1 1 -2 0.1 0.5709374 0 0.1 1.1
1 1 2 -2 0.1 0.7943926 0 0.1 1.1
1 1 3 -2 0.1 4.7526783 0 0.1 1.1
1 1 4 -2 0.1 1.0730536 0 0.1 1.1
1 2 1 -2 0.1 0.5709374 5 0.1 1.1
1 2 2 -2 0.1 0.7943926 5 0.1 1.1
1 2 3 -2 0.1 4.7526783 5 0.1 1.1
1 2 4 -2 0.1 1.0730536 5 0.1 1.1
1 3 1 -2 0.1 0.5709374 10 0.1 1.1
1 3 2 -2 0.1 0.7943926 10 0.1 1.1
1 3 3 -2 0.1 4.7526783 10 0.1 1.1
1 3 4 -2 0.1 1.0730536 10 0.1 1.1
2 1 1 2 0.9 0.5709374 0 0.1 1.1
2 1 2 2 0.9 0.7943926 0 0.1 1.1
2 1 3 2 0.9 4.7526783 0 0.1 1.1
2 1 4 2 0.9 1.0730536 0 0.1 1.1
2 2 1 2 0.9 0.5709374 5 0.1 1.1
2 2 2 2 0.9 0.7943926 5 0.1 1.1
2 2 3 2 0.9 4.7526783 5 0.1 1.1
2 2 4 2 0.9 1.0730536 5 0.1 1.1
2 3 1 2 0.9 0.5709374 10 0.1 1.1
2 3 2 2 0.9 0.7943926 10 0.1 1.1
2 3 3 2 0.9 4.7526783 10 0.1 1.1
2 3 4 2 0.9 1.0730536 10 0.1 1.1

Using expand_grid directly over arrays

# expand grid with dplyr
expand_grid(x = 1:3, y = 1:2, z = -3:-1)

1.2.4.2 Mesh Arrays with expand.grid

Given two arrays, mesh the two arrays together.

# use expand.grid to generate all combinations of two arrays

it_ar_A = 5
it_ar_alpha = 10

ar_A = seq(-2, 2, length.out=it_ar_A)
ar_alpha = seq(0.1, 0.9, length.out=it_ar_alpha)

mt_A_alpha = expand.grid(A = ar_A, alpha = ar_alpha)

mt_A_meshed = mt_A_alpha[,1]
dim(mt_A_meshed) = c(it_ar_A, it_ar_alpha)

mt_alpha_meshed = mt_A_alpha[,2]
dim(mt_alpha_meshed) = c(it_ar_A, it_ar_alpha)

# display
kable(mt_A_meshed) %>%
  kable_styling_fc()
-2 -2 -2 -2 -2 -2 -2 -2 -2 -2
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1
0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2 2 2
kable(mt_alpha_meshed) %>%
  kable_styling_fc_wide()
0.1 0.1888889 0.2777778 0.3666667 0.4555556 0.5444444 0.6333333 0.7222222 0.8111111 0.9
0.1 0.1888889 0.2777778 0.3666667 0.4555556 0.5444444 0.6333333 0.7222222 0.8111111 0.9
0.1 0.1888889 0.2777778 0.3666667 0.4555556 0.5444444 0.6333333 0.7222222 0.8111111 0.9
0.1 0.1888889 0.2777778 0.3666667 0.4555556 0.5444444 0.6333333 0.7222222 0.8111111 0.9
0.1 0.1888889 0.2777778 0.3666667 0.4555556 0.5444444 0.6333333 0.7222222 0.8111111 0.9

Two Identical Arrays, individual attributes, each column is an individual for a matrix, and each row is also an individual.

# use expand.grid to generate all combinations of two arrays

it_ar_A = 5

ar_A = seq(-2, 2, length.out=it_ar_A)
mt_A_A = expand.grid(Arow = ar_A, Arow = ar_A)
mt_Arow = mt_A_A[,1]
dim(mt_Arow) = c(it_ar_A, it_ar_A)
mt_Acol = mt_A_A[,2]
dim(mt_Acol) = c(it_ar_A, it_ar_A)

# display
kable(mt_Arow) %>%
  kable_styling_fc()
-2 -2 -2 -2 -2
-1 -1 -1 -1 -1
0 0 0 0 0
1 1 1 1 1
2 2 2 2 2
kable(mt_Acol) %>%
  kable_styling_fc()
-2 -1 0 1 2
-2 -1 0 1 2
-2 -1 0 1 2
-2 -1 0 1 2
-2 -1 0 1 2

1.3 Matrix

1.3.1 Generate Matrixes

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.3.1.1 Create a N by 2 Matrix from 3 arrays

Names of each array become row names automatically.

ar_row_one <- c(-1,+1)
ar_row_two <- c(-3,-2)
ar_row_three <- c(0.35,0.75)

mt_n_by_2 <- rbind(ar_row_one, ar_row_two, ar_row_three)
kable(mt_n_by_2) %>%
  kable_styling_fc()
ar_row_one -1.00 1.00
ar_row_two -3.00 -2.00
ar_row_three 0.35 0.75

1.3.1.2 Name Matrix Columns and Rows

# An empty matrix with Logical NA
mt_named <- matrix(data=NA, nrow=2, ncol=2)
colnames(mt_named) <- paste0('c', seq(1,2))
rownames(mt_named) <- paste0('r', seq(1,2))
mt_named
##    c1 c2
## r1 NA NA
## r2 NA NA

1.3.1.3 Generate NA Matrix

Allocate with NA or NA_real_ or NA_int_. Clarity in type definition is preferred.

# An empty matrix with Logical NA
mt_na <- matrix(data=NA, nrow=2, ncol=2)
str(mt_na)
##  logi [1:2, 1:2] NA NA NA NA
# An empty matrix with numerica NA
mt_fl_na <- matrix(data=NA_real_, nrow=2, ncol=2)
mt_it_na <- matrix(data=NA_integer_, nrow=2, ncol=2)

str(mt_fl_na)
##  num [1:2, 1:2] NA NA NA NA
str(mt_fl_na)
##  num [1:2, 1:2] NA NA NA NA

1.3.1.4 Generate Matrixes with values

Random draw from the normal distribution, random draw from the uniform distribution, and combine resulting matrixes.

# Generate 15 random normal, put in 5 rows, and 3 columns
mt_rnorm <- matrix(rnorm(15,mean=0,sd=1), nrow=5, ncol=3)

# Generate 15 random normal, put in 5 rows, and 3 columns
mt_runif <- matrix(runif(15,min=0,max=1), nrow=5, ncol=5)

# Combine
mt_rnorm_runif <- cbind(mt_rnorm, mt_runif)

# Display
kable(round(mt_rnorm_runif, 3)) %>% kable_styling_fc()
0.129 -0.446 -0.556 0.318 0.369 0.266 0.318 0.369
1.715 1.224 1.787 0.232 0.152 0.858 0.232 0.152
0.461 0.360 0.498 0.143 0.139 0.046 0.143 0.139
-1.265 0.401 -1.967 0.415 0.233 0.442 0.415 0.233
-0.687 0.111 0.701 0.414 0.466 0.799 0.414 0.466

Now we generate a matrix with sequential integers, and either fill matrix by columns or fill matrix by rows.

# with byrow set to FALSE, will fill first col, then second col, etc..
mt_index_colbycol <- matrix(seq(0, 15), nrow=4, ncol=4, byrow=FALSE)
# Display
kable(mt_index_colbycol,
  caption= "with byrow=FALSE, the default, will fill col by col") %>%
  kable_styling_fc()
Table 1.1: with byrow=FALSE, the default, will fill col by col
0 4 8 12
1 5 9 13
2 6 10 14
3 7 11 15
# with byrow set to TRUE, will fill row by row
mt_index_rowbyrow <- matrix(seq(0, 15), nrow=4, ncol=4, byrow=TRUE)
# Display
kable(mt_index_rowbyrow,
  caption= " with byrow=TRUE, will fill row by row") %>%
  kable_styling_fc()
Table 1.1: with byrow=TRUE, will fill row by row
0 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15

1.3.1.5 Replace a Subset of Matrix Values by NA_real_

For values in matrix that fall below or above some thresholds, we will replace these values by NA_real_.

fl_max_val <- 0.8
fl_min_val <- 0.2
mt_rnorm_runif_bd <- mt_rnorm_runif
mt_rnorm_runif_bd[which(mt_rnorm_runif < fl_min_val)] <- NA_real_
mt_rnorm_runif_bd[which(mt_rnorm_runif > fl_max_val)] <- NA_real_
# Print
print(mt_rnorm_runif_bd)
##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]      [,8]
## [1,]        NA        NA        NA 0.3181810 0.3688455 0.2659726 0.3181810 0.3688455
## [2,]        NA        NA        NA 0.2316258        NA        NA 0.2316258        NA
## [3,] 0.4609162 0.3598138 0.4978505        NA        NA        NA        NA        NA
## [4,]        NA 0.4007715        NA 0.4145463 0.2330341 0.4422001 0.4145463 0.2330341
## [5,]        NA        NA 0.7013559 0.4137243 0.4659625 0.7989248 0.4137243 0.4659625

1.3.1.6 Sort Each Matrix Row or Column

Now we sort within each row or within each column of the random matrix.

# Within row sort
mt_rnorm_runif_row_sort <- t(apply(
  mt_rnorm_runif, 1, sort
))
# Within column sort, note no transpose
mt_rnorm_runif_col_sort <- apply(
  mt_rnorm_runif, 2, sort
)
# Display
kable(round(mt_rnorm_runif_row_sort, 3),
      caption="Each row sort low to high") %>%
  kable_styling_fc()
Table 1.2: Each row sort low to high
-0.556 -0.446 0.129 0.266 0.318 0.318 0.369 0.369
0.152 0.152 0.232 0.232 0.858 1.224 1.715 1.787
0.046 0.139 0.139 0.143 0.143 0.360 0.461 0.498
-1.967 -1.265 0.233 0.233 0.401 0.415 0.415 0.442
-0.687 0.111 0.414 0.414 0.466 0.466 0.701 0.799
kable(round(mt_rnorm_runif_col_sort, 3),
      caption="Each column sort low to high") %>%
  kable_styling_fc()
Table 1.2: Each column sort low to high
-1.265 -0.446 -1.967 0.143 0.139 0.046 0.143 0.139
-0.687 0.111 -0.556 0.232 0.152 0.266 0.232 0.152
0.129 0.360 0.498 0.318 0.233 0.442 0.318 0.233
0.461 0.401 0.701 0.414 0.369 0.799 0.414 0.369
1.715 1.224 1.787 0.415 0.466 0.858 0.415 0.466

1.3.1.7 Compute Column and Row Statistics

Compute column and row means, and also column and row sums

print(paste0('colSums=',
             paste(round(
               colSums(mt_rnorm_runif),3), collapse=',')
             ))
## [1] "colSums=0.353,1.65,0.464,1.521,1.359,2.411,1.521,1.359"
print(paste0('colMeans=',
             paste(round(
               colMeans(mt_rnorm_runif),3), collapse=',')
             ))
## [1] "colMeans=0.071,0.33,0.093,0.304,0.272,0.482,0.304,0.272"
print(paste0('rowSums=',
             paste(round(
               rowSums(mt_rnorm_runif),3), collapse=',')
             ))
## [1] "rowSums=0.768,6.352,1.928,-1.094,2.683"
print(paste0('rowMeans=',
             paste(round(
               rowMeans(mt_rnorm_runif),3), collapse=',')
             ))
## [1] "rowMeans=0.096,0.794,0.241,-0.137,0.335"

1.3.1.8 Add Column to Matrix with Common Scalar Value

Given some matrix of information, add a column, where all rows of the column have the same numerical value. Use the matrix created prior. - R add column to matrix - r append column to matrix constant value

fl_new_first_col_val <- 111
fl_new_last_col_val <- 999
mt_with_more_columns <- cbind(rep(fl_new_first_col_val, dim(mt_rnorm_runif)[1]),
                              mt_rnorm_runif,
                              rep(fl_new_last_col_val, dim(mt_rnorm_runif)[1]))
# Display
kable(mt_with_more_columns) %>% kable_styling_fc_wide()
111 0.1292877 -0.4456620 -0.5558411 0.3181810 0.3688455 0.2659726 0.3181810 0.3688455 999
111 1.7150650 1.2240818 1.7869131 0.2316258 0.1524447 0.8578277 0.2316258 0.1524447 999
111 0.4609162 0.3598138 0.4978505 0.1428000 0.1388061 0.0458312 0.1428000 0.1388061 999
111 -1.2650612 0.4007715 -1.9666172 0.4145463 0.2330341 0.4422001 0.4145463 0.2330341 999
111 -0.6868529 0.1106827 0.7013559 0.4137243 0.4659625 0.7989248 0.4137243 0.4659625 999

1.3.2 Linear Algebra

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.3.2.1 Matrix Multiplication

Multiply Together a 3 by 2 matrix and a 2 by 1 vector

ar_row_one <- c(-1,+1)
ar_row_two <- c(-3,-2)
ar_row_three <- c(0.35,0.75)
mt_n_by_2 <- rbind(ar_row_one, ar_row_two, ar_row_three)

ar_row_four <- c(3,4)

# Matrix Multiplication
mt_out <- mt_n_by_2 %*% ar_row_four
print(mt_n_by_2)
##               [,1]  [,2]
## ar_row_one   -1.00  1.00
## ar_row_two   -3.00 -2.00
## ar_row_three  0.35  0.75
print(ar_row_four)
## [1] 3 4
print(mt_out)
##                [,1]
## ar_row_one     1.00
## ar_row_two   -17.00
## ar_row_three   4.05

1.4 Regular Expression, Date, etc.

1.4.1 String Regular Expression

Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.

1.4.1.1 Character Class

The regex documentation states that: “A character class is a list of characters enclosed between ‘[’ and ’]’ which matches any single character in that list”

First, in the example below, we look for strings that contain at a single letter, symbol, or number in the string list enclosed in square brackets.

# Fou words with metacharacters
ls_st_regex_charclass <- c(
  '00d',
  'z\\12323_4', 
  'pa(_2+\\3', 
  'p99.9_sdfasdpf0', 
  'k9p.e_d+fd')
# Matches any characters with the letter p
print(grepl("[p]", ls_st_regex_charclass))
# Matches any characters with backslash
print(grepl("[\\]", ls_st_regex_charclass))
# Matches any characters with the number 3
print(grepl("[3]", ls_st_regex_charclass))

# > print(grepl("[p]", ls_st_regex_charclass))
# [1] FALSE FALSE  TRUE  TRUE  TRUE
# > print(grepl("[\\]", ls_st_regex_charclass))
# [1] FALSE  TRUE  TRUE FALSE FALSE
# > print(grepl("[3]", ls_st_regex_charclass))
# [1] FALSE  TRUE  TRUE FALSE FALSEZ

Second, using the same set of words as examples, we now test if the strings contain at least a letter, symbol, or number in the string lis enclosed in square brakets.

# Matches any characters eithr with letter p or d
print(grepl('[pd]', ls_st_regex_charclass))
# Matches any characters eithr with letter p or _
print(grepl('[p_]', ls_st_regex_charclass))
# Matches any characters eithr with letter p or _ or 0
print(grepl('[p_0]', ls_st_regex_charclass))

# > print(grepl('[pd]', ls_st_regex_charclass))
# [1]  TRUE FALSE  TRUE  TRUE  TRUE
# > print(grepl('[p_]', ls_st_regex_charclass))
# [1] FALSE  TRUE  TRUE  TRUE  TRUE
# > print(grepl('[p_0]', ls_st_regex_charclass))
# [1] TRUE TRUE TRUE TRUE TRUE

Third, using ‘^’, carat, we exclude strings that include characters, letters, and symols. The documentation states that: “unless the first character of the list is the caret ‘^’, when it matches any character not in the list”.

# Finds strings that has anything other than d and 0
print(grepl('[^d0]', ls_st_regex_charclass))

# > print(grepl('[^d0]', ls_st_regex_charclass))
# [1] FALSE  TRUE  TRUE  TRUE  TRUE

1.4.1.2 Repetition Quantifiers

We have the following quantifiers:

  • ‘?’: The preceding item is optional and will be matched at most once.

  • ’*’: The preceding item will be matched zero or more times.

  • ‘+’:The preceding item will be matched one or more times.

  • ‘{n}’: The preceding item is matched exactly n times.

  • ‘{n,}’: The preceding item is matched n or more times.

  • ‘{n,m}’: The preceding item is matched at least n times, but not more than m times.

Now, we identifier strings where certain characters appear a certain number of times.

# Fou words with metacharacters
ls_st_regex_rep_quantifer <- c(
  '00d',
  'z\\12323_40', 
  'ppa(_2+\\3', 
  'p99.9_sdfasdpf0', 
  'k9p.e_d+fd')
# Matches any characters pp
print(grepl("[p]{2}", ls_st_regex_rep_quantifer))
# Matches any characters with the number 3
print(grepl("[9]{2}", ls_st_regex_rep_quantifer))

# > print(grepl("[p]{2}", ls_st_regex_rep_quantifer))
# [1] FALSE FALSE  TRUE FALSE FALSE
# > print(grepl("[9]{2}", ls_st_regex_rep_quantifer))
# [1] FALSE FALSE FALSE  TRUE FALSE

1.4.1.3 Matches Strings With Multiple Conditions with Repetition Quantifiers

Now we match string that satisfy multiple conditions jointly. We have the following quantifiers:

  • ‘?’: The preceding item is optional and will be matched at most once.
  • ’*’: The preceding item will be matched zero or more times.
  • ‘+’:The preceding item will be matched one or more times.
  • ‘{n}’: The preceding item is matched exactly n times.
  • ‘{n,}’: The preceding item is matched n or more times.
  • ‘{n,m}’: The preceding item is matched at least n times, but not more than m times.

First, we define our string array.

ls_st_regex_joint <- c(
  '_asdf123p',
  'pz12p323_40_', 
  'ppa(_2+\\3', 
  'p9_sdfasdpf0', 
  'p_k9p.e_d+fd', 
  'p123k_dfk')

Second, we identify three cases below:

  1. Matching words containing just “p_”
  2. Matching words containing “p9_” (replace 9 by another other alpha-numeric)
  3. Matching words containing either “p_” or “p9_”
# Start with p, followed by _
print(grepl("p_", ls_st_regex_joint))
# Start with p, followed by a single alpha-numeric, then _
print(grepl("p[[:alnum:]]_", ls_st_regex_joint))
# Start with p, followed by either: 
# 1 single alpha-numeric, then _
# no alpha-numeric, then _
print(grepl("p[[:alnum:]]?_", ls_st_regex_joint))

# > print(grepl("p_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE FALSE  TRUE FALSE
# > print(grepl("p[[:alnum:]]_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE  TRUE FALSE FALSE
# > print(grepl("p[[:alnum:]]?_", ls_st_regex_joint))
# [1] FALSE FALSE FALSE  TRUE  TRUE FALSE

Third, we identify cases, where there the word contains substring starting with “p” and ending with “_“, with any number (including 0) of alpha-numeric characters in between. Note:

  1. In the first string, both “_” and “p” appear, but “p” appears after, so does not match
  2. Note in the second word, “p” and “_” appear multiple times
  3. Note in the third word, “p” and “_” both appear, but are separated by a non-alpha-numeric character
print(grepl("p[[:alnum:]]*_", ls_st_regex_joint))

# > print(grepl("p[[:alnum:]]*_", ls_st_regex_joint))
# [1] FALSE  TRUE FALSE  TRUE  TRUE  TRUE

Fourth, we use alternative repetition quantifiers, plus, rather than asterisks, which means we must have at least one alpha-numeric character in between “p” and the “_“, in which case, the fifth word no longer satisfies the search condition.

# p and _ separated by at least 1 alpha numerics
print(grepl("p[[:alnum:]]+_", ls_st_regex_joint))

# > print(grepl("p[[:alnum:]]+_", ls_st_regex_joint))
# [1] FALSE  TRUE FALSE  TRUE FALSE  TRUE