Chapter 11 Get Data
11.1 Environmental Data
11.1.1 ECMWF ERA5 Data
Go back to fan’s REconTools research support package, R4Econ examples page, PkgTestR packaging guide, or Stat4Econ course page.
This files uses R with the reticulate package to download ECMWF ERA5 data. See this file for instructions and tutorials for downloading the data.
11.1.1.1 Program to Download, Unzip, Convert to combined CSV, derived-utci-historical data
The data downloaded from CDS climate could become very large in size. We want to process parts of the data one part at a time, summarize and aggregate over each part, and generate a file output file with aggregate statistics over the entire time period of interest.
This code below accompalishes the following tasks:
- download data from derived-utci-historical as ZIP
- unzip
- convert nc files to csv files
- individual csv files are half year groups
Parameter Control for the code below:
- spt_root: root folder where everything will be at
- spth_conda_env: the conda virtual environment python path, eccodes and cdsapi packages are installed in the conda virtual environment. In the example below, the first env is: wk_ecmwf
- st_nc_prefix: the downloaded individual nc files have dates and prefix before and after the date string in the nc file names. This is the string before that.
- st_nc_suffix: see (3), this is the suffix
- ar_years: array of years to download and aggregate over
- ar_months_g1: months to download in first half year
- ar_months_g2: months to download in second half year
Note: area below corresponds to North, West, South, East.
#################################################
# ------------ Parameters
#################################################
# Where to store everything
spt_root <- "C:/Users/fan/Downloads/_data/"
spth_conda_env <- "C:/ProgramData/Anaconda3/envs/wk_ecmwf/python.exe"
# nc name prefix
st_nc_prefix <- "ECMWF_utci_"
st_nc_suffix <- "_v1.0_con.nc"
# Years list
# ar_years <- 2001:2019
ar_years <- c(2005, 2015)
# ar_months_g1 <- c('01','02','03','04','05','06')
ar_months_g1 <- c('01', '03')
# ar_months_g2 <- c('07','08','09','10','11','12')
ar_months_g2 <- c('07', '09')
# Area
# # China
# fl_area_north <- 53.31
# fl_area_west <- 73
# fl_area_south <- 4.15
# fl_area_east <- 135
fl_area_north <- 53
fl_area_west <- 73
fl_area_south <- 52
fl_area_east <- 74
# folder to download any nc zips to
nczippath <- spt_root
# we are changing the python api file with different requests stirngs and storing it here
pyapipath <- spt_root
# output directory for AGGREGATE CSV with all DATES from this search
csvpath <- spt_root
#################################################
# ------------ Packages
#################################################
library("ncdf4")
library("chron")
library("lattice")
library("RColorBrewer")
library("stringr")
library("tibble")
library("dplyr")
Sys.setenv(RETICULATE_PYTHON = spth_conda_env)
library("reticulate")
#################################################
# ------------ Define Loops
#################################################
for (it_yr in ar_years) {
for (it_mth_group in c(1,2)) {
if(it_mth_group == 1) {
ar_months = ar_months_g1
}
if(it_mth_group == 2) {
ar_months = ar_months_g2
}
#################################################
# ------------ Define Python API Call
#################################################
# name of zip file
nczipname <- "derived_utci_2010_2.zip"
unzipfolder <- "derived_utci_2010_2"
st_file <- paste0("import cdsapi
import urllib.request
# download folder
spt_root = '", nczippath, "'
spn_dl_test_grib = spt_root + '", nczipname, "'
# request
c = cdsapi.Client()
res = c.retrieve(
'derived-utci-historical',
{
'format': 'zip',
'variable': 'Universal thermal climate index',
'product_type': 'Consolidated dataset',
'year': '",it_yr, "',
'month': [
", paste("'", ar_months, "'", sep = "", collapse = ", "), "
],
'day': [
'01','03'
],
'area' : [", fl_area_north ,", ", fl_area_west ,", ", fl_area_south ,", ", fl_area_east ,"],
'grid' : [0.25, 0.25],
},
spn_dl_test_grib)
# show results
print('print results')
print(res)
print(type(res))")
# st_file = "print(1+1)"
# Store Python Api File
fl_test_tex <- paste0(pyapipath, "api.py")
fileConn <- file(fl_test_tex)
writeLines(st_file, fileConn)
close(fileConn)
#################################################
# ------------ Run Python File
#################################################
# Set Path
setwd(pyapipath)
# Run py file, api.py name just defined
use_python(spth_conda_env)
source_python('api.py')
#################################################
# ------------ uNZIP
#################################################
spn_zip <- paste0(nczippath, nczipname)
spn_unzip_folder <- paste0(nczippath, unzipfolder)
unzip(spn_zip, exdir=spn_unzip_folder)
#################################################
# ------------ Find All files
#################################################
# Get all files with nc suffix in folder
ncpath <- paste0(nczippath, unzipfolder)
ls_sfls <- list.files(path=ncpath, recursive=TRUE, pattern=".nc", full.names=T)
#################################################
# ------------ Combine individual NC files to JOINT Dataframe
#################################################
# List to gather dataframes
ls_df <- vector(mode = "list", length = length(ls_sfls))
# Loop over files and convert nc to csv
it_df_ctr <- 0
for (spt_file in ls_sfls) {
it_df_ctr <- it_df_ctr + 1
# Get file name without Path
snm_file_date <- sub(paste0('\\',st_nc_suffix,'$'), '', basename(spt_file))
snm_file_date <- sub(st_nc_prefix, '', basename(snm_file_date))
# Dates Start and End: list.files is auto sorted in ascending order
if (it_df_ctr == 1) {
snm_start_date <- snm_file_date
}
else {
# this will give the final date
snm_end_date <- snm_file_date
}
# Given this structure: ECMWF_utci_20100702_v1.0_con, sub out prefix and suffix
print(spt_file)
ncin <- nc_open(spt_file)
nchist <- ncatt_get(ncin, 0, "history")
# not using this missing value flag at the moment
missingval <- str_match(nchist$value, "setmisstoc,\\s*(.*?)\\s* ")[,2]
missingval <- as.numeric(missingval)
lon <- ncvar_get(ncin, "lon")
lat <- ncvar_get(ncin, "lat")
tim <- ncvar_get(ncin, "time")
tunits <- ncatt_get(ncin, "time", "units")
nlon <- dim(lon)
nlat <- dim(lat)
ntim <- dim(tim)
# convert time -- split the time units string into fields
# tustr <- strsplit(tunits$value, " ")
# tdstr <- strsplit(unlist(tustr)[3], "-")
# tmonth <- as.integer(unlist(tdstr)[2])
# tday <- as.integer(unlist(tdstr)[3])
# tyear <- as.integer(unlist(tdstr)[1])
# mytim <- chron(tim, origin = c(tmonth, tday, tyear))
tmp_array <- ncvar_get(ncin, "utci")
tmp_array <- tmp_array - 273.15
lonlat <- as.matrix(expand.grid(lon = lon, lat = lat, hours = tim))
temperature <- as.vector(tmp_array)
tmp_df <- data.frame(cbind(lonlat, temperature))
# extract a rectangle
eps <- 1e-8
minlat <- 22.25 - eps
maxlat <- 23.50 + eps
minlon <- 113.00 - eps
maxlon <- 114.50 + eps
# subset data
subset_df <- tmp_df[tmp_df$lat >= minlat & tmp_df$lat <= maxlat &
tmp_df$lon >= minlon & tmp_df$lon <= maxlon, ]
# add Date
subset_df_date <- as_tibble(subset_df) %>% mutate(date = snm_file_date)
# Add to list
ls_df[[it_df_ctr]] <- subset_df_date
# Close NC
nc_close(ncin)
}
# List of DF to one DF
df_all_nc <- do.call(rbind, ls_df)
# Save File
fname <- paste0(paste0(st_nc_prefix,
snm_start_date, "_to_", snm_end_date,
".csv"))
csvfile <- paste0(csvpath, fname)
write.table(na.omit(df_all_nc), csvfile, row.names = FALSE, sep = ",")
# Delete folders
unlink(spn_zip, recursive=TRUE, force=TRUE)
unlink(spn_unzip_folder, recursive=TRUE, force=TRUE)
# end loop months groups
}
# end loop year
}