#### Setup ####

library(tidyverse)
library(ncdf4)
library(Matrix)
#also need cdo installed for command line https://code.mpimet.mpg.de/projects/cdo/wiki

#make sure these are set to the correct locations
project_dir = '/mnt/r/wasserstein/'
data_dir = '/mnt/r/historical/'
source(paste0(project_dir,'00_functions.R'))

#directory name to save results
slice_dir = paste0(project_dir,'globalmean/')

#### Setup slicing method ####

#directory to save sliced data
if(!dir.exists(slice_dir)){
  #create folders to store sliced data
  dir.create(slice_dir)
  dir.create(paste0(slice_dir,'tas/'))
  dir.create(paste0(slice_dir,'tas/weights'))
  dir.create(paste0(slice_dir,'tas/cmip5'))
  dir.create(paste0(slice_dir,'tas/cmip6'))
  dir.create(paste0(slice_dir,'pr/'))
  dir.create(paste0(slice_dir,'pr/weights'))
  dir.create(paste0(slice_dir,'pr/cmip5'))
  dir.create(paste0(slice_dir,'pr/cmip6'))
}

#### Daily Average Surface Air Temperature (TAS) ####

#### Slicing ERA5 ####

#use cdo to create our area weights
system(paste0('cdo gridarea ',
              data_dir,'era5/tas/hourly/era5_tas_hourly_1979_01.nc ',
              slice_dir,'tas/weights/era5_tas_hourly_1979_01.nc'))

#set wd to location of daily files from 02_hourly_to_daily.R
setwd(paste0(data_dir,'era5/tas/daily/'))
Sys.sleep(1)

#calculate # of days ending in November, 2005 since each file has a month of data
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))

#save vector for full time series of global means
era5_gm = numeric(n_days)

#prep upsampling indices
era5_nc = nc_open(paste0(slice_dir,'tas/weights/era5_tas_hourly_1979_01.nc'))
era5_weights = ncvar_get(era5_nc,'cell_area')
d = dim(era5_weights)
dim(era5_weights) = d[1]*d[2]

t = 1
era5_files = sort(list.files())[1:323] #sort by year and subset files we need
for(f in era5_files){
  #open daily file
  era5_temp = aperm(readRDS(f),3:1) #lon, lat, day
  
  #flatten spatial dimension for indexing
  d = dim(era5_temp)
  dim(era5_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  era5_gm_subset = as.numeric(era5_weights %*% era5_temp)/sum(era5_weights)
  
  #reshape to the approximation grid and save
  era5_gm[t:(t + d[3] - 1)] = era5_gm_subset
  t = t + d[3]
}

saveRDS(era5_gm,paste0(slice_dir,'tas/era5.RDS'))

#test the time series
ggplot()+
  geom_histogram(aes(era5_gm))


#clean up, remove all era5 files from memory
rm(era5_temp,era5_weights)
gc()



#### Slicing NCEP ####

#get file names
setwd(paste0(data_dir,'ncep/tas/'))
Sys.sleep(1)
ncep_files = sort(list.files())[1:27] #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gridarea ',
              data_dir,'ncep/tas/',ncep_files[1],' ',
              slice_dir,'tas/weights/',ncep_files[1]))

#calculate # of days (full years this time, we'll subset later)
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-12-31'),by='1 day'))

#save vector for full time series of global means
ncep_gm = numeric(n_days)

#prep upsampling indices
ncep_nc = nc_open(paste0(slice_dir,'tas/weights/',ncep_files[1]))
ncep_weights = ncvar_get(ncep_nc,'cell_area')
d = dim(ncep_weights)
dim(ncep_weights) = d[1]*d[2]

t = 1
for(f in ncep_files){
  #open daily file and convert to celsius
  ncep_nc = nc_open(f)
  ncep_temp = ncvar_get(ncep_nc,'air') - 273.15
  d = dim(ncep_temp)
  dim(ncep_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  ncep_gm_subset = as.numeric(ncep_weights %*% ncep_temp)/sum(ncep_weights)
  
  #reshape to the approximation grid and save
  ncep_gm[t:(t + d[3] - 1)] = ncep_gm_subset
  t = t + d[3]
}

pre_dec_2005 = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))
ncep_gm = ncep_gm[1:pre_dec_2005]

#test
ggplot()+
  geom_histogram(aes(ncep_gm))

saveRDS(ncep_gm,paste0(slice_dir,'tas/ncep.RDS'))

#clean up, remove all ncep files from memory
rm(ncep_temp,ncep_weights)
gc()



#### Slicing CMIP5 ####

setwd(paste0(data_dir,'cmip5tas'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta5 = strsplit(ncs,"_")
meta5 = do.call(rbind.data.frame,meta5)
colnames(meta5) = c('variable','frequency','model','experiment','ensemble','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta5 = meta5 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta5 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta5$file_name = ncs
meta5 = meta5 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta5)){
  curr_model = meta5$model[i]
  mod = nc_open(meta5$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  lon_res = c(lon_res,mod$dim$lon$len)
  lat_res = c(lat_res,mod$dim$lat$len)
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gridarea -selvar,tas -seltimestep,1 ',
                  data_dir,'cmip5tas/',meta5$file_name[i],' ',
                  slice_dir,'tas/weights/',meta5$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
}

#final metadata data frame
meta5$lon_res = lon_res
meta5$lat_res = lat_res
meta5$calendar = calendar
meta5 = meta5 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))


#loop, open files, project, save to the array
last_model = "none"
for(i in 1:nrow(meta5)){
  
  #read in model output
  curr_model = meta5$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta5$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_gm,paste0(slice_dir,'tas/cmip5/',last_model,'.RDS'))
      print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.1)+ggtitle(last_model))
      rm(mod_weights)
      gc()
    }
    
    last_model = curr_model
    t=1
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_gm = numeric(n_days_365)
    }else if(mod$dim$time$calendar == '360_day'){
      mod_gm = numeric(n_days_360)
    }else{
      mod_gm = numeric(n_days)
    }
    
    #cell area weights
    mod_nc = nc_open(paste0(slice_dir,'tas/weights/',meta5$file_name[i]))
    mod_weights = ncvar_get(mod_nc,'cell_area')
    d = dim(mod_weights)
    dim(mod_weights) = d[1]*d[2]
    rm(mod_nc)
    gc()
  }
  
  #read in data, convert to celcius, flatten spatial dimension for indexing
  mod_temp = ncvar_get(mod, "tas", c(1,1,meta5$keep1[i]), c(-1,-1,1+meta5$keep2[i]-meta5$keep1[i])) - 273.15 #long, lat, day
  d = dim(mod_temp)
  dim(mod_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  mod_gm_subset = as.numeric(mod_weights %*% mod_temp)/sum(mod_weights)
  
  #reshape to the approximation grid and save
  mod_gm[t:(t + d[3] - 1)] = mod_gm_subset
  t = t + d[3]

  rm(mod_temp)
  gc()
}
saveRDS(mod_gm,paste0(slice_dir,'tas/cmip5/',curr_model,'.RDS'))

#test histogram
ggplot()+
  geom_histogram(aes(mod_gm))+
  ggtitle(curr_model)

rm(mod_temp, mod_weights)
gc()



#### Slicing CMIP6 ####

setwd(paste0(data_dir,'cmip6tas'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta6 = strsplit(ncs,"_")
meta6 = do.call(rbind.data.frame,meta6)
colnames(meta6) = c('variable','frequency','model','experiment','ensemble','grid','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta6 = meta6 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta6 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta6$file_name = ncs
meta6 = meta6 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#overlapping model dates, need to end early to compensate
meta6$end[which(meta6$model=='CESM2-WACCM-FV2')] = c(19791211,19891221,19991231,20100110)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta6)){
  curr_model = meta6$model[i]
  mod = nc_open(meta6$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  if(curr_model == "ICON-ESM-LR"){
    lon_res = c(lon_res,NA)
    lat_res = c(lat_res,NA)
  }else{
    lon_res = c(lon_res,mod$dim$lon$len)
    lat_res = c(lat_res,mod$dim$lat$len)
  }
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gridarea -selvar,tas -seltimestep,1 ',
                  data_dir,'cmip6tas/',meta6$file_name[i],' ',
                  slice_dir,'tas/weights/',meta6$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
}


meta6$calendar = calendar
meta6$lon_res = lon_res
meta6$lat_res = lat_res

meta6 = meta6 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))
#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#loop, open files, project, save to the array
last_model = 'none'
for(i in 1:nrow(meta6)){
  
  #read in model output
  curr_model = meta6$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta6$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_gm,paste0(slice_dir,'tas/cmip6/',last_model,'.RDS'))
      print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.1)+ggtitle(last_model))
      rm(mod_weights)
      gc()
    }
    
    last_model = curr_model
    t = 1
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_gm = numeric(n_days_365)
    }else if(mod$dim$time$calendar == '360_day'){
      mod_gm = numeric(n_days_360)
    }else{
      mod_gm = numeric(n_days)
    }
    
    #cell area weights
    mod_nc = nc_open(paste0(slice_dir,'tas/weights/',meta6$file_name[i]))
    mod_weights = ncvar_get(mod_nc,'cell_area')
    if(curr_model != "ICON-ESM-LR"){
      d = dim(mod_weights)
      dim(mod_weights) = d[1]*d[2]
    }
    rm(mod_nc)
    gc()
  }
  
  #special case handling for ICON icosahedron model (no need to flatten spatial dimension)
  if(curr_model == "ICON-ESM-LR"){
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "tas", c(1,meta6$keep1[i]), c(-1,1+meta6$keep2[i]-meta6$keep1[i])) - 273.15 #cell, day
    d = c(NA,NA,dim(mod_temp)[2])
  }else{
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "tas", c(1,1,meta6$keep1[i]), c(-1,-1,1+meta6$keep2[i]-meta6$keep1[i])) - 273.15 #long, lat, day
    
    #flatten spatial dimension for indexing
    d = dim(mod_temp)
    dim(mod_temp) = c(d[1]*d[2],d[3])
  }
  
  #calculate global means
  mod_gm_subset = as.numeric(mod_weights %*% mod_temp)/sum(mod_weights)
  
  #reshape to the approximation grid and save
  mod_gm[t:(t + d[3] - 1)] = mod_gm_subset
  t = t + d[3]
  
  rm(mod_temp)
  gc()
}
saveRDS(mod_gm,paste0(slice_dir,'tas/cmip6/',curr_model,'.RDS'))

#test first and last
print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.1)+ggtitle(last_model))

rm(mod_weights)
gc()



#### Daily Total Precipitation (PR) ####

#### Slicing ERA5 ####

#use cdo to create our area weights
system(paste0('cdo gridarea ',
              data_dir,'era5/pr/hourly/era5_pr_hourly_1979_01.nc ',
              slice_dir,'pr/weights/era5_pr_hourly_1979_01.nc'))

#set wd to location of daily files from 02_hourly_to_daily.R
setwd(paste0(data_dir,'era5/pr/daily/'))
Sys.sleep(1)

#calculate # of days ending in November, 2005 since each file has a month of data
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))

#save vector for full time series of global means
era5_gm = numeric(n_days)

#prep upsampling indices
era5_nc = nc_open(paste0(slice_dir,'pr/weights/era5_pr_hourly_1979_01.nc'))
era5_weights = ncvar_get(era5_nc,'cell_area')
d = dim(era5_weights)
dim(era5_weights) = d[1]*d[2]

t = 1
era5_files = sort(list.files())[1:323] #sort by year and subset files we need
for(f in era5_files){
  #open daily file
  era5_temp = aperm(readRDS(f),3:1)*1000 #lon, lat, day
  
  #flatten spatial dimension for indexing
  d = dim(era5_temp)
  dim(era5_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  era5_gm_subset = as.numeric(era5_weights %*% era5_temp)/sum(era5_weights)
  
  #reshape to the approximation grid and save
  era5_gm[t:(t + d[3] - 1)] = era5_gm_subset
  t = t + d[3]
}

saveRDS(era5_gm,paste0(slice_dir,'pr/era5.RDS'))

#test the time series
ggplot()+
  geom_histogram(aes(era5_gm))

#clean up, remove all era5 files from memory
rm(era5_temp,era5_weights)
gc()



#### Slicing NCEP ####

#get file names
setwd(paste0(data_dir,'ncep/pr/'))
Sys.sleep(1)
ncep_files = sort(list.files())[1:27] #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gridarea ',
              data_dir,'ncep/pr/',ncep_files[1],' ',
              slice_dir,'pr/weights/',ncep_files[1]))

#calculate # of days (full years this time, we'll subset later)
n_days = length(seq(as.Date('1979-01-01'),as.Date('2005-12-31'),by='1 day'))

#save vector for full time series of global means
ncep_gm = numeric(n_days)

#prep upsampling indices
ncep_nc = nc_open(paste0(slice_dir,'pr/weights/',ncep_files[1]))
ncep_weights = ncvar_get(ncep_nc,'cell_area')
d = dim(ncep_weights)
dim(ncep_weights) = d[1]*d[2]

t = 1
for(f in ncep_files){
  #open daily file and convert to celsius
  ncep_nc = nc_open(f)
  ncep_temp = ncvar_get(ncep_nc,'prate')*86400
  d = dim(ncep_temp)
  dim(ncep_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  ncep_gm_subset = as.numeric(ncep_weights %*% ncep_temp)/sum(ncep_weights)
  
  #reshape to the approximation grid and save
  ncep_gm[t:(t + d[3] - 1)] = ncep_gm_subset
  t = t + d[3]
}

pre_dec_2005 = length(seq(as.Date('1979-01-01'),as.Date('2005-11-30'),by='1 day'))
ncep_gm = ncep_gm[1:pre_dec_2005]

#test
ggplot()+
  geom_histogram(aes(ncep_gm))

saveRDS(ncep_gm,paste0(slice_dir,'pr/ncep.RDS'))

#clean up, remove all ncep files from memory
rm(ncep_temp,ncep_weights)
gc()


#### Slicing GPCP ####

#get file names
setwd(paste0(data_dir,'gpcp/daily/'))
Sys.sleep(1)
gpcp_files = sort(list.files()) #sort by year and subset files we need

#use cdo to create our upsampling grid and save the template for later
system(paste0('cdo gridarea ',
              data_dir,'gpcp/daily/',gpcp_files[1],' ',
              slice_dir,'pr/weights/',gpcp_files[1]))

#calculate # of days (full years this time, we'll subset later)
n_days = length(seq(as.Date('1996-10-01'),as.Date('2005-11-30'),by='1 day'))

#save vector for full time series of global means
gpcp_gm = numeric(n_days)

#prep upsampling indices
gpcp_nc = nc_open(paste0(slice_dir,'pr/weights/',gpcp_files[1]))
gpcp_weights = ncvar_get(gpcp_nc,'cell_area')
d = dim(gpcp_weights)
dim(gpcp_weights) = d[1]*d[2]

t = 1
for(f in gpcp_files[1:n_days]){
  #open daily file and convert to celsius
  gpcp_weights_temp = gpcp_weights
  gpcp_nc = nc_open(f)
  gpcp_temp = ncvar_get(gpcp_nc,'precip')
  d = dim(gpcp_temp)
  dim(gpcp_temp) = c(d[1]*d[2])
  
  #set weights for missing data to 0 to exclude them from the global mean
  gpcp_weights_temp[gpcp_temp<0]=0
  
  #calculate global means
  gpcp_gm_subset = as.numeric(gpcp_weights_temp %*% gpcp_temp)/sum(gpcp_weights_temp)
  
  #reshape to the approximation grid and save
  gpcp_gm[t] = gpcp_gm_subset
  t=t+1
}

#test
ggplot()+
  geom_histogram(aes(gpcp_gm))

saveRDS(gpcp_gm,paste0(slice_dir,'pr/gpcp.RDS'))

#clean up, remove all gpcp files from memory
rm(gpcp_temp,gpcp_weights)
gc()




#### Slicing CMIP5 ####

setwd(paste0(data_dir,'cmip5pr'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta5 = strsplit(ncs,"_")
meta5 = do.call(rbind.data.frame,meta5)
colnames(meta5) = c('variable','frequency','model','experiment','ensemble','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta5 = meta5 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed,
         contains_full = pre1979 & post2005,
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed))

#Take all models and view the end date, sorting by earliest date first
meta5 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta5$file_name = ncs
meta5 = meta5 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta5)){
  curr_model = meta5$model[i]
  mod = nc_open(meta5$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  lon_res = c(lon_res,mod$dim$lon$len)
  lat_res = c(lat_res,mod$dim$lat$len)
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gridarea -selvar,pr -seltimestep,1 ',
                  data_dir,'cmip5pr/',meta5$file_name[i],' ',
                  slice_dir,'pr/weights/',meta5$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
}

#final metadata data frame
meta5$lon_res = lon_res
meta5$lat_res = lat_res
meta5$calendar = calendar
meta5 = meta5 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))

#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

#loop, open files, project, save to the array
last_model = "none"
for(i in 1:nrow(meta5)){
  
  #read in model output
  curr_model = meta5$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta5$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_gm,paste0(slice_dir,'pr/cmip5/',last_model,'.RDS'))
      print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.025)+ggtitle(last_model))
      rm(mod_weights)
      gc()
    }
    
    last_model = curr_model
    t=1
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_gm = numeric(n_days_365)
    }else if(mod$dim$time$calendar == '360_day'){
      mod_gm = numeric(n_days_360)
    }else{
      mod_gm = numeric(n_days)
    }
    
    #cell area weights
    mod_nc = nc_open(paste0(slice_dir,'pr/weights/',meta5$file_name[i]))
    mod_weights = ncvar_get(mod_nc,'cell_area')
    d = dim(mod_weights)
    dim(mod_weights) = d[1]*d[2]
    rm(mod_nc)
    gc()
  }
  
  #read in data, convert to celcius, flatten spatial dimension for indexing
  mod_temp = ncvar_get(mod, "pr", c(1,1,meta5$keep1[i]), c(-1,-1,1+meta5$keep2[i]-meta5$keep1[i]))*86400 #long, lat, day
  d = dim(mod_temp)
  dim(mod_temp) = c(d[1]*d[2],d[3])
  
  #calculate global means
  mod_gm_subset = as.numeric(mod_weights %*% mod_temp)/sum(mod_weights)
  
  #reshape to the approximation grid and save
  mod_gm[t:(t + d[3] - 1)] = mod_gm_subset
  t = t + d[3]
  
  rm(mod_temp)
  gc()
}
saveRDS(mod_gm,paste0(slice_dir,'pr/cmip5/',curr_model,'.RDS'))

#test histogram
ggplot()+
  geom_histogram(aes(mod_gm),binwidth=0.025)+
  ggtitle(curr_model)

rm(mod_weights)
gc()




#### Slicing CMIP6 ####

setwd(paste0(data_dir,'cmip6pr'))
Sys.sleep(1)

#process file names to make a data frame
ncs = sort(list.files())
meta6 = strsplit(ncs,"_")
meta6 = do.call(rbind.data.frame,meta6)
colnames(meta6) = c('variable','frequency','model','experiment','ensemble','grid','years')

#start date, end date, will show why we end in november in a few lines
sd = 19790101
ed = 20051130

#Given the start and end date, tag each file with it's temporal contents for filtering
meta6 = meta6 %>%
  separate(years,c('start','end')) %>%
  mutate(start = as.numeric(start),
         end = as.numeric(end),
         pre1979 = start<=sd,
         post2005 = end>=ed, 
         contains_full = pre1979 & post2005, 
         out_of_range = (start>ed)|(end<sd),
         has200511 = (end>=ed)&(start<=ed)) 

#Take all models and view the end date, sorting by earliest date first
meta6 %>% group_by(model) %>% summarise(start = min(start),end = max(end)) %>% arrange(end)
#Two models, HadGEM2-CC and HadGEM2-ES, end in November 2005, so we'll take that as our end date

#append file names, then filter files that contain no relevant dates
meta6$file_name = ncs
meta6 = meta6 %>% arrange(model,start) %>% dplyr::filter(out_of_range==FALSE)

#overlapping model dates, need to end early to compensate 
meta6$end[which(meta6$model=='CESM2-WACCM-FV2')] = c(19791211,19891221,19991231,20100110)

#gather some info
lon_res = c()
lat_res = c()
calendar = c()
last_model='none'

for(i in 1:nrow(meta6)){
  curr_model = meta6$model[i]
  mod = nc_open(meta6$file_name[i])
  calendar = c(calendar,mod$dim$time$calendar)
  if(curr_model == "ICON-ESM-LR"){
    lon_res = c(lon_res,NA)
    lat_res = c(lat_res,NA)
  }else{
    lon_res = c(lon_res,mod$dim$lon$len)
    lat_res = c(lat_res,mod$dim$lat$len)
  }
  
  #use cdo to create our upsampling grid and save the template for later
  if(curr_model!=last_model){
    system(paste0('cdo gridarea -selvar,pr -seltimestep,1 ',
                  data_dir,'cmip6pr/',meta6$file_name[i],' ',
                  slice_dir,'pr/weights/',meta6$file_name[i]),ignore.stderr = T)
  }
  
  last_model = curr_model
}


meta6$calendar = calendar
meta6$lon_res = lon_res
meta6$lat_res = lat_res

meta6 = meta6 %>%
  rowwise() %>%
  mutate(keep1 = ifelse(pre1979,match(sd,ymd_range(start,end,calendar)),1),
         keep2 = ifelse(post2005,match(ed,ymd_range(start,end,calendar)),length(ymd_range(start,end,calendar))))
#clean up before processing the output
rm(lon_res,lat_res,calendar,sd,ed,i,ncs)
gc()


n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))


#loop, open files, project, save to the array
last_model = 'none'
for(i in 1:nrow(meta6)){
  
  #read in model output
  curr_model = meta6$model[i]
  
  #nc_open to read data, rast for fast upsampling
  mod = nc_open(meta6$file_name[i])
  
  #if it's a new model, set some parameters
  if(curr_model != last_model){
    if(last_model != 'none'){
      #clean up last loop
      saveRDS(mod_gm,paste0(slice_dir,'pr/cmip6/',last_model,'.RDS'))
      print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.025)+ggtitle(last_model))
      rm(mod_weights)
      gc()
    }
    
    last_model = curr_model
    t = 1
    
    if(mod$dim$time$calendar %in% c('noleap','365_day')){
      mod_gm = numeric(n_days_365)
    }else if(mod$dim$time$calendar == '360_day'){
      mod_gm = numeric(n_days_360)
    }else{
      mod_gm = numeric(n_days)
    }
    
    #cell area weights
    mod_nc = nc_open(paste0(slice_dir,'pr/weights/',meta6$file_name[i]))
    mod_weights = ncvar_get(mod_nc,'cell_area')
    if(curr_model != "ICON-ESM-LR"){
      d = dim(mod_weights)
      dim(mod_weights) = d[1]*d[2]
    }
    rm(mod_nc)
    gc()
  }
  
  #special case handling for ICON icosahedron model (no need to flatten spatial dimension)
  if(curr_model == "ICON-ESM-LR"){
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "pr", c(1,meta6$keep1[i]), c(-1,1+meta6$keep2[i]-meta6$keep1[i]))*86400 #cell, day
    d = c(NA,NA,dim(mod_temp)[2])
  }else{
    #read in data, convert to celcius
    mod_temp = ncvar_get(mod, "pr", c(1,1,meta6$keep1[i]), c(-1,-1,1+meta6$keep2[i]-meta6$keep1[i]))*86400 #long, lat, day
    
    #flatten spatial dimension for indexing
    d = dim(mod_temp)
    dim(mod_temp) = c(d[1]*d[2],d[3])
  }
  
  #calculate global means
  mod_gm_subset = as.numeric(mod_weights %*% mod_temp)/sum(mod_weights)
  
  #reshape to the approximation grid and save
  mod_gm[t:(t + d[3] - 1)] = mod_gm_subset
  t = t + d[3]
  
  rm(mod_temp)
  gc()
  nc_close(mod)
}
saveRDS(mod_gm,paste0(slice_dir,'pr/cmip6/',curr_model,'.RDS'))

#test first and last
print(ggplot()+geom_histogram(aes(mod_gm),binwidth=0.025)+ggtitle(last_model))

rm(mod_weights)
gc()



#### Calculating 1d wasserstein distances from Vissio et. al. ####

#### Surface Temperature ####

cmip5_dir = paste0(slice_dir,'tas/cmip5/')
cmip5_files = list.files(cmip5_dir)
cmip5_models = sapply(strsplit(cmip5_files,"[.]"),function(x){x[1]})
n_cmip5 = length(cmip5_models)

cmip6_dir = paste0(slice_dir,'tas/cmip6/')
cmip6_files = list.files(cmip6_dir)
cmip6_models = sapply(strsplit(cmip6_files,"[.]"),function(x){x[1]})
n_cmip6 = length(cmip6_models)

#### Calculate distance to ERA5 Renalysis ####

#quantiles of interest
q = seq(0,1,0.005)

#compare era5 and ncep, create objects to store results
era5_gm = readRDS(paste0(slice_dir,'tas/era5.RDS'))
ncep_gm = readRDS(paste0(slice_dir,'tas/ncep.RDS'))

tas_results = data.frame(model = c('NCEP Reanalysis',cmip5_models,cmip6_models),
                         mip = c('NCEP Reanalysis',rep('CMIP5',n_cmip5),rep('CMIP6',n_cmip6)),
                         wd = numeric(1+n_cmip5+n_cmip6))

tas_results$wd[1] = wasserstein_1d(era5_gm,ncep_gm,q)

# distances for CMIP5
for(i in 1:n_cmip5){
  mod_gm = readRDS(paste0(cmip5_dir,cmip5_files[i]))
  tas_results$wd[i+1] = wasserstein_1d(era5_gm,mod_gm,q)
}

# distances for CMIP6
for(i in 1:n_cmip6){
  mod_gm = readRDS(paste0(cmip6_dir,cmip6_files[i]))
  tas_results$wd[i+1+n_cmip5] = wasserstein_1d(era5_gm,mod_gm,q)
}

tas_results %>% arrange(wd)

save(tas_results,file=paste0(slice_dir,'tas/results.RDA'))



#### Total Precipitation ####

cmip5_dir = paste0(slice_dir,'pr/cmip5/')
cmip5_files = list.files(cmip5_dir)
cmip5_models = sapply(strsplit(cmip5_files,"[.]"),function(x){x[1]})
n_cmip5 = length(cmip5_models)

cmip6_dir = paste0(slice_dir,'pr/cmip6/')
cmip6_files = list.files(cmip6_dir)
cmip6_models = sapply(strsplit(cmip6_files,"[.]"),function(x){x[1]})
n_cmip6 = length(cmip6_models)

n_days = length(ymd_range(19790101,20051130, calendar = 'standard'))
n_days_365 = length(ymd_range(19790101,20051130, calendar = '365_day'))
n_days_360 = length(ymd_range(19790101,20051130, calendar = '360_day'))

gpcp_period = length(ymd_range(19790101,19961001, calendar = 'standard')):n_days
gpcp_period_365 = length(ymd_range(19790101,19961001, calendar = '365_day')):n_days_365
gpcp_period_360 = length(ymd_range(19790101,19961001, calendar = '360_day')):n_days_360


#### Calculate distance to ERA5 Renalysis ####

#quantiles of interest
q = seq(0,1,0.005)

#compare era5 and ncep, create objects to store results
gpcp_gm = readRDS(paste0(slice_dir,'pr/gpcp.RDS'))
era5_gm = readRDS(paste0(slice_dir,'pr/era5.RDS'))
ncep_gm = readRDS(paste0(slice_dir,'pr/ncep.RDS'))

pr_results = data.frame(model = c('ERA5','NCEP',cmip5_models,cmip6_models),
                         mip = c('Reanalysis','Reanalysis',rep('CMIP5',n_cmip5),rep('CMIP6',n_cmip6)),
                         wd = numeric(2+n_cmip5+n_cmip6))

pr_results$wd[1] = wasserstein_1d(gpcp_gm,era5_gm[gpcp_period],q)
pr_results$wd[2] = wasserstein_1d(gpcp_gm,ncep_gm[gpcp_period],q)

# distances for CMIP5
for(i in 1:n_cmip5){
  mod_gm = readRDS(paste0(cmip5_dir,cmip5_files[i]))
  
  n_days_mod = length(mod_gm)
  if(n_days_mod == n_days){
    mod_gpcp_period = gpcp_period
  }
  if(n_days_mod == n_days_365){
    mod_gpcp_period = gpcp_period_365
  }
  if(n_days_mod == n_days_360){
    mod_gpcp_period = gpcp_period_360
  }
  
  pr_results$wd[i+2] = wasserstein_1d(gpcp_gm,mod_gm[mod_gpcp_period],q)
}

# distances for CMIP6
for(i in 1:n_cmip6){
  mod_gm = readRDS(paste0(cmip6_dir,cmip6_files[i]))
  
  n_days_mod = length(mod_gm)
  if(n_days_mod == n_days){
    mod_gpcp_period = gpcp_period
  }
  if(n_days_mod == n_days_365){
    mod_gpcp_period = gpcp_period_365
  }
  if(n_days_mod == n_days_360){
    mod_gpcp_period = gpcp_period_360
  }
  
  pr_results$wd[i+2+n_cmip5] = wasserstein_1d(gpcp_gm,mod_gm[mod_gpcp_period],q)
}

pr_results %>% arrange(wd)

save(pr_results,file=paste0(slice_dir,'pr/results.RDA'))
