potterzot · March 28, 2018 18:29
diff --git a/extract_spei.Rmd b/extract_spei.Rmd
 This documents the ease with which I used `tidync` to extract data from the drought index database [SPEI](http://spei.csic.es/database.html#p3) compared with using `ncdf4` directly.

 ### Setup necessary for both methods

 ```{r}
 # Establish dimensions ------------------------------------------------------------------

 ### Time limits. 
 time_first_idx <- as.integer(difftime(as.Date("1990-01-01"), as.Date("1900-01-01"))) + 15

 ### Spatial limits. These are based on available SCAD data points for
 # Africa and Latin America, but including the whole continent, so all of
 # Mexico and south and then the continent of Africa. 
 # Latitude: -57.75 to 37.75 degrees north
 # Longitude: -117.75 to 59.75 degrees east
 box <- list(lat = c(-57.75, 37.75), lon = c(-117.75, 57.75))

 # Open the file for reading and grab just the data we want. --------------------
 ncfile <- paste0(raw, "spei01.nc")
 ```

 ### Using tidync

 ```{r}
 ### Get the data for the given dimensions
 d <- tidync(ncfile) %>% 
  hyper_filter(
    lat = between(lat, box$lat[[1]], box$lat[[2]]),
    lon = between(lon, box$lon[[1]], box$lon[[2]]),
    time = time >= time_first_idx
    ) %>% hyper_tibble() %>%
  mutate(date = as.Date(time, origin = as.Date("1900-01-01"))) %>%
  select(-c(time))
 ```

 ### Using ncdf4

 ```{r}
 con <- nc_open(ncfile)

 ### Defining Dimensions

 ## Dates ("time") are "days since 1900-1-1". Probably this actually means "Days 
 # starting with 1900-1-1"
 time_vals <- con$dim$time$vals

 # We start with 1990 January 
 time_start <- as.integer(difftime(as.Date("1990-01-01"), as.Date("1900-01-01"))) + 15
 time_start_idx <- which(time_start == time_vals)
 time_count <- length(time_vals) - time_start_idx

 first_date <- as.Date(time_vals[time_start_idx], origin = as.Date("1900-01-01"))
 last_date <- as.Date(time_vals[length(time_vals)], origin = as.Date("1900-01-01"))

 ## Spatial points are defined by a box
 # Latitude ("lat") is "degrees_north" and runs from -89.75 to 89.75, having
 # 360 values
 lat_vals <- con$dim$lat$vals

 # Longitude ("lon") is "degrees_east" and runs from -179.75 to 179.25, having 
 # 720 values.
 lon_vals <- con$dim$lon$vals

 # create a grid of all the points
 latlons <- expand.grid(lon_vals, lat_vals)

 ### Get the data
 # We cycle through each time period, converting the data to long format, then 
 # append it all. We also limit by lat/lon to keep just Africa and LA

 dim_counts <- c(720, 360, 1)
 d <- rbindlist(lapply(0:time_count, function(t) {
  t2 <- time_start_idx + t # this iterations time index
  
  #initial dimension, collecting just 1 time dimension
  dim_starts <- c(1, 1, t2)
  
  # bind the data with the lat/lon points attached, and drop any NAs
  d <- cbind(
    latlons,
    matrix(ncvar_get(con, "spei", start = dim_starts, count = dim_counts), ncol = 1))
  d <- data.table(d)
  setnames(d, c("lon", "lat", "spei"))
  
  # Filter just the lat/lons within the box and values over land
  d <- d[lat %between% box$lat & lon %between% box$lon & is.na(spei)]
  
  # Exclude NAs
  # We don't want to do this so that we maintain a full grid.
  #d <- d[!is.na(spei)]
  
  # set the date
  today <- as.Date(time_vals[t2], origin = as.Date("1900-01-01"))
  d[, date := today]
  d
 }))
 ```
	This documents the ease with which I used `tidync` to extract data from the drought index database [SPEI](http://spei.csic.es/database.html#p3) compared with using `ncdf4` directly.

	### Setup necessary for both methods

	```{r}
	# Establish dimensions ------------------------------------------------------------------

	### Time limits.
	time_first_idx <- as.integer(difftime(as.Date("1990-01-01"), as.Date("1900-01-01"))) + 15

	### Spatial limits. These are based on available SCAD data points for
	# Africa and Latin America, but including the whole continent, so all of
	# Mexico and south and then the continent of Africa.
	# Latitude: -57.75 to 37.75 degrees north
	# Longitude: -117.75 to 59.75 degrees east
	box <- list(lat = c(-57.75, 37.75), lon = c(-117.75, 57.75))

	# Open the file for reading and grab just the data we want. --------------------
	ncfile <- paste0(raw, "spei01.nc")
	```

	### Using tidync

	```{r}
	### Get the data for the given dimensions
	d <- tidync(ncfile) %>%
	hyper_filter(
	lat = between(lat, box$lat[[1]], box$lat[[2]]),
	lon = between(lon, box$lon[[1]], box$lon[[2]]),
	time = time >= time_first_idx
	) %>% hyper_tibble() %>%
	mutate(date = as.Date(time, origin = as.Date("1900-01-01"))) %>%
	select(-c(time))
	```

	### Using ncdf4

	```{r}
	con <- nc_open(ncfile)

	### Defining Dimensions

	## Dates ("time") are "days since 1900-1-1". Probably this actually means "Days
	# starting with 1900-1-1"
	time_vals <- con$dim$time$vals

	# We start with 1990 January
	time_start <- as.integer(difftime(as.Date("1990-01-01"), as.Date("1900-01-01"))) + 15
	time_start_idx <- which(time_start == time_vals)
	time_count <- length(time_vals) - time_start_idx

	first_date <- as.Date(time_vals[time_start_idx], origin = as.Date("1900-01-01"))
	last_date <- as.Date(time_vals[length(time_vals)], origin = as.Date("1900-01-01"))

	## Spatial points are defined by a box
	# Latitude ("lat") is "degrees_north" and runs from -89.75 to 89.75, having
	# 360 values
	lat_vals <- con$dim$lat$vals

	# Longitude ("lon") is "degrees_east" and runs from -179.75 to 179.25, having
	# 720 values.
	lon_vals <- con$dim$lon$vals

	# create a grid of all the points
	latlons <- expand.grid(lon_vals, lat_vals)

	### Get the data
	# We cycle through each time period, converting the data to long format, then
	# append it all. We also limit by lat/lon to keep just Africa and LA

	dim_counts <- c(720, 360, 1)
	d <- rbindlist(lapply(0:time_count, function(t) {
	t2 <- time_start_idx + t # this iterations time index

	#initial dimension, collecting just 1 time dimension
	dim_starts <- c(1, 1, t2)

	# bind the data with the lat/lon points attached, and drop any NAs
	d <- cbind(
	latlons,
	matrix(ncvar_get(con, "spei", start = dim_starts, count = dim_counts), ncol = 1))
	d <- data.table(d)
	setnames(d, c("lon", "lat", "spei"))

	# Filter just the lat/lons within the box and values over land
	d <- d[lat %between% box$lat & lon %between% box$lon & is.na(spei)]

	# Exclude NAs
	# We don't want to do this so that we maintain a full grid.
	#d <- d[!is.na(spei)]

	# set the date
	today <- as.Date(time_vals[t2], origin = as.Date("1900-01-01"))
	d[, date := today]
	d
	}))
	```