Anthesis Date Estimation from Climate Data

Libraries

Load required libraries

library(dplyr)
library(purrr)
library(gsheet)
library(raster)
library(ncdf4)
library(lubridate)
library(readxl)
library(writexl)
library(caret)
library(tidyr)
library(r4pde)
library(refund)
library(ggplot2)
library(readr)
library(nasapower)

Importing and Processing the Experimental Data

#  Import trial dataset containing municipalities, planting dates, and coordinates
gib <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1MBiKsosQ8Hob6LkS65_1pPU25hx1CO9i42Sm_xf28ww/edit?gid=2128367221 gid=2128367221")

trials <- gib |> 
  mutate(
    planting_date = as.Date(planting_date, format = "%Y-%m-%d"),
    pd120 = as.Date(planting_date + 120),   #   120-day growing period
    study = as.factor(study),
    year = as.factor(year)) |> 
  dplyr::select(study, planting_date, lon, lat, pd120) |>
  rename(longitude = lon,
         latitude = lat)

Weather Data Acquisition

From NASA POWER

weather_nasa <- trials %>%
  rowwise() %>%
  mutate(
    weather = list(
      get_power(
        community = "AG", 
        lonlat = c(longitude, latitude),
        pars = c("T2M", "T2M_MAX", "T2M_MIN", "RH2M", "PRECTOTCORR"),
        dates = c(as.character(planting_date), as.character(pd120)),
        temporal_api = "DAILY"
      )
    )
  ) %>%
  unnest(weather) |> 
  rename(Tmax = T2M_MAX,
         Tmin = T2M_MIN,
         date = YYYYMMDD)

From Xavier Dataset (NetCDF)

This repository is intended to demonstrate the structure and workflow of the code used in the project. To keep the repository lightweight and accessible on GitHub, the original input data files (provided by Xavier et al. 2022) have been excluded because they exceed GitHub’s file size limits.

The corresponding lines in the code that load these large .nc files are commented out (using ). This way, the code can still be inspected and followed without requiring the full dataset. However, the scripts will not reproduce the complete analysis unless the data are available locally.

The full dataset can be obtained from Xavier et al. (2022) at the following link: Xavier Dataset – Download here

After downloading, place the files in the data/ directory of this repository and uncomment the relevant lines in the scripts to run the full analysis.

#  Keep only studies with available NetCDF coverage
trials <- trials |> 
  filter(study %in% 1:91)
#   #     Generic function to extract daily climate variables from NetCDF
# extract_nc_var <- function(netcdf_path, trials, varname, outfile) {
#   nc_data <- nc_open(netcdf_path)
  
#   #     Convert time variable to Date
#   time_var   <- ncvar_get(nc_data, "time")
#   time_units <- ncatt_get(nc_data, "time", "units")$value
#   origin_date <- as.POSIXct(sub("hours since ", "", time_units), tz = "UTC")
#   nc_dates   <- as.Date(origin_date + as.difftime(time_var, units = "hours"))
  
#     Grid coordinates
#   lon_vals <- ncvar_get(nc_data, "longitude")
#   lat_vals <- ncvar_get(nc_data, "latitude")
  
#     Internal extractor for each trial
#   get_data_for_row <- function(row) {
#     date_seq <- seq.Date(row$planting_date, row$pd120, by = "day")
#     date_idx <- match(date_seq, nc_dates)
#     date_idx <- date_idx[!is.na(date_idx)]
#     if (length(date_idx) == 0) return(NULL)
    
#     lon_idx <- which.min(abs(lon_vals - row$longitude))
#     lat_idx <- which.min(abs(lat_vals - row$latitude))
    
#     values <- sapply(date_idx, function(i) {
#       ncvar_get(nc_data, varname, start = c(lon_idx, lat_idx, i), count = c(1, 1, 1))
#     })
#     if (length(values) == 0 || all(is.na(values))) return(NULL)
    
#     df <- data.frame(
#       date          = date_seq[!is.na(match(date_seq, nc_dates))],
#       value         = values,
#       study         = row$study,
#       year          = format(row$planting_date, "%Y"),
#       planting_date = row$planting_date
#     )
#     names(df)[2] <- varname
#     return(df)
#   }
  
#   result <- trials %>%
#     split(1:nrow(.)) %>%
#     map_dfr(get_data_for_row)
  
#   write_xlsx(result, outfile)
#   nc_close(nc_data)
#   return(result)
# }

#      # Extract Tmax and Tmin
# tmax <- extract_nc_var("data/Tmax_20010101_20240320_BR-DWGD_UFES_UTEXAS_v_3.2.3.nc", trials, "Tmax", "data/br_dwgd_tmax.xlsx")
# tmin <- extract_nc_var("data/Tmin_20010101_20240320_BR-DWGD_UFES_UTEXAS_v_3.2.3.nc", trials, "Tmin", "data/br_dwgd_tmin.xlsx")

#   # Merge and compute mean daily temperature
#weather_xavier <- tmax |> 
#  mutate(Tmin = tmin$Tmin, T2M = (Tmax + Tmin)/2)

#write_xlsx(weather_xavier, "plan/weather_xavier_flowering.xlsx")
weather_xavier <- read_xlsx("plan/weather_xavier_flowering.xlsx")

Estimation of Anthesis (Flowering Date)

General Functions

#  Temperature limits for wheat growth
Tmin <- 0     #Lower limit
Tmax <- 40    #Upper limit

#  Daily growing degree-days (GDD) calculation
calculate_gdd_phase <- function(T2M, Tmin, Tmax, Tbase) {
  if (T2M < Tmin || T2M > Tmax) return(0)
  GDD <- T2M - Tbase
  return(ifelse(GDD > 0, GDD, 0))
}

#  Basal temperatures and cumulative thresholds (by growth stage)
Tbase <- c(2.1, 4.76, 0.86, 8.44)     #Sm-Em, Em-DA, DA-ET, ET-AN
thresholds <- c(80, 100, 300, 750)    #Accumulated GDD thresholds

#  Function to assign phenological phases by study
calculate_phases_by_study <- function(df) {
  total_gdd <- 0
  phase_num <- 1
  gd_values <- numeric(nrow(df))
  gdcumul_values <- numeric(nrow(df))
  phases <- character(nrow(df))
  
  for (day in 1:nrow(df)) {
    T2M <- df$T2M[day]
    GDD <- calculate_gdd_phase(T2M, Tmin, Tmax, Tbase[phase_num])
    total_gdd <- total_gdd + GDD
    gd_values[day] <- GDD
    gdcumul_values[day] <- total_gdd
    
    if (is.na(total_gdd)) {
      phases[day] <- NA
    } else if (total_gdd < thresholds[1]) {
      phases[day] <- "Planting"
    } else if (total_gdd < thresholds[2]) {
      phases[day] <- "Emergence"
    } else if (total_gdd < thresholds[3]) {
      phases[day] <- "Double Ridge"
    } else if (total_gdd < thresholds[4]) {
      phases[day] <- "Terminal Spikelet"
    } else {
      phases[day] <- "Anthesis"
    }
    
    if (phase_num <= length(thresholds) && total_gdd >= thresholds[phase_num]) {
      phase_num <- phase_num + 1
    }
  }
  
  df <- df %>%
    mutate(gd = gd_values, gdcumul = gdcumul_values, phase = phases)
  return(df)
}

#  Function to compute anthesis date per dataset
compute_anthesis <- function(weather_data) {
  weather_data %>%
    group_by(study) %>%
    group_modify(~ calculate_phases_by_study(.x)) %>%
    ungroup() %>%
    group_by(study) %>%
    mutate(flowering = as.Date(date[phase == "Anthesis"][1]),
           dias_para_antese = as.numeric(flowering - planting_date)) %>%
    distinct(study, dias_para_antese, .keep_all = TRUE) %>%
    dplyr::select(study, dias_para_antese)
}

Apply to NASA and Xavier Datasets

nasa   <- compute_anthesis(weather_nasa)
xavier <- compute_anthesis(weather_xavier)

Final Note on Flowering Date

For subsequent analyses, the flowering date was defined as the mean number of days to anthesis across the NASA POWER and Xavier datasets, when both were available. For studies where only NASA POWER data existed, flowering was estimated exclusively from that source. The anthesis date was calculated as:

Planting date + estimated days to anthesis.