From 76025d67b46364a905ffa94be4cc540491371f12 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Tue, 17 Feb 2026 00:41:20 +0000 Subject: [PATCH 01/13] Created python implimentaions for cmorizing MODIS terra and aqua data in cmip6 --- .../cmorizers/data/cmor_config/MODIS-Aqua.yml | 49 ++ .../data/cmor_config/MODIS-Terra.yml | 49 ++ .../data/formatters/datasets/_modis_aqua.ncl | 253 ++++++++++ .../data/formatters/datasets/_modis_terra.ncl | 253 ++++++++++ .../data/formatters/datasets/modis_aqua.py | 447 ++++++++++++++++++ .../data/formatters/datasets/modis_terra.py | 447 ++++++++++++++++++ pyproject.toml | 1 + 7 files changed, 1499 insertions(+) create mode 100644 esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml create mode 100644 esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml create mode 100644 esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl create mode 100644 esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl create mode 100644 esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py create mode 100644 esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml new file mode 100644 index 0000000000..29329712c9 --- /dev/null +++ b/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml @@ -0,0 +1,49 @@ +--- +# Author: 2026-01-29 Kristi Webb + +# Common global attributes for Cmorizer output +attributes: + + project_id: CMIP6 + dataset_id: MODIS-Aqua + modeling_realm: atmos + type: sat + tier: 3 + version: MYD08_M3 + source: https://ladsweb.modaps.eosdis.nasa.gov/search/order + reference: modis1 + + filename_fmt: "MYD08_M3.A*.hdf" + +variables: + clwvi: + raw_name: Cloud_Water_Path_Liquid_Mean_Mean + mip: Amon + frequency: mon + clivi: + raw_name: Cloud_Water_Path_Ice_Mean_Mean + mip: Amon + frequency: mon + clt: + raw_name: Cloud_Fraction_Mean_Mean + mip: Amon + frequency: mon + lwpStderr: + raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty + mip: Amon + frequency: mon + project_id: custom + iwpStderr: + raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty + mip: Amon + frequency: mon + project_id: custom + od550aer: + raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean + mip: AERmon + frequency: mon + reference: modis2 + reffclwtop: + raw_name: Cloud_Effective_Radius_Liquid_Mean_Mean + mip: AERmon + frequency: mon diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml new file mode 100644 index 0000000000..94a3a686cc --- /dev/null +++ b/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml @@ -0,0 +1,49 @@ +--- +# Author: 2026-01-29 Kristi Webb + +# Common global attributes for Cmorizer output +attributes: + + project_id: CMIP6 + dataset_id: MODIS-Terra + modeling_realm: atmos + type: sat + tier: 3 + version: MOD08_M3 + source: https://ladsweb.modaps.eosdis.nasa.gov/search/order + reference: modis1 + + filename_fmt: "MOD08_M3.A*.hdf" + +variables: + clwvi: + raw_name: Cloud_Water_Path_Liquid_Mean_Mean + mip: Amon + frequency: mon + clivi: + raw_name: Cloud_Water_Path_Ice_Mean_Mean + mip: Amon + frequency: mon + clt: + raw_name: Cloud_Fraction_Mean_Mean + mip: Amon + frequency: mon + lwpStderr: + raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty + mip: Amon + frequency: mon + project_id: custom + iwpStderr: + raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty + mip: Amon + frequency: mon + project_id: custom + od550aer: + raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean + mip: AERmon + frequency: mon + reference: modis2 + reffclwtop: + raw_name: Cloud_Effective_Radius_Liquid_Mean_Mean + mip: AERmon + frequency: mon diff --git a/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl b/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl new file mode 100644 index 0000000000..f1829bd6b6 --- /dev/null +++ b/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl @@ -0,0 +1,253 @@ +; ############################################################################# +; ESMValTool CMORizer for MODIS data +; ############################################################################# +; +; Tier +; Tier 3: restricted dataset. +; +; Source +; https://ladsweb.modaps.eosdis.nasa.gov/search/order +; +; Last access +; 20190209 +; +; Download and processing instructions +; In Products: select "MODIS Aqua", "Collection 6.1" and +; "L3 Atmosphere Product", click on MYD08_M3. +; In Time: select from 2000-01-01 to today. +; In Location: skip, the global domain will be applied. +; In Files: select all. +; Submit the order. +; A registration is required to download the data. +; +; Caveats +; clwvi and clivi data are in-cloud values whereas CMIP5 models provide +; grid-box averages --> multiply MODIS clwvi and clivi values with cloud +; fraction as a first guess +; +; Modification history +; 20180209-righi_mattia: fixed bug in lwpStderr. +; 20180209-hassler_birgit: adapted to v2. +; 20180810-righi_mattia: fix minor calendar issue. +; 20180806-righi_mattia: code cleaning. +; 20170116-lauer_axel: using cirrus fraction to gridbox averages. +; 20160408-lauer_axel: added processing of uncertainties. +; 20151118-lauer_axel: bugfix: added unit conversion. +; 20150430-evaldsson_martin: written. +; +; ############################################################################# +loadscript(getenv("esmvaltool_root") + \ + "/data/formatters/interface.ncl") + +begin + + ; Script name (for logger) + DIAG_SCRIPT = "modis_aqua.ncl" + + ; Source name + OBSNAME = "MODIS-Aqua" + + ; Tier + TIER = 3 + + ; Selected variable (standard name) + VAR = (/"clwvi", \ + "clivi", \ + "clt", \ + "lwpStderr", \ + "iwpStderr", \ + "od550aer", \ + "reffclwtop"/) + + ; Name in the raw data + NAME = (/"Cloud_Water_Path_Liquid_Mean_Mean", \ + "Cloud_Water_Path_Ice_Mean_Mean", \ + "Cloud_Fraction_Mean_Mean", \ + "Cloud_Water_Path_Liquid_Mean_Uncertainty", \ + "Cloud_Water_Path_Ice_Mean_Uncertainty", \ + "AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean", \ + "Cloud_Effective_Radius_Liquid_Mean_Mean"/) + + ; MIP + MIP = (/"Amon", "Amon", "Amon", "Amon", "Amon", "aero", "aero"/) + + ; Frequency + FREQ = (/"mon", "mon", "mon", "mon", "mon", "mon", "mon"/) + + ; Version + VERSION = "MYD08_M3" + + ; CMOR table + CMOR_TABLE = getenv("cmor_tables") + \ + (/"/cmip5/Tables/CMIP5_Amon", \ + "/cmip5/Tables/CMIP5_Amon", \ + "/cmip5/Tables/CMIP5_Amon", \ + "/custom/CMOR_lwpStderr.dat", \ + "/custom/CMOR_iwpStderr.dat", \ + "/cmip5/Tables/CMIP5_aero", \ + "/cmip5/Tables/CMIP5_aero"/) + + ; Type + TYPE = "sat" + + ; Global attributes + SOURCE = "https://ladsweb.modaps.eosdis.nasa.gov/search/order" + REF1 = "Platnick et al., IEEE Trans. Geosci. Remote Sens., " + \ + "doi:10.1109/TGRS.2002.808301, 2003." + REF2 = "Levy et al., Atmos. Meas. Tech., " + \ + "doi:10.5194/amt-6-2989-2013, 2013." + COMMENT = "" + +end + +begin + + ; List of files + FILES = systemfunc("ls -1 " + input_dir_path + VERSION + ".A*.hdf") + + do ff = 0, dimsizes(FILES) - 1 + + fin = addfile(FILES(ff), "r") + + ; Get time + infile = systemfunc("basename " + FILES(ff)) + date = yyyyddd_to_yyyymmdd(toint(str_get_cols(infile, 10, 16))) + year = toint(str_get_cols(tostring(date), 0, 3)) + month = toint(str_get_cols(tostring(date), 4, 5)) + dm = days_in_month(year, month) + + ; Loop over variables to fetch from input file + do vv = 0, dimsizes(VAR) - 1 + + invar = fin->$NAME(vv)$ + invar_fv = invar@_FillValue + invar_coords = invar + invar := tofloat(invar) + invar := where(invar.eq.tofloat(invar_fv), \ + default_fillvalue("float"), invar) + + ; Special case clwvi as the sum lwp + iwp + if (VAR(vv).eq."clwvi") then + if (NAME(vv).ne."Cloud_Water_Path_Liquid_Mean_Mean") then + error_msg("f", DIAG_SCRIPT, "", "cannot calculate clwvi") + end if + + ; Read cirrus fraction + ; cfin = fin->Cirrus_Fraction_SWIR_FMean + cfin = fin->Cirrus_Fraction_Infrared_FMean + cif = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + ; liquid fraction is estimated assuming random overlap, i.e. + ; ctot = 1 - (1 - cif) * (1 - lif) + ; --> lif = 1 - (1 - ctot) / (1 - cif) + delete(cfin) + cfin = fin->Cloud_Fraction_Mean_Mean + ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + cif = where(cif.gt.0.999, cif@_FillValue, cif) + lif = 1.0 - (1.0 - ctot) / (1.0 - cif) + lif = where(lif.lt.0, 0, lif) + tmpvar = fin->Cloud_Water_Path_Ice_Mean_Mean ; read ice water path + tmpvar_fv = tmpvar@_FillValue + tmpvar := tofloat(tmpvar) + tmpvar := where(tmpvar.eq.tofloat(tmpvar_fv), \ + default_fillvalue("float"), \ + tmpvar) + tmpvar = tmpvar * cif ; convert iwp in-cloud value to gridbox avg + invar = invar * lif ; convert lwp in-cloud value to grid-box avg + invar = invar + tmpvar ; clwvi = lwp + iwp + delete(tmpvar) + delete(lif) + delete(cif) + invar = 0.001 * invar ; [g/m2] --> [kg/m2] + end if + + ; lwp and iwp are in-cloud values + ; convert lwp/iwp to grid-box averages by multiplying with + ; average cloud fraction (not optimum but best we can do at the moment) + if (any((/"clivi", "iwpStderr", "lwpStderr"/) .eq. VAR(vv))) then + + ; Read cirrus fraction (0-1) + ; cfin = fin->Cirrus_Fraction_SWIR_FMean + cfin = fin->Cirrus_Fraction_Infrared_FMean + cf = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + if (VAR(vv).eq."lwpStderr") then + cfin = fin->Cloud_Fraction_Mean_Mean + ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + cif = where(cf.gt.0.999, cf@_FillValue, cf) + cf = 1.0 - (1.0 - ctot) / (1.0 - cif) + cf = where(cf.lt.0, 0, cf) + delete(cif) + delete(ctot) + end if + invar = invar * cf ; ; "grid-box average" lwp/iwp + delete(cf) + invar = 0.001 * invar ; [g/m2] --> [kg/m2] + end if + + invar@_FillValue = default_fillvalue("float") + copy_VarCoords(invar_coords, invar) + if (isatt(invar_coords, "scale_factor")) then + invar = invar * tofloat(invar_coords@scale_factor) + end if + if (isatt(invar_coords, "add_offset")) then + invar = invar + tofloat(invar_coords@add_offset) + end if + + if (VAR(vv).eq."clt") then + invar = 100.0 * invar ; [1] --> [%] + end if + + ; Create output variable + lat = fin->YDim + lon = fin->XDim + output = new((/1, dimsizes(lat), dimsizes(lon)/), float) + output!0 = "time" + output!1 = "lat" + output!2 = "lon" + output&time = cd_inv_calendar(year, month, 15, 0, 0, 0, TUNITS, 0) + output&lat = lat + output&lon = lon + output(0, :, :) = (/invar/) + delete(invar) + delete(invar_coords) + + ; Format coordinates + format_coords(output, year + sprinti("%0.2i", month) + "01", \ + year + sprinti("%0.2i", month) + dm, FREQ(vv)) + + ; Set variable attributes + tmp = format_variable(output, VAR(vv), CMOR_TABLE(vv)) + delete(output) + output = tmp + delete(tmp) + + ; Calculate coordinate bounds + bounds = guess_coord_bounds(output, FREQ(vv)) + + ; Set global attributes + if (VAR(vv).ne."od550aer") then + gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF1, COMMENT) + else + gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF2, COMMENT) + end if + + ; Output file + DATESTR = \ + year + sprinti("%0.2i", month) + "-" + year + sprinti("%0.2i", month) + fout = output_dir_path + \ + str_join((/"OBS", OBSNAME, TYPE, str_sub_str(VERSION, "_", "-"), \ + MIP(vv), VAR(vv), DATESTR/), "_") + ".nc" + + ; Write variable + write_nc(fout, VAR(vv), output, bounds, gAtt) + delete(gAtt) + delete(output) + delete(bounds) + + end do + + end do + +end diff --git a/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl b/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl new file mode 100644 index 0000000000..2e02498a88 --- /dev/null +++ b/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl @@ -0,0 +1,253 @@ +; ############################################################################# +; ESMValTool CMORizer for MODIS data +; ############################################################################# +; +; Tier +; Tier 3: restricted dataset. +; +; Source +; https://ladsweb.modaps.eosdis.nasa.gov/search/order +; +; Last access +; 20190209 +; +; Download and processing instructions +; In Products: select "MODIS Aqua", "Collection 6.1" and +; "L3 Atmosphere Product", click on MOD08_M3. +; In Time: select from 2000-01-01 to today. +; In Location: skip, the global domain will be applied. +; In Files: select all. +; Submit the order. +; A registration is required to download the data. +; +; Caveats +; clwvi and clivi data are in-cloud values whereas CMIP5 models provide +; grid-box averages --> multiply MODIS clwvi and clivi values with cloud +; fraction as a first guess +; +; Modification history +; 20180209-righi_mattia: fixed bug in lwpStderr. +; 20180209-hassler_birgit: adapted to v2. +; 20180810-righi_mattia: fix minor calendar issue. +; 20180806-righi_mattia: code cleaning. +; 20170116-lauer_axel: using cirrus fraction to gridbox averages. +; 20160408-lauer_axel: added processing of uncertainties. +; 20151118-lauer_axel: bugfix: added unit conversion. +; 20150430-evaldsson_martin: written. +; +; ############################################################################# +loadscript(getenv("esmvaltool_root") + \ + "/data/formatters/interface.ncl") + +begin + + ; Script name (for logger) + DIAG_SCRIPT = "modis_terra.ncl" + + ; Source name + OBSNAME = "MODIS-Terra" + + ; Tier + TIER = 3 + + ; Selected variable (standard name) + VAR = (/"clwvi", \ + "clivi", \ + "clt", \ + "lwpStderr", \ + "iwpStderr", \ + "od550aer", \ + "reffclwtop"/) + + ; Name in the raw data + NAME = (/"Cloud_Water_Path_Liquid_Mean_Mean", \ + "Cloud_Water_Path_Ice_Mean_Mean", \ + "Cloud_Fraction_Mean_Mean", \ + "Cloud_Water_Path_Liquid_Mean_Uncertainty", \ + "Cloud_Water_Path_Ice_Mean_Uncertainty", \ + "AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean", \ + "Cloud_Effective_Radius_Liquid_Mean_Mean"/) + + ; MIP + MIP = (/"Amon", "Amon", "Amon", "Amon", "Amon", "aero", "aero"/) + + ; Frequency + FREQ = (/"mon", "mon", "mon", "mon", "mon", "mon", "mon"/) + + ; Version + VERSION = "MOD08_M3" + + ; CMOR table + CMOR_TABLE = getenv("cmor_tables") + \ + (/"/cmip5/Tables/CMIP5_Amon", \ + "/cmip5/Tables/CMIP5_Amon", \ + "/cmip5/Tables/CMIP5_Amon", \ + "/custom/CMOR_lwpStderr.dat", \ + "/custom/CMOR_iwpStderr.dat", \ + "/cmip5/Tables/CMIP5_aero", \ + "/cmip5/Tables/CMIP5_aero"/) + + ; Type + TYPE = "sat" + + ; Global attributes + SOURCE = "https://ladsweb.modaps.eosdis.nasa.gov/search/order" + REF1 = "Platnick et al., IEEE Trans. Geosci. Remote Sens., " + \ + "doi:10.1109/TGRS.2002.808301, 2003." + REF2 = "Levy et al., Atmos. Meas. Tech., " + \ + "doi:10.5194/amt-6-2989-2013, 2013." + COMMENT = "" + +end + +begin + + ; List of files + FILES = systemfunc("ls -1 " + input_dir_path + VERSION + ".A*.hdf") + + do ff = 0, dimsizes(FILES) - 1 + + fin = addfile(FILES(ff), "r") + + ; Get time + infile = systemfunc("basename " + FILES(ff)) + date = yyyyddd_to_yyyymmdd(toint(str_get_cols(infile, 10, 16))) + year = toint(str_get_cols(tostring(date), 0, 3)) + month = toint(str_get_cols(tostring(date), 4, 5)) + dm = days_in_month(year, month) + + ; Loop over variables to fetch from input file + do vv = 0, dimsizes(VAR) - 1 + + invar = fin->$NAME(vv)$ + invar_fv = invar@_FillValue + invar_coords = invar + invar := tofloat(invar) + invar := where(invar.eq.tofloat(invar_fv), \ + default_fillvalue("float"), invar) + + ; Special case clwvi as the sum lwp + iwp + if (VAR(vv).eq."clwvi") then + if (NAME(vv).ne."Cloud_Water_Path_Liquid_Mean_Mean") then + error_msg("f", DIAG_SCRIPT, "", "cannot calculate clwvi") + end if + + ; Read cirrus fraction + ; cfin = fin->Cirrus_Fraction_SWIR_FMean + cfin = fin->Cirrus_Fraction_Infrared_FMean + cif = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + ; liquid fraction is estimated assuming random overlap, i.e. + ; ctot = 1 - (1 - cif) * (1 - lif) + ; --> lif = 1 - (1 - ctot) / (1 - cif) + delete(cfin) + cfin = fin->Cloud_Fraction_Mean_Mean + ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + cif = where(cif.gt.0.999, cif@_FillValue, cif) + lif = 1.0 - (1.0 - ctot) / (1.0 - cif) + lif = where(lif.lt.0, 0, lif) + tmpvar = fin->Cloud_Water_Path_Ice_Mean_Mean ; read ice water path + tmpvar_fv = tmpvar@_FillValue + tmpvar := tofloat(tmpvar) + tmpvar := where(tmpvar.eq.tofloat(tmpvar_fv), \ + default_fillvalue("float"), \ + tmpvar) + tmpvar = tmpvar * cif ; convert iwp in-cloud value to gridbox avg + invar = invar * lif ; convert lwp in-cloud value to grid-box avg + invar = invar + tmpvar ; clwvi = lwp + iwp + delete(tmpvar) + delete(lif) + delete(cif) + invar = 0.001 * invar ; [g/m2] --> [kg/m2] + end if + + ; lwp and iwp are in-cloud values + ; convert lwp/iwp to grid-box averages by multiplying with + ; average cloud fraction (not optimum but best we can do at the moment) + if (any((/"clivi", "iwpStderr", "lwpStderr"/) .eq. VAR(vv))) then + + ; Read cirrus fraction (0-1) + ; cfin = fin->Cirrus_Fraction_SWIR_FMean + cfin = fin->Cirrus_Fraction_Infrared_FMean + cf = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + if (VAR(vv).eq."lwpStderr") then + cfin = fin->Cloud_Fraction_Mean_Mean + ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) + delete(cfin) + cif = where(cf.gt.0.999, cf@_FillValue, cf) + cf = 1.0 - (1.0 - ctot) / (1.0 - cif) + cf = where(cf.lt.0, 0, cf) + delete(cif) + delete(ctot) + end if + invar = invar * cf ; ; "grid-box average" lwp/iwp + delete(cf) + invar = 0.001 * invar ; [g/m2] --> [kg/m2] + end if + + invar@_FillValue = default_fillvalue("float") + copy_VarCoords(invar_coords, invar) + if (isatt(invar_coords, "scale_factor")) then + invar = invar * tofloat(invar_coords@scale_factor) + end if + if (isatt(invar_coords, "add_offset")) then + invar = invar + tofloat(invar_coords@add_offset) + end if + + if (VAR(vv).eq."clt") then + invar = 100.0 * invar ; [1] --> [%] + end if + + ; Create output variable + lat = fin->YDim + lon = fin->XDim + output = new((/1, dimsizes(lat), dimsizes(lon)/), float) + output!0 = "time" + output!1 = "lat" + output!2 = "lon" + output&time = cd_inv_calendar(year, month, 15, 0, 0, 0, TUNITS, 0) + output&lat = lat + output&lon = lon + output(0, :, :) = (/invar/) + delete(invar) + delete(invar_coords) + + ; Format coordinates + format_coords(output, year + sprinti("%0.2i", month) + "01", \ + year + sprinti("%0.2i", month) + dm, FREQ(vv)) + + ; Set variable attributes + tmp = format_variable(output, VAR(vv), CMOR_TABLE(vv)) + delete(output) + output = tmp + delete(tmp) + + ; Calculate coordinate bounds + bounds = guess_coord_bounds(output, FREQ(vv)) + + ; Set global attributes + if (VAR(vv).ne."od550aer") then + gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF1, COMMENT) + else + gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF2, COMMENT) + end if + + ; Output file + DATESTR = \ + year + sprinti("%0.2i", month) + "-" + year + sprinti("%0.2i", month) + fout = output_dir_path + \ + str_join((/"OBS", OBSNAME, TYPE, str_sub_str(VERSION, "_", "-"), \ + MIP(vv), VAR(vv), DATESTR/), "_") + ".nc" + + ; Write variable + write_nc(fout, VAR(vv), output, bounds, gAtt) + delete(gAtt) + delete(output) + delete(bounds) + + end do + + end do + +end diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py new file mode 100644 index 0000000000..f43bdc8d2b --- /dev/null +++ b/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py @@ -0,0 +1,447 @@ +"""ESMValTool CMORizer for MODIS aqua data. + +Tier + Tier 3 + +Source + https://ladsweb.modaps.eosdis.nasa.gov/search/order + +Requirements + python hdf reader, https://pypi.org/project/pyhdf/ + +Modification history + 20260216-Fruttarol_Noah: Written based on the original MODIS CMORization NCL script created by evaldsson_martin, which can be found in at ESMValTool/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl + + +""" + +import logging +import os +import re +from datetime import datetime, timedelta + +import iris +import numpy as np +from dask import array as da +from esmvalcore.cmor.table import CMOR_TABLES +from pyhdf.SD import SD, SDC + +from esmvaltool.cmorizers.data import utilities as utils + +logger = logging.getLogger(__name__) + + +def collect_files(in_dir: str, cfg) -> list: + """Compose input file list and download if missing.""" + + attrs = cfg["attributes"] + # rootpath / Tier{tier}/{dataset}/{version}/{frequency}/{short_name} + # in_dir = rootpath / Tier{tier}/{dataset} + in_dir = os.path.join(in_dir) + fname_pattern = ( + f"{attrs['version']}\\.A.*\\.hdf$" # e.g. "MYD08_M3.A*.hdf" + ) + + files_a = next(os.walk(in_dir), (None, None, []))[ + 2 + ] # get list of files in in_dir + files = [] + for filename in files_a: + if re.search(fname_pattern, filename): + files.append(filename) + + if len(files) == 0: + raise Exception("No files found", in_dir) + + return files + + +def get_year_from_filepath(filepath: str, cfg) -> int: + """get year from first four characters of last section of basename""" + + attrs = cfg["attributes"] + basename = os.path.splitext(os.path.basename(filepath))[0] + offset = len( + f"{attrs['version']}.A" + ) # length of version string + dot + 'A' character + year = int(basename[offset : offset + 4]) + return year + + +def get_month_from_filepath(filepath: str, cfg) -> int: + """get month from timerange section of basename""" + + attrs = cfg["attributes"] + basename = os.path.splitext(os.path.basename(filepath))[0] + offset = len( + f"{attrs['version']}.A" + ) # length of version string + dot + 'A' character + doy = int(basename[offset + 4 : offset + 7]) # day of year + year = int(basename[offset : offset + 4]) + # Convert day-of-year to month + date = datetime(year, 1, 1) + timedelta(days=doy - 1) + month = date.month + logger.debug( + f"Extracted month {month}, doy {doy}, year {year} from filepath {filepath}" + ) + return month + + +def group_files_by_year(filepaths: list, cfg) -> dict: + """group filepaths by year using get_year_from_filepath function""" + years_D = dict() + for filename in filepaths: + year = get_year_from_filepath(filename, cfg) + + if year not in years_D: + years_D[year] = [filename] + else: + years_D[year].append(filename) + + return years_D + + +def read_hdf( + in_dir: str, + filepath: str, + year: int, + month: int, + raw_name: str, + short_name: str, + **extras, +): + """Read HDF file and build iris cube with auxiliary data for special variables.""" + f = SD(os.path.join(in_dir, filepath), SDC.READ) + + data_obj = f.select(raw_name) + data = data_obj.get().astype(np.float32) + + # mask fill values + if hasattr(data_obj, "fill_value"): + data[data == data_obj.fill_value] = np.nan + + # mask values outside valid range + if hasattr(data_obj, "valid_range"): + valid_range = data_obj.valid_range + logger.debug(f"Masking {short_name} data outside range {valid_range}") + data[(data < valid_range[0]) | (data > valid_range[1])] = np.nan + + # apply scale factor and add offset if they exist + if hasattr(data_obj, "scale_factor"): + scale_factor = data_obj.scale_factor + logger.debug(f"Scaling {short_name} data by {scale_factor}") + data *= scale_factor + if hasattr(data_obj, "add_offset"): + add_offset = data_obj.add_offset + logger.debug(f"Offsetting {short_name} data by {add_offset}") + data += add_offset + + # for certain variables, we need to read additional datasets to compute the final variable + + # liquid water path in-cloud, need to use cirrus fraction and cloud fraction to estimate liquid fraction, then multiply by in-cloud lwp to get grid-box average lwp + if short_name in ["clwvi"]: + cfirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") + cfin = cfirFm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + cif = (cfin * cfirFm_obj.scale_factor) + cfirFm_obj.add_offset + cif[cif > 0.999] = np.nan + + cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") + cfmm = cfmm_obj.get().astype(np.float32) + ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset + + # liquid fraction estimated assuming random overlap: + # ctot = 1 - (1 - cif) * (1 - lif) + # --> lif = 1 - (1 - ctot) / (1 - cif) + lif = 1.0 - (1.0 - ctot) / (1.0 - cif) + lif[lif < 0] = 0 + + # ice water path + cwpimm_obj = f.select("Cloud_Water_Path_Ice_Mean_Mean") + cwpimm = cwpimm_obj.get().astype(np.float32) + cwpimm_fv = cwpimm_obj._FillValue + cwpimm[cwpimm == cwpimm_fv] = np.nan # mask all fill values + + # convert iwp in-cloud to gridbox avg using masked cirrus fraction + iwp = cwpimm * cif + + # liquid water path + # convert lwp in-cloud value to grid-box avg + lwp = data * lif + + data = lwp + iwp + + # these variables are all in-cloud values, so we need to multiply by cirrus fraction to get grid-box average + elif short_name in ["clivi", "iwpStderr", "lwpStderr"]: + # cirrus fraction (0-1) + cfswirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") + cfswirFm = cfswirFm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + cf = (cfswirFm * cfswirFm_obj.scale_factor) + cfswirFm_obj.add_offset + cf[cf > 0.999] = np.nan + + # lwpStderr is only for liquid water path, so we want to use the liquid fraction to convert to grid-box average + if short_name in ["lwpStderr"]: + cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") + cfmm = cfmm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset + + cif = cf + cif[cf > 0.999] = np.nan + + cf = 1.0 - (1.0 - ctot) / (1.0 - cif) + cf[cf < 0] = 0 + + data *= cf # "grid-box average" lwp/iwp + + # mask all null values (nan, inf) + data = np.ma.masked_invalid(data) + + # Handle units - convert invalid units to '1' + units_str = data_obj.attributes().get("units", "1") + if units_str in ["none", "None", ""]: + units_str = "1" + + # Create time coordinate for the start of the month, with bounds covering the whole month + time_point = datetime( + year=year, month=month, day=15 + ) # use 15th as representative time for the month + time_bounds_lower = datetime(year=year, month=month, day=1) + time_bounds_upper = datetime( + year=year + (month == 12), month=month + 1 - (month == 12) * 12, day=1 + ) - timedelta( + days=1 + ) # end of month is the day before the first day of the next month + + # Convert time to days since 1850-01-01 + time_units = datetime(1850, 1, 1) + delta_1850 = (time_point - time_units).days + # After creating time delta, add bounds: + delta_bounds_lower = (time_bounds_lower - time_units).days + delta_bounds_upper = (time_bounds_upper - time_units).days + + time_coord = iris.coords.DimCoord( + points=[delta_1850], + standard_name="time", + units="days since 1850-01-01 00:00:00", + bounds=[[delta_bounds_lower, delta_bounds_upper]], + ) + data = data[np.newaxis, :, :] # Add time dimension at start + + lats = f.select("YDim").get().astype(np.float64) + lat_coord = iris.coords.DimCoord( + lats, + standard_name="latitude", + units="degrees_north", + bounds=[ + ( # calculate bounds as midpoint between adjacent latitudes, assuming regular grid + lats[i] - 0.5 * np.abs(lats[1] - lats[0]), + lats[i] + 0.5 * np.abs(lats[1] - lats[0]), + ) + for i in range(len(lats)) + ], + ) + + lons = f.select("XDim").get().astype(np.float64) + lon_coord = iris.coords.DimCoord( + lons, + standard_name="longitude", + units="degrees_east", + bounds=[ + ( # calculate bounds as midpoint between adjacent longitudes, assuming regular grid + lons[i] - 0.5 * np.abs(lons[1] - lons[0]), + lons[i] + 0.5 * np.abs(lons[1] - lons[0]), + ) + for i in range(len(lons)) + ], + ) + + if short_name == "od550aer": + # add auxiliary coordinate for wavelength (550 nm for this variable, which is AOD at 550 nm) + wavelength_coord = iris.coords.AuxCoord( + [550], + standard_name="radiation_wavelength", + var_name="wavelength", + units="nm", + ) + aux_coords_and_dims = [ + ( + wavelength_coord, + None, + ), # wavelength is a scalar auxiliary coordinate + ] + else: + aux_coords_and_dims = ( + None # no auxiliary coordinates for other variables + ) + + cube = iris.cube.Cube( + data, + long_name=data_obj.attributes().get("long_name", raw_name), + var_name=short_name, + units=units_str, + dim_coords_and_dims=[ + (time_coord, 0), + (lat_coord, 1), + (lon_coord, 2), + ], + aux_coords_and_dims=aux_coords_and_dims, + ) + + return cube + + +def convert( + cube: iris.cube.Cube, + cmor_info, + attrs, +) -> iris.cube.Cube: + """Convert cube to CMOR standards based on data type and cmor_info.""" + + if attrs.get("reference") is None: + attrs["reference"] = cmor_info.attributes["reference"] + + cube.convert_units(cmor_info.units) + + if cube.coord("longitude").points.min() < 0: + # convert from [-180, 180] to [0, 360] + cube.coord("longitude").points = cube.coord("longitude").points + 180.0 + # roll the data as part of the longitude conversion to maintain correct order (assuming regular grid and that longitude is the last dimension) + nlon = len(cube.coord("longitude").points) + dim_lon = 2 + cube.data = da.roll(cube.core_data(), int(nlon / 2), axis=dim_lon) + + if np.diff(cube.coord("latitude").points)[0] < 0: + # convert [90,-90] to [-90,90] + cube.coord("latitude").points = cube.coord("latitude").points[::-1] + # flip the data + cube.data = cube.data[:, ::-1, :] # latitude is axis=1 + + utils.set_global_atts(cube, attrs) + utils.fix_var_metadata(cube, cmor_info) + utils.fix_dim_coordnames(cube) + + utils.fix_coords(cube) + + return cube + + +def _extract_variable( + filepaths: list, + var_d, + attrs, + cmor_table, + cfg, + in_dir: str, + out_dir: str, + year: int, +) -> None: + """Extract variable data from input files, convert to CMOR standards, and save.""" + + # add mip to attributes for use in cmor table lookup and global attributes + attrs["mip"] = var_d["mip"] + + # get custom cmor table for the target project if specified + if var_d.get("project_id"): + cmor_table_b = CMOR_TABLES.get(var_d.get("project_id")) + cmor_info = cmor_table_b.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) + else: + cmor_info = cmor_table.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) + + # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table + if var_d.get("reference"): + attrs["reference"] = var_d.get("reference") + + if not cmor_table: + raise Exception( + f'project_id "{var_d.get("project_id", "None")}" is invalid, no CMOR table found. Valid options: {", ".join(list(CMOR_TABLES.keys()))}' + ) + + # check if cmor info was found for the variable, if not log an error and skip + if cmor_info is None: + logger.error( + f"CMOR info not found for variable {var_d.get('short_name')} in mip {var_d.get('mip')}" + ) + return + logger.debug( + f"CMOR info for variable {var_d.get('short_name')}: {cmor_info}" + ) + + logger.info( + f"CMORizing variable {var_d.get('short_name')} from input set {var_d.get('raw_name')}" + ) + + cubes = iris.cube.CubeList() + for filepath in filepaths: + month = get_month_from_filepath(filepath, cfg) + logger.debug( + f"File: {filepath} -> var={var_d.get('short_name')} year={year}, month={month}" + ) + cube = read_hdf(in_dir, filepath, year, month, **var_d) + + logger.debug( + f"Read cube for variable {var_d.get('short_name')}: {cube}, with attributes: {cube.attributes}" + ) + cubes.append(cube) + + if len(cubes) > 0: + cube = cubes.concatenate()[ + 0 + ] # concatenate along time dimension and take the resulting cube + else: + cube = cubes[0] + + cube = convert(cube, cmor_info, attrs) + + # add dataset_id to global attributes for provenance + cube_attrs = cube.attributes.globals + cube_attrs["dataset_id"] = attrs["dataset_id"] + logger.debug(f"Saving variable {cube} with attributes: {cube_attrs}") + + utils.save_variable( + cube, + var_d.get("short_name"), + out_dir, + cube_attrs, + unlimited_dimensions=["time"], + ) + + +def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): + """Run CMORizer for MODIS.""" + + cmor_table = cfg["cmor_table"] + glob_attrs = cfg["attributes"] + + # get relevant files + filepaths = collect_files(in_dir, cfg) + + # group by year + years_d = group_files_by_year(filepaths, cfg) + years = sorted(list(years_d.keys())) + + # iterate over the variables to be CMORized + for var, var_d in cfg["variables"].items(): + logger.info( + "CMORizing var %s from input set %s", var, var_d["raw_name"] + ) + var_d["short_name"] = var + logger.debug("\n" + format(var_d)) + + # cmorize by year, merging all months at the end + for year in years: + _extract_variable( + years_d[year], + var_d, + glob_attrs, + cmor_table, + cfg, + in_dir, + out_dir, + year, + ) diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py new file mode 100644 index 0000000000..d7e2b5f088 --- /dev/null +++ b/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py @@ -0,0 +1,447 @@ +"""ESMValTool CMORizer for MODIS terra data. + +Tier + Tier 3 + +Source + https://ladsweb.modaps.eosdis.nasa.gov/search/order + +Requirements + python hdf reader, https://pypi.org/project/pyhdf/ + +Modification history + 20260216-Fruttarol_Noah: Written based on the original MODIS CMORization NCL script created by evaldsson_martin, which can be found in at ESMValTool/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl + + +""" + +import logging +import os +import re +from datetime import datetime, timedelta + +import iris +import numpy as np +from dask import array as da +from esmvalcore.cmor.table import CMOR_TABLES +from pyhdf.SD import SD, SDC + +from esmvaltool.cmorizers.data import utilities as utils + +logger = logging.getLogger(__name__) + + +def collect_files(in_dir: str, cfg) -> list: + """Compose input file list and download if missing.""" + + attrs = cfg["attributes"] + # rootpath / Tier{tier}/{dataset}/{version}/{frequency}/{short_name} + # in_dir = rootpath / Tier{tier}/{dataset} + in_dir = os.path.join(in_dir) + fname_pattern = ( + f"{attrs['version']}\\.A.*\\.hdf$" # e.g. "MOD08_M3.A*.hdf" + ) + + files_a = next(os.walk(in_dir), (None, None, []))[ + 2 + ] # get list of files in in_dir + files = [] + for filename in files_a: + if re.search(fname_pattern, filename): + files.append(filename) + + if len(files) == 0: + raise Exception("No files found", in_dir) + + return files + + +def get_year_from_filepath(filepath: str, cfg) -> int: + """get year from first four characters of last section of basename""" + + attrs = cfg["attributes"] + basename = os.path.splitext(os.path.basename(filepath))[0] + offset = len( + f"{attrs['version']}.A" + ) # length of version string + dot + 'A' character + year = int(basename[offset : offset + 4]) + return year + + +def get_month_from_filepath(filepath: str, cfg) -> int: + """get month from timerange section of basename""" + + attrs = cfg["attributes"] + basename = os.path.splitext(os.path.basename(filepath))[0] + offset = len( + f"{attrs['version']}.A" + ) # length of version string + dot + 'A' character + doy = int(basename[offset + 4 : offset + 7]) # day of year + year = int(basename[offset : offset + 4]) + # Convert day-of-year to month + date = datetime(year, 1, 1) + timedelta(days=doy - 1) + month = date.month + logger.debug( + f"Extracted month {month}, doy {doy}, year {year} from filepath {filepath}" + ) + return month + + +def group_files_by_year(filepaths: list, cfg) -> dict: + """group filepaths by year using get_year_from_filepath function""" + years_D = dict() + for filename in filepaths: + year = get_year_from_filepath(filename, cfg) + + if year not in years_D: + years_D[year] = [filename] + else: + years_D[year].append(filename) + + return years_D + + +def read_hdf( + in_dir: str, + filepath: str, + year: int, + month: int, + raw_name: str, + short_name: str, + **extras, +): + """Read HDF file and build iris cube with auxiliary data for special variables.""" + f = SD(os.path.join(in_dir, filepath), SDC.READ) + + data_obj = f.select(raw_name) + data = data_obj.get().astype(np.float32) + + # mask fill values + if hasattr(data_obj, "fill_value"): + data[data == data_obj.fill_value] = np.nan + + # mask values outside valid range + if hasattr(data_obj, "valid_range"): + valid_range = data_obj.valid_range + logger.debug(f"Masking {short_name} data outside range {valid_range}") + data[(data < valid_range[0]) | (data > valid_range[1])] = np.nan + + # apply scale factor and add offset if they exist + if hasattr(data_obj, "scale_factor"): + scale_factor = data_obj.scale_factor + logger.debug(f"Scaling {short_name} data by {scale_factor}") + data *= scale_factor + if hasattr(data_obj, "add_offset"): + add_offset = data_obj.add_offset + logger.debug(f"Offsetting {short_name} data by {add_offset}") + data += add_offset + + # for certain variables, we need to read additional datasets to compute the final variable + + # liquid water path in-cloud, need to use cirrus fraction and cloud fraction to estimate liquid fraction, then multiply by in-cloud lwp to get grid-box average lwp + if short_name in ["clwvi"]: + cfirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") + cfin = cfirFm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + cif = (cfin * cfirFm_obj.scale_factor) + cfirFm_obj.add_offset + cif[cif > 0.999] = np.nan + + cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") + cfmm = cfmm_obj.get().astype(np.float32) + ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset + + # liquid fraction estimated assuming random overlap: + # ctot = 1 - (1 - cif) * (1 - lif) + # --> lif = 1 - (1 - ctot) / (1 - cif) + lif = 1.0 - (1.0 - ctot) / (1.0 - cif) + lif[lif < 0] = 0 + + # ice water path + cwpimm_obj = f.select("Cloud_Water_Path_Ice_Mean_Mean") + cwpimm = cwpimm_obj.get().astype(np.float32) + cwpimm_fv = cwpimm_obj._FillValue + cwpimm[cwpimm == cwpimm_fv] = np.nan # mask all fill values + + # convert iwp in-cloud to gridbox avg using masked cirrus fraction + iwp = cwpimm * cif + + # liquid water path + # convert lwp in-cloud value to grid-box avg + lwp = data * lif + + data = lwp + iwp + + # these variables are all in-cloud values, so we need to multiply by cirrus fraction to get grid-box average + elif short_name in ["clivi", "iwpStderr", "lwpStderr"]: + # cirrus fraction (0-1) + cfswirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") + cfswirFm = cfswirFm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + cf = (cfswirFm * cfswirFm_obj.scale_factor) + cfswirFm_obj.add_offset + cf[cf > 0.999] = np.nan + + # lwpStderr is only for liquid water path, so we want to use the liquid fraction to convert to grid-box average + if short_name in ["lwpStderr"]: + cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") + cfmm = cfmm_obj.get().astype(np.float32) + # unpack with scale_factor and add_offset + ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset + + cif = cf + cif[cf > 0.999] = np.nan + + cf = 1.0 - (1.0 - ctot) / (1.0 - cif) + cf[cf < 0] = 0 + + data *= cf # "grid-box average" lwp/iwp + + # mask all null values (nan, inf) + data = np.ma.masked_invalid(data) + + # Handle units - convert invalid units to '1' + units_str = data_obj.attributes().get("units", "1") + if units_str in ["none", "None", ""]: + units_str = "1" + + # Create time coordinate for the start of the month, with bounds covering the whole month + time_point = datetime( + year=year, month=month, day=15 + ) # use 15th as representative time for the month + time_bounds_lower = datetime(year=year, month=month, day=1) + time_bounds_upper = datetime( + year=year + (month == 12), month=month + 1 - (month == 12) * 12, day=1 + ) - timedelta( + days=1 + ) # end of month is the day before the first day of the next month + + # Convert time to days since 1850-01-01 + time_units = datetime(1850, 1, 1) + delta_1850 = (time_point - time_units).days + # After creating time delta, add bounds: + delta_bounds_lower = (time_bounds_lower - time_units).days + delta_bounds_upper = (time_bounds_upper - time_units).days + + time_coord = iris.coords.DimCoord( + points=[delta_1850], + standard_name="time", + units="days since 1850-01-01 00:00:00", + bounds=[[delta_bounds_lower, delta_bounds_upper]], + ) + data = data[np.newaxis, :, :] # Add time dimension at start + + lats = f.select("YDim").get().astype(np.float64) + lat_coord = iris.coords.DimCoord( + lats, + standard_name="latitude", + units="degrees_north", + bounds=[ + ( # calculate bounds as midpoint between adjacent latitudes, assuming regular grid + lats[i] - 0.5 * np.abs(lats[1] - lats[0]), + lats[i] + 0.5 * np.abs(lats[1] - lats[0]), + ) + for i in range(len(lats)) + ], + ) + + lons = f.select("XDim").get().astype(np.float64) + lon_coord = iris.coords.DimCoord( + lons, + standard_name="longitude", + units="degrees_east", + bounds=[ + ( # calculate bounds as midpoint between adjacent longitudes, assuming regular grid + lons[i] - 0.5 * np.abs(lons[1] - lons[0]), + lons[i] + 0.5 * np.abs(lons[1] - lons[0]), + ) + for i in range(len(lons)) + ], + ) + + if short_name == "od550aer": + # add auxiliary coordinate for wavelength (550 nm for this variable, which is AOD at 550 nm) + wavelength_coord = iris.coords.AuxCoord( + [550], + standard_name="radiation_wavelength", + var_name="wavelength", + units="nm", + ) + aux_coords_and_dims = [ + ( + wavelength_coord, + None, + ), # wavelength is a scalar auxiliary coordinate + ] + else: + aux_coords_and_dims = ( + None # no auxiliary coordinates for other variables + ) + + cube = iris.cube.Cube( + data, + long_name=data_obj.attributes().get("long_name", raw_name), + var_name=short_name, + units=units_str, + dim_coords_and_dims=[ + (time_coord, 0), + (lat_coord, 1), + (lon_coord, 2), + ], + aux_coords_and_dims=aux_coords_and_dims, + ) + + return cube + + +def convert( + cube: iris.cube.Cube, + cmor_info, + attrs, +) -> iris.cube.Cube: + """Convert cube to CMOR standards based on data type and cmor_info.""" + + if attrs.get("reference") is None: + attrs["reference"] = cmor_info.attributes["reference"] + + cube.convert_units(cmor_info.units) + + if cube.coord("longitude").points.min() < 0: + # convert from [-180, 180] to [0, 360] + cube.coord("longitude").points = cube.coord("longitude").points + 180.0 + # roll the data as part of the longitude conversion to maintain correct order (assuming regular grid and that longitude is the last dimension) + nlon = len(cube.coord("longitude").points) + dim_lon = 2 + cube.data = da.roll(cube.core_data(), int(nlon / 2), axis=dim_lon) + + if np.diff(cube.coord("latitude").points)[0] < 0: + # convert [90,-90] to [-90,90] + cube.coord("latitude").points = cube.coord("latitude").points[::-1] + # flip the data + cube.data = cube.data[:, ::-1, :] # latitude is axis=1 + + utils.set_global_atts(cube, attrs) + utils.fix_var_metadata(cube, cmor_info) + utils.fix_dim_coordnames(cube) + + utils.fix_coords(cube) + + return cube + + +def _extract_variable( + filepaths: list, + var_d, + attrs, + cmor_table, + cfg, + in_dir: str, + out_dir: str, + year: int, +) -> None: + """Extract variable data from input files, convert to CMOR standards, and save.""" + + # add mip to attributes for use in cmor table lookup and global attributes + attrs["mip"] = var_d["mip"] + + # get custom cmor table for the target project if specified + if var_d.get("project_id"): + cmor_table_b = CMOR_TABLES.get(var_d.get("project_id")) + cmor_info = cmor_table_b.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) + else: + cmor_info = cmor_table.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) + + # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table + if var_d.get("reference"): + attrs["reference"] = var_d.get("reference") + + if not cmor_table: + raise Exception( + f'project_id "{var_d.get("project_id", "None")}" is invalid, no CMOR table found. Valid options: {", ".join(list(CMOR_TABLES.keys()))}' + ) + + # check if cmor info was found for the variable, if not log an error and skip + if cmor_info is None: + logger.error( + f"CMOR info not found for variable {var_d.get('short_name')} in mip {var_d.get('mip')}" + ) + return + logger.debug( + f"CMOR info for variable {var_d.get('short_name')}: {cmor_info}" + ) + + logger.info( + f"CMORizing variable {var_d.get('short_name')} from input set {var_d.get('raw_name')}" + ) + + cubes = iris.cube.CubeList() + for filepath in filepaths: + month = get_month_from_filepath(filepath, cfg) + logger.debug( + f"File: {filepath} -> var={var_d.get('short_name')} year={year}, month={month}" + ) + cube = read_hdf(in_dir, filepath, year, month, **var_d) + + logger.debug( + f"Read cube for variable {var_d.get('short_name')}: {cube}, with attributes: {cube.attributes}" + ) + cubes.append(cube) + + if len(cubes) > 0: + cube = cubes.concatenate()[ + 0 + ] # concatenate along time dimension and take the resulting cube + else: + cube = cubes[0] + + cube = convert(cube, cmor_info, attrs) + + # add dataset_id to global attributes for provenance + cube_attrs = cube.attributes.globals + cube_attrs["dataset_id"] = attrs["dataset_id"] + logger.debug(f"Saving variable {cube} with attributes: {cube_attrs}") + + utils.save_variable( + cube, + var_d.get("short_name"), + out_dir, + cube_attrs, + unlimited_dimensions=["time"], + ) + + +def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): + """Run CMORizer for MODIS.""" + + cmor_table = cfg["cmor_table"] + glob_attrs = cfg["attributes"] + + # get relevant files + filepaths = collect_files(in_dir, cfg) + + # group by year + years_d = group_files_by_year(filepaths, cfg) + years = sorted(list(years_d.keys())) + + # iterate over the variables to be CMORized + for var, var_d in cfg["variables"].items(): + logger.info( + "CMORizing var %s from input set %s", var, var_d["raw_name"] + ) + var_d["short_name"] = var + logger.debug("\n" + format(var_d)) + + # cmorize by year, merging all months at the end + for year in years: + _extract_variable( + years_d[year], + var_d, + glob_attrs, + cmor_table, + cfg, + in_dir, + out_dir, + year, + ) diff --git a/pyproject.toml b/pyproject.toml index f7b907c2c6..28877fe61d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ dependencies = [ "packaging", "pandas", "progressbar2", + "pyhdf", "pyproj>=2.1", "pys2index", "python-dateutil", From 7702714619a59a589fc3e4981d4eee54d0363c0a Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Tue, 17 Feb 2026 19:24:22 +0000 Subject: [PATCH 02/13] Applied ruff format changes --- .../cmorizers/data/cmor_config/MISR.yml | 23 +++ .../data/formatters/datasets/misr.py | 143 ++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 esmvaltool/cmorizers/data/cmor_config/MISR.yml create mode 100644 esmvaltool/cmorizers/data/formatters/datasets/misr.py diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml new file mode 100644 index 0000000000..c7348afa5d --- /dev/null +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -0,0 +1,23 @@ +--- +# Global attributes of NetCDF file +attributes: + dataset_id: MISR + project_id: CMOR6 + tier: 2 + version: 'V0032' + modeling_realm: sat + source: 'ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/' + reference: 'misr' + comment: '' + files: 'MISR_AM1_CGAS_*_F15_0032.nc' +# Variables to CMORize +variables: + od550aer: + raw_name: Aerosol_Optical_Depth + mip: AERmon + abs550aer: + raw_name: Absorbing_Optical_Depth + mip: AERmon + # ae: + # raw_name: Angstrom_Exponent_550_860 + # mip: AERmon diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py new file mode 100644 index 0000000000..c07c314e3d --- /dev/null +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -0,0 +1,143 @@ +"""ESMValTool CMORizer for MISR data. + +Tier + Tier 2 + +Source + ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/ + +""" + +import copy +import datetime as dt +import logging +import os +from pathlib import Path + +import numpy as np +import xarray as xr +from dask import array as da +from esmvalcore.cmor.table import CMOR_TABLES + +from esmvaltool.cmorizers.data import utilities as utils + +logger = logging.getLogger(__name__) + +band550 = {"name": "green_558nm", "lambda": 558} + + +def _extract_variable(short_name, var, cfg, in_dir, out_dir): + attrs = copy.deepcopy(cfg["attributes"]) + attrs["mip"] = var["mip"] + ver = attrs["version"] + files = attrs["files"] + raw_var = var.get("raw_name", short_name) + + cmor_table = CMOR_TABLES[attrs["project_id"]] + cmor_info = cmor_table.get_variable(var["mip"], short_name) + + logger.info("CMORizing variable '%s' from file(s) '%s'", short_name, files) + + """Extract variable.""" + # load data + for filepath in Path(os.path.join(in_dir, ver)).glob(files): + xrds = xr.open_dataset(filepath, group="Aerosol_Parameter_Average") + xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ + raw_var + ] + + # change order of latitude and longitude coordinates + # xrvar = xrvar.transpose() + + # Add additional coordinates before converting to an iris cube, as this is easier with xarray + + # Time not present in source data, needs to be added manually + # Determine time from filename: + fileparts = str(filepath).split("_") + year = int(fileparts[-3]) + monthstr = fileparts[-4] + month = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ].index(monthstr) + 1 + days_since_1850 = dt.date(year, month, 15) - dt.date(1850, 1, 1) + lb_since_1850 = dt.date(year, month, 1) - dt.date(1850, 1, 1) + if month == 12: + ub_since_1850 = dt.date(year + 1, 1, 1) - dt.date(1850, 1, 1) + else: + ub_since_1850 = dt.date(year, month + 1, 1) - dt.date(1850, 1, 1) + + xrvar = xrvar.assign_coords(time=days_since_1850.days) + xrvar = xrvar.expand_dims("time", axis=2) + xrvar["time"].attrs["units"] = "days since 1850-01-01 00:00:00" + + if short_name in ["od550aer", "abs550aer"]: + xrvar = xrvar.assign_coords(radiation_wavelength=band550["lambda"]) + xrvar["radiation_wavelength"].attrs["units"] = "nm" + + cube = xrvar.to_iris() + + # Fix metadata + cube.coord("Geodetic Latitude").rename("latitude") + cube.coord("Geodetic Longitude").rename("longitude") + + # add time bounds + cube.coord("time").bounds = np.array( + [ub_since_1850.days, lb_since_1850.days] + ) + + utils.fix_var_metadata(cube, cmor_info) + utils.set_global_atts(cube, attrs) + + utils.fix_dim_coordnames(cube) + + # When Dask tries to roll this cube, it fails because it can't chunk this properly + # So here we replicate the part of fix_coords that does that, except with numpy.roll + # instead of dask.roll. + # However, it seems that those last two lines had no effect on the results... + # cube.data has shape (360, 720, 1) i.e. (lat, lon, time) + # so the original code tried to roll the cube on the time axis + # I could hard-code the lon axis (like they did), but instead I try to autodetect it. + cube_coord = cube.coord("longitude") + logger.info("Fixing longitude...") + if cube_coord.ndim == 1: + if cube_coord.points[0] < 0.0 and cube_coord.points[-1] < 181.0: + cube_coord.points = cube_coord.points + 180.0 + cube.attributes["geospatial_lon_min"] = 0.0 + cube.attributes["geospatial_lon_max"] = 360.0 + nlon = len(cube_coord.points) + axis = cube.coords().index(cube_coord) + shift = nlon // 2 + cube.data = da.roll(cube.core_data(), shift, axis=axis) + + utils.fix_coords(cube) + + # fix the wavelength coordinate information. + if short_name in ["od550aer", "abs550aer"]: + cube.coord("radiation_wavelength").var_name = "wavelength" + cube.coord("wavelength").standard_name = "radiation_wavelength" + + utils.set_global_atts(cube, attrs) + + # Save variable + utils.save_variable( + cube, short_name, out_dir, attrs, unlimited_dimensions=["time"] + ) + + +def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): + """Run CMORizer for MISR.""" + cfg.pop("cmor_table") + + for short_name, var in cfg["variables"].items(): + _extract_variable(short_name, var, cfg, in_dir, out_dir) From 3f94595855e71c0fd094e6af1bf639d99c0ad365 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Tue, 17 Feb 2026 22:14:00 +0000 Subject: [PATCH 03/13] ok --- esmvaltool/cmorizers/data/formatters/datasets/misr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py index c07c314e3d..09088611b5 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/misr.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -6,6 +6,9 @@ Source ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/ +Modification history + + """ import copy From 2d1228c057ce84d8fea36556ec478e68a9d3b1d2 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Wed, 18 Feb 2026 10:08:51 +0000 Subject: [PATCH 04/13] 1 --- esmvaltool/cmorizers/data/cmor_config/MISR.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml index c7348afa5d..d57d12bc93 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MISR.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -1,14 +1,13 @@ ---- # Global attributes of NetCDF file attributes: - dataset_id: MISR - project_id: CMOR6 + project_id: CMIP6 + dataset_id: MODIS-Terra + modeling_realm: atmos + type: sat tier: 2 version: 'V0032' - modeling_realm: sat source: 'ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/' reference: 'misr' - comment: '' files: 'MISR_AM1_CGAS_*_F15_0032.nc' # Variables to CMORize variables: @@ -18,6 +17,6 @@ variables: abs550aer: raw_name: Absorbing_Optical_Depth mip: AERmon - # ae: - # raw_name: Angstrom_Exponent_550_860 - # mip: AERmon + ae: + raw_name: Angstrom_Exponent_550_860 + mip: AERmon From 2de95e9651cd0eac2f0800b3eabaa3b60bb5302d Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 19 Feb 2026 21:21:36 +0000 Subject: [PATCH 05/13] fixed problems running the cmorizer --- esmvaltool/cmorizers/data/cmor_config/MISR.yml | 9 +++------ esmvaltool/cmorizers/data/datasets.yml | 18 ++++++++++++++++++ .../cmorizers/data/formatters/datasets/misr.py | 13 +++++++++---- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml index d57d12bc93..596dc07bae 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MISR.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -1,12 +1,12 @@ # Global attributes of NetCDF file attributes: project_id: CMIP6 - dataset_id: MODIS-Terra + dataset_id: MISR modeling_realm: atmos type: sat - tier: 2 + tier: 3 version: 'V0032' - source: 'ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/' + source: 'https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004' reference: 'misr' files: 'MISR_AM1_CGAS_*_F15_0032.nc' # Variables to CMORize @@ -17,6 +17,3 @@ variables: abs550aer: raw_name: Absorbing_Optical_Depth mip: AERmon - ae: - raw_name: Angstrom_Exponent_550_860 - mip: AERmon diff --git a/esmvaltool/cmorizers/data/datasets.yml b/esmvaltool/cmorizers/data/datasets.yml index 3d50453bb9..dd4c106014 100644 --- a/esmvaltool/cmorizers/data/datasets.yml +++ b/esmvaltool/cmorizers/data/datasets.yml @@ -1003,6 +1003,24 @@ datasets: yearly granularity. Note that some (most) variables are on the goldsmr4 server, whereas others are on the goldsmr5 server. + MISR: + tier: 3 + source: https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004 + last_access: 2026-02-18 + info: | + In Products: select "MISR", "Collection 6.1" and "L3 Atmosphere Product", + click on MISR_AM1_AS_L3_V4.1. + In Time: select from 2000-01-01 to today. + In Location: skip, the global domain will be applied. + In Files: select all. + Submit the order. + A registration is required to download the data. + + Caveats + clwvi and clivi data are in-cloud values whereas CMIP5 models provide + grid-box averages --> multiply MISR clwvi and clivi values with cloud + fraction as a first guess + MLS-AURA: tier: 3 source: https://disc.gsfc.nasa.gov/datasets/ML2RHI_004/summary https://disc.gsfc.nasa.gov/datasets/ML2T_004/summary diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py index 09088611b5..0a0a6ac6f1 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/misr.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -1,7 +1,7 @@ """ESMValTool CMORizer for MISR data. Tier - Tier 2 + Tier 3 Source ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/ @@ -39,11 +39,15 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): cmor_table = CMOR_TABLES[attrs["project_id"]] cmor_info = cmor_table.get_variable(var["mip"], short_name) - logger.info("CMORizing variable '%s' from file(s) '%s'", short_name, files) - """Extract variable.""" # load data - for filepath in Path(os.path.join(in_dir, ver)).glob(files): + logger.info( + "Loading data from file(s) '%s' in directory '%s' with version '%s'", + files, + in_dir, + ver, + ) + for filepath in Path(os.path.join(in_dir)).glob(files): xrds = xr.open_dataset(filepath, group="Aerosol_Parameter_Average") xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ raw_var @@ -133,6 +137,7 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): utils.set_global_atts(cube, attrs) # Save variable + logger.info(f"Saving Cube: {cube}, in directory: {out_dir}") utils.save_variable( cube, short_name, out_dir, attrs, unlimited_dimensions=["time"] ) From 4cd85751023efb8bcea51badbdf019cb213575c9 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 19 Feb 2026 21:47:58 +0000 Subject: [PATCH 06/13] Bunched cmorized data files by year --- .../data/formatters/datasets/misr.py | 227 +++++++++++------- 1 file changed, 135 insertions(+), 92 deletions(-) diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py index 0a0a6ac6f1..2d7b3b767d 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/misr.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -12,13 +12,14 @@ """ import copy -import datetime as dt +import datetime import logging import os from pathlib import Path +import iris import numpy as np -import xarray as xr +import xarray from dask import array as da from esmvalcore.cmor.table import CMOR_TABLES @@ -29,6 +30,18 @@ band550 = {"name": "green_558nm", "lambda": 558} +def bunch_files_by_year(in_dir, files_pattern): + """Group files by year.""" + files_by_year = {} + for filepath in Path(os.path.join(in_dir)).glob(files_pattern): + fileparts = str(filepath).split("_") + year = int(fileparts[-3]) + if year not in files_by_year: + files_by_year[year] = [] + files_by_year[year].append(filepath) + return files_by_year + + def _extract_variable(short_name, var, cfg, in_dir, out_dir): attrs = copy.deepcopy(cfg["attributes"]) attrs["mip"] = var["mip"] @@ -47,97 +60,127 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): in_dir, ver, ) - for filepath in Path(os.path.join(in_dir)).glob(files): - xrds = xr.open_dataset(filepath, group="Aerosol_Parameter_Average") - xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ - raw_var - ] - - # change order of latitude and longitude coordinates - # xrvar = xrvar.transpose() - - # Add additional coordinates before converting to an iris cube, as this is easier with xarray - - # Time not present in source data, needs to be added manually - # Determine time from filename: - fileparts = str(filepath).split("_") - year = int(fileparts[-3]) - monthstr = fileparts[-4] - month = [ - "JAN", - "FEB", - "MAR", - "APR", - "MAY", - "JUN", - "JUL", - "AUG", - "SEP", - "OCT", - "NOV", - "DEC", - ].index(monthstr) + 1 - days_since_1850 = dt.date(year, month, 15) - dt.date(1850, 1, 1) - lb_since_1850 = dt.date(year, month, 1) - dt.date(1850, 1, 1) - if month == 12: - ub_since_1850 = dt.date(year + 1, 1, 1) - dt.date(1850, 1, 1) + files_by_year = bunch_files_by_year(in_dir, files) + for year, filepaths in files_by_year.items(): + cubes = iris.cube.CubeList() + for filepath in filepaths: + xrds = xarray.open_dataset( + filepath, group="Aerosol_Parameter_Average" + ) + xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ + raw_var + ] + + # change order of latitude and longitude coordinates + # xrvar = xrvar.transpose() + + # Add additional coordinates before converting to an iris cube, as this is easier with xarray + + # Time not present in source data, needs to be added manually + # Determine time from filename: + fileparts = str(filepath).split("_") + year = int(fileparts[-3]) + monthstr = fileparts[-4] + month = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ].index(monthstr) + 1 + days_since_1850 = datetime.date(year, month, 15) - datetime.date( + 1850, 1, 1 + ) + lb_since_1850 = datetime.date(year, month, 1) - datetime.date( + 1850, 1, 1 + ) + if month == 12: + ub_since_1850 = ( + datetime.date(year + 1, 1, 1) + - datetime.date(1850, 1, 1) + - datetime.timedelta(days=1) + ) + else: + ub_since_1850 = ( + datetime.date(year, month + 1, 1) + - datetime.date(1850, 1, 1) + - datetime.timedelta(days=1) + ) + + xrvar = xrvar.assign_coords(time=days_since_1850.days) + xrvar = xrvar.expand_dims("time", axis=2) + xrvar["time"].attrs["units"] = "days since 1850-01-01 00:00:00" + + if short_name in ["od550aer", "abs550aer"]: + xrvar = xrvar.assign_coords( + radiation_wavelength=band550["lambda"] + ) + xrvar["radiation_wavelength"].attrs["units"] = "nm" + + cube = xrvar.to_iris() + + # Fix metadata + cube.coord("Geodetic Latitude").rename("latitude") + cube.coord("Geodetic Longitude").rename("longitude") + + # add time bounds + cube.coord("time").bounds = np.array( + [ub_since_1850.days, lb_since_1850.days] + ) + cube.coord("time").units = "days since 1850-01-01 00:00:00" + + utils.fix_var_metadata(cube, cmor_info) + utils.set_global_atts(cube, attrs) + + utils.fix_dim_coordnames(cube) + + # When Dask tries to roll this cube, it fails because it can't chunk this properly + # So here we replicate the part of fix_coords that does that, except with numpy.roll + # instead of dask.roll. + # However, it seems that those last two lines had no effect on the results... + # cube.data has shape (360, 720, 1) i.e. (lat, lon, time) + # so the original code tried to roll the cube on the time axis + # I could hard-code the lon axis (like they did), but instead I try to autodetect it. + cube_coord = cube.coord("longitude") + if cube_coord.ndim == 1: + if ( + cube_coord.points[0] < 0.0 + and cube_coord.points[-1] < 181.0 + ): + cube_coord.points = cube_coord.points + 180.0 + cube.attributes["geospatial_lon_min"] = 0.0 + cube.attributes["geospatial_lon_max"] = 360.0 + nlon = len(cube_coord.points) + axis = cube.coords().index(cube_coord) + shift = nlon // 2 + cube.data = da.roll(cube.core_data(), shift, axis=axis) + + utils.fix_coords(cube) + + # fix the wavelength coordinate information. + if short_name in ["od550aer", "abs550aer"]: + cube.coord("radiation_wavelength").var_name = "wavelength" + cube.coord("wavelength").standard_name = "radiation_wavelength" + + utils.set_global_atts(cube, attrs) + + # Save variable + logger.debug(f"Saving Cube: {cube}, in directory: {out_dir}") + cubes.append(cube) + + if len(cubes) > 0: + cube = cubes.concatenate()[ + 0 + ] # concatenate along time dimension and take the resulting cube else: - ub_since_1850 = dt.date(year, month + 1, 1) - dt.date(1850, 1, 1) - - xrvar = xrvar.assign_coords(time=days_since_1850.days) - xrvar = xrvar.expand_dims("time", axis=2) - xrvar["time"].attrs["units"] = "days since 1850-01-01 00:00:00" - - if short_name in ["od550aer", "abs550aer"]: - xrvar = xrvar.assign_coords(radiation_wavelength=band550["lambda"]) - xrvar["radiation_wavelength"].attrs["units"] = "nm" - - cube = xrvar.to_iris() - - # Fix metadata - cube.coord("Geodetic Latitude").rename("latitude") - cube.coord("Geodetic Longitude").rename("longitude") - - # add time bounds - cube.coord("time").bounds = np.array( - [ub_since_1850.days, lb_since_1850.days] - ) - - utils.fix_var_metadata(cube, cmor_info) - utils.set_global_atts(cube, attrs) - - utils.fix_dim_coordnames(cube) - - # When Dask tries to roll this cube, it fails because it can't chunk this properly - # So here we replicate the part of fix_coords that does that, except with numpy.roll - # instead of dask.roll. - # However, it seems that those last two lines had no effect on the results... - # cube.data has shape (360, 720, 1) i.e. (lat, lon, time) - # so the original code tried to roll the cube on the time axis - # I could hard-code the lon axis (like they did), but instead I try to autodetect it. - cube_coord = cube.coord("longitude") - logger.info("Fixing longitude...") - if cube_coord.ndim == 1: - if cube_coord.points[0] < 0.0 and cube_coord.points[-1] < 181.0: - cube_coord.points = cube_coord.points + 180.0 - cube.attributes["geospatial_lon_min"] = 0.0 - cube.attributes["geospatial_lon_max"] = 360.0 - nlon = len(cube_coord.points) - axis = cube.coords().index(cube_coord) - shift = nlon // 2 - cube.data = da.roll(cube.core_data(), shift, axis=axis) - - utils.fix_coords(cube) - - # fix the wavelength coordinate information. - if short_name in ["od550aer", "abs550aer"]: - cube.coord("radiation_wavelength").var_name = "wavelength" - cube.coord("wavelength").standard_name = "radiation_wavelength" - - utils.set_global_atts(cube, attrs) - - # Save variable - logger.info(f"Saving Cube: {cube}, in directory: {out_dir}") + cube = cubes[0] utils.save_variable( cube, short_name, out_dir, attrs, unlimited_dimensions=["time"] ) From b4022043b239f47351c83abb315b0d35965c8143 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 19 Feb 2026 22:26:10 +0000 Subject: [PATCH 07/13] reverted esmvaltool/cmorizers/data/formatters/datasets/misr.py --- .../cmorizers/data/cmor_config/MISR.yml | 9 +- esmvaltool/cmorizers/data/datasets.yml | 18 -- .../data/formatters/datasets/misr.py | 227 +++++++----------- 3 files changed, 98 insertions(+), 156 deletions(-) diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml index 596dc07bae..d57d12bc93 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MISR.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -1,12 +1,12 @@ # Global attributes of NetCDF file attributes: project_id: CMIP6 - dataset_id: MISR + dataset_id: MODIS-Terra modeling_realm: atmos type: sat - tier: 3 + tier: 2 version: 'V0032' - source: 'https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004' + source: 'ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/' reference: 'misr' files: 'MISR_AM1_CGAS_*_F15_0032.nc' # Variables to CMORize @@ -17,3 +17,6 @@ variables: abs550aer: raw_name: Absorbing_Optical_Depth mip: AERmon + ae: + raw_name: Angstrom_Exponent_550_860 + mip: AERmon diff --git a/esmvaltool/cmorizers/data/datasets.yml b/esmvaltool/cmorizers/data/datasets.yml index dd4c106014..3d50453bb9 100644 --- a/esmvaltool/cmorizers/data/datasets.yml +++ b/esmvaltool/cmorizers/data/datasets.yml @@ -1003,24 +1003,6 @@ datasets: yearly granularity. Note that some (most) variables are on the goldsmr4 server, whereas others are on the goldsmr5 server. - MISR: - tier: 3 - source: https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004 - last_access: 2026-02-18 - info: | - In Products: select "MISR", "Collection 6.1" and "L3 Atmosphere Product", - click on MISR_AM1_AS_L3_V4.1. - In Time: select from 2000-01-01 to today. - In Location: skip, the global domain will be applied. - In Files: select all. - Submit the order. - A registration is required to download the data. - - Caveats - clwvi and clivi data are in-cloud values whereas CMIP5 models provide - grid-box averages --> multiply MISR clwvi and clivi values with cloud - fraction as a first guess - MLS-AURA: tier: 3 source: https://disc.gsfc.nasa.gov/datasets/ML2RHI_004/summary https://disc.gsfc.nasa.gov/datasets/ML2T_004/summary diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py index 2d7b3b767d..0a0a6ac6f1 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/misr.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -12,14 +12,13 @@ """ import copy -import datetime +import datetime as dt import logging import os from pathlib import Path -import iris import numpy as np -import xarray +import xarray as xr from dask import array as da from esmvalcore.cmor.table import CMOR_TABLES @@ -30,18 +29,6 @@ band550 = {"name": "green_558nm", "lambda": 558} -def bunch_files_by_year(in_dir, files_pattern): - """Group files by year.""" - files_by_year = {} - for filepath in Path(os.path.join(in_dir)).glob(files_pattern): - fileparts = str(filepath).split("_") - year = int(fileparts[-3]) - if year not in files_by_year: - files_by_year[year] = [] - files_by_year[year].append(filepath) - return files_by_year - - def _extract_variable(short_name, var, cfg, in_dir, out_dir): attrs = copy.deepcopy(cfg["attributes"]) attrs["mip"] = var["mip"] @@ -60,127 +47,97 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): in_dir, ver, ) - files_by_year = bunch_files_by_year(in_dir, files) - for year, filepaths in files_by_year.items(): - cubes = iris.cube.CubeList() - for filepath in filepaths: - xrds = xarray.open_dataset( - filepath, group="Aerosol_Parameter_Average" - ) - xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ - raw_var - ] - - # change order of latitude and longitude coordinates - # xrvar = xrvar.transpose() - - # Add additional coordinates before converting to an iris cube, as this is easier with xarray - - # Time not present in source data, needs to be added manually - # Determine time from filename: - fileparts = str(filepath).split("_") - year = int(fileparts[-3]) - monthstr = fileparts[-4] - month = [ - "JAN", - "FEB", - "MAR", - "APR", - "MAY", - "JUN", - "JUL", - "AUG", - "SEP", - "OCT", - "NOV", - "DEC", - ].index(monthstr) + 1 - days_since_1850 = datetime.date(year, month, 15) - datetime.date( - 1850, 1, 1 - ) - lb_since_1850 = datetime.date(year, month, 1) - datetime.date( - 1850, 1, 1 - ) - if month == 12: - ub_since_1850 = ( - datetime.date(year + 1, 1, 1) - - datetime.date(1850, 1, 1) - - datetime.timedelta(days=1) - ) - else: - ub_since_1850 = ( - datetime.date(year, month + 1, 1) - - datetime.date(1850, 1, 1) - - datetime.timedelta(days=1) - ) - - xrvar = xrvar.assign_coords(time=days_since_1850.days) - xrvar = xrvar.expand_dims("time", axis=2) - xrvar["time"].attrs["units"] = "days since 1850-01-01 00:00:00" - - if short_name in ["od550aer", "abs550aer"]: - xrvar = xrvar.assign_coords( - radiation_wavelength=band550["lambda"] - ) - xrvar["radiation_wavelength"].attrs["units"] = "nm" - - cube = xrvar.to_iris() - - # Fix metadata - cube.coord("Geodetic Latitude").rename("latitude") - cube.coord("Geodetic Longitude").rename("longitude") - - # add time bounds - cube.coord("time").bounds = np.array( - [ub_since_1850.days, lb_since_1850.days] - ) - cube.coord("time").units = "days since 1850-01-01 00:00:00" - - utils.fix_var_metadata(cube, cmor_info) - utils.set_global_atts(cube, attrs) - - utils.fix_dim_coordnames(cube) - - # When Dask tries to roll this cube, it fails because it can't chunk this properly - # So here we replicate the part of fix_coords that does that, except with numpy.roll - # instead of dask.roll. - # However, it seems that those last two lines had no effect on the results... - # cube.data has shape (360, 720, 1) i.e. (lat, lon, time) - # so the original code tried to roll the cube on the time axis - # I could hard-code the lon axis (like they did), but instead I try to autodetect it. - cube_coord = cube.coord("longitude") - if cube_coord.ndim == 1: - if ( - cube_coord.points[0] < 0.0 - and cube_coord.points[-1] < 181.0 - ): - cube_coord.points = cube_coord.points + 180.0 - cube.attributes["geospatial_lon_min"] = 0.0 - cube.attributes["geospatial_lon_max"] = 360.0 - nlon = len(cube_coord.points) - axis = cube.coords().index(cube_coord) - shift = nlon // 2 - cube.data = da.roll(cube.core_data(), shift, axis=axis) - - utils.fix_coords(cube) - - # fix the wavelength coordinate information. - if short_name in ["od550aer", "abs550aer"]: - cube.coord("radiation_wavelength").var_name = "wavelength" - cube.coord("wavelength").standard_name = "radiation_wavelength" - - utils.set_global_atts(cube, attrs) - - # Save variable - logger.debug(f"Saving Cube: {cube}, in directory: {out_dir}") - cubes.append(cube) - - if len(cubes) > 0: - cube = cubes.concatenate()[ - 0 - ] # concatenate along time dimension and take the resulting cube + for filepath in Path(os.path.join(in_dir)).glob(files): + xrds = xr.open_dataset(filepath, group="Aerosol_Parameter_Average") + xrvar = xrds.sel(Band=band550["name"], Optical_Depth_Range="all")[ + raw_var + ] + + # change order of latitude and longitude coordinates + # xrvar = xrvar.transpose() + + # Add additional coordinates before converting to an iris cube, as this is easier with xarray + + # Time not present in source data, needs to be added manually + # Determine time from filename: + fileparts = str(filepath).split("_") + year = int(fileparts[-3]) + monthstr = fileparts[-4] + month = [ + "JAN", + "FEB", + "MAR", + "APR", + "MAY", + "JUN", + "JUL", + "AUG", + "SEP", + "OCT", + "NOV", + "DEC", + ].index(monthstr) + 1 + days_since_1850 = dt.date(year, month, 15) - dt.date(1850, 1, 1) + lb_since_1850 = dt.date(year, month, 1) - dt.date(1850, 1, 1) + if month == 12: + ub_since_1850 = dt.date(year + 1, 1, 1) - dt.date(1850, 1, 1) else: - cube = cubes[0] + ub_since_1850 = dt.date(year, month + 1, 1) - dt.date(1850, 1, 1) + + xrvar = xrvar.assign_coords(time=days_since_1850.days) + xrvar = xrvar.expand_dims("time", axis=2) + xrvar["time"].attrs["units"] = "days since 1850-01-01 00:00:00" + + if short_name in ["od550aer", "abs550aer"]: + xrvar = xrvar.assign_coords(radiation_wavelength=band550["lambda"]) + xrvar["radiation_wavelength"].attrs["units"] = "nm" + + cube = xrvar.to_iris() + + # Fix metadata + cube.coord("Geodetic Latitude").rename("latitude") + cube.coord("Geodetic Longitude").rename("longitude") + + # add time bounds + cube.coord("time").bounds = np.array( + [ub_since_1850.days, lb_since_1850.days] + ) + + utils.fix_var_metadata(cube, cmor_info) + utils.set_global_atts(cube, attrs) + + utils.fix_dim_coordnames(cube) + + # When Dask tries to roll this cube, it fails because it can't chunk this properly + # So here we replicate the part of fix_coords that does that, except with numpy.roll + # instead of dask.roll. + # However, it seems that those last two lines had no effect on the results... + # cube.data has shape (360, 720, 1) i.e. (lat, lon, time) + # so the original code tried to roll the cube on the time axis + # I could hard-code the lon axis (like they did), but instead I try to autodetect it. + cube_coord = cube.coord("longitude") + logger.info("Fixing longitude...") + if cube_coord.ndim == 1: + if cube_coord.points[0] < 0.0 and cube_coord.points[-1] < 181.0: + cube_coord.points = cube_coord.points + 180.0 + cube.attributes["geospatial_lon_min"] = 0.0 + cube.attributes["geospatial_lon_max"] = 360.0 + nlon = len(cube_coord.points) + axis = cube.coords().index(cube_coord) + shift = nlon // 2 + cube.data = da.roll(cube.core_data(), shift, axis=axis) + + utils.fix_coords(cube) + + # fix the wavelength coordinate information. + if short_name in ["od550aer", "abs550aer"]: + cube.coord("radiation_wavelength").var_name = "wavelength" + cube.coord("wavelength").standard_name = "radiation_wavelength" + + utils.set_global_atts(cube, attrs) + + # Save variable + logger.info(f"Saving Cube: {cube}, in directory: {out_dir}") utils.save_variable( cube, short_name, out_dir, attrs, unlimited_dimensions=["time"] ) From ffd2efe18988b1657e563ec72792891cecf7292b Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Tue, 24 Feb 2026 19:27:42 +0000 Subject: [PATCH 08/13] added exspected data for cmip6 standard --- .../cmorizers/data/cmor_config/MISR.yml | 9 ++---- esmvaltool/cmorizers/data/datasets.yml | 10 +++++++ .../data/formatters/datasets/misr.py | 29 ++++++++++++++----- esmvaltool/config-references.yml | 4 +++ esmvaltool/references/misr.bibtex | 11 +++++++ 5 files changed, 50 insertions(+), 13 deletions(-) create mode 100644 esmvaltool/references/misr.bibtex diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml index d57d12bc93..596dc07bae 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MISR.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -1,12 +1,12 @@ # Global attributes of NetCDF file attributes: project_id: CMIP6 - dataset_id: MODIS-Terra + dataset_id: MISR modeling_realm: atmos type: sat - tier: 2 + tier: 3 version: 'V0032' - source: 'ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/' + source: 'https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004' reference: 'misr' files: 'MISR_AM1_CGAS_*_F15_0032.nc' # Variables to CMORize @@ -17,6 +17,3 @@ variables: abs550aer: raw_name: Absorbing_Optical_Depth mip: AERmon - ae: - raw_name: Angstrom_Exponent_550_860 - mip: AERmon diff --git a/esmvaltool/cmorizers/data/datasets.yml b/esmvaltool/cmorizers/data/datasets.yml index 3d50453bb9..7b727f1831 100644 --- a/esmvaltool/cmorizers/data/datasets.yml +++ b/esmvaltool/cmorizers/data/datasets.yml @@ -1030,6 +1030,16 @@ datasets: info: | Download the file MPI_MOBO-DIC_2004-2019_v2.nc + MISR: + tier: 3 + source: https://asdc.larc.nasa.gov/project/MISR/MIL3MAEN_004 + last_access: 2026-02-17 + info: | + To obtain the data set you will need to go to the Data Distribution + section of the link, create an account/sign in on the earthdata.nasa.gov website + and then follow the instructions for downloading the data. + The data is currently freely available, but a registration is required. + MODIS: tier: 3 source: https://ladsweb.modaps.eosdis.nasa.gov/search/order diff --git a/esmvaltool/cmorizers/data/formatters/datasets/misr.py b/esmvaltool/cmorizers/data/formatters/datasets/misr.py index 0a0a6ac6f1..c72a37d3fb 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/misr.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/misr.py @@ -5,10 +5,6 @@ Source ftp://l5ftl01.larc.nasa.gov/MISR/MIL3MAEN.004/ - -Modification history - - """ import copy @@ -41,7 +37,7 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): """Extract variable.""" # load data - logger.info( + logger.debug( "Loading data from file(s) '%s' in directory '%s' with version '%s'", files, in_dir, @@ -116,7 +112,7 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): # so the original code tried to roll the cube on the time axis # I could hard-code the lon axis (like they did), but instead I try to autodetect it. cube_coord = cube.coord("longitude") - logger.info("Fixing longitude...") + logger.debug("Fixing longitude...") if cube_coord.ndim == 1: if cube_coord.points[0] < 0.0 and cube_coord.points[-1] < 181.0: cube_coord.points = cube_coord.points + 180.0 @@ -127,6 +123,24 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): shift = nlon // 2 cube.data = da.roll(cube.core_data(), shift, axis=axis) + if np.diff(cube.coord("latitude").points)[0] < 0: + # convert [90,-90] to [-90,90] + cube.coord("latitude").points = cube.coord("latitude").points[::-1] + # flip the data + cube.data = cube.data[:, ::-1, :] # latitude is axis=1 + + lat_bounds = [] + for lat in cube.coord("latitude").points: + lat_bounds.append([lat - 0.25, lat + 0.25]) + lat_bounds = np.array(lat_bounds) + cube.coord("latitude").bounds = lat_bounds.reshape(-1, 2) + + lon_bounds = [] + for lon in cube.coord("longitude").points: + lon_bounds.append([lon - 0.25, lon + 0.25]) + lon_bounds = np.array(lon_bounds) + cube.coord("longitude").bounds = lon_bounds.reshape(-1, 2) + utils.fix_coords(cube) # fix the wavelength coordinate information. @@ -137,7 +151,7 @@ def _extract_variable(short_name, var, cfg, in_dir, out_dir): utils.set_global_atts(cube, attrs) # Save variable - logger.info(f"Saving Cube: {cube}, in directory: {out_dir}") + logger.debug(f"Saving Cube: {cube}, in directory: {out_dir}") utils.save_variable( cube, short_name, out_dir, attrs, unlimited_dimensions=["time"] ) @@ -148,4 +162,5 @@ def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): cfg.pop("cmor_table") for short_name, var in cfg["variables"].items(): + logger.info(f"CMORizing variable '{short_name}'") _extract_variable(short_name, var, cfg, in_dir, out_dir) diff --git a/esmvaltool/config-references.yml b/esmvaltool/config-references.yml index 939696123f..f0aed75a6d 100644 --- a/esmvaltool/config-references.yml +++ b/esmvaltool/config-references.yml @@ -229,6 +229,10 @@ authors: name: Winterstein, Franziska institute: DLR, Germany orcid: https://orcid.org/0000-0002-2406-4936 + fruttarol_noah: + name: Fruttarol, Noah + institute: CCCma, ECCC, Canada + github: noahfruttarol fuckar_neven: name: Fuckar, Neven institute: BSC, Spain diff --git a/esmvaltool/references/misr.bibtex b/esmvaltool/references/misr.bibtex new file mode 100644 index 0000000000..577d498694 --- /dev/null +++ b/esmvaltool/references/misr.bibtex @@ -0,0 +1,11 @@ +@misc{misr, + doi = {10.5067/Terra/MISR/MIL3MAEN_L3.004}, + url = {https://doi.org/10.5067/Terra/MISR/MIL3MAEN_L3.004}, + publisher = {NASA Langley Atmospheric Science Data Center DAAC}, + title = {MISR Level 3 Component Global Aerosol product in netCDF format covering a month V004}, + author = {NASA/LARC/SD/ASDC}, + date = {2008-09-26}, + year = 2008, + month = 9, + day = 26, +} From 7037ee76f553883f9e73517e6cbb718fa7d12369 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 5 Mar 2026 21:05:07 +0000 Subject: [PATCH 09/13] fixed project_id --- esmvaltool/cmorizers/data/cmor_config/MISR.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esmvaltool/cmorizers/data/cmor_config/MISR.yml b/esmvaltool/cmorizers/data/cmor_config/MISR.yml index 596dc07bae..351cfbfede 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MISR.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MISR.yml @@ -1,6 +1,6 @@ # Global attributes of NetCDF file attributes: - project_id: CMIP6 + project_id: OBS6 dataset_id: MISR modeling_realm: atmos type: sat From 0a19731f7347b9445e5e9220415294768558e1a1 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 12 Mar 2026 17:45:18 +0000 Subject: [PATCH 10/13] Minor changes and added reference --- .../cmorizers/data/cmor_config/MODIS-Aqua.yml | 6 ++--- .../data/cmor_config/MODIS-Terra.yml | 6 ++--- esmvaltool/cmorizers/data/datasets.yml | 22 +++++++++++++++++-- .../data/formatters/datasets/modis_aqua.py | 14 ++++-------- .../data/formatters/datasets/modis_terra.py | 14 ++++-------- esmvaltool/config-references.yml | 4 ++++ 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml index 29329712c9..812e750d09 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml @@ -30,14 +30,12 @@ variables: frequency: mon lwpStderr: raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty - mip: Amon + mip: custom frequency: mon - project_id: custom iwpStderr: raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty - mip: Amon + mip: custom frequency: mon - project_id: custom od550aer: raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean mip: AERmon diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml index 94a3a686cc..e30f90835f 100644 --- a/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml +++ b/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml @@ -30,14 +30,12 @@ variables: frequency: mon lwpStderr: raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty - mip: Amon + mip: custom frequency: mon - project_id: custom iwpStderr: raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty - mip: Amon + mip: custom frequency: mon - project_id: custom od550aer: raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean mip: AERmon diff --git a/esmvaltool/cmorizers/data/datasets.yml b/esmvaltool/cmorizers/data/datasets.yml index 3d50453bb9..6c7613a20d 100644 --- a/esmvaltool/cmorizers/data/datasets.yml +++ b/esmvaltool/cmorizers/data/datasets.yml @@ -1030,10 +1030,10 @@ datasets: info: | Download the file MPI_MOBO-DIC_2004-2019_v2.nc - MODIS: + MODIS-Aqua: tier: 3 source: https://ladsweb.modaps.eosdis.nasa.gov/search/order - last_access: 2019-02-09 + last_access: 2026-01-30 info: | In Products: select "MODIS Aqua", "Collection 6.1" and "L3 Atmosphere Product", click on MYD08_M3. @@ -1048,6 +1048,24 @@ datasets: grid-box averages --> multiply MODIS clwvi and clivi values with cloud fraction as a first guess + MODIS-Terra: + tier: 3 + source: https://ladsweb.modaps.eosdis.nasa.gov/search/order + last_access: 2026-01-30 + info: | + In Products: select "MODIS Terra", "Collection 6.1" and + "L3 Atmosphere Product", click on MOD08_M3. + In Time: select from 2000-01-01 to today. + In Location: skip, the global domain will be applied. + In Files: select all. + Submit the order. + A registration is required to download the data. + + Caveats + clwvi and clivi data are in-cloud values whereas CMIP5 models provide + grid-box averages --> multiply MODIS clwvi and clivi values with cloud + fraction as a first guess + MTE: tier: 3 source: http://www.bgc-jena.mpg.de/geodb/BGI/Home diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py index f43bdc8d2b..80859e31f3 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py @@ -342,16 +342,10 @@ def _extract_variable( # add mip to attributes for use in cmor table lookup and global attributes attrs["mip"] = var_d["mip"] - # get custom cmor table for the target project if specified - if var_d.get("project_id"): - cmor_table_b = CMOR_TABLES.get(var_d.get("project_id")) - cmor_info = cmor_table_b.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) - else: - cmor_info = cmor_table.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) + # get cmor table for variable + cmor_info = cmor_table.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table if var_d.get("reference"): diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py index d7e2b5f088..53e942be64 100644 --- a/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py +++ b/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py @@ -342,16 +342,10 @@ def _extract_variable( # add mip to attributes for use in cmor table lookup and global attributes attrs["mip"] = var_d["mip"] - # get custom cmor table for the target project if specified - if var_d.get("project_id"): - cmor_table_b = CMOR_TABLES.get(var_d.get("project_id")) - cmor_info = cmor_table_b.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) - else: - cmor_info = cmor_table.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) + # get the cmor table for the variable + cmor_info = cmor_table.get_variable( + var_d.get("mip"), var_d.get("short_name") + ) # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table if var_d.get("reference"): diff --git a/esmvaltool/config-references.yml b/esmvaltool/config-references.yml index 939696123f..f0aed75a6d 100644 --- a/esmvaltool/config-references.yml +++ b/esmvaltool/config-references.yml @@ -229,6 +229,10 @@ authors: name: Winterstein, Franziska institute: DLR, Germany orcid: https://orcid.org/0000-0002-2406-4936 + fruttarol_noah: + name: Fruttarol, Noah + institute: CCCma, ECCC, Canada + github: noahfruttarol fuckar_neven: name: Fuckar, Neven institute: BSC, Spain From 0a8f443856e4cf5803a48e279298f81d1ff86e1c Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Thu, 12 Mar 2026 18:05:24 +0000 Subject: [PATCH 11/13] added MISR to check recipe --- esmvaltool/recipes/examples/recipe_check_obs.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/esmvaltool/recipes/examples/recipe_check_obs.yml b/esmvaltool/recipes/examples/recipe_check_obs.yml index ea34444d64..9f9a826ae2 100644 --- a/esmvaltool/recipes/examples/recipe_check_obs.yml +++ b/esmvaltool/recipes/examples/recipe_check_obs.yml @@ -796,6 +796,19 @@ diagnostics: scripts: null + MISR: + description: MISR check + variables: + od550aer: + mip: AERmon + abs550aer: + mip: AERmon + additional_datasets: + - {dataset: MISR, project: OBS, mip: AERmon, tier: 3, + type: sat, version: v0032, start_year: 2000, end_year: 2020} + scripts: null + + MOBO-DIC-MPIM: description: MOBO-DIC-MPIM check variables: From 7c9d2a28bcaee7da3b124221ce3087f2eaafd86b Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Mon, 16 Mar 2026 16:40:40 +0000 Subject: [PATCH 12/13] added dataset to ESMValTool/doc/sphinx/source/input.rst --- doc/sphinx/source/input.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/sphinx/source/input.rst b/doc/sphinx/source/input.rst index 23b791149c..906dd2af27 100644 --- a/doc/sphinx/source/input.rst +++ b/doc/sphinx/source/input.rst @@ -424,6 +424,8 @@ A list of the datasets for which a CMORizers is available is provided in the fol +----------------------------------------+------------------------------------------------------------------------------------------------------+------+-----------------+ | MLS-AURA* [#t3]_ | hur, hurStderr (day) | 3 | Python | +----------------------------------------+------------------------------------------------------------------------------------------------------+------+-----------------+ +| MISR | od550aer, abs550aer (AERmon) | 3 | Python | ++----------------------------------------+------------------------------------------------------------------------------------------------------+------+-----------------+ | MOBO-DIC-MPIM | dissic (Omon) | 2 | Python | +----------------------------------------+------------------------------------------------------------------------------------------------------+------+-----------------+ | MOBO-DIC2004-2019 | dissic (Omon) | 2 | Python | From 24529572135b4f9e95407ea7661096c52f422816 Mon Sep 17 00:00:00 2001 From: Noah Fruttarol Date: Tue, 21 Apr 2026 11:18:37 -0700 Subject: [PATCH 13/13] Revert changes made by mistake. --- .../cmorizers/data/cmor_config/MODIS-Aqua.yml | 47 -- .../data/cmor_config/MODIS-Terra.yml | 47 -- esmvaltool/cmorizers/data/datasets.yml | 23 +- .../data/formatters/datasets/_modis_aqua.ncl | 253 ---------- .../data/formatters/datasets/_modis_terra.ncl | 253 ---------- .../data/formatters/datasets/modis_aqua.py | 441 ------------------ .../data/formatters/datasets/modis_terra.py | 441 ------------------ pyproject.toml | 1 - 8 files changed, 2 insertions(+), 1504 deletions(-) delete mode 100644 esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml delete mode 100644 esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml delete mode 100644 esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl delete mode 100644 esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl delete mode 100644 esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py delete mode 100644 esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml deleted file mode 100644 index 812e750d09..0000000000 --- a/esmvaltool/cmorizers/data/cmor_config/MODIS-Aqua.yml +++ /dev/null @@ -1,47 +0,0 @@ ---- -# Author: 2026-01-29 Kristi Webb - -# Common global attributes for Cmorizer output -attributes: - - project_id: CMIP6 - dataset_id: MODIS-Aqua - modeling_realm: atmos - type: sat - tier: 3 - version: MYD08_M3 - source: https://ladsweb.modaps.eosdis.nasa.gov/search/order - reference: modis1 - - filename_fmt: "MYD08_M3.A*.hdf" - -variables: - clwvi: - raw_name: Cloud_Water_Path_Liquid_Mean_Mean - mip: Amon - frequency: mon - clivi: - raw_name: Cloud_Water_Path_Ice_Mean_Mean - mip: Amon - frequency: mon - clt: - raw_name: Cloud_Fraction_Mean_Mean - mip: Amon - frequency: mon - lwpStderr: - raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty - mip: custom - frequency: mon - iwpStderr: - raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty - mip: custom - frequency: mon - od550aer: - raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean - mip: AERmon - frequency: mon - reference: modis2 - reffclwtop: - raw_name: Cloud_Effective_Radius_Liquid_Mean_Mean - mip: AERmon - frequency: mon diff --git a/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml b/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml deleted file mode 100644 index e30f90835f..0000000000 --- a/esmvaltool/cmorizers/data/cmor_config/MODIS-Terra.yml +++ /dev/null @@ -1,47 +0,0 @@ ---- -# Author: 2026-01-29 Kristi Webb - -# Common global attributes for Cmorizer output -attributes: - - project_id: CMIP6 - dataset_id: MODIS-Terra - modeling_realm: atmos - type: sat - tier: 3 - version: MOD08_M3 - source: https://ladsweb.modaps.eosdis.nasa.gov/search/order - reference: modis1 - - filename_fmt: "MOD08_M3.A*.hdf" - -variables: - clwvi: - raw_name: Cloud_Water_Path_Liquid_Mean_Mean - mip: Amon - frequency: mon - clivi: - raw_name: Cloud_Water_Path_Ice_Mean_Mean - mip: Amon - frequency: mon - clt: - raw_name: Cloud_Fraction_Mean_Mean - mip: Amon - frequency: mon - lwpStderr: - raw_name: Cloud_Water_Path_Liquid_Mean_Uncertainty - mip: custom - frequency: mon - iwpStderr: - raw_name: Cloud_Water_Path_Ice_Mean_Uncertainty - mip: custom - frequency: mon - od550aer: - raw_name: AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean - mip: AERmon - frequency: mon - reference: modis2 - reffclwtop: - raw_name: Cloud_Effective_Radius_Liquid_Mean_Mean - mip: AERmon - frequency: mon diff --git a/esmvaltool/cmorizers/data/datasets.yml b/esmvaltool/cmorizers/data/datasets.yml index a4aa17cb99..4d4852d8a6 100644 --- a/esmvaltool/cmorizers/data/datasets.yml +++ b/esmvaltool/cmorizers/data/datasets.yml @@ -1092,11 +1092,10 @@ datasets: and then follow the instructions for downloading the data. The data is currently freely available, but a registration is required. - MODIS-Aqua: - + MODIS: tier: 3 source: https://ladsweb.modaps.eosdis.nasa.gov/search/order - last_access: 2026-01-30 + last_access: 2019-02-09 info: | In Products: select "MODIS Aqua", "Collection 6.1" and "L3 Atmosphere Product", click on MYD08_M3. @@ -1111,24 +1110,6 @@ datasets: grid-box averages --> multiply MODIS clwvi and clivi values with cloud fraction as a first guess - MODIS-Terra: - tier: 3 - source: https://ladsweb.modaps.eosdis.nasa.gov/search/order - last_access: 2026-01-30 - info: | - In Products: select "MODIS Terra", "Collection 6.1" and - "L3 Atmosphere Product", click on MOD08_M3. - In Time: select from 2000-01-01 to today. - In Location: skip, the global domain will be applied. - In Files: select all. - Submit the order. - A registration is required to download the data. - - Caveats - clwvi and clivi data are in-cloud values whereas CMIP5 models provide - grid-box averages --> multiply MODIS clwvi and clivi values with cloud - fraction as a first guess - MTE: tier: 3 source: http://www.bgc-jena.mpg.de/geodb/BGI/Home diff --git a/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl b/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl deleted file mode 100644 index f1829bd6b6..0000000000 --- a/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl +++ /dev/null @@ -1,253 +0,0 @@ -; ############################################################################# -; ESMValTool CMORizer for MODIS data -; ############################################################################# -; -; Tier -; Tier 3: restricted dataset. -; -; Source -; https://ladsweb.modaps.eosdis.nasa.gov/search/order -; -; Last access -; 20190209 -; -; Download and processing instructions -; In Products: select "MODIS Aqua", "Collection 6.1" and -; "L3 Atmosphere Product", click on MYD08_M3. -; In Time: select from 2000-01-01 to today. -; In Location: skip, the global domain will be applied. -; In Files: select all. -; Submit the order. -; A registration is required to download the data. -; -; Caveats -; clwvi and clivi data are in-cloud values whereas CMIP5 models provide -; grid-box averages --> multiply MODIS clwvi and clivi values with cloud -; fraction as a first guess -; -; Modification history -; 20180209-righi_mattia: fixed bug in lwpStderr. -; 20180209-hassler_birgit: adapted to v2. -; 20180810-righi_mattia: fix minor calendar issue. -; 20180806-righi_mattia: code cleaning. -; 20170116-lauer_axel: using cirrus fraction to gridbox averages. -; 20160408-lauer_axel: added processing of uncertainties. -; 20151118-lauer_axel: bugfix: added unit conversion. -; 20150430-evaldsson_martin: written. -; -; ############################################################################# -loadscript(getenv("esmvaltool_root") + \ - "/data/formatters/interface.ncl") - -begin - - ; Script name (for logger) - DIAG_SCRIPT = "modis_aqua.ncl" - - ; Source name - OBSNAME = "MODIS-Aqua" - - ; Tier - TIER = 3 - - ; Selected variable (standard name) - VAR = (/"clwvi", \ - "clivi", \ - "clt", \ - "lwpStderr", \ - "iwpStderr", \ - "od550aer", \ - "reffclwtop"/) - - ; Name in the raw data - NAME = (/"Cloud_Water_Path_Liquid_Mean_Mean", \ - "Cloud_Water_Path_Ice_Mean_Mean", \ - "Cloud_Fraction_Mean_Mean", \ - "Cloud_Water_Path_Liquid_Mean_Uncertainty", \ - "Cloud_Water_Path_Ice_Mean_Uncertainty", \ - "AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean", \ - "Cloud_Effective_Radius_Liquid_Mean_Mean"/) - - ; MIP - MIP = (/"Amon", "Amon", "Amon", "Amon", "Amon", "aero", "aero"/) - - ; Frequency - FREQ = (/"mon", "mon", "mon", "mon", "mon", "mon", "mon"/) - - ; Version - VERSION = "MYD08_M3" - - ; CMOR table - CMOR_TABLE = getenv("cmor_tables") + \ - (/"/cmip5/Tables/CMIP5_Amon", \ - "/cmip5/Tables/CMIP5_Amon", \ - "/cmip5/Tables/CMIP5_Amon", \ - "/custom/CMOR_lwpStderr.dat", \ - "/custom/CMOR_iwpStderr.dat", \ - "/cmip5/Tables/CMIP5_aero", \ - "/cmip5/Tables/CMIP5_aero"/) - - ; Type - TYPE = "sat" - - ; Global attributes - SOURCE = "https://ladsweb.modaps.eosdis.nasa.gov/search/order" - REF1 = "Platnick et al., IEEE Trans. Geosci. Remote Sens., " + \ - "doi:10.1109/TGRS.2002.808301, 2003." - REF2 = "Levy et al., Atmos. Meas. Tech., " + \ - "doi:10.5194/amt-6-2989-2013, 2013." - COMMENT = "" - -end - -begin - - ; List of files - FILES = systemfunc("ls -1 " + input_dir_path + VERSION + ".A*.hdf") - - do ff = 0, dimsizes(FILES) - 1 - - fin = addfile(FILES(ff), "r") - - ; Get time - infile = systemfunc("basename " + FILES(ff)) - date = yyyyddd_to_yyyymmdd(toint(str_get_cols(infile, 10, 16))) - year = toint(str_get_cols(tostring(date), 0, 3)) - month = toint(str_get_cols(tostring(date), 4, 5)) - dm = days_in_month(year, month) - - ; Loop over variables to fetch from input file - do vv = 0, dimsizes(VAR) - 1 - - invar = fin->$NAME(vv)$ - invar_fv = invar@_FillValue - invar_coords = invar - invar := tofloat(invar) - invar := where(invar.eq.tofloat(invar_fv), \ - default_fillvalue("float"), invar) - - ; Special case clwvi as the sum lwp + iwp - if (VAR(vv).eq."clwvi") then - if (NAME(vv).ne."Cloud_Water_Path_Liquid_Mean_Mean") then - error_msg("f", DIAG_SCRIPT, "", "cannot calculate clwvi") - end if - - ; Read cirrus fraction - ; cfin = fin->Cirrus_Fraction_SWIR_FMean - cfin = fin->Cirrus_Fraction_Infrared_FMean - cif = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - ; liquid fraction is estimated assuming random overlap, i.e. - ; ctot = 1 - (1 - cif) * (1 - lif) - ; --> lif = 1 - (1 - ctot) / (1 - cif) - delete(cfin) - cfin = fin->Cloud_Fraction_Mean_Mean - ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - cif = where(cif.gt.0.999, cif@_FillValue, cif) - lif = 1.0 - (1.0 - ctot) / (1.0 - cif) - lif = where(lif.lt.0, 0, lif) - tmpvar = fin->Cloud_Water_Path_Ice_Mean_Mean ; read ice water path - tmpvar_fv = tmpvar@_FillValue - tmpvar := tofloat(tmpvar) - tmpvar := where(tmpvar.eq.tofloat(tmpvar_fv), \ - default_fillvalue("float"), \ - tmpvar) - tmpvar = tmpvar * cif ; convert iwp in-cloud value to gridbox avg - invar = invar * lif ; convert lwp in-cloud value to grid-box avg - invar = invar + tmpvar ; clwvi = lwp + iwp - delete(tmpvar) - delete(lif) - delete(cif) - invar = 0.001 * invar ; [g/m2] --> [kg/m2] - end if - - ; lwp and iwp are in-cloud values - ; convert lwp/iwp to grid-box averages by multiplying with - ; average cloud fraction (not optimum but best we can do at the moment) - if (any((/"clivi", "iwpStderr", "lwpStderr"/) .eq. VAR(vv))) then - - ; Read cirrus fraction (0-1) - ; cfin = fin->Cirrus_Fraction_SWIR_FMean - cfin = fin->Cirrus_Fraction_Infrared_FMean - cf = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - if (VAR(vv).eq."lwpStderr") then - cfin = fin->Cloud_Fraction_Mean_Mean - ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - cif = where(cf.gt.0.999, cf@_FillValue, cf) - cf = 1.0 - (1.0 - ctot) / (1.0 - cif) - cf = where(cf.lt.0, 0, cf) - delete(cif) - delete(ctot) - end if - invar = invar * cf ; ; "grid-box average" lwp/iwp - delete(cf) - invar = 0.001 * invar ; [g/m2] --> [kg/m2] - end if - - invar@_FillValue = default_fillvalue("float") - copy_VarCoords(invar_coords, invar) - if (isatt(invar_coords, "scale_factor")) then - invar = invar * tofloat(invar_coords@scale_factor) - end if - if (isatt(invar_coords, "add_offset")) then - invar = invar + tofloat(invar_coords@add_offset) - end if - - if (VAR(vv).eq."clt") then - invar = 100.0 * invar ; [1] --> [%] - end if - - ; Create output variable - lat = fin->YDim - lon = fin->XDim - output = new((/1, dimsizes(lat), dimsizes(lon)/), float) - output!0 = "time" - output!1 = "lat" - output!2 = "lon" - output&time = cd_inv_calendar(year, month, 15, 0, 0, 0, TUNITS, 0) - output&lat = lat - output&lon = lon - output(0, :, :) = (/invar/) - delete(invar) - delete(invar_coords) - - ; Format coordinates - format_coords(output, year + sprinti("%0.2i", month) + "01", \ - year + sprinti("%0.2i", month) + dm, FREQ(vv)) - - ; Set variable attributes - tmp = format_variable(output, VAR(vv), CMOR_TABLE(vv)) - delete(output) - output = tmp - delete(tmp) - - ; Calculate coordinate bounds - bounds = guess_coord_bounds(output, FREQ(vv)) - - ; Set global attributes - if (VAR(vv).ne."od550aer") then - gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF1, COMMENT) - else - gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF2, COMMENT) - end if - - ; Output file - DATESTR = \ - year + sprinti("%0.2i", month) + "-" + year + sprinti("%0.2i", month) - fout = output_dir_path + \ - str_join((/"OBS", OBSNAME, TYPE, str_sub_str(VERSION, "_", "-"), \ - MIP(vv), VAR(vv), DATESTR/), "_") + ".nc" - - ; Write variable - write_nc(fout, VAR(vv), output, bounds, gAtt) - delete(gAtt) - delete(output) - delete(bounds) - - end do - - end do - -end diff --git a/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl b/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl deleted file mode 100644 index 2e02498a88..0000000000 --- a/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl +++ /dev/null @@ -1,253 +0,0 @@ -; ############################################################################# -; ESMValTool CMORizer for MODIS data -; ############################################################################# -; -; Tier -; Tier 3: restricted dataset. -; -; Source -; https://ladsweb.modaps.eosdis.nasa.gov/search/order -; -; Last access -; 20190209 -; -; Download and processing instructions -; In Products: select "MODIS Aqua", "Collection 6.1" and -; "L3 Atmosphere Product", click on MOD08_M3. -; In Time: select from 2000-01-01 to today. -; In Location: skip, the global domain will be applied. -; In Files: select all. -; Submit the order. -; A registration is required to download the data. -; -; Caveats -; clwvi and clivi data are in-cloud values whereas CMIP5 models provide -; grid-box averages --> multiply MODIS clwvi and clivi values with cloud -; fraction as a first guess -; -; Modification history -; 20180209-righi_mattia: fixed bug in lwpStderr. -; 20180209-hassler_birgit: adapted to v2. -; 20180810-righi_mattia: fix minor calendar issue. -; 20180806-righi_mattia: code cleaning. -; 20170116-lauer_axel: using cirrus fraction to gridbox averages. -; 20160408-lauer_axel: added processing of uncertainties. -; 20151118-lauer_axel: bugfix: added unit conversion. -; 20150430-evaldsson_martin: written. -; -; ############################################################################# -loadscript(getenv("esmvaltool_root") + \ - "/data/formatters/interface.ncl") - -begin - - ; Script name (for logger) - DIAG_SCRIPT = "modis_terra.ncl" - - ; Source name - OBSNAME = "MODIS-Terra" - - ; Tier - TIER = 3 - - ; Selected variable (standard name) - VAR = (/"clwvi", \ - "clivi", \ - "clt", \ - "lwpStderr", \ - "iwpStderr", \ - "od550aer", \ - "reffclwtop"/) - - ; Name in the raw data - NAME = (/"Cloud_Water_Path_Liquid_Mean_Mean", \ - "Cloud_Water_Path_Ice_Mean_Mean", \ - "Cloud_Fraction_Mean_Mean", \ - "Cloud_Water_Path_Liquid_Mean_Uncertainty", \ - "Cloud_Water_Path_Ice_Mean_Uncertainty", \ - "AOD_550_Dark_Target_Deep_Blue_Combined_Mean_Mean", \ - "Cloud_Effective_Radius_Liquid_Mean_Mean"/) - - ; MIP - MIP = (/"Amon", "Amon", "Amon", "Amon", "Amon", "aero", "aero"/) - - ; Frequency - FREQ = (/"mon", "mon", "mon", "mon", "mon", "mon", "mon"/) - - ; Version - VERSION = "MOD08_M3" - - ; CMOR table - CMOR_TABLE = getenv("cmor_tables") + \ - (/"/cmip5/Tables/CMIP5_Amon", \ - "/cmip5/Tables/CMIP5_Amon", \ - "/cmip5/Tables/CMIP5_Amon", \ - "/custom/CMOR_lwpStderr.dat", \ - "/custom/CMOR_iwpStderr.dat", \ - "/cmip5/Tables/CMIP5_aero", \ - "/cmip5/Tables/CMIP5_aero"/) - - ; Type - TYPE = "sat" - - ; Global attributes - SOURCE = "https://ladsweb.modaps.eosdis.nasa.gov/search/order" - REF1 = "Platnick et al., IEEE Trans. Geosci. Remote Sens., " + \ - "doi:10.1109/TGRS.2002.808301, 2003." - REF2 = "Levy et al., Atmos. Meas. Tech., " + \ - "doi:10.5194/amt-6-2989-2013, 2013." - COMMENT = "" - -end - -begin - - ; List of files - FILES = systemfunc("ls -1 " + input_dir_path + VERSION + ".A*.hdf") - - do ff = 0, dimsizes(FILES) - 1 - - fin = addfile(FILES(ff), "r") - - ; Get time - infile = systemfunc("basename " + FILES(ff)) - date = yyyyddd_to_yyyymmdd(toint(str_get_cols(infile, 10, 16))) - year = toint(str_get_cols(tostring(date), 0, 3)) - month = toint(str_get_cols(tostring(date), 4, 5)) - dm = days_in_month(year, month) - - ; Loop over variables to fetch from input file - do vv = 0, dimsizes(VAR) - 1 - - invar = fin->$NAME(vv)$ - invar_fv = invar@_FillValue - invar_coords = invar - invar := tofloat(invar) - invar := where(invar.eq.tofloat(invar_fv), \ - default_fillvalue("float"), invar) - - ; Special case clwvi as the sum lwp + iwp - if (VAR(vv).eq."clwvi") then - if (NAME(vv).ne."Cloud_Water_Path_Liquid_Mean_Mean") then - error_msg("f", DIAG_SCRIPT, "", "cannot calculate clwvi") - end if - - ; Read cirrus fraction - ; cfin = fin->Cirrus_Fraction_SWIR_FMean - cfin = fin->Cirrus_Fraction_Infrared_FMean - cif = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - ; liquid fraction is estimated assuming random overlap, i.e. - ; ctot = 1 - (1 - cif) * (1 - lif) - ; --> lif = 1 - (1 - ctot) / (1 - cif) - delete(cfin) - cfin = fin->Cloud_Fraction_Mean_Mean - ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - cif = where(cif.gt.0.999, cif@_FillValue, cif) - lif = 1.0 - (1.0 - ctot) / (1.0 - cif) - lif = where(lif.lt.0, 0, lif) - tmpvar = fin->Cloud_Water_Path_Ice_Mean_Mean ; read ice water path - tmpvar_fv = tmpvar@_FillValue - tmpvar := tofloat(tmpvar) - tmpvar := where(tmpvar.eq.tofloat(tmpvar_fv), \ - default_fillvalue("float"), \ - tmpvar) - tmpvar = tmpvar * cif ; convert iwp in-cloud value to gridbox avg - invar = invar * lif ; convert lwp in-cloud value to grid-box avg - invar = invar + tmpvar ; clwvi = lwp + iwp - delete(tmpvar) - delete(lif) - delete(cif) - invar = 0.001 * invar ; [g/m2] --> [kg/m2] - end if - - ; lwp and iwp are in-cloud values - ; convert lwp/iwp to grid-box averages by multiplying with - ; average cloud fraction (not optimum but best we can do at the moment) - if (any((/"clivi", "iwpStderr", "lwpStderr"/) .eq. VAR(vv))) then - - ; Read cirrus fraction (0-1) - ; cfin = fin->Cirrus_Fraction_SWIR_FMean - cfin = fin->Cirrus_Fraction_Infrared_FMean - cf = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - if (VAR(vv).eq."lwpStderr") then - cfin = fin->Cloud_Fraction_Mean_Mean - ctot = tofloat(cfin * cfin@scale_factor + cfin@add_offset) - delete(cfin) - cif = where(cf.gt.0.999, cf@_FillValue, cf) - cf = 1.0 - (1.0 - ctot) / (1.0 - cif) - cf = where(cf.lt.0, 0, cf) - delete(cif) - delete(ctot) - end if - invar = invar * cf ; ; "grid-box average" lwp/iwp - delete(cf) - invar = 0.001 * invar ; [g/m2] --> [kg/m2] - end if - - invar@_FillValue = default_fillvalue("float") - copy_VarCoords(invar_coords, invar) - if (isatt(invar_coords, "scale_factor")) then - invar = invar * tofloat(invar_coords@scale_factor) - end if - if (isatt(invar_coords, "add_offset")) then - invar = invar + tofloat(invar_coords@add_offset) - end if - - if (VAR(vv).eq."clt") then - invar = 100.0 * invar ; [1] --> [%] - end if - - ; Create output variable - lat = fin->YDim - lon = fin->XDim - output = new((/1, dimsizes(lat), dimsizes(lon)/), float) - output!0 = "time" - output!1 = "lat" - output!2 = "lon" - output&time = cd_inv_calendar(year, month, 15, 0, 0, 0, TUNITS, 0) - output&lat = lat - output&lon = lon - output(0, :, :) = (/invar/) - delete(invar) - delete(invar_coords) - - ; Format coordinates - format_coords(output, year + sprinti("%0.2i", month) + "01", \ - year + sprinti("%0.2i", month) + dm, FREQ(vv)) - - ; Set variable attributes - tmp = format_variable(output, VAR(vv), CMOR_TABLE(vv)) - delete(output) - output = tmp - delete(tmp) - - ; Calculate coordinate bounds - bounds = guess_coord_bounds(output, FREQ(vv)) - - ; Set global attributes - if (VAR(vv).ne."od550aer") then - gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF1, COMMENT) - else - gAtt = set_global_atts(OBSNAME, TIER, SOURCE, REF2, COMMENT) - end if - - ; Output file - DATESTR = \ - year + sprinti("%0.2i", month) + "-" + year + sprinti("%0.2i", month) - fout = output_dir_path + \ - str_join((/"OBS", OBSNAME, TYPE, str_sub_str(VERSION, "_", "-"), \ - MIP(vv), VAR(vv), DATESTR/), "_") + ".nc" - - ; Write variable - write_nc(fout, VAR(vv), output, bounds, gAtt) - delete(gAtt) - delete(output) - delete(bounds) - - end do - - end do - -end diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py deleted file mode 100644 index 80859e31f3..0000000000 --- a/esmvaltool/cmorizers/data/formatters/datasets/modis_aqua.py +++ /dev/null @@ -1,441 +0,0 @@ -"""ESMValTool CMORizer for MODIS aqua data. - -Tier - Tier 3 - -Source - https://ladsweb.modaps.eosdis.nasa.gov/search/order - -Requirements - python hdf reader, https://pypi.org/project/pyhdf/ - -Modification history - 20260216-Fruttarol_Noah: Written based on the original MODIS CMORization NCL script created by evaldsson_martin, which can be found in at ESMValTool/esmvaltool/cmorizers/data/formatters/datasets/_modis_aqua.ncl - - -""" - -import logging -import os -import re -from datetime import datetime, timedelta - -import iris -import numpy as np -from dask import array as da -from esmvalcore.cmor.table import CMOR_TABLES -from pyhdf.SD import SD, SDC - -from esmvaltool.cmorizers.data import utilities as utils - -logger = logging.getLogger(__name__) - - -def collect_files(in_dir: str, cfg) -> list: - """Compose input file list and download if missing.""" - - attrs = cfg["attributes"] - # rootpath / Tier{tier}/{dataset}/{version}/{frequency}/{short_name} - # in_dir = rootpath / Tier{tier}/{dataset} - in_dir = os.path.join(in_dir) - fname_pattern = ( - f"{attrs['version']}\\.A.*\\.hdf$" # e.g. "MYD08_M3.A*.hdf" - ) - - files_a = next(os.walk(in_dir), (None, None, []))[ - 2 - ] # get list of files in in_dir - files = [] - for filename in files_a: - if re.search(fname_pattern, filename): - files.append(filename) - - if len(files) == 0: - raise Exception("No files found", in_dir) - - return files - - -def get_year_from_filepath(filepath: str, cfg) -> int: - """get year from first four characters of last section of basename""" - - attrs = cfg["attributes"] - basename = os.path.splitext(os.path.basename(filepath))[0] - offset = len( - f"{attrs['version']}.A" - ) # length of version string + dot + 'A' character - year = int(basename[offset : offset + 4]) - return year - - -def get_month_from_filepath(filepath: str, cfg) -> int: - """get month from timerange section of basename""" - - attrs = cfg["attributes"] - basename = os.path.splitext(os.path.basename(filepath))[0] - offset = len( - f"{attrs['version']}.A" - ) # length of version string + dot + 'A' character - doy = int(basename[offset + 4 : offset + 7]) # day of year - year = int(basename[offset : offset + 4]) - # Convert day-of-year to month - date = datetime(year, 1, 1) + timedelta(days=doy - 1) - month = date.month - logger.debug( - f"Extracted month {month}, doy {doy}, year {year} from filepath {filepath}" - ) - return month - - -def group_files_by_year(filepaths: list, cfg) -> dict: - """group filepaths by year using get_year_from_filepath function""" - years_D = dict() - for filename in filepaths: - year = get_year_from_filepath(filename, cfg) - - if year not in years_D: - years_D[year] = [filename] - else: - years_D[year].append(filename) - - return years_D - - -def read_hdf( - in_dir: str, - filepath: str, - year: int, - month: int, - raw_name: str, - short_name: str, - **extras, -): - """Read HDF file and build iris cube with auxiliary data for special variables.""" - f = SD(os.path.join(in_dir, filepath), SDC.READ) - - data_obj = f.select(raw_name) - data = data_obj.get().astype(np.float32) - - # mask fill values - if hasattr(data_obj, "fill_value"): - data[data == data_obj.fill_value] = np.nan - - # mask values outside valid range - if hasattr(data_obj, "valid_range"): - valid_range = data_obj.valid_range - logger.debug(f"Masking {short_name} data outside range {valid_range}") - data[(data < valid_range[0]) | (data > valid_range[1])] = np.nan - - # apply scale factor and add offset if they exist - if hasattr(data_obj, "scale_factor"): - scale_factor = data_obj.scale_factor - logger.debug(f"Scaling {short_name} data by {scale_factor}") - data *= scale_factor - if hasattr(data_obj, "add_offset"): - add_offset = data_obj.add_offset - logger.debug(f"Offsetting {short_name} data by {add_offset}") - data += add_offset - - # for certain variables, we need to read additional datasets to compute the final variable - - # liquid water path in-cloud, need to use cirrus fraction and cloud fraction to estimate liquid fraction, then multiply by in-cloud lwp to get grid-box average lwp - if short_name in ["clwvi"]: - cfirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") - cfin = cfirFm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - cif = (cfin * cfirFm_obj.scale_factor) + cfirFm_obj.add_offset - cif[cif > 0.999] = np.nan - - cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") - cfmm = cfmm_obj.get().astype(np.float32) - ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset - - # liquid fraction estimated assuming random overlap: - # ctot = 1 - (1 - cif) * (1 - lif) - # --> lif = 1 - (1 - ctot) / (1 - cif) - lif = 1.0 - (1.0 - ctot) / (1.0 - cif) - lif[lif < 0] = 0 - - # ice water path - cwpimm_obj = f.select("Cloud_Water_Path_Ice_Mean_Mean") - cwpimm = cwpimm_obj.get().astype(np.float32) - cwpimm_fv = cwpimm_obj._FillValue - cwpimm[cwpimm == cwpimm_fv] = np.nan # mask all fill values - - # convert iwp in-cloud to gridbox avg using masked cirrus fraction - iwp = cwpimm * cif - - # liquid water path - # convert lwp in-cloud value to grid-box avg - lwp = data * lif - - data = lwp + iwp - - # these variables are all in-cloud values, so we need to multiply by cirrus fraction to get grid-box average - elif short_name in ["clivi", "iwpStderr", "lwpStderr"]: - # cirrus fraction (0-1) - cfswirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") - cfswirFm = cfswirFm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - cf = (cfswirFm * cfswirFm_obj.scale_factor) + cfswirFm_obj.add_offset - cf[cf > 0.999] = np.nan - - # lwpStderr is only for liquid water path, so we want to use the liquid fraction to convert to grid-box average - if short_name in ["lwpStderr"]: - cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") - cfmm = cfmm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset - - cif = cf - cif[cf > 0.999] = np.nan - - cf = 1.0 - (1.0 - ctot) / (1.0 - cif) - cf[cf < 0] = 0 - - data *= cf # "grid-box average" lwp/iwp - - # mask all null values (nan, inf) - data = np.ma.masked_invalid(data) - - # Handle units - convert invalid units to '1' - units_str = data_obj.attributes().get("units", "1") - if units_str in ["none", "None", ""]: - units_str = "1" - - # Create time coordinate for the start of the month, with bounds covering the whole month - time_point = datetime( - year=year, month=month, day=15 - ) # use 15th as representative time for the month - time_bounds_lower = datetime(year=year, month=month, day=1) - time_bounds_upper = datetime( - year=year + (month == 12), month=month + 1 - (month == 12) * 12, day=1 - ) - timedelta( - days=1 - ) # end of month is the day before the first day of the next month - - # Convert time to days since 1850-01-01 - time_units = datetime(1850, 1, 1) - delta_1850 = (time_point - time_units).days - # After creating time delta, add bounds: - delta_bounds_lower = (time_bounds_lower - time_units).days - delta_bounds_upper = (time_bounds_upper - time_units).days - - time_coord = iris.coords.DimCoord( - points=[delta_1850], - standard_name="time", - units="days since 1850-01-01 00:00:00", - bounds=[[delta_bounds_lower, delta_bounds_upper]], - ) - data = data[np.newaxis, :, :] # Add time dimension at start - - lats = f.select("YDim").get().astype(np.float64) - lat_coord = iris.coords.DimCoord( - lats, - standard_name="latitude", - units="degrees_north", - bounds=[ - ( # calculate bounds as midpoint between adjacent latitudes, assuming regular grid - lats[i] - 0.5 * np.abs(lats[1] - lats[0]), - lats[i] + 0.5 * np.abs(lats[1] - lats[0]), - ) - for i in range(len(lats)) - ], - ) - - lons = f.select("XDim").get().astype(np.float64) - lon_coord = iris.coords.DimCoord( - lons, - standard_name="longitude", - units="degrees_east", - bounds=[ - ( # calculate bounds as midpoint between adjacent longitudes, assuming regular grid - lons[i] - 0.5 * np.abs(lons[1] - lons[0]), - lons[i] + 0.5 * np.abs(lons[1] - lons[0]), - ) - for i in range(len(lons)) - ], - ) - - if short_name == "od550aer": - # add auxiliary coordinate for wavelength (550 nm for this variable, which is AOD at 550 nm) - wavelength_coord = iris.coords.AuxCoord( - [550], - standard_name="radiation_wavelength", - var_name="wavelength", - units="nm", - ) - aux_coords_and_dims = [ - ( - wavelength_coord, - None, - ), # wavelength is a scalar auxiliary coordinate - ] - else: - aux_coords_and_dims = ( - None # no auxiliary coordinates for other variables - ) - - cube = iris.cube.Cube( - data, - long_name=data_obj.attributes().get("long_name", raw_name), - var_name=short_name, - units=units_str, - dim_coords_and_dims=[ - (time_coord, 0), - (lat_coord, 1), - (lon_coord, 2), - ], - aux_coords_and_dims=aux_coords_and_dims, - ) - - return cube - - -def convert( - cube: iris.cube.Cube, - cmor_info, - attrs, -) -> iris.cube.Cube: - """Convert cube to CMOR standards based on data type and cmor_info.""" - - if attrs.get("reference") is None: - attrs["reference"] = cmor_info.attributes["reference"] - - cube.convert_units(cmor_info.units) - - if cube.coord("longitude").points.min() < 0: - # convert from [-180, 180] to [0, 360] - cube.coord("longitude").points = cube.coord("longitude").points + 180.0 - # roll the data as part of the longitude conversion to maintain correct order (assuming regular grid and that longitude is the last dimension) - nlon = len(cube.coord("longitude").points) - dim_lon = 2 - cube.data = da.roll(cube.core_data(), int(nlon / 2), axis=dim_lon) - - if np.diff(cube.coord("latitude").points)[0] < 0: - # convert [90,-90] to [-90,90] - cube.coord("latitude").points = cube.coord("latitude").points[::-1] - # flip the data - cube.data = cube.data[:, ::-1, :] # latitude is axis=1 - - utils.set_global_atts(cube, attrs) - utils.fix_var_metadata(cube, cmor_info) - utils.fix_dim_coordnames(cube) - - utils.fix_coords(cube) - - return cube - - -def _extract_variable( - filepaths: list, - var_d, - attrs, - cmor_table, - cfg, - in_dir: str, - out_dir: str, - year: int, -) -> None: - """Extract variable data from input files, convert to CMOR standards, and save.""" - - # add mip to attributes for use in cmor table lookup and global attributes - attrs["mip"] = var_d["mip"] - - # get cmor table for variable - cmor_info = cmor_table.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) - - # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table - if var_d.get("reference"): - attrs["reference"] = var_d.get("reference") - - if not cmor_table: - raise Exception( - f'project_id "{var_d.get("project_id", "None")}" is invalid, no CMOR table found. Valid options: {", ".join(list(CMOR_TABLES.keys()))}' - ) - - # check if cmor info was found for the variable, if not log an error and skip - if cmor_info is None: - logger.error( - f"CMOR info not found for variable {var_d.get('short_name')} in mip {var_d.get('mip')}" - ) - return - logger.debug( - f"CMOR info for variable {var_d.get('short_name')}: {cmor_info}" - ) - - logger.info( - f"CMORizing variable {var_d.get('short_name')} from input set {var_d.get('raw_name')}" - ) - - cubes = iris.cube.CubeList() - for filepath in filepaths: - month = get_month_from_filepath(filepath, cfg) - logger.debug( - f"File: {filepath} -> var={var_d.get('short_name')} year={year}, month={month}" - ) - cube = read_hdf(in_dir, filepath, year, month, **var_d) - - logger.debug( - f"Read cube for variable {var_d.get('short_name')}: {cube}, with attributes: {cube.attributes}" - ) - cubes.append(cube) - - if len(cubes) > 0: - cube = cubes.concatenate()[ - 0 - ] # concatenate along time dimension and take the resulting cube - else: - cube = cubes[0] - - cube = convert(cube, cmor_info, attrs) - - # add dataset_id to global attributes for provenance - cube_attrs = cube.attributes.globals - cube_attrs["dataset_id"] = attrs["dataset_id"] - logger.debug(f"Saving variable {cube} with attributes: {cube_attrs}") - - utils.save_variable( - cube, - var_d.get("short_name"), - out_dir, - cube_attrs, - unlimited_dimensions=["time"], - ) - - -def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): - """Run CMORizer for MODIS.""" - - cmor_table = cfg["cmor_table"] - glob_attrs = cfg["attributes"] - - # get relevant files - filepaths = collect_files(in_dir, cfg) - - # group by year - years_d = group_files_by_year(filepaths, cfg) - years = sorted(list(years_d.keys())) - - # iterate over the variables to be CMORized - for var, var_d in cfg["variables"].items(): - logger.info( - "CMORizing var %s from input set %s", var, var_d["raw_name"] - ) - var_d["short_name"] = var - logger.debug("\n" + format(var_d)) - - # cmorize by year, merging all months at the end - for year in years: - _extract_variable( - years_d[year], - var_d, - glob_attrs, - cmor_table, - cfg, - in_dir, - out_dir, - year, - ) diff --git a/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py b/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py deleted file mode 100644 index 53e942be64..0000000000 --- a/esmvaltool/cmorizers/data/formatters/datasets/modis_terra.py +++ /dev/null @@ -1,441 +0,0 @@ -"""ESMValTool CMORizer for MODIS terra data. - -Tier - Tier 3 - -Source - https://ladsweb.modaps.eosdis.nasa.gov/search/order - -Requirements - python hdf reader, https://pypi.org/project/pyhdf/ - -Modification history - 20260216-Fruttarol_Noah: Written based on the original MODIS CMORization NCL script created by evaldsson_martin, which can be found in at ESMValTool/esmvaltool/cmorizers/data/formatters/datasets/_modis_terra.ncl - - -""" - -import logging -import os -import re -from datetime import datetime, timedelta - -import iris -import numpy as np -from dask import array as da -from esmvalcore.cmor.table import CMOR_TABLES -from pyhdf.SD import SD, SDC - -from esmvaltool.cmorizers.data import utilities as utils - -logger = logging.getLogger(__name__) - - -def collect_files(in_dir: str, cfg) -> list: - """Compose input file list and download if missing.""" - - attrs = cfg["attributes"] - # rootpath / Tier{tier}/{dataset}/{version}/{frequency}/{short_name} - # in_dir = rootpath / Tier{tier}/{dataset} - in_dir = os.path.join(in_dir) - fname_pattern = ( - f"{attrs['version']}\\.A.*\\.hdf$" # e.g. "MOD08_M3.A*.hdf" - ) - - files_a = next(os.walk(in_dir), (None, None, []))[ - 2 - ] # get list of files in in_dir - files = [] - for filename in files_a: - if re.search(fname_pattern, filename): - files.append(filename) - - if len(files) == 0: - raise Exception("No files found", in_dir) - - return files - - -def get_year_from_filepath(filepath: str, cfg) -> int: - """get year from first four characters of last section of basename""" - - attrs = cfg["attributes"] - basename = os.path.splitext(os.path.basename(filepath))[0] - offset = len( - f"{attrs['version']}.A" - ) # length of version string + dot + 'A' character - year = int(basename[offset : offset + 4]) - return year - - -def get_month_from_filepath(filepath: str, cfg) -> int: - """get month from timerange section of basename""" - - attrs = cfg["attributes"] - basename = os.path.splitext(os.path.basename(filepath))[0] - offset = len( - f"{attrs['version']}.A" - ) # length of version string + dot + 'A' character - doy = int(basename[offset + 4 : offset + 7]) # day of year - year = int(basename[offset : offset + 4]) - # Convert day-of-year to month - date = datetime(year, 1, 1) + timedelta(days=doy - 1) - month = date.month - logger.debug( - f"Extracted month {month}, doy {doy}, year {year} from filepath {filepath}" - ) - return month - - -def group_files_by_year(filepaths: list, cfg) -> dict: - """group filepaths by year using get_year_from_filepath function""" - years_D = dict() - for filename in filepaths: - year = get_year_from_filepath(filename, cfg) - - if year not in years_D: - years_D[year] = [filename] - else: - years_D[year].append(filename) - - return years_D - - -def read_hdf( - in_dir: str, - filepath: str, - year: int, - month: int, - raw_name: str, - short_name: str, - **extras, -): - """Read HDF file and build iris cube with auxiliary data for special variables.""" - f = SD(os.path.join(in_dir, filepath), SDC.READ) - - data_obj = f.select(raw_name) - data = data_obj.get().astype(np.float32) - - # mask fill values - if hasattr(data_obj, "fill_value"): - data[data == data_obj.fill_value] = np.nan - - # mask values outside valid range - if hasattr(data_obj, "valid_range"): - valid_range = data_obj.valid_range - logger.debug(f"Masking {short_name} data outside range {valid_range}") - data[(data < valid_range[0]) | (data > valid_range[1])] = np.nan - - # apply scale factor and add offset if they exist - if hasattr(data_obj, "scale_factor"): - scale_factor = data_obj.scale_factor - logger.debug(f"Scaling {short_name} data by {scale_factor}") - data *= scale_factor - if hasattr(data_obj, "add_offset"): - add_offset = data_obj.add_offset - logger.debug(f"Offsetting {short_name} data by {add_offset}") - data += add_offset - - # for certain variables, we need to read additional datasets to compute the final variable - - # liquid water path in-cloud, need to use cirrus fraction and cloud fraction to estimate liquid fraction, then multiply by in-cloud lwp to get grid-box average lwp - if short_name in ["clwvi"]: - cfirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") - cfin = cfirFm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - cif = (cfin * cfirFm_obj.scale_factor) + cfirFm_obj.add_offset - cif[cif > 0.999] = np.nan - - cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") - cfmm = cfmm_obj.get().astype(np.float32) - ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset - - # liquid fraction estimated assuming random overlap: - # ctot = 1 - (1 - cif) * (1 - lif) - # --> lif = 1 - (1 - ctot) / (1 - cif) - lif = 1.0 - (1.0 - ctot) / (1.0 - cif) - lif[lif < 0] = 0 - - # ice water path - cwpimm_obj = f.select("Cloud_Water_Path_Ice_Mean_Mean") - cwpimm = cwpimm_obj.get().astype(np.float32) - cwpimm_fv = cwpimm_obj._FillValue - cwpimm[cwpimm == cwpimm_fv] = np.nan # mask all fill values - - # convert iwp in-cloud to gridbox avg using masked cirrus fraction - iwp = cwpimm * cif - - # liquid water path - # convert lwp in-cloud value to grid-box avg - lwp = data * lif - - data = lwp + iwp - - # these variables are all in-cloud values, so we need to multiply by cirrus fraction to get grid-box average - elif short_name in ["clivi", "iwpStderr", "lwpStderr"]: - # cirrus fraction (0-1) - cfswirFm_obj = f.select("Cirrus_Fraction_Infrared_FMean") - cfswirFm = cfswirFm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - cf = (cfswirFm * cfswirFm_obj.scale_factor) + cfswirFm_obj.add_offset - cf[cf > 0.999] = np.nan - - # lwpStderr is only for liquid water path, so we want to use the liquid fraction to convert to grid-box average - if short_name in ["lwpStderr"]: - cfmm_obj = f.select("Cloud_Fraction_Mean_Mean") - cfmm = cfmm_obj.get().astype(np.float32) - # unpack with scale_factor and add_offset - ctot = (cfmm * cfmm_obj.scale_factor) + cfmm_obj.add_offset - - cif = cf - cif[cf > 0.999] = np.nan - - cf = 1.0 - (1.0 - ctot) / (1.0 - cif) - cf[cf < 0] = 0 - - data *= cf # "grid-box average" lwp/iwp - - # mask all null values (nan, inf) - data = np.ma.masked_invalid(data) - - # Handle units - convert invalid units to '1' - units_str = data_obj.attributes().get("units", "1") - if units_str in ["none", "None", ""]: - units_str = "1" - - # Create time coordinate for the start of the month, with bounds covering the whole month - time_point = datetime( - year=year, month=month, day=15 - ) # use 15th as representative time for the month - time_bounds_lower = datetime(year=year, month=month, day=1) - time_bounds_upper = datetime( - year=year + (month == 12), month=month + 1 - (month == 12) * 12, day=1 - ) - timedelta( - days=1 - ) # end of month is the day before the first day of the next month - - # Convert time to days since 1850-01-01 - time_units = datetime(1850, 1, 1) - delta_1850 = (time_point - time_units).days - # After creating time delta, add bounds: - delta_bounds_lower = (time_bounds_lower - time_units).days - delta_bounds_upper = (time_bounds_upper - time_units).days - - time_coord = iris.coords.DimCoord( - points=[delta_1850], - standard_name="time", - units="days since 1850-01-01 00:00:00", - bounds=[[delta_bounds_lower, delta_bounds_upper]], - ) - data = data[np.newaxis, :, :] # Add time dimension at start - - lats = f.select("YDim").get().astype(np.float64) - lat_coord = iris.coords.DimCoord( - lats, - standard_name="latitude", - units="degrees_north", - bounds=[ - ( # calculate bounds as midpoint between adjacent latitudes, assuming regular grid - lats[i] - 0.5 * np.abs(lats[1] - lats[0]), - lats[i] + 0.5 * np.abs(lats[1] - lats[0]), - ) - for i in range(len(lats)) - ], - ) - - lons = f.select("XDim").get().astype(np.float64) - lon_coord = iris.coords.DimCoord( - lons, - standard_name="longitude", - units="degrees_east", - bounds=[ - ( # calculate bounds as midpoint between adjacent longitudes, assuming regular grid - lons[i] - 0.5 * np.abs(lons[1] - lons[0]), - lons[i] + 0.5 * np.abs(lons[1] - lons[0]), - ) - for i in range(len(lons)) - ], - ) - - if short_name == "od550aer": - # add auxiliary coordinate for wavelength (550 nm for this variable, which is AOD at 550 nm) - wavelength_coord = iris.coords.AuxCoord( - [550], - standard_name="radiation_wavelength", - var_name="wavelength", - units="nm", - ) - aux_coords_and_dims = [ - ( - wavelength_coord, - None, - ), # wavelength is a scalar auxiliary coordinate - ] - else: - aux_coords_and_dims = ( - None # no auxiliary coordinates for other variables - ) - - cube = iris.cube.Cube( - data, - long_name=data_obj.attributes().get("long_name", raw_name), - var_name=short_name, - units=units_str, - dim_coords_and_dims=[ - (time_coord, 0), - (lat_coord, 1), - (lon_coord, 2), - ], - aux_coords_and_dims=aux_coords_and_dims, - ) - - return cube - - -def convert( - cube: iris.cube.Cube, - cmor_info, - attrs, -) -> iris.cube.Cube: - """Convert cube to CMOR standards based on data type and cmor_info.""" - - if attrs.get("reference") is None: - attrs["reference"] = cmor_info.attributes["reference"] - - cube.convert_units(cmor_info.units) - - if cube.coord("longitude").points.min() < 0: - # convert from [-180, 180] to [0, 360] - cube.coord("longitude").points = cube.coord("longitude").points + 180.0 - # roll the data as part of the longitude conversion to maintain correct order (assuming regular grid and that longitude is the last dimension) - nlon = len(cube.coord("longitude").points) - dim_lon = 2 - cube.data = da.roll(cube.core_data(), int(nlon / 2), axis=dim_lon) - - if np.diff(cube.coord("latitude").points)[0] < 0: - # convert [90,-90] to [-90,90] - cube.coord("latitude").points = cube.coord("latitude").points[::-1] - # flip the data - cube.data = cube.data[:, ::-1, :] # latitude is axis=1 - - utils.set_global_atts(cube, attrs) - utils.fix_var_metadata(cube, cmor_info) - utils.fix_dim_coordnames(cube) - - utils.fix_coords(cube) - - return cube - - -def _extract_variable( - filepaths: list, - var_d, - attrs, - cmor_table, - cfg, - in_dir: str, - out_dir: str, - year: int, -) -> None: - """Extract variable data from input files, convert to CMOR standards, and save.""" - - # add mip to attributes for use in cmor table lookup and global attributes - attrs["mip"] = var_d["mip"] - - # get the cmor table for the variable - cmor_info = cmor_table.get_variable( - var_d.get("mip"), var_d.get("short_name") - ) - - # if this variable has a reference specified in the config, use that, otherwise use the one from the cmor table - if var_d.get("reference"): - attrs["reference"] = var_d.get("reference") - - if not cmor_table: - raise Exception( - f'project_id "{var_d.get("project_id", "None")}" is invalid, no CMOR table found. Valid options: {", ".join(list(CMOR_TABLES.keys()))}' - ) - - # check if cmor info was found for the variable, if not log an error and skip - if cmor_info is None: - logger.error( - f"CMOR info not found for variable {var_d.get('short_name')} in mip {var_d.get('mip')}" - ) - return - logger.debug( - f"CMOR info for variable {var_d.get('short_name')}: {cmor_info}" - ) - - logger.info( - f"CMORizing variable {var_d.get('short_name')} from input set {var_d.get('raw_name')}" - ) - - cubes = iris.cube.CubeList() - for filepath in filepaths: - month = get_month_from_filepath(filepath, cfg) - logger.debug( - f"File: {filepath} -> var={var_d.get('short_name')} year={year}, month={month}" - ) - cube = read_hdf(in_dir, filepath, year, month, **var_d) - - logger.debug( - f"Read cube for variable {var_d.get('short_name')}: {cube}, with attributes: {cube.attributes}" - ) - cubes.append(cube) - - if len(cubes) > 0: - cube = cubes.concatenate()[ - 0 - ] # concatenate along time dimension and take the resulting cube - else: - cube = cubes[0] - - cube = convert(cube, cmor_info, attrs) - - # add dataset_id to global attributes for provenance - cube_attrs = cube.attributes.globals - cube_attrs["dataset_id"] = attrs["dataset_id"] - logger.debug(f"Saving variable {cube} with attributes: {cube_attrs}") - - utils.save_variable( - cube, - var_d.get("short_name"), - out_dir, - cube_attrs, - unlimited_dimensions=["time"], - ) - - -def cmorization(in_dir, out_dir, cfg, cfg_user, start_date, end_date): - """Run CMORizer for MODIS.""" - - cmor_table = cfg["cmor_table"] - glob_attrs = cfg["attributes"] - - # get relevant files - filepaths = collect_files(in_dir, cfg) - - # group by year - years_d = group_files_by_year(filepaths, cfg) - years = sorted(list(years_d.keys())) - - # iterate over the variables to be CMORized - for var, var_d in cfg["variables"].items(): - logger.info( - "CMORizing var %s from input set %s", var, var_d["raw_name"] - ) - var_d["short_name"] = var - logger.debug("\n" + format(var_d)) - - # cmorize by year, merging all months at the end - for year in years: - _extract_variable( - years_d[year], - var_d, - glob_attrs, - cmor_table, - cfg, - in_dir, - out_dir, - year, - ) diff --git a/pyproject.toml b/pyproject.toml index 9dcb1d5aca..c31d67b3e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ dependencies = [ "packaging", "pandas", "progressbar2", - "pyhdf", "pyproj>=2.1", "pys2index", "python-dateutil",