diff --git a/profiles_eval/configs/facility_pairs.json b/profiles_eval/configs/facility_pairs.json new file mode 100644 index 0000000..b6e7368 --- /dev/null +++ b/profiles_eval/configs/facility_pairs.json @@ -0,0 +1,7 @@ +{ + "sgp": { + "C1": { + "supplementary_facilities": ["E13"] + } + } +} diff --git a/profiles_eval/configs/hsrl.json b/profiles_eval/configs/hsrl.json index f05d063..79dcf24 100644 --- a/profiles_eval/configs/hsrl.json +++ b/profiles_eval/configs/hsrl.json @@ -28,15 +28,15 @@ "name_in_file": "extinction_aerosol", "units": "1/m" }, - "linear_depol_ratio": { + "particulate_linear_depol_ratio": { "name_in_file": "linear_depol", "units": "1" }, - "linear_depol_ratio_1064nm": { + "particulate_linear_depol_ratio_1064nm": { "name_in_file": "linear_depol_1064", "units": "1" }, - "optical_depth": { + "particulate_optical_depth": { "name_in_file": "od_aerosol", "units": "1" }, diff --git a/profiles_eval/configs/hsrl5s.json b/profiles_eval/configs/hsrl5s.json new file mode 100644 index 0000000..40b0e75 --- /dev/null +++ b/profiles_eval/configs/hsrl5s.json @@ -0,0 +1,75 @@ +{ + "instrument_info": { + "name": "High spectral resolution lidar (HSRL) 5-second resolution", + "type": "lidar", + "instrument_class": "hsrl5s", + "level": "a1", + "requires_corrections": false + }, + "variables": { + "time": { + "name_in_file": "time", + "units": "sec" + }, + "range": { + "name_in_file": "range", + "units": "m" + }, + "particulate_backscatter": { + "name_in_file": "beta_a_backscatter", + "units": "1/(m*sr)" + }, + "particulate_backscatter_1064nm": { + "name_in_file": "beta_a_1064_backscatter", + "units": "1/(m*sr)" + }, + "particulate_extinction": { + "name_in_file": "extinction_aerosol", + "units": "1/m" + }, + "particulate_linear_depol_ratio": { + "name_in_file": "linear_depol", + "units": "1" + }, + "particulate_linear_depol_ratio_1064nm": { + "name_in_file": "linear_depol_1064", + "units": "1" + }, + "particulate_optical_depth": { + "name_in_file": "od_aerosol", + "units": "1" + }, + "molecular_attenuated_backscatter": { + "name_in_file": "atten_beta_r_backscatter", + "units": "1/(m*sr)" + }, + "attenuated_backscatter": { + "name_in_file": "atten_beta_a_backscatter", + "units": "1/(m*sr)" + }, + "attenuated_backscatter_1064nm": { + "name_in_file": "atten_beta_a_1064nm_backscatter", + "units": "1/(m*sr)" + }, + "particulate_backscatter_std": { + "name_in_file": "beta_a_backscatter_std", + "units": "1/(m*sr)" + }, + "molecular_signal_to_noise": { + "name_in_file": "molecular_signal_to_noise", + "units": "1" + }, + "particulate_backscatter_signal_to_noise": { + "name_in_file": "particulate_backscatter_signal_to_noise", + "units": "1" + }, + "color_ratio": { + "name_in_file": "color_ratio", + "units": "1" + } + }, + "coordinates": { + "time": "time", + "range": "range" + } +} diff --git a/profiles_eval/real_prof_init.py b/profiles_eval/real_prof_init.py index e533b91..a3cd119 100644 --- a/profiles_eval/real_prof_init.py +++ b/profiles_eval/real_prof_init.py @@ -151,4 +151,72 @@ def find_instrument_type( return cfg_path.stem except Exception: continue - return None \ No newline at end of file + return None + + +def load_facility_pairs(config_dir: str = "./configs") -> Dict: + """ + Load the facility-pairing configuration. + + Reads ``facility_pairs.json`` from *config_dir* to determine which + (site, facility) combinations have supplementary facilities. Returns + an empty dict if the file does not exist. + + Parameters + ---------- + config_dir : str, optional + Directory containing JSON configuration files, by default "./configs" + + Returns + ------- + Dict + Parsed dictionary with structure:: + + { + "site_code": { + "facility_code": { "supplementary_facilities": ["fac1", "fac2"] } + } + } + + Returns ``{}`` if facility_pairs.json does not exist. + """ + pairs_file = Path(config_dir) / "facility_pairs.json" + + if not pairs_file.exists(): + return {} + + try: + with open(pairs_file, "r") as f: + pairs = json.load(f) + return pairs + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in facility pairs file {pairs_file}: {e}") + + +def get_supplementary_facilities( + site: str, facility: str, config_dir: str = "./configs" +) -> list[str]: + """ + Get the list of supplementary facilities for a given (site, facility) pair. + + Consults the facility-pairing configuration (loaded via + :func:`load_facility_pairs`) and returns any supplementary facilities + configured for the given primary (site, facility) combination. + + Parameters + ---------- + site : str + ARM site code, e.g. ``"sgp"``. + facility : str + ARM facility code, e.g. ``"C1"``. + config_dir : str, optional + Directory containing JSON configuration files, by default "./configs" + + Returns + ------- + list[str] + List of supplementary facility codes, or empty list if none are + defined for this (site, facility) pair. + """ + pairs = load_facility_pairs(config_dir) + return pairs.get(site, {}).get(facility, {}).get("supplementary_facilities", []) \ No newline at end of file diff --git a/profiles_eval/real_prof_io.py b/profiles_eval/real_prof_io.py index bd6f03d..7451c07 100644 --- a/profiles_eval/real_prof_io.py +++ b/profiles_eval/real_prof_io.py @@ -30,6 +30,64 @@ ] +def _native_time_res_str(time_arr: np.ndarray) -> str: + """ + Compute median native time resolution from a time array. + + Parameters + ---------- + time_arr : np.ndarray + 1D array of datetime64 values. + + Returns + ------- + str + Median time step formatted as e.g. "30 s". + """ + if len(time_arr) < 2: + return "undetermined (only one time point)" + diffs = np.diff(time_arr) + # Convert to timedelta64 and extract median in seconds + diffs_s = diffs / np.timedelta64(1, "s") + median_s = float(np.median(diffs_s)) + + # Format: prefer integer or fraction seconds if < 60, otherwise round to nearest second. + if median_s < 60: + return f"{int(median_s)} s" if median_s == int(median_s) else f"{median_s:.1f} s" + else: + return f"{int(median_s)} s" + + +def _native_range_res_str(range_da: xr.DataArray, units_str: str) -> str: + """ + Compute median native range resolution from a range array. + + Always returns result in km. + + Parameters + ---------- + range_da : xr.DataArray + 1D array of range/height values. + units_str : str + Unit string (e.g. "m", "km"). + + Returns + ------- + str + Median range spacing in km, formatted as e.g. "0.015 km", "0.5 km". + """ + if len(range_da) < 2: + return "undetermined (only one range gate)" + diffs = np.diff(range_da.values) + median_diff = float(np.median(np.abs(diffs))) + + # Convert to km if needed + scale_to_km = 1.0 / 1000.0 if units_str.lower() == "m" else 1.0 + median_km = median_diff * scale_to_km + + return f"{median_km:.4g} km" + + def _ordered_vars(var_names: list[str]) -> list[str]: """Return *var_names* in canonical order with qc_ vars after their parent. @@ -525,6 +583,16 @@ def load_and_process_arm_data( _range_da.attrs.get("units", "m") if _range_da is not None else "m" ) + # ------------------------------------------------------------------ + # Capture native resolutions from primary dataset + # ------------------------------------------------------------------ + native_time_res = _native_time_res_str(ds.time.values) + range_coord = ds.get(range_nif, ds.coords.get(range_nif)) + native_range_res = ( + _native_range_res_str(range_coord, range_units) + if range_coord is not None else "unknown" + ) + # ------------------------------------------------------------------ # 2. Load high-resolution dataset if specified # ------------------------------------------------------------------ @@ -541,6 +609,19 @@ def load_and_process_arm_data( ) except FileNotFoundError: ds_highres = None + + # Capture native resolutions from high-res dataset if available + native_time_res2 = None + native_range_res2 = None + highres_range_units = None + if ds_highres is not None: + # Derive range units from highres dataset, not primary dataset + _hr_range_da = ds_highres.get(range_nif, ds_highres.coords.get(range_nif)) + highres_range_units = ( + _hr_range_da.attrs.get("units", "m") if _hr_range_da is not None else "m" + ) + native_time_res2 = _native_time_res_str(ds_highres.time.values) + native_range_res2 = _native_range_res_str(ds_highres["range"], highres_range_units) # ------------------------------------------------------------------ # 3. Apply NRB corrections if required @@ -711,11 +792,11 @@ def _nif(key: str) -> str: time_max=time_max, range_km=range_km, time_step=time_step, - range_units=range_units, + range_units=highres_range_units, ) highres_max_range = float( ds_highres["range"].max() - / (1000.0 if range_units.lower() == "m" else 1.0) + / (1000.0 if highres_range_units.lower() == "m" else 1.0) ) out_range = ds["range"].values if "range" in ds.coords else range_km for var in ds.data_vars: @@ -729,17 +810,42 @@ def _nif(key: str) -> str: ds[var], ) blended.attrs = ds[var].attrs - # Update source attributes to reflect both streams + # Add _2 suffixed source attributes from highres data + hr_field = ds_highres_interp[var].attrs.get("source_fieldname", "") + if hr_field: + blended.attrs["source_fieldname_2"] = hr_field hr_src = ds_highres_interp[var].attrs.get("source_datastream", "") - blended.attrs["source_datastream"] = ( - f"{ds[var].attrs.get('source_datastream','')},{hr_src}".strip(",") - ) + if hr_src: + blended.attrs["source_datastream_2"] = hr_src hr_pv = ds_highres_interp[var].attrs.get("source_process_version", "") - blended.attrs["source_process_version"] = ( - f"{ds[var].attrs.get('source_process_version','')},{hr_pv}".strip(",") - ) + if hr_pv: + blended.attrs["source_process_version_2"] = hr_pv ds[var] = blended + # ------------------------------------------------------------------ + # Attach temporal and vertical resolution attributes to all variables + # ------------------------------------------------------------------ + for var in ds.data_vars: + # All variables get temporal_resolution + ds[var].attrs["temporal_resolution"] = native_time_res + + # Range-dimensioned variables get vertical_resolution (in km) + if "range" in ds[var].dims: + ds[var].attrs["vertical_resolution"] = native_range_res + + # Layer-dimensioned variables also get vertical_resolution (1 layer unit in km) + if "layer" in ds[var].dims: + ds[var].attrs["vertical_resolution"] = native_range_res + + # If HSRL high-res blending occurred, add _2 suffixed attrs + if ds_highres is not None: + # Temporal resolution applies to all variables with time dimension + if "time" in ds[var].dims: + ds[var].attrs["temporal_resolution_2"] = native_time_res2 + # Vertical resolution applies to variables with range dimension + if "range" in ds[var].dims: + ds[var].attrs["vertical_resolution_2"] = native_range_res2 + # ------------------------------------------------------------------ # Final step: rename variables from name_in_file to canonical JSON key # ------------------------------------------------------------------ @@ -900,3 +1006,79 @@ def export_dataset( ) return out_file.resolve() + + +# --------------------------------------------------------------------------- +# Geographic metadata extraction +# --------------------------------------------------------------------------- + +def extract_geographic_metadata( + instrument_types: list[str], + site: str, + facility: str, + data_path: str | Path | None, + time_min: np.datetime64, + time_max: np.datetime64, + safety_delta: np.timedelta64 = np.timedelta64(5, "m"), + config_dir: str = "./configs", + data_path_template: str | None = None, +) -> dict[str, xr.DataArray]: + """ + Extract geographic metadata (lat, lon, alt) from the first available data file. + + Attempts to locate and open data files for each instrument in order, extracting + the 0-D geographic coordinate fields (lat, lon, alt) from the first successful + file found. Returns a dictionary mapping field names to 0-D DataArrays with + full attributes preserved. + + Parameters + ---------- + instrument_types : list[str] + List of instrument identifiers to search, in priority order. + site : str + ARM site code. + facility : str + ARM facility code. + data_path : str, Path, or None + Directory containing instrument data files. Pass ``None`` + when ``data_path_template`` is used instead. + time_min : np.datetime64 + Start of time window for file discovery. + time_max : np.datetime64 + End of time window for file discovery. + safety_delta : np.timedelta64, optional + Lookback/ahead margin for file discovery. Default is 5 minutes. + config_dir : str, optional + Directory containing JSON configuration files. Default is ``"./configs"``. + data_path_template : str, optional + Format string for the ARM nested archive layout. Used when ``data_path`` + is ``None``. + + Returns + ------- + dict[str, xr.DataArray] + Dictionary mapping field names ("lat", "lon", "alt") to 0-D DataArrays + with preserved attributes. Returns only fields that were found; missing + fields are omitted from the dictionary. + """ + geo_data: dict[str, xr.DataArray] = {} + + for instr in instrument_types: + try: + files = find_arm_files( + instr, site, facility, data_path, + time_min, time_max, safety_delta, config_dir, + data_path_template=data_path_template, + ) + if files: + with xr.open_dataset(str(files[0])) as raw_ds: + for field in ["lat", "lon", "alt"]: + if field in raw_ds: + # Keep the full 0-D DataArray with attributes + geo_data[field] = raw_ds[field] + if geo_data: + break + except Exception: + continue + + return geo_data diff --git a/profiles_eval/real_prof_main.py b/profiles_eval/real_prof_main.py index 80d02dc..2dca325 100644 --- a/profiles_eval/real_prof_main.py +++ b/profiles_eval/real_prof_main.py @@ -12,10 +12,14 @@ from pathlib import Path import warnings +import argparse +from datetime import datetime import numpy as np +import xarray as xr -from real_prof_io import load_and_process_arm_data, export_dataset +from real_prof_io import load_and_process_arm_data, export_dataset, extract_geographic_metadata +from real_prof_init import get_supplementary_facilities # --------------------------------------------------------------------------- # Configuration @@ -47,31 +51,111 @@ # Main # --------------------------------------------------------------------------- -def main() -> "xr.Dataset": +def main( + time_min: "np.datetime64 | None" = None, + time_max: "np.datetime64 | None" = None, + site: "str | None" = None, + facility: "str | None" = None, + data_path_template: "str | None" = None, +) -> "xr.Dataset": + # Use module-level defaults if not provided + if time_min is None: + time_min = TIME_MIN + if time_max is None: + time_max = TIME_MAX + if site is None: + site = SITE + if facility is None: + facility = FACILITY + if data_path_template is None: + data_path_template = DATA_PATH_TEMPLATE print("=" * 70) - print(f"Site: {SITE} Facility: {FACILITY}") - print(f"Period: {TIME_MIN} → {TIME_MAX}") - print(f"Path template: {DATA_PATH_TEMPLATE}") + print(f"Site: {site} Facility: {facility}") + print(f"Period: {time_min} → {time_max}") + print(f"Path template: {data_path_template}") print("=" * 70) + # Extract geographic metadata (lat/lon/alt) from first available data file + geo_data = extract_geographic_metadata( + ALL_INSTRUMENTS, + site, + facility, + None, + time_min, + time_max, + config_dir=CONFIG_DIR, + data_path_template=data_path_template, + ) + with warnings.catch_warnings(record=True) as caught: warnings.simplefilter("always") ds = load_and_process_arm_data( instrument_type = ALL_INSTRUMENTS, - site = SITE, - facility = FACILITY, + site = site, + facility = facility, data_path = None, - time_min = TIME_MIN, - time_max = TIME_MAX, + time_min = time_min, + time_max = time_max, config_dir = CONFIG_DIR, - data_path_template = DATA_PATH_TEMPLATE, + data_path_template = data_path_template, interpolate = True, ) + # ------------------------------------------------------------------ + # Load supplementary facility data if configured + # ------------------------------------------------------------------ + supp_facilities = get_supplementary_facilities(SITE, FACILITY, CONFIG_DIR) + contributing_supp_facilities = [] + + for supp_facility in supp_facilities: + print(f"\nLoading supplementary facility '{supp_facility}'...") + try: + ds_supp = load_and_process_arm_data( + instrument_type = ALL_INSTRUMENTS, + site = site, + facility = supp_facility, + data_path = None, + time_min = time_min, + time_max = time_max, + config_dir = CONFIG_DIR, + data_path_template = data_path_template, + interpolate = True, + ) + if ds_supp.data_vars: + # Build rename map: for each variable, find its instrument prefix + # and rename {instr}_{rest} -> {instr}_supp_{rest} + rename_map = {} + for var in ds_supp.data_vars: + # Find which instrument this variable belongs to + for instr in ALL_INSTRUMENTS: + prefix = f"{instr}_" + if var.startswith(prefix): + rest = var[len(prefix):] + rename_map[var] = f"{instr}_supp_{rest}" + break + # If no instrument prefix found, keep the variable as-is + if var not in rename_map: + rename_map[var] = var + + ds_supp = ds_supp.rename(rename_map) + ds = xr.merge([ds, ds_supp]) + contributing_supp_facilities.append(supp_facility) + except FileNotFoundError as exc: + warnings.warn( + f"[{supp_facility}] skipped - {exc}", stacklevel=2 + ) + + # Attach supplementary facility info (set to "N/A" if none) + ds.attrs["supplementary_facility"] = ( + ", ".join(contributing_supp_facilities) + if contributing_supp_facilities + else "N/A" + ) + skipped = [str(w.message) for w in caught] if skipped: - print("\nSkipped instruments:") + print("\nSkipped instruments/facilities:") for msg in skipped: print(f" {msg}") @@ -82,11 +166,98 @@ def main() -> "xr.Dataset": print(f" range: [{float(ds.coords['range'].min()):.3f}," f" {float(ds.coords['range'].max()):.3f}] km") + # ------------------------------------------------------------------ + # Group variables by instrument: primary then supplementary + # ------------------------------------------------------------------ + def _instr_group_key(var): + """Sort key: (instr_index, 0_for_primary_1_for_supp, var_name).""" + for idx, instr in enumerate(ALL_INSTRUMENTS): + if var.startswith(f"{instr}_supp_"): + return (idx, 1, var) + if var.startswith(f"{instr}_"): + return (idx, 0, var) + return (len(ALL_INSTRUMENTS), 0, var) + + ds = ds[sorted(ds.data_vars, key=_instr_group_key)] + + # ------------------------------------------------------------------ + # Add lat/lon/alt as the last variables in the dataset + # ------------------------------------------------------------------ + for field in ["lat", "lon", "alt"]: + if field in geo_data: + ds[field] = geo_data[field] + return ds if __name__ == "__main__": - result = main() - if result.data_vars: - out = export_dataset(result, site=SITE, facility=FACILITY) - print(f"\nExported: {out}") + parser = argparse.ArgumentParser( + description="Process lidar profile intercomparison data." + ) + parser.add_argument( + "--period-start", + type=str, + default=None, + help="Start time in yyyymmddTHHMMSS format (default: {TIME_MIN})", + ) + parser.add_argument( + "--period-end", + type=str, + default=None, + help="End time in yyyymmddTHHMMSS format (default: {TIME_MAX})", + ) + parser.add_argument( + "--site", + type=str, + default=None, + help=f"Site code (default: {SITE})", + ) + parser.add_argument( + "--facility", + type=str, + default=None, + help=f"Facility code (default: {FACILITY})", + ) + parser.add_argument( + "--data-path-template", + type=str, + default=None, + help=f"Data path template (default: {DATA_PATH_TEMPLATE})", + ) + args = parser.parse_args() + + # Parse period_start and period_end from CLI or use defaults + if args.period_start: + period_start = np.datetime64( + datetime.strptime(args.period_start, "%Y%m%dT%H%M%S").isoformat(), "ns" + ) + else: + period_start = TIME_MIN + + if args.period_end: + period_end = np.datetime64( + datetime.strptime(args.period_end, "%Y%m%dT%H%M%S").isoformat(), "ns" + ) + else: + period_end = TIME_MAX + + # Process multiple hours of data, exporting 1-hour files + _hour = np.timedelta64(1, "h") + t = period_start + + while t < period_end: + t_next = t + _hour + result = main( + time_min=t, + time_max=t_next, + site=args.site, + facility=args.facility, + data_path_template=args.data_path_template, + ) + if result.data_vars: + # Use resolved site and facility values + resolved_site = args.site if args.site else SITE + resolved_facility = args.facility if args.facility else FACILITY + out = export_dataset(result, site=resolved_site, facility=resolved_facility) + print(f"\nExported: {out}") + t = t_next diff --git a/profiles_eval/real_prof_utils.py b/profiles_eval/real_prof_utils.py index dd3372d..6cbc052 100644 --- a/profiles_eval/real_prof_utils.py +++ b/profiles_eval/real_prof_utils.py @@ -46,7 +46,7 @@ def get_common_variables(instrument_types: List[str], config_dir: str = "./confi # Default output grids -_DEFAULT_RANGE_KM = np.arange(0.0, 20.0 + 0.015, 0.015) # 0–20 km, 15 m steps +_DEFAULT_RANGE_KM = np.arange(0.0, 20.0 + 0.005, 0.010) # 0–20 km, 10 m steps (IS Update 6/16/2026 per REAL meeting feedback) def interpolate_data( @@ -84,7 +84,7 @@ def interpolate_data( End of the output time grid (inclusive). range_km : np.ndarray, optional 1-D array of output range gate centres in **km**. Defaults to - ``np.arange(0, 20.015, 0.015)`` (0–20 km in 15 m increments). + ``np.arange(0, 20.015, 0.015)`` (0-20 km in 15 m increments). Ignored for variables that have no range dimension. time_step : np.timedelta64, optional Spacing between output time steps. Default is 15 s.