diff --git a/pyproject.toml b/pyproject.toml index 65e97cfe..41a39a3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ["poetry-core"] [project] name = "MetObs-toolkit" -version = "1.0.0a1" +version = "1.0.0a13" license = "LICENSE" authors = [{name = "Thomas Vergauwen", email = "thomas.vergauwen@ugent.be"}] description = "A Meteorological observations toolkit for scientists" diff --git a/src/metobs_toolkit/dataset.py b/src/metobs_toolkit/dataset.py index 92d5445b..f269ff03 100644 --- a/src/metobs_toolkit/dataset.py +++ b/src/metobs_toolkit/dataset.py @@ -12,9 +12,11 @@ import numpy as np import concurrent.futures + if TYPE_CHECKING: from matplotlib.pyplot import Axes from xarray import Dataset as xrDataset + from matplotlib.pyplot import Figure from metobs_toolkit.backend_collection.df_helpers import ( save_concat, @@ -41,6 +43,9 @@ from metobs_toolkit.gf_collection.overview_df_constructors import ( dataset_gap_status_overview_df, ) +from metobs_toolkit.qc_collection.overview_df_constructor import ( + dataset_qc_overview_df, +) from metobs_toolkit.backend_collection.filter_modeldatadf import filter_modeldatadf from metobs_toolkit.timestampmatcher import simplify_time from metobs_toolkit.obstypes import tlk_obstypes @@ -1816,7 +1821,9 @@ def gross_value_check( whiteset=whiteset, ) if use_mp: - with concurrent.futures.ProcessPoolExecutor() as executor: + num_cores = Settings.get('use_N_cores_for_MP') + logger.debug(f'Distributing gross_value_check computations over {num_cores} cores.') + with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor: stationgenerator = executor.map( _qc_grossvalue_generatorfunc, func_feed_list ) @@ -1851,7 +1858,9 @@ def persistence_check( whiteset=whiteset, ) if use_mp: - with concurrent.futures.ProcessPoolExecutor() as executor: + num_cores = Settings.get('use_N_cores_for_MP') + logger.debug(f'Distributing persistence_check computations over {num_cores} cores.') + with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor: stationgenerator = executor.map( _qc_persistence_generatorfunc, func_feed_list ) @@ -1883,7 +1892,9 @@ def repetitions_check( ) if use_mp: - with concurrent.futures.ProcessPoolExecutor() as executor: + num_cores = Settings.get('use_N_cores_for_MP') + logger.debug(f'Distributing repetitions_check computations over {num_cores} cores.') + with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor: stationgenerator = executor.map( _qc_repetitions_generatorfunc, func_feed_list ) @@ -1917,7 +1928,9 @@ def step_check( ) if use_mp: - with concurrent.futures.ProcessPoolExecutor() as executor: + num_cores = Settings.get('use_N_cores_for_MP') + logger.debug(f'Distributing step_check computations over {num_cores} cores.') + with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor: stationgenerator = executor.map(_qc_step_generatorfunc, func_feed_list) qced_stations = list(stationgenerator) else: @@ -1957,7 +1970,9 @@ def window_variation_check( ) if use_mp: - with concurrent.futures.ProcessPoolExecutor() as executor: + num_cores = Settings.get('use_N_cores_for_MP') + logger.debug(f'Distributing window_variation_check computations over {num_cores} cores.') + with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor: stationgenerator = executor.map( _qc_window_var_generatorfunc, func_feed_list ) @@ -1973,15 +1988,21 @@ def buddy_check( obstype: str = "temp", spatial_buddy_radius: Union[int, float] = 10000, min_sample_size: int = 4, + max_sample_size: Union[int, None] = None, max_alt_diff: Union[int, float, None] = None, - min_std: Union[int, float] = 1.0, + min_sample_spread: Union[int, float] = 1.0, + min_buddy_distance: Union[int, float] = 0.0, spatial_z_threshold: Union[int, float] = 3.1, N_iter: int = 2, instantaneous_tolerance: Union[str, pd.Timedelta] = pd.Timedelta("4min"), lapserate: Union[float, None] = None, # -0.0065 for temperature (in °C) whiteset: WhiteSet = WhiteSet(), + use_z_robust_method: bool = True, use_mp: bool = True, + min_std = None, ): + #TODO: update docstring + """Spatial buddy check. The buddy check compares an observation against its neighbors @@ -1998,16 +2019,18 @@ def buddy_check( #. A distance matrix is constructed for all interdistances between the stations. This is done using the haversine approximation. #. Groups of spatial buddies (neighbours) are created by using the - `spatial_buddy_radius.` These groups are further filtered by: + `spatial_buddy_radius` and `min_buddy_distance`. Only stations within + the distance range [min_buddy_distance, spatial_buddy_radius] are + considered as buddies. These groups are further filtered by: - * removing stations from the groups that differ to much in altitude + * removing stations from the groups that differ too much in altitude (based on the `max_alt_diff`) * removing groups of buddies that are too small (based on the `min_sample_size`) #. Observations per group are synchronized in time (using the - `instantaneous_tolerance` for allignment). - #. If a `lapsrate` is specified, the observations are corrected for + `instantaneous_tolerance` for alignment). + #. If a `lapserate` is specified, the observations are corrected for altitude differences. #. The following steps are repeated for `N-iter` iterations: @@ -2036,10 +2059,23 @@ def buddy_check( The radius to define spatial neighbors in meters. Default is 10000. min_sample_size : int, optional The minimum sample size to calculate statistics on. Default is 4. + max_sample_size : int or None, optional + The maximum number of spatial buddies to use per station. If not + None, the spatial buddies for each station are sorted by distance + and only the nearest ``max_sample_size`` are kept. Must be larger + than ``min_sample_size`` when specified. The default is None + (no limit). max_alt_diff : int | float | None, optional The maximum altitude difference allowed for buddies. Default is None. - min_std : int | float, optional - The minimum standard deviation for sample statistics. Default is 1.0. + min_sample_spread : int | float, optional + The minimum sample spread for sample statistics. When use_z_robust_method is True, + this is equal to the minimum MAD to use (avoids division by near-zero). When + use_z_robust_method is False, this is the standard deviation. This parameter helps + to represent the accuracy of the observations. Default is 1.0. + min_buddy_distance : int | float, optional + The minimum distance (in meters) required between a station and its buddies. + Stations closer than this distance will be excluded from the buddy group. + Default is 0.0 (no minimum distance). spatial_z_threshold : int | float, optional The threshold (std units) for flagging observations as outliers. Default is 3.1. N_iter : int, optional @@ -2068,89 +2104,35 @@ def buddy_check( * The altitude of the stations can be extracted from GEE by using the `Dataset.get_altitude()` method. * White-listed records from the WhiteSet participate in all buddy check calculations but are not flagged as outliers in the final results. - """ - - instantaneous_tolerance = fmt_timedelta_arg(instantaneous_tolerance) - if (lapserate is not None) | (max_alt_diff is not None): - if not all([sta.site.flag_has_altitude() for sta in self.stations]): - raise MetObsMetadataNotFound( - "Not all stations have altitude data, lapserate correction and max_alt_diff filtering could not be applied." - ) + See Also + -------- + buddy_check_with_safetynets : Buddy check with configurable safety nets. - qc_kwargs = dict( + """ + if min_std is not None: + raise DeprecationWarning( + "The min_std parameter is deprecated and replaced by the min_sample_spread parameter. The min_sample_spread serves as the minimum STD, if use_z_robust_method is False. Else it acts as the minimum MAD (median absolute deviation from median)." + ) + + # Delegate to buddy_check_with_safetynets with no safety nets + self.buddy_check_with_safetynets( obstype=obstype, spatial_buddy_radius=spatial_buddy_radius, - spatial_min_sample_size=min_sample_size, + safety_net_configs=None, + min_sample_size=min_sample_size, + max_sample_size=max_sample_size, max_alt_diff=max_alt_diff, - min_std=min_std, + min_sample_spread=min_sample_spread, + min_buddy_distance=min_buddy_distance, spatial_z_threshold=spatial_z_threshold, N_iter=N_iter, instantaneous_tolerance=instantaneous_tolerance, lapserate=lapserate, whiteset=whiteset, - safety_net_configs=None, # No safety nets for basic buddy_check - # technical + use_z_robust_method=use_z_robust_method, use_mp=use_mp, ) - # Locate stations with the obstype - target_stations, skip_stations = filter_to_stations_with_target_obstype( - stations=self.stations, obstype=obstype - ) - metadf = self.metadf.loc[[sta.name for sta in target_stations]] - - outlierslist, timestamp_map = toolkit_buddy_check( - target_stations=target_stations, metadf=metadf, **qc_kwargs - ) - # outlierslist is a list of tuples (stationname, datetime, msg) that are outliers - # timestamp_map is a dict with keys the stationname and values a series to map the syncronized - # timestamps to the original timestamps - - # convert to a dataframe - alloutliersdf = pd.DataFrame( - data=outlierslist, columns=["name", "datetime", "detail_msg"] - ) - - # Handle duplicates - # Note: duplicates can occur when a specific record was part of more than one - # outlier group, and is flagged by more than one group. If so, keep the - # first row, but concat the detail_msg's (since they describe the outlier group) - - if not alloutliersdf.empty: - # Group by name and datetime, concatenate detail_msg for duplicates - alloutliersdf = ( - alloutliersdf.groupby(["name", "datetime"], as_index=False) - .agg({"detail_msg": lambda x: " | ".join(x)}) - .reset_index(drop=True) - ) - - # update all the sensordata - for station in target_stations: - # Get the sensordata object - sensorddata = station.get_sensor(obstype) - - # get outlier datetimeindex - outldt = pd.DatetimeIndex( - alloutliersdf[alloutliersdf["name"] == station.name]["datetime"] - ) - - if not outldt.empty: - # convert to original timestamps - dtmap = timestamp_map[station.name] - outldt = outldt.map(dtmap) - - # update the sensordata - sensorddata._update_outliers( - qccheckname="buddy_check", - outliertimestamps=outldt, - check_kwargs=qc_kwargs, - extra_columns={ - "detail_msg": alloutliersdf[alloutliersdf["name"] == station.name][ - "detail_msg" - ].to_numpy() - }, - ) - @log_entry def buddy_check_with_LCZ_safety_net(*args): raise DeprecationWarning( @@ -2164,15 +2146,20 @@ def buddy_check_with_safetynets( spatial_buddy_radius: Union[int, float] = 10000, safety_net_configs: List[Dict] = None, min_sample_size: int = 4, + max_sample_size: Union[int, None] = None, max_alt_diff: Union[int, float, None] = None, - min_std: Union[int, float] = 1.0, + min_sample_spread: Union[int, float] = 1.0, + min_buddy_distance: Union[int, float] = 0.0, spatial_z_threshold: Union[int, float] = 3.1, N_iter: int = 2, instantaneous_tolerance: Union[str, pd.Timedelta] = pd.Timedelta("4min"), lapserate: Union[float, None] = None, # -0.0065 for temperature (in °C) whiteset: WhiteSet = WhiteSet(), + use_z_robust_method: bool = True, use_mp: bool = True, + min_std = None, ): + #TODO: update docstring """Spatial buddy check with configurable safety nets. The buddy check compares an observation against its neighbors @@ -2199,16 +2186,18 @@ def buddy_check_with_safetynets( #. A distance matrix is constructed for all interdistances between the stations. This is done using the haversine approximation. #. Groups of spatial buddies (neighbours) are created by using the - `spatial_buddy_radius.` These groups are further filtered by: + `spatial_buddy_radius` and `min_buddy_distance`. Only stations within + the distance range [min_buddy_distance, spatial_buddy_radius] are + considered as buddies. These groups are further filtered by: - * removing stations from the groups that differ to much in altitude + * removing stations from the groups that differ too much in altitude (based on the `max_alt_diff`) * removing groups of buddies that are too small (based on the `min_sample_size`) #. Observations per group are synchronized in time (using the - `instantaneous_tolerance` for allignment). - #. If a `lapsrate` is specified, the observations are corrected for + `instantaneous_tolerance` for alignment). + #. If a `lapserate` is specified, the observations are corrected for altitude differences. #. The following steps are repeated for `N-iter` iterations: @@ -2228,8 +2217,15 @@ def buddy_check_with_safetynets( #. For each safety net in `safety_net_configs` (in order): + * If `only_if_previous_had_no_buddies` is True for this + safety net, only outlier records where the previous safety + net had insufficient buddies are passed to this safety net. + All other records retain their status from the previous + safety net. * Category buddies (stations sharing the same category value - within the specified radius) are identified. + within the specified distance range) are identified. Like spatial + buddies, category buddies are filtered by distance range + [min_buddy_distance, buddy_radius]. * The category-buddy sample is tested in size (sample size must be at least `min_sample_size`). If the condition is not met, the safety net test is not applied. @@ -2269,6 +2265,22 @@ def buddy_check_with_safetynets( * 'z_threshold': int or float, z-value threshold for saving outliers * 'min_sample_size': int, minimum number of buddies required for the safety net test + * 'min_buddy_distance': int or float (optional), minimum distance + (in meters) required between a station and its category buddies. + Stations closer than this distance will be excluded from the + buddy group. Defaults to 0 (no minimum distance). + * 'max_sample_size': int or None (optional), maximum number of + category buddies to use per station. If not None, category + buddies are sorted by distance and only the nearest + ``max_sample_size`` are kept. Must be larger than + ``min_sample_size`` when specified. Defaults to None (no limit). + * 'only_if_previous_had_no_buddies': bool (optional), if True + this safety net is only applied to outlier records for which + the **previous** safety net could not be executed due to + insufficient buddies. Records that were successfully tested + by the previous safety net (passed or failed) are not + re-tested. This enables a cascading fallback strategy. + Cannot be True for the first safety net. Defaults to False. Example:: @@ -2283,7 +2295,8 @@ def buddy_check_with_safetynets( "category": "network", "buddy_radius": 50000, "z_threshold": 2.5, - "min_sample_size": 3 + "min_sample_size": 3, + "only_if_previous_had_no_buddies": True } ] @@ -2291,11 +2304,25 @@ def buddy_check_with_safetynets( min_sample_size : int, optional The minimum sample size to calculate statistics on. Used for spatial-buddy samples. Default is 4. + max_sample_size : int or None, optional + The maximum number of spatial buddies to use per station. If not + None, the spatial buddies for each station are sorted by distance + and only the nearest ``max_sample_size`` are kept. Must be larger + than ``min_sample_size`` when specified. The default is None + (no limit). max_alt_diff : int or float or None, optional The maximum altitude difference allowed for buddies. Default is None. - min_std : int or float, optional - The minimum standard deviation for sample statistics. This is used - in spatial and safety net samples. Default is 1.0. + min_sample_spread : int or float, optional + The minimum sample spread for sample statistics. When use_z_robust_method is True, + this is equal to the minimum MAD to use (avoids division by near-zero). When + use_z_robust_method is False, this is the standard deviation. This parameter helps + to represent the accuracy of the observations. This is used in spatial and + safety net samples. Default is 1.0. + min_buddy_distance : int or float, optional + The minimum distance (in meters) required between a station and its spatial buddies. + Stations closer than this distance will be excluded from the buddy group. This also + affects safety net buddy selection unless overridden in the safety_net_configs. + Default is 0.0 (no minimum distance). spatial_z_threshold : int or float, optional The threshold, tested with z-scores, for flagging observations as outliers. Default is 3.1. @@ -2359,11 +2386,28 @@ def buddy_check_with_safetynets( ... ] ... ) + Apply cascading safety nets where the second safety net only tests + records that had insufficient buddies in the first: + + >>> dataset.buddy_check_with_safetynets( + ... obstype="temp", + ... safety_net_configs=[ + ... {"category": "LCZ", "buddy_radius": 40000, "z_threshold": 2.1, "min_sample_size": 4}, + ... {"category": "network", "buddy_radius": 50000, "z_threshold": 2.5, "min_sample_size": 3, "only_if_previous_had_no_buddies": True} + ... ] + ... ) + """ + if min_std is not None: + raise DeprecationWarning( + "The min_std parameter is deprecated and replaced by the min_sample_spread parameter. The min_sample_spread serves as the minimum STD, if use_z_robust_method is False. Else it acts as the minimum MAD (median absolute deviation from median)." + ) instantaneous_tolerance = fmt_timedelta_arg(instantaneous_tolerance) - + # Validate that the required metadata columns exist if safety_net_configs: + if not isinstance(safety_net_configs, list): + raise TypeError("safety_net_configs must be a list of dicts.") required_categories = set(cfg["category"] for cfg in safety_net_configs) for category in required_categories: if category == "LCZ": @@ -2390,15 +2434,19 @@ def buddy_check_with_safetynets( obstype=obstype, spatial_buddy_radius=spatial_buddy_radius, spatial_min_sample_size=min_sample_size, + spatial_max_sample_size=max_sample_size, max_alt_diff=max_alt_diff, - min_std=min_std, + min_sample_spread=min_sample_spread, spatial_z_threshold=spatial_z_threshold, N_iter=N_iter, instantaneous_tolerance=instantaneous_tolerance, + min_buddy_distance = min_buddy_distance, lapserate=lapserate, whiteset=whiteset, # Generalized safety net configuration safety_net_configs=safety_net_configs, + #statistical + use_z_robust_method=use_z_robust_method, # technical use_mp=use_mp, ) @@ -2409,82 +2457,110 @@ def buddy_check_with_safetynets( ) metadf = self.metadf.loc[[sta.name for sta in target_stations]] - outlierslist, timestamp_map = toolkit_buddy_check( + qcresuldict, detailsensors = toolkit_buddy_check( target_stations=target_stations, metadf=metadf, **qc_kwargs ) - # outlierslist is a list of tuples (stationname, datetime, msg) that are outliers - # timestamp_map is a dict with keys the stationname and values a series to map the syncronized - # timestamps to the original timestamps + for staname, qcres in qcresuldict.items(): + sensordata = self.get_station(staname).get_sensor(obstype) + sensordata._update_outliers(qcresult=qcres, overwrite=False) + return detailsensors + + @copy_doc(dataset_qc_overview_df) + @log_entry + def qc_overview_df(self, + subset_stations:Union[list[str], None] = None, + subset_obstypes:Union[list[str], None] = None) -> pd.DataFrame: + return dataset_qc_overview_df(self, + subset_stations=subset_stations, + subset_obstypes=subset_obstypes) - # convert to a dataframe - alloutliersdf = pd.DataFrame( - data=outlierslist, columns=["name", "datetime", "detail_msg"] - ) + @log_entry + def get_qc_stats( + self, obstype: str = "temp", make_plot: bool = True + ) -> Union[dict[str, pd.Series], Figure]: + """ + Summarize QC label frequencies across all stations for a given observation type. - # Handle duplicates - # Note: duplicates can occur when a specific record was part of more than one - # outlier group, and is flagged by more than one group. If so, keep the - # first row, but concat the detail_msg's (since they describe the outlier group) - - if not alloutliersdf.empty: - # Group by name and datetime, concatenate detail_msg for duplicates - alloutliersdf = ( - alloutliersdf.groupby(["name", "datetime"], as_index=False) - .agg({"detail_msg": lambda x: " | ".join(x)}) - .reset_index(drop=True) - ) + This aggregates three series over every station that has the requested ``obstype``: - # update all the sensordata - for station in target_stations: - # Get the sensordata object - sensorddata = station.get_sensor(obstype) + * final label counts from each ``SensorData.df['label'].value_counts()``; + * outlier-only label counts from ``SensorData.outliersdf['label'].value_counts()``; + * per-check outcome counts from ``SensorData.get_qc_freq_statistics()`` + (MultiIndex ``['checkname', 'flag']``). - # get outlier datetimeindex - outldt = pd.DatetimeIndex( - alloutliersdf[alloutliersdf["name"] == station.name]["datetime"] - ) + When ``make_plot`` is True, the aggregated counts are visualized with + ``plotting.qc_overview_pies``. When False, the aggregated series are returned for + programmatic use. - if not outldt.empty: - # convert to original timestamps - dtmap = timestamp_map[station.name] - outldt = outldt.map(dtmap) - - # update the sensordata - sensorddata._update_outliers( - qccheckname="buddy_check_with_safetynets", - outliertimestamps=outldt, - check_kwargs=qc_kwargs, - extra_columns={ - "detail_msg": alloutliersdf[alloutliersdf["name"] == station.name][ - "detail_msg" - ].to_numpy() - }, - ) + Parameters + ---------- + obstype : str, optional + Observation type to evaluate, by default "temp". + make_plot : bool, optional + If True, return a figure with pie charts; if False, return the aggregated counts. + Default is True. - @copy_doc(Station.get_qc_stats) - @log_entry - def get_qc_stats( - self, obstype: str = "temp", make_plot: bool = True - ) -> Union[pd.DataFrame, None]: - freqdf_list = [ - sta.get_qc_stats(obstype=obstype, make_plot=False) for sta in self.stations + Returns + ------- + matplotlib.figure.Figure or dict[str, pandas.Series] + Figure with QC overview pies when ``make_plot`` is True; otherwise a dictionary with + keys ``all_labels``, ``outlier_labels``, and ``per_check_labels``. Returns None when + no stations provide the requested ``obstype``. + """ + + # collect stations that actually have the target obstype + target_stations = [ + sta for sta in self.stations if obstype in sta.obsdata ] - dfagg = ( - pd.concat(freqdf_list) - .reset_index() - .groupby(["qc_check"]) - .sum() - .drop(columns=["name"]) - ) + if not target_stations: + logger.warning("No stations with obstype '%s' found for QC stats.", obstype) + return None + + df, outliersdf = self.df, self.outliersdf + if df.empty: + logger.warning("Dataset is empty, cannot compute QC stats.") + return None + if obstype not in df.index.get_level_values("obstype"): + logger.warning("No data for obstype '%s' in dataset, cannot compute QC stats.", obstype) + return None + + all_label_counts = df.xs(obstype, level="obstype")["label"].value_counts() + if obstype in outliersdf.index.get_level_values("obstype"): + outlier_label_counts = outliersdf.xs(obstype, level="obstype")["label"].value_counts() + else: + outlier_label_counts = pd.Series(index=pd.Index([], dtype=int, name="label"), + dtype=int) + + + + per_check_counts = [] + for sta in target_stations: + sensor_counts = sta.get_sensor(obstype).get_qc_freq_statistics() + #add the name of the station as a index level + sensor_counts.index = pd.MultiIndex.from_frame( + sensor_counts.index.to_frame().assign(name=sta.name)) + + per_check_counts.append(sensor_counts) + + per_check_counts = pd.concat(per_check_counts).groupby(level=["checkname", "flag"]).sum() if make_plot: - fig = plotting.qc_overview_pies(df=dfagg) - fig.suptitle(f"QC frequency statistics of {obstype} on Dataset level.") + fig = plotting.qc_overview_pies( + end_labels_from_df=all_label_counts, + end_labels_from_outliers=outlier_label_counts, + per_check_labels=per_check_counts, + fig_title = f"QC frequency statistics of {obstype} on Dataset level." + ) + return fig - else: - return dfagg + + return { + "all_labels": all_label_counts, + "outlier_labels": outlier_label_counts, + "per_check_labels": per_check_counts, + } # ------------------------------------------ # Other methods diff --git a/src/metobs_toolkit/gf_collection/overview_df_constructors.py b/src/metobs_toolkit/gf_collection/overview_df_constructors.py index 94a32e14..dcb00295 100644 --- a/src/metobs_toolkit/gf_collection/overview_df_constructors.py +++ b/src/metobs_toolkit/gf_collection/overview_df_constructors.py @@ -2,9 +2,13 @@ (sensordata, station, dataset) for overviews and summaries of Gaps.""" import pandas as pd +from typing import Union from metobs_toolkit.backend_collection.dev_collection import copy_doc from metobs_toolkit.backend_collection.df_helpers import save_concat +#=============================== +# Gap overiview +#=============================== def sensordata_gap_status_overview_df(sensordata) -> pd.DataFrame: """ @@ -80,7 +84,6 @@ def sensordata_gap_status_overview_df(sensordata) -> pd.DataFrame: index=pd.Index([], name="gapstart"), ) - @copy_doc(sensordata_gap_status_overview_df) def station_gap_status_overview_df(station) -> pd.DataFrame: concatlist = [] @@ -127,3 +130,4 @@ def dataset_gap_status_overview_df(dataset) -> pd.DataFrame: ), ) return combdf + diff --git a/src/metobs_toolkit/plot_collection/qc_info_pies.py b/src/metobs_toolkit/plot_collection/qc_info_pies.py index 447a29ec..0b986a02 100644 --- a/src/metobs_toolkit/plot_collection/qc_info_pies.py +++ b/src/metobs_toolkit/plot_collection/qc_info_pies.py @@ -13,28 +13,40 @@ logger = logging.getLogger("") +def autopct_format(pct): + return f'{pct:.1f}%' if pct > 0 else '' + @log_entry def qc_overview_pies( - df: pd.DataFrame, + end_labels_from_df: pd.Series, + end_labels_from_outliers: pd.Series, + per_check_labels: pd.Series, + fig_title: str = "" + ) -> plt.Figure: """ - Generate a quality control (QC) overview using pie charts. + Generate a QC overview figure with pie charts for label frequencies and per-check outcomes. Parameters ---------- - df : pandas.DataFrame - DataFrame containing QC data. Must include columns 'N_labeled', 'N_all', and 'N_checked'. + end_labels_from_df : pandas.Series + Counts of final labels for all records (index as labels), typically from + ``SensorData.df['label'].value_counts()``. + end_labels_from_outliers : pandas.Series + Counts limited to outlier records (index as labels), e.g. from + ``SensorData.outliersdf['label'].value_counts()``. If empty, a single slice + "No QC outliers" is drawn. + per_check_labels : pandas.Series + MultiIndex Series with index levels ``['checkname', 'flag']`` containing counts per + QC check outcome (flags such as ``flagged_cond``, ``pass_cond``, ``unmet_cond``, etc.), + as returned by ``SensorData.get_qc_freq_statistics()``. Returns ------- matplotlib.figure.Figure - The generated figure containing the QC overview pie charts. - - Raises - ------ - TypeError - If any of the arguments are not of the expected type. + Figure containing two large pies (all labels and outlier labels) and one small pie per + QC check showing the distribution of its outcomes. """ # Define layout @@ -48,97 +60,198 @@ def qc_overview_pies( ax_thr = fig.add_subplot(spec[0, 2:]) # top half right # Frequency with all - plotdf = df - colors = [Settings._get_color_from_label(label) for label in plotdf.index] - plotdf.plot( + colors = [Settings._get_color_from_label(label) for label in end_labels_from_df.index] + end_labels_from_df.plot( ax=ax_thl, kind="pie", - y="N_labeled", - autopct="%1.1f%%", + y="", + autopct=autopct_format, legend=False, colors=colors, radius=Settings.get("plotting_settings.pie_charts.radius_big"), - fontsize=Settings.get("plotting_settings.pie_charts.txt_size_big_pies"), + fontsize=Settings.get("plotting_settings.pie_charts.txt_label_size_big_pies"), ) - ax_thl.set_title("Label frequencies") + ax_thl.set_title("Label frequencies", **Settings.get("plotting_settings.pie_charts.big_pie_title_kwargs")) ax_thl.set_ylabel("") - # Outliers comparison - plotdf = df[ - ~df.index.isin( - [ - Settings.get("label_def.goodrecord.label"), - Settings.get("label_def.regular_gap.label"), - ] - ) - ] - - colors = [Settings._get_color_from_label(label) for label in plotdf.index] + # Only outliers + + colors = [Settings._get_color_from_label(label) for label in end_labels_from_outliers.index] - if plotdf.empty: + if end_labels_from_outliers.empty: # No outliers --> full pie with "No QC outliers" in the color of 'ok' - plotdf = pd.DataFrame( - data={"N_labeled": [100]}, index=pd.Index(data=["No QC outliers"]) - ) - colors = [Settings.get("label_def.goodrecord.color")] + end_labels_from_outliers = pd.Series([100], index=["No QC outliers"]) + colors = [Settings._get_color_from_label('ok')] - plotdf.plot( + end_labels_from_outliers.plot( ax=ax_thr, kind="pie", - y="N_labeled", - autopct="%1.1f%%", + y="", + autopct=autopct_format, legend=False, colors=colors, radius=Settings.get("plotting_settings.pie_charts.radius_big"), - fontsize=Settings.get("plotting_settings.pie_charts.txt_size_big_pies"), + fontsize=Settings.get("plotting_settings.pie_charts.txt_label_size_big_pies"), ) - ax_thr.set_title("Outlier specific frequencies") + ax_thr.set_title("Outlier specific frequencies", **Settings.get("plotting_settings.pie_charts.big_pie_title_kwargs")) ax_thr.set_ylabel("") # Performance per check - plotdf = df[ - ~df.index.isin( - [ - Settings.get("label_def.goodrecord.label"), - Settings.get("label_def.regular_gap.label"), - ] - ) - ] - - # Label to QC check name map - label_too_qcname_map = Settings._label_to_qccheckmap() + per_qc_colmap = {val['label']: val['plotkwargs']['color'] for val in Settings.get('qc_status_labels_per_check').values()} + i = 0 - for idx, row in plotdf.iterrows(): - # Target a specific axes + for checkname in per_check_labels.index.get_level_values('checkname').unique(): subax = fig.add_subplot(spec[math.floor(i / ncol) + 1, i % ncol]) - - # Construct a plot Series - plotseries = pd.Series( - { - Settings.get("label_def.uncheckedrecord.label"): row["N_all"] - - row["N_checked"], - Settings.get("label_def.goodrecord.label"): row["N_checked"] - - row["N_labeled"], - Settings.get("label_def.outlier.label"): row["N_labeled"], - } - ) - # Define colors - colors = [Settings._get_color_from_label(label) for label in plotseries.index] - plotseries.plot( + + checkname_subset = per_check_labels.loc[checkname] + colors = [per_qc_colmap.get(label, 'gray') for label in checkname_subset.index] + + checkname_subset.plot( ax=subax, kind="pie", - autopct="%1.1f%%", + autopct=autopct_format, legend=False, colors=colors, radius=Settings.get("plotting_settings.pie_charts.radius_small"), - fontsize=Settings.get("plotting_settings.pie_charts.txt_size_small_pies"), + fontsize=Settings.get("plotting_settings.pie_charts.txt_label_size_small_pies"), ) - subax.set_title(f"Effectiveness of {label_too_qcname_map[idx]}") + subax.set_title(f"{checkname}", **Settings.get("plotting_settings.pie_charts.small_pie_title_kwargs")) subax.set_ylabel("") i += 1 - - logger.debug("Exiting qc_overview_pies function.") + + + fig.suptitle(fig_title, **Settings.get("plotting_settings.pie_charts.fig_title_kwargs")) return fig + + +# @log_entry +# def qc_overview_pies( +# df: pd.DataFrame, +# ) -> plt.Figure: +# """ +# Generate a quality control (QC) overview using pie charts. + +# Parameters +# ---------- +# df : pandas.DataFrame +# DataFrame containing QC data. Must include columns 'N_labeled', 'N_all', and 'N_checked'. + +# Returns +# ------- +# matplotlib.figure.Figure +# The generated figure containing the QC overview pie charts. + +# Raises +# ------ +# TypeError +# If any of the arguments are not of the expected type. +# """ + +# # Define layout +# ax = create_axes(**Settings.get("plotting_settings.pie_charts.figkwargs")) +# ax.set_axis_off() +# fig = ax.get_figure() + +# ncol = Settings.get("plotting_settings.pie_charts.ncols") +# spec = fig.add_gridspec(4, ncol) +# ax_thl = fig.add_subplot(spec[0, :2]) # top half left +# ax_thr = fig.add_subplot(spec[0, 2:]) # top half right + +# # Frequency with all +# plotdf = df +# colors = [Settings._get_color_from_label(label) for label in plotdf.index] +# plotdf.plot( +# ax=ax_thl, +# kind="pie", +# y="N_labeled", +# autopct="%1.1f%%", +# legend=False, +# colors=colors, +# radius=Settings.get("plotting_settings.pie_charts.radius_big"), +# fontsize=Settings.get("plotting_settings.pie_charts.txt_size_big_pies"), +# ) +# ax_thl.set_title("Label frequencies") +# ax_thl.set_ylabel("") + +# # Outliers comparison +# plotdf = df[ +# ~df.index.isin( +# [ +# Settings.get("label_def.goodrecord.label"), +# Settings.get("label_def.regular_gap.label"), +# ] +# ) +# ] + +# colors = [Settings._get_color_from_label(label) for label in plotdf.index] + +# if plotdf.empty: +# # No outliers --> full pie with "No QC outliers" in the color of 'ok' +# plotdf = pd.DataFrame( +# data={"N_labeled": [100]}, index=pd.Index(data=["No QC outliers"]) +# ) +# colors = [Settings.get("label_def.goodrecord.color")] + +# plotdf.plot( +# ax=ax_thr, +# kind="pie", +# y="N_labeled", +# autopct="%1.1f%%", +# legend=False, +# colors=colors, +# radius=Settings.get("plotting_settings.pie_charts.radius_big"), +# fontsize=Settings.get("plotting_settings.pie_charts.txt_size_big_pies"), +# ) +# ax_thr.set_title("Outlier specific frequencies") +# ax_thr.set_ylabel("") + +# # Performance per check +# plotdf = df[ +# ~df.index.isin( +# [ +# Settings.get("label_def.goodrecord.label"), +# Settings.get("label_def.regular_gap.label"), +# ] +# ) +# ] + +# # Label to QC check name map +# label_too_qcname_map = Settings._label_to_qccheckmap() + +# i = 0 +# for idx, row in plotdf.iterrows(): +# # Target a specific axes +# subax = fig.add_subplot(spec[math.floor(i / ncol) + 1, i % ncol]) + +# # Construct a plot Series +# plotseries = pd.Series( +# { +# Settings.get("label_def.uncheckedrecord.label"): row["N_all"] +# - row["N_checked"], +# Settings.get("label_def.goodrecord.label"): row["N_checked"] +# - row["N_labeled"], +# Settings.get("label_def.outlier.label"): row["N_labeled"], +# } +# ) +# # Define colors +# colors = [Settings._get_color_from_label(label) for label in plotseries.index] +# plotseries.plot( +# ax=subax, +# kind="pie", +# autopct="%1.1f%%", +# legend=False, +# colors=colors, +# radius=Settings.get("plotting_settings.pie_charts.radius_small"), +# fontsize=Settings.get("plotting_settings.pie_charts.txt_size_small_pies"), +# ) + +# subax.set_title(f"Effectiveness of {label_too_qcname_map[idx]}") +# subax.set_ylabel("") + +# i += 1 + +# logger.debug("Exiting qc_overview_pies function.") +# return fig diff --git a/src/metobs_toolkit/qc_collection/__init__.py b/src/metobs_toolkit/qc_collection/__init__.py index 5e668ee7..1752d14e 100644 --- a/src/metobs_toolkit/qc_collection/__init__.py +++ b/src/metobs_toolkit/qc_collection/__init__.py @@ -1,8 +1,10 @@ # flake8: noqa: F401 +from .duplicated_timestamp import duplicated_timestamp_check +from .invalid_check import drop_invalid_values from .grossvalue_check import gross_value_check from .persistence_check import persistence_check from .repetitions_check import repetitions_check from .step_check import step_check from .window_variation_check import window_variation_check -from .buddy_check import toolkit_buddy_check +from .spatial_checks.buddy_check import toolkit_buddy_check diff --git a/src/metobs_toolkit/qc_collection/buddy_check.py b/src/metobs_toolkit/qc_collection/buddy_check.py deleted file mode 100644 index 4fc3e40d..00000000 --- a/src/metobs_toolkit/qc_collection/buddy_check.py +++ /dev/null @@ -1,1132 +0,0 @@ -from __future__ import annotations - -import os -import logging -import concurrent.futures -from typing import Union, List, Dict, Tuple, TYPE_CHECKING - -import numpy as np -import pandas as pd - -from metobs_toolkit.backend_collection.datetime_collection import to_timedelta -from metobs_toolkit.backend_collection.decorators import log_entry -from metobs_toolkit.qc_collection.distancematrix_func import generate_distance_matrix -from .whitelist import WhiteSet - -if TYPE_CHECKING: - from metobs_toolkit.station import Station - -logger = logging.getLogger("") - - -@log_entry -def synchronize_series( - series_list: List[pd.Series], max_shift: pd.Timedelta -) -> Tuple[pd.DataFrame, Dict]: - """ - Synchronize a list of pandas Series with datetime indexes. - - The target timestamps are defined by: - - - * freq: the highest frequency present in the input series - * origin: the earliest timestamp found, rounded down by the freq - * closing: the latest timestamp found, rounded up by the freq. - - Parameters - ---------- - series_list : list of pandas.Series - List of pandas Series with datetime indexes. - max_shift : pandas.Timedelta - Maximum shift in time that can be applied to each timestamp - in synchronization. - - Returns - ------- - pandas.DataFrame - DataFrame with synchronized Series. - dict - Dictionary mapping each synchronized timestamp to its - original timestamp. - """ - - # find highest frequency - frequencies = [to_timedelta(s.index.inferred_freq) for s in series_list] - trg_freq = min(frequencies) - - # find origin and closing timestamp (earliest/latest) - origin = min([s.index.min() for s in series_list]).floor(trg_freq) - closing = max([s.index.max() for s in series_list]).ceil(trg_freq) - - # Create target datetime axes - target_dt = pd.date_range(start=origin, end=closing, freq=trg_freq) - - # Synchronize (merge with tolerance) series to the common index - synchronized_series = [] - timestamp_mapping = {} - for s in series_list: - targetdf = ( - s.to_frame() - .assign(orig_datetime=s.index) - .reindex( - index=pd.DatetimeIndex(target_dt), - method="nearest", - tolerance=max_shift, - limit=1, - ) - ) - - # extract the mapping (new -> original) - orig_timestampseries = targetdf["orig_datetime"] - orig_timestampseries.name = "original_timestamp" - timestamp_mapping[s.name] = orig_timestampseries - - synchronized_series.append(s) - - return pd.concat(synchronized_series, axis=1), timestamp_mapping - - -def _validate_safety_net_configs(safety_net_configs: List[Dict]) -> None: - """ - Validate that all required keys are present in safety_net_configs. - - Parameters - ---------- - safety_net_configs : list of dict - List of safety net configuration dictionaries. - - Raises - ------ - ValueError - If safety_net_configs is not a list or contains non-dict elements. - KeyError - If any required key is missing from a safety net configuration. - """ - if safety_net_configs is None: - return None - - required_keys = {"category", "buddy_radius", "z_threshold", "min_sample_size"} - - if not isinstance(safety_net_configs, list): - raise ValueError( - f"safety_net_configs must be a list, got {type(safety_net_configs).__name__}" - ) - - for i, config in enumerate(safety_net_configs): - if not isinstance(config, dict): - raise ValueError( - f"Each safety net config must be a dict, but config at index {i} " - f"is {type(config).__name__}" - ) - - missing_keys = required_keys - set(config.keys()) - if missing_keys: - raise KeyError( - f"Safety net config at index {i} is missing required key(s): " - f"{', '.join(sorted(missing_keys))}. " - f"Required keys are: {', '.join(sorted(required_keys))}" - ) - - -def _find_buddies_by_distance( - distance_df: pd.DataFrame, buddy_radius: Union[int, float] -) -> Dict: - """ - Get neighbouring stations using buddy radius (distance only). - - This is the core distance-based buddy finding function used internally - by other buddy-finding functions. - - Parameters - ---------- - distance_df : pandas.DataFrame - DataFrame containing distances between stations. - buddy_radius : int or float - Maximum distance (in meters) to consider as a buddy. - - Returns - ------- - dict - Dictionary mapping each station to a list of its buddies within the radius. - """ - - buddies = {} - for refstation, distances in distance_df.iterrows(): - bud_stations = distances[distances <= buddy_radius].index.to_list() - bud_stations.remove(refstation) - buddies[refstation] = bud_stations - - return buddies - - -def _find_category_buddies( - metadf: pd.DataFrame, - category_column: str, - max_dist: Union[int, float], - distance_df: pd.DataFrame, -) -> Dict: - """ - Get neighbouring stations using a categorical column and spatial distance. - - This function identifies buddy stations that share the same categorical - value (e.g., LCZ, network, region) and are within a specified distance. - - Parameters - ---------- - metadf : pandas.DataFrame - DataFrame containing metadata for stations. Must include the specified - category column. - category_column : str - The name of the categorical column to group stations by (e.g., 'LCZ', - 'network', 'region'). - max_dist : int or float - Maximum distance (in meters) to consider as a category buddy. - distance_df : pandas.DataFrame - DataFrame containing distances between stations. - - Returns - ------- - dict - Dictionary mapping each station to a list of its category buddies that - are also within the specified distance. - - Notes - ----- - - Category buddies are stations with the same category value as the reference - station. - - The final buddies are the intersection of category buddies and spatial - buddies within `max_dist`. - - Stations with NaN values in the category column are handled: they will not - match with any other station (including other NaN stations). - """ - category_buddies = {} - # Find buddies by category - for refstation in metadf.index: - ref_category = metadf.loc[refstation, category_column] - # Handle NaN values - they should not match with anything - if pd.isna(ref_category): - logger.warning( - "Station %s has NaN value for category '%s' - no category buddies assigned", - refstation, - category_column, - ) - category_buddies[refstation] = [] - else: - ref_buddies = metadf.loc[ - metadf[category_column] == ref_category - ].index.to_list() - category_buddies[refstation] = ref_buddies - - # Find buddies by distance - spatial_buddies = _find_buddies_by_distance(distance_df, max_dist) - - # Intersection of both buddy definitions - final_buddies = {} - for refstation in category_buddies.keys(): - # Sort the result to ensure deterministic ordering - final_buddies[refstation] = sorted( - set(category_buddies[refstation]).intersection( - set(spatial_buddies[refstation]) - ) - ) - - return final_buddies - - -def _find_spatial_buddies( - distance_df: pd.DataFrame, - metadf: pd.DataFrame, - buddy_radius: Union[int, float], -) -> Dict: - """ - Get neighbouring stations using buddy radius (spatial distance only). - - This function is a wrapper around `_find_category_buddies` that finds - buddies based purely on spatial distance, without any categorical - constraints. It works by creating a dummy category column where all - stations have the same value. - - Parameters - ---------- - distance_df : pandas.DataFrame - DataFrame containing distances between stations. - metadf : pandas.DataFrame - DataFrame containing metadata for stations. The index should be - station names. - buddy_radius : int or float - Maximum distance (in meters) to consider as a buddy. - - Returns - ------- - dict - Dictionary mapping each station to a list of its buddies within - the specified radius. - - See Also - -------- - _find_category_buddies : Find buddies by category and distance. - _find_buddies_by_distance : Core distance-based buddy finding function. - """ - - # Create a temporary metadf with a dummy category column where all - # stations have the same value, so _find_category_buddies will not - # filter by category - temp_metadf = metadf.copy() - temp_metadf["_all_same_category"] = 1 - - return _find_category_buddies( - metadf=temp_metadf, - category_column="_all_same_category", - max_dist=buddy_radius, - distance_df=distance_df, - ) - - -def _filter_to_altitude_buddies( - buddies: Dict, altitudes: pd.Series, max_altitude_diff: Union[int, float] -) -> Dict: - """ - Filter neighbours by maximum altitude difference. - - Parameters - ---------- - buddies : dict - Dictionary mapping each station to a list of its spatial buddies. - altitudes : pandas.Series - Series containing altitudes for each station. - max_altitude_diff : int or float - Maximum allowed altitude difference. - - Returns - ------- - dict - Dictionary mapping each station to a list of altitude-filtered buddies. - """ - - alt_buddies_dict = {} - for refstation, buddylist in buddies.items(): - alt_diff = abs((altitudes.loc[buddylist]) - altitudes.loc[refstation]) - - alt_buddies = alt_diff[alt_diff <= max_altitude_diff].index.to_list() - alt_buddies_dict[refstation] = alt_buddies - return alt_buddies_dict - - -def _filter_to_minimum_samplesize(buddydict: Dict, min_sample_size: int) -> Dict: - """ - Filter stations that are too isolated using minimum sample size. - - Parameters - ---------- - buddydict : dict - Dictionary mapping each station to a list of its buddies. - min_sample_size : int - Minimum number of buddies required. - - Returns - ------- - dict - Dictionary mapping each station to a list of buddies meeting the - minimum sample size. - """ - - to_check_stations = {} - for refstation, buddies in buddydict.items(): - if len(buddies) < min_sample_size: - # not enough buddies - to_check_stations[refstation] = [] # remove buddies - else: - to_check_stations[refstation] = buddies - return to_check_stations - - -@log_entry -def create_groups_of_buddies(buddydict: Dict) -> List[Tuple]: - """ - Create unique groups of buddies from a buddy dictionary. - - Parameters - ---------- - buddydict : dict - Dictionary mapping each station to a list of its buddies. - - Returns - ------- - list of tuple - List of tuples, each containing a group of station names. - Groups are sorted to ensure deterministic processing order. - """ - - grouped_stations = [] - groups = [] - # Iterate in sorted order for deterministic results - for refstation in sorted(buddydict.keys()): - buddies = buddydict[refstation] - if not bool(buddies): - continue - if refstation in grouped_stations: - continue - # Sort the group members for deterministic column ordering - group = tuple(sorted([refstation, *buddies])) - - grouped_stations.extend([*group]) - groups.append(group) - - return groups - - -@log_entry -def toolkit_buddy_check( - target_stations: list[Station], - metadf: pd.DataFrame, - obstype: str, - spatial_buddy_radius: Union[int, float], - spatial_min_sample_size: int, - max_alt_diff: Union[int, float, None], - min_std: Union[int, float], - spatial_z_threshold: Union[int, float], - N_iter: int, - instantaneous_tolerance: pd.Timedelta, - # Whitelist arguments - whiteset: WhiteSet, - # Safety nets - safety_net_configs: List[Dict] = None, - # Technical - lapserate: Union[float, None] = None, # -0.0065 for temperature - use_mp: bool = True, -) -> Tuple[list, dict]: - """ - Spatial buddy check. - - The buddy check compares an observation against its neighbors - (i.e. spatial buddies). The check loops over all the groups, which are stations - within a radius of each other. For each group, the z-value of the reference - observation is computed given the sample of spatial buddies. If one (or more) - exceeds the `spatial_z_threshold`, the most extreme (=baddest) observation of - that group is labeled as an outlier. - - Multiple iterations of this checks can be done using the N_iter. - - Optionally, one or more safety nets can be applied. A safety net tests - potential outliers against a sample of stations that share a categorical - attribute (e.g., LCZ, network). If the z-value computed using the safety - net sample is below the specified threshold, the outlier is "saved" and - removed from the outlier set for the current iteration. - - Safety nets are applied in the order they are specified, allowing for - multi-level filtering (e.g., first test against LCZ buddies, then against - network buddies). - - A schematic step-by-step description of the buddy check: - - #. A distance matrix is constructed for all interdistances between - the stations. This is done using the haversine approximation. - #. Groups of spatial buddies (neighbours) are created by using the - `spatial_buddy_radius.` These groups are further filtered by: - - * removing stations from the groups that differ to much in altitude - (based on the `max_alt_diff`) - * removing groups of buddies that are too small (based on the - `min_sample_size`) - - #. Observations per group are synchronized in time (using the - `instantaneous_tolerance` for allignment). - #. If a `lapsrate` is specified, the observations are corrected for - altitude differences. - #. The following steps are repeated for `N-iter` iterations: - - #. The values of outliers flagged by a previous iteration are converted to - NaN's. Therefore they are not used in any following score or sample. - #. For each buddy group: - - * The mean, standard deviation (std), and sample size are computed. - * If the std is lower than the `minimum_std`, it is replaced by the - minimum std. - * Chi values are calculated for all records. - * For each timestamp the record with the highest Chi is tested if - it is larger then spatial_z_threshold. - If so, that record is flagged as an outlier. It will be ignored - in the next iteration. - - #. If `safety_net_configs` is provided, the following steps are applied - on the outliers flagged by the current iteration, for each safety net - in order: - - * Category buddies (stations sharing the same category value within - the specified radius) are identified. - * The safety net sample is tested in size (sample size must be at - least `min_sample_size`). If the condition is not met, the safety - net test is not applied. - * The safety net test is applied: - - * The mean and std are computed of the category-buddy sample. If - the std is smaller than `min_std`, the latter is used. - * The z-value is computed for the target record (= flagged outlier). - * If the z-value is smaller than the safety net's `z_threshold`, - the tested outlier is "saved" and removed from the set of outliers - for the current iteration. - - #. If `whiteset` contains records, any outliers that match the white-listed - timestamps (and optionally station names) are removed from the outlier set - for the current iteration. White-listed records participate in all buddy - check calculations but are not flagged as outliers in the final results. - - Parameters - ---------- - target_stations : list[Station] - A list of Station objects to apply the buddy check on. These should be - stations that contain the target observation type. - metadf : pandas.DataFrame - DataFrame containing station metadata including coordinates (geometry) - and altitude information for all stations. - obstype : str - The observation type that has to be checked. - spatial_buddy_radius : int or float - The radius to define spatial neighbors in meters. - spatial_min_sample_size : int - The minimum sample size to calculate statistics on used by - spatial-buddy samples. - max_alt_diff : int, float, or None - The maximum altitude difference allowed for buddies. If None, - no altitude filter is applied. - min_std : int or float - The minimum standard deviation for sample statistics. This should - represent the accuracy of the observations. - spatial_z_threshold : int or float - The threshold, tested with z-scores, for flagging observations as outliers. - N_iter : int - The number of iterations to perform the buddy check. - instantaneous_tolerance : pandas.Timedelta - The maximum time difference allowed for synchronizing observations. - whiteset : WhiteSet - A WhiteSet instance containing records that should be excluded from - outlier detection. Records in the WhiteSet undergo the buddy check - iterations as regular records but are removed from the outlier set - at the end of each iteration. - safety_net_configs : list of dict, optional - List of safety net configurations to apply in order. Each dict must - contain: - - * 'category': str, the metadata column name to group by (e.g., 'LCZ', - 'network') - * 'buddy_radius': int or float, maximum distance for category buddies - (in meters) - * 'z_threshold': int or float, z-value threshold for saving outliers - * 'min_sample_size': int, minimum number of buddies required for the - safety net test - - The default is None. - lapserate : float or None, optional - Describes how the obstype changes with altitude (in meters). If - None, no altitude correction is applied. For temperature, a - common value is -0.0065. - use_mp : bool, optional - Use multiprocessing to speed up the buddy check. The default is True. - - Returns - ------- - list - A list of tuples containing the outlier station, timestamp, - and detail message. Each tuple is in the form (station_name, - timestamp, message). - dict - A dictionary mapping each synchronized timestamp to its original - timestamp. - - Notes - ----- - - * The altitude of the stations can be extracted from GEE by using the - `Dataset.get_altitude()` method. - * The LCZ of the stations can be extracted from GEE by using the - `Dataset.get_LCZ()` method. - - """ - - # Validate safety net configs if provided - _validate_safety_net_configs(safety_net_configs) - - # ----- Part 1: construct buddy groups ------ - # compute distance metric - logger.debug("Calculating distance matrix with Haversine formula") - dist_matrix = generate_distance_matrix(metadf) - - # find potential buddies by distance - logger.debug( - "Finding spatial buddies within radius of %s meters", spatial_buddy_radius - ) - spatial_buddies = _find_spatial_buddies( - distance_df=dist_matrix, metadf=metadf, buddy_radius=spatial_buddy_radius - ) - - # filter buddies by altitude difference - if max_alt_diff is not None: - logger.debug( - "Filtering buddies by maximum altitude difference of %s meters", - max_alt_diff, - ) - if metadf["altitude"].isna().any(): - raise ValueError( - "At least one station has a NaN \ -value for 'altitude'" - ) - # Filter by altitude difference - spatial_buddies = _filter_to_altitude_buddies( - buddies=spatial_buddies, - altitudes=metadf["altitude"], - max_altitude_diff=max_alt_diff, - ) - - # Filter by sample size (based on the number of buddy stations) - logger.debug( - "Filtering buddies by minimum sample size of %s", spatial_min_sample_size - ) - spatial_buddies = _filter_to_minimum_samplesize( - buddydict=spatial_buddies, min_sample_size=spatial_min_sample_size - ) - - # create unique groups of buddies (list of tuples) - logger.debug("Creating groups of buddies") - buddygroups = create_groups_of_buddies(spatial_buddies) - logger.debug("Number of buddy groups created: %s", len(buddygroups)) - - # ---- Part 2: Preparing the records ----- - - # construct a wide observation dataframe - logger.debug("Constructing wide observation DataFrame for obstype: %s", obstype) - concatlist = [] - for sta in target_stations: - if obstype in sta.sensordata.keys(): - records = sta.get_sensor(obstype).series - records.name = sta.name - concatlist.append(records) - - # synchronize the timestamps - logger.debug("Synchronizing timestamps") - combdf, timestamp_map = synchronize_series( - series_list=concatlist, max_shift=instantaneous_tolerance - ) - - # lapse rate correction - if lapserate is not None: - logger.debug("Applying lapse rate correction with rate: %s", lapserate) - # get altitude dataframe - altdict = {sta.name: sta.site.altitude for sta in target_stations} - altseries = pd.Series(altdict) - altcorrectionseries = (altseries - altseries.min()) * lapserate - combdf = combdf - altcorrectionseries # Correct for altitude - - # ---- Part 3 : Apply buddy check on each group, - # rejecting the most extreme outlier - - outliersbin = [] - for i in range(N_iter): - logger.debug("Starting iteration %s of %s", i + 1, N_iter) - # convert values to NaN, if they are labeled as outlier in - # previous iteration - if bool(outliersbin): - logger.debug("Converting previous-iteration outliers to NaN") - for outlier_station, outlier_time, _msg in outliersbin: - if outlier_station in combdf.columns: - combdf.loc[outlier_time, outlier_station] = np.nan - - if use_mp: - # Use multiprocessing generator (parallelization) - num_cpus = os.cpu_count() - # since this check is an instantaneous check --> - # perfect for splitting the dataset in chunks in time - chunks = np.array_split(combdf, num_cpus) - - # create inputargs for each buddygroup, and for each chunk in time - inputargs = [ - ( - buddygroup, - chunk, - spatial_min_sample_size, - min_std, - spatial_z_threshold, - ) - for buddygroup in buddygroups - for chunk in chunks - ] - - with concurrent.futures.ProcessPoolExecutor() as executor: - outliers = executor.map(find_buddy_group_outlier, inputargs) - outliers = list(outliers) - - else: - # create inputargs for each buddygroup, and for each chunk in time - inputargs = [ - ( - buddygroup, - combdf, - spatial_min_sample_size, - min_std, - spatial_z_threshold, - ) - for buddygroup in buddygroups - ] - - logger.debug("Finding outliers in each buddy group") - outliers = list(map(find_buddy_group_outlier, inputargs)) - - # unpack double nested list - outliers = [item for sublist in outliers for item in sublist] - - # Apply safety nets (if configured) - if safety_net_configs: - logger.debug( - "Applying %s safety net(s) to %s outliers", - len(safety_net_configs), - len(outliers), - ) - outliers = apply_safetynets( - outliers=outliers, - safety_net_configs=safety_net_configs, - wideobsds=combdf, - metadf=metadf, - distance_df=dist_matrix, - max_alt_diff=max_alt_diff, - min_std=min_std, - ) - # NOTE: Records saved by any safety net will be tested again in - # the following iteration. A different result can occur if the - # spatial/safety net sample changes in the next iteration. - - # Save white-listed records - outliers = save_whitelist_records( - outliers=outliers, - whiteset=whiteset, - obstype=obstype, - ) - # NOTE: The white-listed records are removed from the outliers at the end - # of each iteration, similar to the safety nets. They participate in - # the buddy check calculations but are not flagged as outliers. - - # Save white-listed records - outliers = save_whitelist_records( - outliers=outliers, - whiteset=whiteset, - obstype=obstype, - ) - # NOTE: The white-listed records are removed from the outliers at the end - # of each iteration, similar to the LCZ safety net. They participate in - # the buddy check calculations but are not flagged as outliers. - - # Add reference to the iteration in the msg of the outliers - outliers = [ - (station, timestamp, f"{msg} (iteration {i+1}/{N_iter})") - for station, timestamp, msg in outliers - ] - - outliersbin.extend(outliers) - i += 1 - - return outliersbin, timestamp_map - - -@log_entry -def apply_safety_net( - outliers: list, - category_buddies: dict, - wideobsds: pd.DataFrame, - safety_z_threshold: Union[int, float], - min_sample_size: int, - min_std: Union[int, float], - category_name: str, -) -> list: - """ - Apply a category-based safety net to outliers detected by the spatial buddy check. - - This function works with any categorical grouping (e.g., LCZ, network, region). - - For each outlier, this function checks whether the value can be "saved" by - comparison with its category buddies (stations with the same category value - and within a certain distance). If the outlier's value is within the specified - z-threshold when compared to its category buddies, it is not considered an - outlier for this iteration. - - Parameters - ---------- - outliers : list of tuple - List of detected outliers, each as a tuple (station_name, timestamp, message). - category_buddies : dict - Dictionary mapping each station to a list of its category buddies. - wideobsds : pandas.DataFrame - Wide-format DataFrame with stations as columns and timestamps as index. - safety_z_threshold : int or float - Z-value threshold for saving an outlier using the safety net. - min_sample_size : int - Minimum number of category buddies required to apply the safety net. - min_std : int or float - Minimum standard deviation to use for z-value calculation. - category_name : str - Name of the category being used (for logging and messages). - - Returns - ------- - list of tuple - List of outliers that were not saved by the safety net, each as a tuple - (station_name, timestamp, message). Outliers that are "saved" are not - included in the returned list. - - Notes - ----- - - The safety net is only applied if there are enough category buddies and - non-NaN values. - - Outliers from previous iterations are already set to NaN in `wideobsds` - and are not considered. - - The function appends a message to the outlier if the safety net is not - applied or not passed. - """ - checked_outliers = [] - for outl in outliers: - outlstation, outltimestamp, outl_msg = outl - - outl_value = wideobsds.loc[outltimestamp, outlstation] - outl_category_buddies = category_buddies.get(outlstation, []) - - # Check if sample size is sufficient - if len(outl_category_buddies) < min_sample_size: - msg = f"Too few {category_name} buddies to apply safety net ({len(outl_category_buddies)} < {min_sample_size})." - logger.debug( - "Skip %s safety net for %s: too few buddies (%s < %s).", - category_name, - outlstation, - len(outl_category_buddies), - min_sample_size, - ) - checked_outliers.append((outlstation, outltimestamp, outl_msg + msg)) - continue - - # Get safety net samples - # NOTE: The sample is constructed using wideobsds, thus outliers - # from the current iteration are not taken into account! - # Outliers from previous iterations are taken into account since - # wideobsdf is altered (NaNs placed at outlier records) at the beginning - # of each iteration. - safetynet_samples = wideobsds.loc[outltimestamp][outl_category_buddies] - - # Compute scores - sample_mean = safetynet_samples.mean() - sample_std = safetynet_samples.std() - sample_non_nan_count = safetynet_samples.notna().sum() - - # Instantaneous sample size check - if sample_non_nan_count < min_sample_size: - msg = f"Too few non-NaN {category_name} buddies ({sample_non_nan_count} < {min_sample_size})." - logger.debug( - "Skip %s safety net for %s: too few non-NaN buddies (%s < %s).", - category_name, - outlstation, - sample_non_nan_count, - min_sample_size, - ) - checked_outliers.append((outlstation, outltimestamp, outl_msg + msg)) - continue - - # Apply min std - if sample_std < min_std: - sample_std = min_std - - # Check if saved - z_value = abs((outl_value - sample_mean) / sample_std) - if z_value <= safety_z_threshold: - # Is saved - logger.debug( - "%s at %s is saved by %s safety net with z=%.2f.", - outlstation, - outltimestamp, - category_name, - z_value, - ) - # Do not append the current outl to checked (it's saved) - else: - # Not saved by the safety net - msg = f"{category_name} safety net applied but not saved (z={z_value:.2f} > {safety_z_threshold})." - checked_outliers.append((outlstation, outltimestamp, outl_msg + msg)) - continue - - n_saved = len(outliers) - len(checked_outliers) - logger.debug( - "A total of %s records are saved by the %s safety net.", n_saved, category_name - ) - return checked_outliers - - -@log_entry -def apply_safetynets( - outliers: list, - safety_net_configs: List[Dict], - wideobsds: pd.DataFrame, - metadf: pd.DataFrame, - distance_df: pd.DataFrame, - max_alt_diff: Union[int, float, None], - min_std: Union[int, float], -) -> list: - """ - Apply multiple safety nets in sequence to outliers. - - Each safety net is defined by a category column, buddy radius, z-threshold, - and minimum sample size. Outliers are tested against each safety net in order, - and if saved by any of them, they are removed from the outlier list. - - Parameters - ---------- - outliers : list of tuple - List of detected outliers, each as a tuple (station_name, timestamp, message). - safety_net_configs : list of dict - List of safety net configurations. Each dict must contain: - - 'category': str, the metadata column name to group by - - 'buddy_radius': int or float, maximum distance for category buddies - - 'z_threshold': int or float, z-value threshold for saving outliers - - 'min_sample_size': int, minimum number of buddies required - wideobsds : pandas.DataFrame - Wide-format DataFrame with stations as columns and timestamps as index. - metadf : pandas.DataFrame - DataFrame containing station metadata. - distance_df : pandas.DataFrame - DataFrame containing distances between stations. - max_alt_diff : int, float, or None - Maximum altitude difference allowed for buddies. If None, no altitude - filter is applied. - min_std : int or float - Minimum standard deviation for sample statistics. - - Returns - ------- - list of tuple - List of outliers that were not saved by any safety net. - """ - if not safety_net_configs: - return outliers - - current_outliers = outliers - - for config in safety_net_configs: - category = config["category"] - buddy_radius = config["buddy_radius"] - z_threshold = config["z_threshold"] - min_sample_size = config["min_sample_size"] - - logger.debug( - "Applying %s safety net (radius=%s, z_threshold=%s, min_sample=%s)", - category, - buddy_radius, - z_threshold, - min_sample_size, - ) - - # Find category buddies - category_buddies = _find_category_buddies( - metadf=metadf, - category_column=category, - max_dist=buddy_radius, - distance_df=distance_df, - ) - - # Filter by altitude difference if specified - if max_alt_diff is not None: - category_buddies = _filter_to_altitude_buddies( - buddies=category_buddies, - altitudes=metadf["altitude"], - max_altitude_diff=max_alt_diff, - ) - - # Apply the safety net - current_outliers = apply_safety_net( - outliers=current_outliers, - category_buddies=category_buddies, - wideobsds=wideobsds, - safety_z_threshold=z_threshold, - min_sample_size=min_sample_size, - min_std=min_std, - category_name=category, - ) - - return current_outliers - - -@log_entry -def save_whitelist_records( - outliers: list, - whiteset: WhiteSet, - obstype: str, -) -> list: - """Remove whitelisted records from the outlier list. - - This function filters out any outliers that are present in the WhiteSet. - Whitelisted records are known valid observations that should not be flagged - as outliers, even if they are detected by the buddy check. - - Parameters - ---------- - outliers : list of tuple - List of detected outliers, each as a tuple (station_name, timestamp, message). - whiteset : WhiteSet - A WhiteSet instance containing records that should be excluded from outlier - detection. The WhiteSet is converted to station-specific and obstype-specific - SensorWhiteSet instances for each station in the outliers list. - obstype : str - The observation type being checked. Used to filter the whiteset for the - target obstype. - - Returns - ------- - list of tuple - List of outliers excluding those that are whitelisted. Each tuple contains - (station_name, timestamp, message). - - Notes - ----- - * Whitelisted records undergo the buddy check iterations as if they are regular - records. - * Only at the end of each iteration are they filtered out from the outliers list. - * This allows whitelisted records to still influence the statistics of their - buddy groups. - * The function processes each station separately by creating a SensorWhiteSet - for each station-obstype combination. - """ - - outldf = pd.DataFrame(outliers, columns=["name", "datetime", "message"]) - - for outlsta in outldf["name"].unique(): - # Create a sensorwhiteset for each station - sensorwhiteset = whiteset.create_sensorwhitelist( - stationname=outlsta, obstype=obstype - ) - # get the white-listed datetimes for the station - outliers_dts = sensorwhiteset.catch_white_records( - outliers_idx=pd.DatetimeIndex( - data=outldf[outldf["name"] == outlsta]["datetime"], name="datetime" - ) - ) - - # subset to the saved outliers - outldf = outldf.drop( - outldf[ - (outldf["name"] == outlsta) & (~outldf["datetime"].isin(outliers_dts)) - ].index - ) - - # convert back to a list of tuples (name, datetime, message) - outliers = list(outldf.itertuples(index=False, name=None)) - return outliers - - -@log_entry -def find_buddy_group_outlier(inputarg: Tuple) -> List[Tuple]: - """ - Apply a buddy check on a group to identify outliers. - - Parameters - ---------- - inputarg : tuple - A tuple containing: - - * buddygroup : list - List of station names that form the buddy group. - * combdf : pandas.DataFrame - DataFrame containing the combined data for all stations. - * min_sample_size : int - Minimum number of non-NaN values required in the buddy group for a - valid comparison. - * min_std : float - Minimum standard deviation to use when calculating z-scores. - * outlier_threshold : float - Threshold for identifying outliers in terms of z-scores. - - Returns - ------- - list of tuple - Each tuple contains: - - * str : The station name of the most extreme outlier. - * pandas.Timestamp : The timestamp of the outlier. - * str : A detailed message describing the outlier. - - Notes - ----- - This function performs the following steps: - - 1. Subsets the data to the buddy group. - 2. Calculates the mean, standard deviation, and count of non-NaN values - for each timestamp. - 3. Filters out timestamps with insufficient data. - 4. Replaces standard deviations below the minimum threshold with the - minimum value. - 5. Converts station values to z-scores. - 6. Identifies timestamps with at least one outlier. - 7. Locates the most extreme outlier for each timestamp. - 8. Generates a detailed message for each outlier. - """ - - buddygroup, combdf = inputarg[0], inputarg[1] - min_sample_size, min_std, outlier_threshold = inputarg[2:] - - # subset to the buddies - buddydf = combdf[[*buddygroup]] - - # calculate std and mean row wise - buddydf["mean"] = buddydf[[*buddygroup]].mean(axis=1) - buddydf["std"] = buddydf[[*buddygroup]].std(axis=1) - buddydf["non_nan_count"] = buddydf[[*buddygroup]].notna().sum(axis=1) - - # subset to samples with enough members (check for each timestamp - - # specifically) - buddydf = buddydf.loc[buddydf["non_nan_count"] >= min_sample_size] - - # replace std by minimum, if needed - buddydf.loc[buddydf["std"] < min_std, "std"] = np.float32(min_std) - - # Convert values to sigmas - for station in buddygroup: - buddydf[station] = (buddydf[station] - buddydf["mean"]).abs() / buddydf["std"] - - # Drop rows for which all values are smaller than the threshold - # (speed up the last step) - buddydf["timestamp_with_outlier"] = buddydf[[*buddygroup]].apply( - lambda row: any(row > outlier_threshold), axis=1 - ) - buddydf = buddydf.loc[buddydf["timestamp_with_outlier"]] - - # locate the most extreme outlier per timestamp - buddydf["is_the_most_extreme_outlier"] = buddydf[[*buddygroup]].idxmax(axis=1) - - @log_entry - def msgcreator(row): - """ - Create a detailed message describing an outlier. - - Parameters - ---------- - row : pandas.Series - A row from the buddy DataFrame containing outlier information, - including 'is_the_most_extreme_outlier', 'mean', and 'std' columns. - - Returns - ------- - str - Formatted message describing the outlier with its z-score and - buddy group statistics. - """ - retstr = f"Outlier at {row['is_the_most_extreme_outlier']}" - retstr += f" with chi value \ -{row[row['is_the_most_extreme_outlier']]:.2f}," - retstr += ( - f" is part of {sorted(buddygroup)}, with mean: {row['mean']:.2f}, " - f"std: {row['std']:.2f}. " - ) - return retstr - - # detail info string - buddydf["detail_msg"] = buddydf.apply( - lambda row: msgcreator(row), axis=1, result_type="reduce" - ) - - return list( - zip( - buddydf["is_the_most_extreme_outlier"], buddydf.index, buddydf["detail_msg"] - ) - ) diff --git a/src/metobs_toolkit/qc_collection/common_functions.py b/src/metobs_toolkit/qc_collection/common_functions.py index a9fed301..ea8d32f1 100644 --- a/src/metobs_toolkit/qc_collection/common_functions.py +++ b/src/metobs_toolkit/qc_collection/common_functions.py @@ -2,6 +2,13 @@ import logging from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qcresult import ( + pass_cond, + flagged_cond, + saved_cond, + unchecked_cond, + unmet_cond +) logger = logging.getLogger("") @@ -50,3 +57,44 @@ def test_moving_window_condition( ismet = (windowsize / freq) >= min_records_per_window logger.debug("Exiting function test_moving_window_condition.") return ismet + + +def create_qcresult_flags( + all_input_records: pd.Series, + unmet_cond_idx: pd.DatetimeIndex, + outliers_before_white_idx: pd.DatetimeIndex, + outliers_after_white_idx: pd.DatetimeIndex, +) -> pd.Series: + """Create quality control flags series for all input records. + + This function generates a pandas Series containing QC flags for all timestamps + in the input records. Records are categorized as: unchecked (NaN values), + passed (valid non-outliers), unmet condition, saved (whitelisted outliers), + or flagged (detected outliers). + + Parameters + ---------- + all_input_records : pd.Series + Complete series of records with datetime index to flag. + unmet_cond_idx : pd.DatetimeIndex + Timestamps where QC check conditions were not met. + outliers_before_white_idx : pd.DatetimeIndex + Timestamps of all detected outliers before whitelist filtering. + outliers_after_white_idx : pd.DatetimeIndex + Timestamps of outliers remaining after whitelist filtering. + + Returns + ------- + pd.Series + Series with same index as all_input_records containing QC flag strings: + 'unchecked', 'passed', 'condition_unmet', 'saved', or 'flagged'. + """ + flags = pd.Series(data=unchecked_cond, index=all_input_records.index) + flags.loc[all_input_records.dropna().index] = pass_cond + flags.loc[unmet_cond_idx] = unmet_cond + + saved_records_idx = outliers_before_white_idx.difference(outliers_after_white_idx) + flags.loc[saved_records_idx] = saved_cond + flags.loc[outliers_after_white_idx] = flagged_cond + + return flags \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/duplicated_timestamp.py b/src/metobs_toolkit/qc_collection/duplicated_timestamp.py new file mode 100644 index 00000000..2ab91874 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/duplicated_timestamp.py @@ -0,0 +1,80 @@ +import logging + +import pandas as pd + + +from .common_functions import create_qcresult_flags +from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qcresult import ( + QCresult, + pass_cond, + flagged_cond, + +) + +logger = logging.getLogger("") + + +@log_entry +def duplicated_timestamp_check(records: pd.Series) -> QCresult: + """Check for duplicated timestamps in a time series. + + Identifies all records that share the same timestamp. All occurrences of + duplicated timestamps are flagged as outliers (not just subsequent ones). + This check is performed before invalid value checking. + + Parameters + ---------- + records : pd.Series + Series with a datetime-like index to check for duplicates. + + Returns + ------- + QCresult + Quality control result object containing flags, outliers, and details + for the duplicated timestamp check. + + Notes + ----- + * All records with duplicated timestamps are flagged, including the first occurrence. + * Values are coerced to numeric during this check to ensure compatibility with + downstream processing. + * Details include a list of all values sharing each duplicated timestamp. + """ + # find all duplicates + duplicates = records[records.index.duplicated(keep=False)] + + #Drop dulicates from series, they are a mess to take along + no_dup_records = records[~records.index.duplicated(keep='first')] + + #create flags (no duplicates in the index!) + flags = create_qcresult_flags( + all_input_records=no_dup_records, #NO duplicates here + unmet_cond_idx = pd.DatetimeIndex([]), + outliers_before_white_idx=duplicates.index.unique(), + outliers_after_white_idx=duplicates.index.unique(), + ) + + + + #Special case: this check is performed before the invalid check, so values + #must be cast to numeric to avoid issues when combining them in the outliersdf + duplicates = pd.to_numeric(duplicates, errors='coerce') + + qcresult = QCresult( + checkname="duplicated_timestamp", + checksettings={}, + flags=flags, + detail='no details') + + #Create and add details + if not duplicates.empty: + # For each duplicated timestamp, join all values as a comma-separated string + details = ( + duplicates.groupby(duplicates.index) + .apply(lambda x:"duplicated timestamp with: " + ", ".join(map(str, x.values))) + ) + qcresult.add_details_by_series(detail_series = details) + + + return qcresult \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/grossvalue_check.py b/src/metobs_toolkit/qc_collection/grossvalue_check.py index ba22d412..c6586d07 100644 --- a/src/metobs_toolkit/qc_collection/grossvalue_check.py +++ b/src/metobs_toolkit/qc_collection/grossvalue_check.py @@ -2,9 +2,10 @@ from typing import Union import pandas as pd - +from .common_functions import create_qcresult_flags from .whitelist import SensorWhiteSet from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qcresult import QCresult logger = logging.getLogger("") @@ -15,7 +16,7 @@ def gross_value_check( lower_threshold: Union[int, float], upper_threshold: Union[int, float], sensorwhiteset: SensorWhiteSet, -) -> pd.DatetimeIndex: +) -> QCresult: """ Identify outliers in a time series based on lower and upper thresholds. @@ -34,21 +35,47 @@ def gross_value_check( Returns ------- - pd.DatetimeIndex - Timestamps of outlier records. - - + QCresult + Quality control result object containing flags, outliers, and details + for the gross value check. """ # Drop NaN values - records = records.dropna() + to_check_records = records.dropna() # Identify outliers - outliers_idx = records[ - (records < lower_threshold) | (records > upper_threshold) + outliers_idx = to_check_records[ + (to_check_records < lower_threshold) | (to_check_records > upper_threshold) ].index # Exclude white records if provided - outliers_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) + outliers_after_white_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) + + # Create QCresult flags + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx = pd.DatetimeIndex([]), + outliers_before_white_idx=outliers_idx, + outliers_after_white_idx=outliers_after_white_idx, + ) - logger.debug("Exiting function gross_value_check.") - return outliers_idx + checksettings = { + "lower_threshold": lower_threshold, + "upper_threshold": upper_threshold, + "sensorwhiteset": sensorwhiteset, + } + + qcresult = QCresult( + checkname="gross_value", + checksettings=checksettings, + flags=flags, + detail='no details' + ) + + #Create and add details + if not outliers_after_white_idx.empty: + detailseries = pd.Series( + data = 'value outside gross value thresholds [' + str(lower_threshold) + ', ' + str(upper_threshold) + ']', + index = outliers_after_white_idx + ) + qcresult.add_details_by_series(detail_series = detailseries) + return qcresult diff --git a/src/metobs_toolkit/qc_collection/invalid_check.py b/src/metobs_toolkit/qc_collection/invalid_check.py new file mode 100644 index 00000000..01d086dc --- /dev/null +++ b/src/metobs_toolkit/qc_collection/invalid_check.py @@ -0,0 +1,72 @@ +import logging + +import pandas as pd + + +from .whitelist import SensorWhiteSet +from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qcresult import ( + QCresult, + pass_cond, + flagged_cond, + +) + +logger = logging.getLogger("") + + +@log_entry +def drop_invalid_values(records: pd.Series, skip_records: pd.DatetimeIndex) -> pd.Series: + """Remove invalid (non-numeric) values from a time series. + + Filters out values that could not be cast to numeric types. Invalid timestamps + are treated as gaps and removed from the series rather than being flagged as + outliers. This allows the gap detection mechanism to handle them appropriately. + + Parameters + ---------- + records : pd.Series + Series with a datetime-like index containing values to validate. + skip_records : pd.DatetimeIndex + Records to temporarily exclude from the check (typically duplicated timestamps). + These records are preserved regardless of validity and added back after filtering. + + Returns + ------- + pd.Series + Filtered series containing only records with valid numeric values, + plus all skipped records. + + Notes + ----- + * Invalid values are interpreted as missing data (gaps) rather than outliers. + * Skipped records are preserved to avoid interfering with prior QC checks. + * This function does not raise an error if the check was previously applied. + """ + skipped_data = records.loc[skip_records] + targets = records.drop(skip_records) + + # Option 1: Create a outlier label for these invalid inputs, + # and treath them as outliers + # outlier_timestamps = targets[~targets.notnull()] + + # self._update_outliers( + # qccheckname="invalid_input", + # outliertimestamps=outlier_timestamps.index, + # check_kwargs={}, + # extra_columns={}, + # overwrite=False, + # ) + + # Option 2: Since there is not numeric value present, these timestamps are + # interpreted as gaps --> remove the timestamp, so that it is captured by the + # gap finder. + + # Note: do not treat the first/last timestamps differently. That is + # a philosiphycal choice. + + validrecords = targets[targets.notnull()] # subset to numerical casted values + # add the skipped records back + validrecords = pd.concat([validrecords, skipped_data]).sort_index() + return validrecords + \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/overview_df_constructor.py b/src/metobs_toolkit/qc_collection/overview_df_constructor.py new file mode 100644 index 00000000..ebf6e458 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/overview_df_constructor.py @@ -0,0 +1,110 @@ +"""Collection of DF constructing functions on various levels +(sensordata, station, dataset) for overviews and summaries of QC checks.""" + +import pandas as pd +from typing import Union +from metobs_toolkit.backend_collection.dev_collection import copy_doc +from metobs_toolkit.backend_collection.df_helpers import save_concat + + +def sensordata_qc_overview_df(sensor) -> pd.DataFrame: + #TODO: docstring + + #TODO rearange the order of qc columns to reflect the executeion order + possible_timestamps = sensor.series.index + qc_before_timecoarsening = ['duplicated_timestamp'] + + + to_concat = [] + for qcresult in sensor.outliers: + checkdf = qcresult.create_outliersdf( + map_to_basic_labels=False, #get all flags (ok, outl, notchecked, unmet, saved) + subset_to_outliers=False) #Get all flags + #add checkname to the index + checkdf["checkname"] = qcresult.checkname + if qcresult.checkname in qc_before_timecoarsening: + #Subset to coarsende timestmaps only + checkdf = checkdf.reindex(possible_timestamps) + + checkdf.set_index("checkname", append=True, inplace=True) + to_concat.append(checkdf) + + totaldf = save_concat(to_concat) + + if totaldf.empty: + return pd.DataFrame(columns=['value', 'label', 'details'], + index=pd.DatetimeIndex([], name='datetime')) + + #Unstack + totaldf = totaldf.unstack(level='checkname') + totaldf.fillna('Not applied', inplace=True) + + #add values + allvals = save_concat([sensor.series, sensor.outliers_values_bin]) #do not sort before removing the duplicates ! + allvals = allvals[~allvals.index.duplicated(keep='last')].sort_index() + totaldf['value'] = allvals.loc[totaldf.index] + + return totaldf[['value', 'label', 'details']] + + +def station_qc_overview_df(station, subset_obstypes:Union[list[str], None] = None) -> pd.DataFrame: + #TODO: docstring + + if subset_obstypes is None: + sensortargets = station.sensordata.values() + else: + sensortargets = [] + for obstype in subset_obstypes: + if obstype in station.sensordata: + sensortargets.append(station.get_sensor(obstype)) + else: + #Log a warning? + pass + + to_concat = [] + for sensordata in sensortargets: + stadf = sensordata_qc_overview_df(sensordata).reset_index() + #add obstype to the index + if not stadf.empty: + stadf["obstype"] = sensordata.obstype.name + stadf = stadf.reset_index().set_index(['datetime', "obstype"]) + to_concat.append(stadf) + + + + totaldf = save_concat(to_concat) + totaldf.sort_index(inplace=True) + + if totaldf.empty: + return pd.DataFrame(columns=['value', 'label', 'details'], + index=pd.MultiIndex( + levels=[[], []], codes=[[], []], names=["datetime", "obstype"])) + + return totaldf[['value', 'label', 'details']] + +def dataset_qc_overview_df(dataset, subset_stations:Union[list[str], None] = None, + subset_obstypes:Union[list[str], None] = None) -> pd.DataFrame: + #TODO: docstring + if subset_stations is None: + stationtargets = dataset.stations + else: + stationtargets = [dataset.get_station(station_name) for station_name in subset_stations] + + to_concat = [] + for station in stationtargets: + stadf = station_qc_overview_df(station, subset_obstypes=subset_obstypes).reset_index() + #add obstype to the index + if not stadf.empty: + stadf["name"] = station.name + stadf = stadf.reset_index().set_index(['datetime', "obstype", "name"]) + to_concat.append(stadf) + + totaldf = save_concat(to_concat) + totaldf.sort_index(inplace=True) + + if totaldf.empty: + return pd.DataFrame(columns=['value', 'label', 'details'], + index=pd.MultiIndex( + levels=[[], [], []], codes=[[], [], []], names=["datetime", "obstype", "name"])) + + return totaldf[['value', 'label', 'details']] \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/persistence_check.py b/src/metobs_toolkit/qc_collection/persistence_check.py index 315c168d..b7917324 100644 --- a/src/metobs_toolkit/qc_collection/persistence_check.py +++ b/src/metobs_toolkit/qc_collection/persistence_check.py @@ -3,8 +3,14 @@ import numpy as np import pandas as pd -from .common_functions import test_moving_window_condition +from .common_functions import test_moving_window_condition, create_qcresult_flags from .whitelist import SensorWhiteSet +from metobs_toolkit.qcresult import ( + QCresult, + pass_cond, + flagged_cond, + unmet_cond, +) from metobs_toolkit.backend_collection.decorators import log_entry from metobs_toolkit.backend_collection.datetime_collection import ( timestamps_to_datetimeindex, @@ -13,13 +19,32 @@ logger = logging.getLogger("") +def _has_window_unique_values(window: pd.Series) -> bool: + """ + Check if all non-NaN values in the window are identical. + + Parameters + ---------- + window : pd.Series + A pandas Series representing the rolling window. + + Returns + ------- + bool + True if all non-NaN values are identical, False otherwise. + """ + a = window.values + a = a[~np.isnan(a)] + return (a[0] == a).all() if len(a) > 0 else False + + @log_entry def persistence_check( records: pd.Series, timewindow: pd.Timedelta, min_records_per_window: int, sensorwhiteset: SensorWhiteSet, -) -> pd.DatetimeIndex: +) -> QCresult: """ Check if values are not constant in a moving time window. @@ -35,15 +60,16 @@ def persistence_check( The size of the rolling time window to check for persistence. min_records_per_window : int The minimum number of non-NaN records required within the time window for the check to be valid. - sensorwhiteset : SensorWhiteSet, optional + sensorwhiteset : SensorWhiteSet A SensorWhiteSet instance containing timestamps that should be excluded from outlier detection. Records matching the whiteset criteria will not be flagged as outliers even if they meet the persistence criteria. Returns ------- - pd.DatetimeIndex - Timestamps of outlier records. + QCresult + Quality control result object containing flags, outliers, and details + for the persistence check. Notes ----- @@ -59,10 +85,16 @@ def persistence_check( If the minimum number of records per window is not met over the full time series, a warning is logged, and the function returns an empty DatetimeIndex. """ + checksettings = { + "timewindow": timewindow, + "min_records_per_window": min_records_per_window, + "sensorwhiteset": sensorwhiteset, + } + to_check_records = records.dropna() # Exclude outliers and gaps # Test if the conditions for the moving window are met by the records frequency is_met = test_moving_window_condition( - records=records, + records=records, #pass records, because freq is estimated windowsize=timewindow, min_records_per_window=min_records_per_window, ) @@ -70,48 +102,64 @@ def persistence_check( logger.warning( "The minimum number of window members for the persistence check is not met!" ) - return timestamps_to_datetimeindex( - name="datetime", timestamps=[], current_tz=None + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx=to_check_records.index, + outliers_before_white_idx=pd.DatetimeIndex([]), + outliers_after_white_idx=pd.DatetimeIndex([]), ) - - # Apply persistence - @log_entry - def is_unique(window: pd.Series) -> bool: - """ - Check if all non-NaN values in the window are identical. - - Parameters - ---------- - window : pd.Series - A pandas Series representing the rolling window. - - Returns - ------- - bool - True if all non-NaN values are identical, False otherwise. - """ - a = window.values - a = a[~np.isnan(a)] - return (a[0] == a).all() if len(a) > 0 else False + qcresult = QCresult( + checkname="persistence", + checksettings=checksettings, + flags=flags, + detail=f"Minimum number of records ({min_records_per_window}) per window ({timewindow}) not met.", + ) + return qcresult + # This is very expensive if no coarsening is applied! Can we speed this up? - window_is_constant = ( - records.dropna() # Exclude outliers and gaps + + window_flags = ( + to_check_records .rolling( window=timewindow, closed="both", center=True, min_periods=min_records_per_window, ) - .apply(is_unique) + .apply(_has_window_unique_values) ) - # The returns are numeric values (0 --> False, NaN --> not checked (members/window condition not met), 1 --> outlier) - window_is_constant = window_is_constant.map({0.0: False, np.nan: False, 1.0: True}) - - outliers_idx = window_is_constant[window_is_constant].index + # The returns are numeric values (0 --> oke, NaN --> not checked (members/window condition not met), 1 --> outlier) + window_flags = window_flags.map( + {0.0: pass_cond, + np.nan: unmet_cond, + 1.0: flagged_cond}) + + outliers_idx = window_flags[window_flags == flagged_cond].index # Catch the white records - outliers_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) - - logger.debug("Exiting function persistence_check") - return outliers_idx + outliers_after_white_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) + + #Create flags + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx=window_flags[window_flags == unmet_cond].index, + outliers_before_white_idx=outliers_idx, + outliers_after_white_idx=outliers_after_white_idx) + + qcresult = QCresult( + checkname="persistence", + checksettings=checksettings, + flags=flags, + detail='no details' + ) + + #Create and add details + if not outliers_after_white_idx.empty: + detailseries = pd.Series( + data = 'constant values in timewindow of ' + str(timewindow), + index = outliers_after_white_idx + ) + qcresult.add_details_by_series(detail_series = detailseries) + + return qcresult diff --git a/src/metobs_toolkit/qc_collection/repetitions_check.py b/src/metobs_toolkit/qc_collection/repetitions_check.py index f511a840..031bf70e 100644 --- a/src/metobs_toolkit/qc_collection/repetitions_check.py +++ b/src/metobs_toolkit/qc_collection/repetitions_check.py @@ -1,6 +1,8 @@ import logging import pandas as pd +from metobs_toolkit.qcresult import QCresult +from .common_functions import create_qcresult_flags from metobs_toolkit.backend_collection.decorators import log_entry from metobs_toolkit.backend_collection.datetime_collection import ( timestamps_to_datetimeindex, @@ -15,7 +17,7 @@ def repetitions_check( records: pd.Series, max_N_repetitions: int, sensorwhiteset: SensorWhiteSet, -) -> pd.DatetimeIndex: +) -> QCresult: """ Test if an observation changes after a number of repetitions. @@ -33,15 +35,16 @@ def repetitions_check( max_N_repetitions : int The maximum number of repetitions allowed before the records are flagged as outliers. If the number of repetitions exceeds this value, all repeated records are flagged as outliers. - sensorwhiteset : SensorWhiteSet, optional + sensorwhiteset : SensorWhiteSet A SensorWhiteSet instance containing timestamps that should be excluded from outlier detection. Records matching the whiteset criteria will not be flagged as outliers even if they exceed the max_N_repetitions threshold. Returns ------- - pd.DatetimeIndex - Timestamps of outlier records. + QCresult + Quality control result object containing flags, outliers, and details + for the repetitions check. Notes ----- @@ -49,15 +52,19 @@ def repetitions_check( The persistence check uses thresholds that are meteorologically based (e.g., the moving window is defined by a duration), in contrast to the repetitions check whose thresholds are instrumentally based (e.g., the "window" is defined by a number of records). """ - + + checksettings = { + "max_N_repetitions": max_N_repetitions, + "sensorwhiteset": sensorwhiteset, + } # Drop outliers from the series (these are NaNs) - input_series = records.dropna() + to_check_records = records.dropna() # Create group definitions for repeating values that do not change - persistence_filter = ((input_series.shift() != input_series)).cumsum() + persistence_filter = ((to_check_records.shift() != to_check_records)).cumsum() persdf = pd.DataFrame( - data={"value": input_series, "persistgroup": persistence_filter}, - index=input_series.index, + data={"value": to_check_records, "persistgroup": persistence_filter}, + index=to_check_records.index, ) # Find outlier groups @@ -66,23 +73,49 @@ def repetitions_check( group_sizes = groups.size() outlier_groups = group_sizes[group_sizes > max_N_repetitions] - # Combine all outlier groups + if outlier_groups.empty: logger.debug("No outliers detected. Exiting repetitions_check function.") - return timestamps_to_datetimeindex( - timestamps=[], name="datetime", current_tz=None + outliers_idx = timestamps_to_datetimeindex( + timestamps=[], name="datetime", current_tz=None + ) + outliers = pd.Series(index=outliers_idx) + else: + + # Combine all outlier groups + outliers = pd.concat( + [ + groups.get_group( + outlgroup, + ) + for outlgroup in outlier_groups.index + ] ) + - outliers = pd.concat( - [ - groups.get_group( - outlgroup, - ) - for outlgroup in outlier_groups.index - ] + # Catch the white records + outliers_after_white_idx = sensorwhiteset.catch_white_records(outliers.index) + + # Create flags + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx = pd.DatetimeIndex([]), + outliers_before_white_idx=outliers.index, + outliers_after_white_idx=outliers_after_white_idx, ) - logger.debug("Outliers detected. Exiting repetitions_check function.") - # Catch the white records - outliers_idx = sensorwhiteset.catch_white_records(outliers.index) - return outliers_idx + qcresult = QCresult( + checkname="repetitions", + checksettings=checksettings, + flags=flags, + ) + + #Create and add details + if not outliers_after_white_idx.empty: + detailseries = pd.Series( + data = 'More than ' + str(max_N_repetitions) + ' repeated values', + index = outliers_after_white_idx + ) + qcresult.add_details_by_series(detail_series = detailseries) + + return qcresult \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/buddy_check.py b/src/metobs_toolkit/qc_collection/spatial_checks/buddy_check.py new file mode 100644 index 00000000..e57520e9 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/buddy_check.py @@ -0,0 +1,557 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING + +import numpy as np +from concurrent.futures import ProcessPoolExecutor +import pandas as pd + + + +from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qc_collection.distancematrix_func import generate_distance_matrix + +from metobs_toolkit.qcresult import QCresult, flagged_cond +from .buddywrapsensor import BuddyWrapSensor, to_qc_labels_map +from metobs_toolkit.settings_collection import Settings +from ..whitelist import WhiteSet +# Import methods +from . import methods as buddymethods + + + +if TYPE_CHECKING: + from metobs_toolkit.station import Station + +logger = logging.getLogger("") + + +def _run_buddy_test(kwargs): + """Executor for multiprocessing - runs buddy test for a single station.""" + return buddymethods.buddy_test_a_station(**kwargs) + + +def _build_station_buddy_kwargs( + station: 'BuddyWrapSensor', + widedf: pd.DataFrame, + buddygroupname: str, + min_sample_size: int, + min_sample_spread: float, + outlier_threshold: float, + iteration: int, + check_type: str, + use_z_robust_method: bool, +) -> dict: + """ + Build kwargs dictionary for buddy_test_a_station with a minimal widedf subset. + + This function creates a dictionary of keyword arguments to pass to + buddymethods.buddy_test_a_station, including a view of the widedf that + contains only the target station and its buddy stations. This enables + efficient parallelization by station rather than by time chunks. + + Parameters + ---------- + station : BuddyWrapSensor + The wrapped station (center station) to build kwargs for. + widedf : pd.DataFrame + The full wide observation DataFrame with stations as columns. + buddygroupname : str + Name of the buddy group to use for extracting buddies. + min_sample_size : int + Minimum sample size for statistics. + min_sample_spread : float + Minimum sample spread (MAD or std). + outlier_threshold : float + Z-score threshold for flagging outliers. + iteration : int + Current iteration number. + check_type : str + Type of check being performed ('spatial_check', 'safetynet_check:groupname'). + use_z_robust_method : bool + Whether to use robust z-score method. + + Returns + ------- + dict + Dictionary of kwargs to pass to buddymethods.buddy_test_a_station. + The 'widedf' key contains only the columns for the center station + and its buddies. + """ + # Get the center station name and its buddies + center_name = station.name + buddies = station.get_buddies(groupname=buddygroupname) + + # Build list of required columns: center station + all its buddies + required_columns = [center_name] + buddies + + # Create a subset (view) of widedf with only required columns + subset_widedf = widedf[required_columns] + + return { + 'centerwrapstation': station, + 'buddygroupname': buddygroupname, + 'widedf': subset_widedf, + 'min_sample_size': min_sample_size, + 'min_sample_spread': min_sample_spread, + 'outlier_threshold': outlier_threshold, + 'iteration': iteration, + 'check_type': check_type, + 'use_z_robust_method': use_z_robust_method, + } + +#TODO: Trough all modules related to the buddy check, there is often the reference to wrappedbuddystation or buddywrapstation. Replace these to buddywrapsensor + +@log_entry +def toolkit_buddy_check( + target_stations: list[Station], + metadf: pd.DataFrame, + obstype: str, + spatial_buddy_radius: Union[int, float], + spatial_min_sample_size: int, + spatial_max_sample_size: Union[int, None] = None, + max_alt_diff: Union[int, float, None] = None, + min_sample_spread: Union[int, float] = 1.0, + min_buddy_distance: Union[int, float]=0, + spatial_z_threshold: Union[int, float] = 3.1, + N_iter: int = 2, + instantaneous_tolerance: pd.Timedelta = pd.Timedelta("4min"), + # Whitelist arguments + whiteset: WhiteSet = WhiteSet(), + # Safety nets + safety_net_configs: List[Dict] = None, + #Statistical + use_z_robust_method: bool = True, + # Technical + lapserate: Union[float, None] = None, # -0.0065 for temperature + use_mp: bool = True, +) -> List[QCresult]: + """ + #TODO update the docstring accordingly + Spatial buddy check. + + The buddy check compares an observation against its neighbors + (i.e. spatial buddies). The check loops over all the groups, which are stations + within a radius of each other. For each group, the z-value of the reference + observation is computed given the sample of spatial buddies. If one (or more) + exceeds the `spatial_z_threshold`, the most extreme (=baddest) observation of + that group is labeled as an outlier. + + Multiple iterations of this checks can be done using the N_iter. + + Optionally, one or more safety nets can be applied. A safety net tests + potential outliers against a sample of stations that share a categorical + attribute (e.g., LCZ, network). If the z-value computed using the safety + net sample is below the specified threshold, the outlier is "saved" and + removed from the outlier set for the current iteration. + + Safety nets are applied in the order they are specified, allowing for + multi-level filtering (e.g., first test against LCZ buddies, then against + network buddies). + + A schematic step-by-step description of the buddy check: + + #. A distance matrix is constructed for all interdistances between + the stations. This is done using the haversine approximation. + #. Groups of spatial buddies (neighbours) are created by using the + `spatial_buddy_radius` and `min_buddy_distance`. Only stations within + the distance range [min_buddy_distance, spatial_buddy_radius] are + considered as buddies. These groups are further filtered by: + + * removing stations from the groups that differ too much in altitude + (based on the `max_alt_diff`) + * removing groups of buddies that are too small (based on the + `min_sample_size`) + + #. Observations per group are synchronized in time (using the + `instantaneous_tolerance` for alignment). + #. If a `lapserate` is specified, the observations are corrected for + altitude differences. + #. The following steps are repeated for `N-iter` iterations: + + #. The values of outliers flagged by a previous iteration are converted to + NaN's. Therefore they are not used in any following score or sample. + #. For each buddy group: + + * The mean, standard deviation (std), and sample size are computed. + * If the std is lower than the `minimum_std`, it is replaced by the + minimum std. + * Chi values are calculated for all records. + * For each timestamp the record with the highest Chi is tested if + it is larger then spatial_z_threshold. + If so, that record is flagged as an outlier. It will be ignored + in the next iteration. + + #. If `safety_net_configs` is provided, the following steps are applied + on the outliers flagged by the current iteration, for each safety net + in order: + + * If `only_if_previous_had_no_buddies` is True for this safety net, + only outlier records where the previous safety net had insufficient + buddies (``BC_NO_BUDDIES`` flag) are passed to this safety net. + All other records retain their status from the previous safety net. + * Category buddies (stations sharing the same category value within + the specified distance range) are identified. Like spatial buddies, + category buddies are filtered by distance range [min_buddy_distance, + buddy_radius]. + * The safety net sample is tested in size (sample size must be at + least `min_sample_size`). If the condition is not met, the safety + net test is not applied. + * The safety net test is applied: + + * If use_z_robust_method is True: + + * The mean and std are computed of the category-buddy sample. If + the std is smaller than `min_sample_spread`, the latter is used. + * The z-value is computed for the target record (= flagged outlier). + * If the z-value is smaller than the safety net's `z_threshold`, + the tested outlier is "saved" and removed from the set of outliers + for the current iteration. + + #. If `whiteset` contains records, any outliers that match the white-listed + timestamps (and optionally station names) are removed from the outlier set + for the current iteration. White-listed records participate in all buddy + check calculations but are not flagged as outliers in the final results. + + Parameters + ---------- + target_stations : list[Station] + A list of Station objects to apply the buddy check on. These should be + stations that contain the target observation type. + metadf : pandas.DataFrame + DataFrame containing station metadata including coordinates (geometry) + and altitude information for all stations. + obstype : str + The observation type that has to be checked. + spatial_buddy_radius : int or float + The radius to define spatial neighbors in meters. + spatial_min_sample_size : int + The minimum sample size to calculate statistics on used by + spatial-buddy samples. + spatial_max_sample_size : int or None, optional + The maximum number of spatial buddies to use per station. If not + None, the spatial buddies for each station are sorted by distance + and only the nearest ``spatial_max_sample_size`` buddies are kept. + Must be larger than ``spatial_min_sample_size`` when specified. + The default is None (no limit). + max_alt_diff : int, float, or None + The maximum altitude difference allowed for buddies. If None, + no altitude filter is applied. + min_sample_spread : int or float + The minimum sample spread for sample statistics. When use_z_robust_method is True, + this is the equal to the minimum MAD to use (avoids division by near-zero). When + use_z_robust_method is False, this is the standard deviation. This parameter helps + to represent the accuracy of the observations. + min_buddy_distance : int or float, optional + The minimum distance (in meters) required between a station and its spatial buddies. + Stations closer than this distance will be excluded from the buddy group. This also + affects safety net buddy selection unless overridden in the safety_net_configs. + Default is 0.0 (no minimum distance). + spatial_z_threshold : int or float + The threshold, tested with z-scores, for flagging observations as outliers. + N_iter : int + The number of iterations to perform the buddy check. + instantaneous_tolerance : pandas.Timedelta + The maximum time difference allowed for synchronizing observations. + whiteset : WhiteSet + A WhiteSet instance containing records that should be excluded from + outlier detection. Records in the WhiteSet undergo the buddy check + iterations as regular records but are removed from the outlier set + at the end of each iteration. + safety_net_configs : list of dict, optional + List of safety net configurations to apply in order. Each dict must + contain: + + * 'category': str, the metadata column name to group by (e.g., 'LCZ', + 'network') + * 'buddy_radius': int or float, maximum distance for category buddies + (in meters) + * 'z_threshold': int or float, z-value threshold for saving outliers + * 'min_sample_size': int, minimum number of buddies required for the + safety net test + * 'min_buddy_distance': int or float (optional), minimum distance + (in meters) required between a station and its category buddies. + Stations closer than this distance will be excluded from the + buddy group. Defaults to 0 (no minimum distance). + * 'max_sample_size': int or None (optional), maximum number of category + buddies to use per station. If not None, category buddies are sorted + by distance and only the nearest ``max_sample_size`` are kept. Must + be larger than ``min_sample_size`` when specified. Defaults to None + (no limit). + * 'only_if_previous_had_no_buddies': bool (optional), if True this + safety net is only applied to outlier records for which the + **previous** safety net could not be executed due to insufficient + buddies (``BC_NO_BUDDIES`` flag). Records that were successfully + tested by the previous safety net (whether they passed or failed) + are not re-tested by this one. This enables a cascading fallback + strategy, e.g. first try LCZ buddies, then fall back to network + buddies only for records that had no LCZ buddies. Cannot be True + for the first safety net. Defaults to False. + + The default is None. + lapserate : float or None, optional + Describes how the obstype changes with altitude (in meters). If + None, no altitude correction is applied. For temperature, a + common value is -0.0065. + use_mp : bool, optional + Use multiprocessing to speed up the buddy check. The default is True. + + Returns + ------- + list + A list of tuples containing the outlier station, timestamp, + and detail message. Each tuple is in the form (station_name, + timestamp, message). + dict + A dictionary mapping each synchronized timestamp to its original + timestamp. + A dictionary mapping sensor names to BuddyCheckSensorDetails objects + containing detailed tracking information for each timestamp. + + + Notes + ----- + + * The altitude of the stations can be extracted from GEE by using the + `Dataset.get_altitude()` method. + * The LCZ of the stations can be extracted from GEE by using the + `Dataset.get_LCZ()` method. + + """ + # Validate spatial_max_sample_size + if spatial_max_sample_size is not None: + if spatial_max_sample_size <= spatial_min_sample_size: + raise ValueError( + f"spatial_max_sample_size ({spatial_max_sample_size}) must be " + f"larger than spatial_min_sample_size ({spatial_min_sample_size})." + ) + + checksettings = { + "obstype": obstype, + "spatial_buddy_radius": spatial_buddy_radius, + "spatial_min_sample_size": spatial_min_sample_size, + "spatial_max_sample_size": spatial_max_sample_size, + "max_alt_diff": max_alt_diff, + "min_sample_spread": min_sample_spread, + "spatial_z_threshold": spatial_z_threshold, + "N_iter": N_iter, + "instantaneous_tolerance": instantaneous_tolerance, + "whiteset": whiteset, + "safety_net_configs": safety_net_configs, + "lapserate": lapserate, + } + + targets = [BuddyWrapSensor(sensor=sta.get_sensor(obstype), + site=sta.site) for sta in target_stations] + + # Validate safety net configs if provided + buddymethods.validate_safety_net_configs(safety_net_configs) + + # ----- Part 1: construct buddy groups ------ + # compute distance metric + logger.debug("Calculating distance matrix with Haversine formula") + dist_matrix = generate_distance_matrix(metadf) + + # find potential buddies by distance + logger.debug( + "Finding spatial buddies within radius of %s meters", spatial_buddy_radius + ) + + buddymethods.assign_spatial_buddies( + distance_df=dist_matrix, + metadf = metadf, + max_alt_diff=max_alt_diff, + buddy_radius=spatial_buddy_radius, + min_buddy_distance = min_buddy_distance, + wrappedstations=targets, + ) + + # Subset spatial buddies to nearest N if spatial_max_sample_size is set + if spatial_max_sample_size is not None: + logger.debug( + "Subsetting spatial buddies to nearest %s stations", + spatial_max_sample_size, + ) + buddymethods.subset_buddies_to_nearest( + wrappedstations=targets, + distance_df=dist_matrix, + max_sample_size=spatial_max_sample_size, + groupname='spatial', + ) + + # ---- Part 2: Preparing the records ----- + + # construct a wide observation dataframe + + widedf, timestamp_map = buddymethods.create_wide_obs_df( + wrappedsensors=targets, + instantaneous_tolerance=instantaneous_tolerance, + ) + + # lapse rate correction + widedf = buddymethods.correct_lapse_rate(widedf=widedf, + wrappedsensors=targets, + lapserate=lapserate) + + + + # ---- Part 3 : Apply buddy check per stationcenter, + + # valid_targets = [budsta for budsta in targets if budsta.has_enough_buddies( + # groupname='spatial', min_buddies = spatial_min_sample_size)] + + outliersbin = [] + for i in range(N_iter): + logger.debug("Starting iteration %s of %s", i + 1, N_iter) + # convert values to NaN, if they are labeled as outlier in + # previous iteration + if bool(outliersbin): + logger.debug("Converting previous-iteration outliers to NaN") + for outlier_station, outlier_time in outliersbin: + if outlier_station in widedf.columns: + widedf.loc[outlier_time, outlier_station] = np.nan + + if use_mp: + num_cores = Settings.get('use_N_cores_for_MP') + num_workers = min(len(targets), num_cores) + + logger.info(f"Running spatial buddy check with multiprocessing on {num_workers} cores") + logger.info( + f"Parallelizing by station: {len(targets)} stations to process" + ) + + # Build kwargs for each station with subset widedf containing only + # the center station and its buddies + station_kwargs = [ + _build_station_buddy_kwargs( + station=sta, + widedf=widedf, + buddygroupname='spatial', + min_sample_size=spatial_min_sample_size, + min_sample_spread=min_sample_spread, + outlier_threshold=spatial_z_threshold, + iteration=i, + check_type='spatial_check', + use_z_robust_method=use_z_robust_method, + ) + for sta in targets + ] + + # Run in parallel - each task processes one center station + with ProcessPoolExecutor(max_workers=num_workers) as executor: + buddy_output = list(executor.map(_run_buddy_test, station_kwargs)) + + else: + # create inputargs for each buddygroup, and for each chunk in time + inputargs = [ + { + 'centerwrapstation': sta, + 'buddygroupname': 'spatial', + 'widedf': widedf, + 'min_sample_size': spatial_min_sample_size, + 'min_sample_spread': min_sample_spread, + 'outlier_threshold': spatial_z_threshold, + 'iteration': i, + 'check_type': 'spatial_check', + 'use_z_robust_method': use_z_robust_method, + } + for sta in targets + ] + + + logger.debug("Finding outliers in each buddy group") + buddy_output = list(map(lambda kwargs: buddymethods.buddy_test_a_station(**kwargs), inputargs)) + + #buddy output is [(MultiIndex, BuddyCheckStation), ...], that needs to be unpacked + outlier_indices, updated_stations = zip(*buddy_output) + #overload the Buddycheckstation + targets = list(updated_stations) + + # Concatenate all outlier MultiIndices + # Each element is a MultiIndex with (name, datetime) + spatial_outliers = buddymethods.concat_multiindices(list(outlier_indices)) + + # Start with spatial outliers for further processing + current_outliers_idx = spatial_outliers + + # Apply safety nets (if configured) + if safety_net_configs: + logger.debug( + "Applying %s safety net(s) to %s outliers", + len(safety_net_configs), + len(current_outliers_idx), + ) + previous_safetynet_category = None + for safety_net_config in safety_net_configs: + logger.info(f"Applying safety net on category {safety_net_config['category']}") + + current_outliers_idx = buddymethods.apply_safety_net( + outliers=current_outliers_idx, + buddycheckstations = targets, + buddygroupname=safety_net_config['category'], + metadf = metadf, + distance_df = dist_matrix, + max_distance=safety_net_config['buddy_radius'], + min_distance=safety_net_config.get('min_buddy_distance', 0), + max_alt_diff=max_alt_diff, #make this configurable? + wideobsds=widedf, + safety_z_threshold=safety_net_config['z_threshold'], + min_sample_size=safety_net_config['min_sample_size'], + min_sample_spread=min_sample_spread, #make this configurable? + use_z_robust_method=use_z_robust_method, + iteration=i, + max_sample_size=safety_net_config.get('max_sample_size', None), + only_if_previous_had_no_buddies=safety_net_config.get('only_if_previous_had_no_buddies', False), + previous_safetynet_category=previous_safetynet_category, + ) + previous_safetynet_category = safety_net_config['category'] + + # NOTE: Records saved by any safety net will be tested again in + # the following iteration. A different result can occur if the + # spatial/safety net sample changes in the next iteration. + + # Apply whitelist filtering + current_outliers_idx = buddymethods.save_whitelist_records( + outliers=current_outliers_idx, + wrappedstations=targets, + whiteset=whiteset, + obstype=obstype, + iteration=i, + ) + + # NOTE: The white-listed records are removed from the outliers at the end + # of each iteration, similar to the safety nets. They participate in + # the buddy check calculations but are not flagged as outliers. + + # Convert MultiIndex to list of tuples for outliersbin + # Format: (station_name, timestamp, message) + for name, dt in current_outliers_idx: + outliersbin.append((name, dt)) + + #Prepare for output + return_results = {} + for wrapsta in targets: + logger.debug(f"Preparing QCresult for station {wrapsta.name}") + #1. Map timestamps back to original timestamps + wrapsta.map_timestamps(timestamp_map=timestamp_map[wrapsta.name]) + + #2. Create final QC labels (specific for buddy check) + final_labels = wrapsta.get_final_labels() + + #3 Convert these flags to default qc flags + qcflags = final_labels.map(to_qc_labels_map) + + #4 Create QCresult object + qcres = QCresult( + checkname='buddy_check', + checksettings=checksettings, + flags=qcflags, + detail='', + ) + + qcres.add_details_by_series(detail_series = wrapsta.get_final_details()) + return_results[wrapsta.name] = qcres + + return return_results, targets + diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/buddywrapsensor.py b/src/metobs_toolkit/qc_collection/spatial_checks/buddywrapsensor.py new file mode 100644 index 00000000..7ac049e3 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/buddywrapsensor.py @@ -0,0 +1,725 @@ +from __future__ import annotations +import os +import logging + +from typing import Union, List, Dict, TYPE_CHECKING +from collections import defaultdict + +from metobs_toolkit.qcresult import ( + unchecked_cond, + unmet_cond, + pass_cond, + flagged_cond, + saved_cond, +) + +import pandas as pd +if TYPE_CHECKING: + from metobs_toolkit.sensordata import SensorData + from metobs_toolkit.site import Site + +logger = logging.getLogger("") + +#=============================== +# Labels +#=============================== +# Constants for buddy check status labels +BC_NOT_TESTED = "not_tested" # Value was NaN, not tested +BC_NO_BUDDIES = "no_buddies" # Not enough buddies to test +BC_PASSED = "passed" # Tested and passed +BC_FLAGGED = "flagged" # Tested and flagged as outlier +BC_SAFETYNET_SAVED = "safetynet_saved" # Flagged but saved by safetynet +BC_SAFETYNET_OUTLIER = "safetynet_outlier" # Flagged but not saved by safetynet +BC_WHITELIST_SAVED = "whitelist_saved" # Flagged but saved by whitelist +BC_WHITELIST_NOT_SAVED = "whitelist_not_saved" # Flagged but not saved by whitelist +BC_CHECK_SKIPPED = "skipped" #This check was skipped, e.g. due to arugments of the user (not whitelist, not safetynets etc) + + +to_qc_labels_map = { + BC_NOT_TESTED: unchecked_cond, # Value was NaN, not tested + BC_NO_BUDDIES: unmet_cond, # Not enough buddies to test + BC_PASSED : pass_cond, # Tested and passed + BC_FLAGGED : flagged_cond, # Tested and flagged as outlier + BC_SAFETYNET_SAVED : pass_cond, # IMPORTANT !!! + # BC_SAFETYNET_OUTLIER : flagged_cond # Flagged but not saved by safetynet + BC_WHITELIST_SAVED : saved_cond, # Flagged but saved by whitelist + # BC_WHITELIST_NOT_SAVED : flagged_cond +} + +#=============================== +# Buddy wrap sensor class +#=============================== + +class BuddyWrapSensor: + """Wrapper for a Sensor with buddy check-specific details. + + This class wraps a sensor object and adds information about how it is + handled during the buddy check process, including buddy assignment, + filtering steps, and participation in buddy groups. + + Attributes + ---------- + sensor : SensorData + The wrapped Sensor object. + _buddy_groups : dict + Dictionary mapping group names to lists of buddy sensor names. + flag_lapsrate_corrections : bool + Whether lapse rate corrections have been applied. + cor_term : float + The correction term applied for lapse rate. + flags : pandas.DataFrame + DataFrame with MultiIndex (datetime, iteration) containing flag values. + Columns are added via `add_flags` method for different check types. + details : dict + Dictionary storing iteration-wise detail information. Structure: + { + 'spatial_check': { + iteration_int: Series(index=DatetimeIndex, data=detail_strings), + ... + }, + 'safetynet_check': { + groupname_str: { + iteration_int: Series(index=DatetimeIndex, data=detail_strings), + ... + }, + ... + }, + 'whitelist_check': { + iteration_int: Series(index=DatetimeIndex, data=detail_strings), + ... + }, + } + """ + # station: Station + + def __init__(self, sensor: SensorData, site:Site): + self._sensor = sensor + self._site = site + # Initialize instance-specific attributes (NOT class attributes!) + self._buddy_groups: Dict[str, List[str]] = { + 'spatial': [], + } + + # Value corrections + self.flag_lapsrate_corrections: bool = False + self.cor_term: float = 0. + + # Flags DataFrame with MultiIndex (datetime, iteration) + self._flags: pd.DataFrame = pd.DataFrame() + + # Details dictionary structure + self.details: Dict[str, Union[Dict[int, pd.Series], Dict[str, Dict[int, pd.Series]]]] = { + 'spatial_check': {}, + 'safetynet_check': {}, # Dict of groupname -> Dict of iteration -> Series + 'whitelist_check': {}, + } + + @property + def sensor(self) -> SensorData: + """Get the wrapped SensorData object.""" + return self._sensor + + @property + def site(self) -> Site: + """Get the wrapped Site object.""" + return self._site + + @property + def name(self) -> str: + """Get the station name.""" + return self.sensor.stationname + + @property + def flags(self) -> pd.DataFrame: + """Get the flags DataFrame.""" + if self._flags.empty: + return pd.DataFrame(index=pd.MultiIndex(levels=[[], []], codes=[[], []], names=['datetime', 'iteration'])) + + return self._flags + + @flags.setter + def flags(self, flags: pd.DataFrame) -> None: + + if not isinstance(flags, pd.DataFrame): + raise ValueError("flags must be a pandas DataFrame") + + if not flags.empty: + if not isinstance(flags.index, pd.MultiIndex): + raise ValueError("flags DataFrame must have a MultiIndex") + if flags.index.names != ['datetime', 'iteration']: + raise ValueError("flags DataFrame MultiIndex must have levels ['datetime', 'iteration']") + + # Preserve column order: existing columns first, new columns at the end + if not self._flags.empty and not flags.empty: + existing_cols = [col for col in self._flags.columns if col in flags.columns] + new_cols = [col for col in flags.columns if col not in self._flags.columns] + ordered_cols = existing_cols + new_cols + flags = flags[ordered_cols] + + self._flags = flags + + + + def add_flags(self, iteration: int, flag_series: pd.Series, column_name: str) -> None: + """Add flags to the flags DataFrame for a specific iteration. + + Parameters + ---------- + iteration : int + The iteration number. + flag_series : pd.Series + Series with DatetimeIndex containing flag values. + column_name : str + The name of the column to add/update (e.g., 'spatial_check', + 'safetynet_check:groupname', 'whitelist_check'). + """ + if flag_series.empty: + return + + + # Subset flag_series to only include indices present in self.sensor.series.index + #edgcase: this can be caused because of the wideobds desing, that creates empty rows + # for timestamps where no observations are present. + valid_index = flag_series.index.intersection(self.sensor.series.index) + flag_series = flag_series.loc[valid_index] + # Remove duplicates (keep first occurrence) + flag_series = flag_series[~flag_series.index.duplicated(keep='first')] + + + + # Create a DataFrame with MultiIndex for the new flags + new_flags = pd.DataFrame({ + column_name: flag_series.values + }, index=pd.MultiIndex.from_arrays( + [flag_series.index, [iteration] * len(flag_series)], + names=['datetime', 'iteration'] + )) + + if self.flags.empty: + self.flags = new_flags + else: + # Merge new flags with existing flags + # Use combine_first to keep existing values and add new ones + self.flags = self.flags.combine_first(new_flags) + + # If the column already exists, update with new values (not NaN) + if column_name in self.flags.columns: + # For indices that exist in both, update from new_flags + common_idx = self.flags.index.intersection(new_flags.index) + if not common_idx.empty: + self.flags.loc[common_idx, column_name] = new_flags.loc[common_idx, column_name] + + self.flags = self.flags.sort_index() + + def filter_buddies(self, filteredbuddies: List[str], groupname: str) -> None: + self.set_buddies(filteredbuddies, groupname=f'{groupname}_filtered') + + def set_buddies(self, buddies: List[str], groupname: str) -> None: + self._buddy_groups.update({groupname: buddies}) + + def get_buddies(self, groupname: str) -> List[str]: + if f'{groupname}_filtered' in self._buddy_groups: + return self._buddy_groups[f'{groupname}_filtered'] + if groupname in self._buddy_groups: + return self._buddy_groups[groupname] + raise ValueError(f"Unknown buddy group: {groupname}") + + def has_enough_buddies(self, groupname: str, min_buddies: int) -> bool: + """Check if the station has enough final buddies.""" + enough = len(self.get_buddies(groupname=groupname)) >= min_buddies + return enough + + + def _update_details(self, iteration: int, detail_series: pd.Series, groupname: str) -> None: + # Handle empty input - still store an empty entry to ensure the iteration key exists + if detail_series.empty: + if iteration not in self.details[groupname]: + self.details[groupname][iteration] = pd.Series(dtype=str, index=pd.DatetimeIndex([], name='datetime')) + return + + # Subset detail_series to only include indices present in self.sensor.series.index + #edgcase: this can be caused because of the wideobds desing, that creates empty rows + # for timestamps where no observations are present. + valid_index = detail_series.index.intersection(self.sensor.series.index) + detail_series = detail_series.loc[valid_index] + # Remove duplicates (keep first occurrence) + detail_series = detail_series[~detail_series.index.duplicated(keep='first')] + + # Store details in the details dictionary (even if now empty after filtering) + if iteration not in self.details[groupname]: + self.details[groupname][iteration] = detail_series + else: + # Append to existing series for this iteration + existing = self.details[groupname][iteration] + if existing.empty: + self.details[groupname][iteration] = detail_series + elif not detail_series.empty: + combined = pd.concat([existing, detail_series]) + # Remove duplicates keeping first + self.details[groupname][iteration] = combined[~combined.index.duplicated(keep='first')].sort_index() + + def add_spatial_details(self, iteration: int, detail_series: pd.Series) -> None: + """Add spatial check detail information for an iteration. + + Parameters + ---------- + iteration : int + The iteration number. + detail_series : pd.Series + Series with DatetimeIndex containing detail messages. + """ + self._update_details( + iteration=iteration, + detail_series=detail_series, + groupname='spatial_check' + ) + + def add_safetynet_details(self, iteration: int, safetynetname: str, detail_series: pd.Series) -> None: + if detail_series.empty: + return + # Remove duplicates (keep first occurrence) + detail_series = detail_series[~detail_series.index.duplicated(keep='first')] + + # check if the groupname entry exists + if safetynetname not in self.details['safetynet_check']: + self.details['safetynet_check'][safetynetname] = {} + + + # Store details in the details dictionary + if iteration not in self.details['safetynet_check'][safetynetname]: + self.details['safetynet_check'][safetynetname][iteration] = detail_series + else: + # Append to existing series for this iteration + existing = self.details['safetynet_check'][safetynetname][iteration] + combined = pd.concat([existing, detail_series]) + # Remove duplicates keeping first + self.details['safetynet_check'][safetynetname][iteration] = combined[~combined.index.duplicated(keep='first')].sort_index() + + + + def update_whitelist_details(self, whitelistseries: pd.Series, + iteration: int, + is_saved: bool = True) -> None: + """Update whitelist check saved information for an iteration. + + Parameters + ---------- + whitelistseries : pd.Series + Series with DatetimeIndex containing detail messages for whitelisted records. + iteration : int + The iteration number. + is_saved : bool, optional + Whether the records were saved by the whitelist (True) or not (False). + Default is True. + """ + # Remove duplicates (keep first occurrence) + whitelistseries = whitelistseries[~whitelistseries.index.duplicated(keep='first')] + + if whitelistseries.empty: + return + + if is_saved: + flag = BC_WHITELIST_SAVED + else: + flag = BC_WHITELIST_NOT_SAVED + # Add flags to the flags DataFrame + flag_series = pd.Series(flag, index=whitelistseries.index) + self.add_flags(iteration=iteration, flag_series=flag_series, column_name='whitelist_check') + + # Store details in the details dictionary + if iteration not in self.details['whitelist_check']: + self.details['whitelist_check'][iteration] = whitelistseries + else: + # Append to existing series for this iteration + existing = self.details['whitelist_check'][iteration] + combined = pd.concat([existing, whitelistseries]) + # Remove duplicates keeping first + self.details['whitelist_check'][iteration] = combined[~combined.index.duplicated(keep='first')].sort_index() + + + + def _get_iterations(self) -> List[int]: + """Get all iterations that have been processed.""" + iterations = set() + + # From spatial_check + iterations.update(self.details['spatial_check'].keys()) + + # From safetynet_check (nested) + for groupname, iter_dict in self.details['safetynet_check'].items(): + iterations.update(iter_dict.keys()) + + # From whitelist_check + iterations.update(self.details['whitelist_check'].keys()) + + return sorted(iterations) + + def get_final_labels(self) -> pd.Series: + flags = self.flags + + #if no whitelist check has been performed, create an empyt column + if 'whitelist_check' not in flags.columns: + flags['whitelist_check'] = BC_CHECK_SKIPPED #'skipped' + + final_labels = flags.groupby('datetime').apply(final_label_logic) + final_labels.name = 'final_label' + return final_labels + + def get_final_details(self) -> pd.Series: + """Get detailed description strings for each timestamp based on final label logic. + + This method returns a Series with detailed description strings that illustrate + how the final label was determined. The details are extracted from the details + attribute and combined based on the check pipeline. + + Returns + ------- + pd.Series + Series with DatetimeIndex containing detailed description strings. + The series name is 'final_details'. + """ + # Handle edge case: if flags are empty, return empty series + if self.flags.empty: + return pd.Series(dtype=str, name='final_details') + + # The flags have a MultiIndex (datetime, iteration), but the detail + # Series stored in self.details use a plain DatetimeIndex. We must + # build the detail strings per-iteration using the DatetimeIndex so + # that reindexing can match properly, then combine the results back + # into a Series indexed by the original MultiIndex. + + # Get all datetimes (unique, from flags) + all_datetimes = self.flags.index.get_level_values('datetime').unique() + + def reindex_details(detail_series: pd.Series, dt_index: pd.DatetimeIndex) -> pd.Series: + """Reindex a detail Series to match the given DatetimeIndex.""" + return detail_series.reindex(dt_index).fillna('NA').astype(str) + + # Get all iterations that have been processed (using the helper method) + iterations = self._get_iterations() + + # If no iterations found, try to infer from flags index + if not iterations: + # Fallback: get unique iterations from the flags MultiIndex + if isinstance(self.flags.index, pd.MultiIndex): + iterations = sorted(self.flags.index.get_level_values('iteration').unique()) + else: + iterations = [0] # Default to iteration 0 + + # Build detail strings per datetime using all_datetimes as the index + detailstr = pd.Series('', index=all_datetimes) + + for iter_num in iterations: + # Check if this iteration exists in spatial_check details + if iter_num not in self.details['spatial_check']: + # If iteration doesn't exist in spatial_check, add a placeholder message + detailstr = detailstr + f'iteration {iter_num}:[No spatial check details available' + else: + detailstr = detailstr + f'iteration {iter_num}:[' + + #add spatial check details + if iter_num in self.details['spatial_check']: + spatial_details = reindex_details(self.details['spatial_check'][iter_num], all_datetimes) + detailstr = detailstr.str.cat(spatial_details, sep='') + # else: placeholder already added above + + detailstr = detailstr + " --> " + + #add safetynet details + if bool(self.details['safetynet_check']): + + for safetynetkey in self.details['safetynet_check'].keys(): + if iter_num in self.details['safetynet_check'][safetynetkey].keys(): + + savedetails = reindex_details(self.details['safetynet_check'][safetynetkey][iter_num], all_datetimes) + detailstr = detailstr.str.cat(savedetails, sep=f'{safetynetkey}:') + + else: + detailstr = detailstr + f'{safetynetkey}: NA ' + + detailstr = detailstr + " --> " + else: + detailstr = detailstr + "NA " + " --> " + + #add whitelist details + if iter_num in self.details['whitelist_check'].keys(): + savedetails = reindex_details(self.details['whitelist_check'][iter_num], all_datetimes) + detailstr = detailstr.str.cat(savedetails, sep='') + else: + detailstr = detailstr + 'NA' + + detailstr = detailstr + "] \n" + + detailstr.name = 'final_details' + return detailstr + + def get_iteration_summary(self, iteration: int) -> Dict[str, int]: + """Get a summary of record counts for a specific iteration. + + Parameters + ---------- + iteration : int + The iteration number to summarize. + + Returns + ------- + dict + Dictionary with check types as keys and record counts as values. + For safetynet_check, includes both total and per-group counts. + """ + # Count spatial_check + spatial_count = 0 + if iteration in self.details['spatial_check']: + spatial_count = len(self.details['spatial_check'][iteration]) + + # Count safetynet_check per group + safetynet_per_group = {} + for groupname, iter_dict in self.details['safetynet_check'].items(): + if iteration in iter_dict: + safetynet_per_group[groupname] = len(iter_dict[iteration]) + safetynet_total = sum(safetynet_per_group.values()) + + # Count whitelist_check + whitelist_count = 0 + if iteration in self.details['whitelist_check']: + whitelist_count = len(self.details['whitelist_check'][iteration]) + + return { + 'spatial_check': spatial_count, + 'safetynet_check': safetynet_per_group, + 'safetynet_check_total': safetynet_total, + 'whitelist_check': whitelist_count + } + + def get_info(self) -> str: + """Get a summary of the BuddyCheckStation status and attributes. + + Returns + ------- + str + Formatted string with overview of the station's buddy check status. + """ + lines = [] + lines.append("=" * 60) + lines.append(f"BuddyCheckStation: {self.name}") + lines.append("=" * 60) + + # Buddy groups + lines.append("\n--- Buddy Groups ---") + if self._buddy_groups: + for groupname, buddies in self._buddy_groups.items(): + n_buddies = len(buddies) + buddies_str = ", ".join(buddies[:5]) + if n_buddies > 5: + buddies_str += f", ... (+{n_buddies - 5} more)" + lines.append(f" {groupname}: {n_buddies} buddies") + if buddies: + lines.append(f" [{buddies_str}]") + else: + lines.append(" No buddy groups assigned") + + # Corrections + lines.append("\n--- Value Corrections ---") + lines.append(f" Lapse rate correction: {self.flag_lapsrate_corrections}") + if self.flag_lapsrate_corrections: + lines.append(f" Correction term: {self.cor_term:.4f}") + + # Flags summary + lines.append("\n--- Flags ---") + if not self.flags.empty: + lines.append(f" Total flag entries: {len(self.flags)}") + lines.append(f" Flag columns: {list(self.flags.columns)}") + else: + lines.append(" No flags recorded") + + # Iteration status + iterations = self._get_iterations() + lines.append("\n--- Iteration Status ---") + if iterations: + lines.append(f" Iterations processed: {len(iterations)}") + + # Totals across all iterations + total_spatial = 0 + total_safetynet = 0 + total_whitelist = 0 + safetynet_groups_total: Dict[str, int] = {} + + for iteration in iterations: + summary = self.get_iteration_summary(iteration) + total_spatial += summary['spatial_check'] + total_safetynet += summary['safetynet_check_total'] + total_whitelist += summary['whitelist_check'] + + # Accumulate per-group safetynet totals + for groupname, count in summary['safetynet_check'].items(): + safetynet_groups_total[groupname] = safetynet_groups_total.get(groupname, 0) + count + + lines.append(f"\n Iteration {iteration}:") + lines.append(f" Spatial outliers: {summary['spatial_check']}") + if summary['safetynet_check']: + lines.append(f" Safetynet saved (total): {summary['safetynet_check_total']}") + for groupname, count in summary['safetynet_check'].items(): + lines.append(f" - {groupname}: {count}") + else: + lines.append(f" Safetynet saved: 0") + lines.append(f" Whitelist saved: {summary['whitelist_check']}") + + lines.append(f"\n --- Totals ---") + lines.append(f" Total spatial outliers: {total_spatial}") + lines.append(f" Total safetynet saved: {total_safetynet}") + if safetynet_groups_total: + for groupname, count in safetynet_groups_total.items(): + lines.append(f" - {groupname}: {count}") + lines.append(f" Total whitelist saved: {total_whitelist}") + else: + lines.append(" No iterations processed") + + lines.append("=" * 60) + + info_str = "\n".join(lines) + print(info_str) + return info_str + + def map_timestamps(self, + timestamp_map: Dict[str, pd.Series]) -> None: + """Map synchronized timestamps to original timestamps for this station. + + This function maps the synchronized timestamps (used during buddy check + processing) back to the original timestamps for this station's flags and + details attributes. + + Parameters + ---------- + timestamp_map : dict + Dictionary mapping station names to Series where index is synchronized + timestamp and value is original timestamp. + + Returns + ------- + None + Modifies the station in-place. + """ + + # ts_map = timestamp_map[self.name] + + # Revert flags DataFrame timestamps (MultiIndex: datetime, iteration) + if not self.flags.empty: + self.flags = _map_dt_index( + pdobj=self.flags, + ts_map=timestamp_map, + datetime_level='datetime' + ) + + # Revert details timestamps + # spatial_check: {iteration: Series} + for iteration, detail_series in self.details['spatial_check'].items(): + self.details['spatial_check'][iteration] = _map_dt_index( + pdobj=detail_series, + ts_map=timestamp_map + ) + + # safetynet_check: {groupname: {iteration: Series}} + for groupname, iter_dict in self.details['safetynet_check'].items(): + for iteration, detail_series in iter_dict.items(): + self.details['safetynet_check'][groupname][iteration] = _map_dt_index( + pdobj=detail_series, + ts_map=timestamp_map + ) + + # whitelist_check: {iteration: Series} + for iteration, detail_series in self.details['whitelist_check'].items(): + self.details['whitelist_check'][iteration] = _map_dt_index( + pdobj=detail_series, + ts_map=timestamp_map + ) + +#=============================== +# helpers for BCS +#=============================== + +def final_label_logic(subset: pd.DataFrame) -> str: + #the flag not tested is present on ALL iterations ! + if subset['spatial_check'].apply(lambda x: x=='not_tested').all(): + return BC_NOT_TESTED + + # --- passed condition ---- + #1a perfect pass (pass on last iteration of spatial check) + if subset['spatial_check'].iloc[-1] == BC_PASSED: + return BC_PASSED + + if subset['spatial_check'].iloc[-1] == BC_NO_BUDDIES: + #Choice made: it can happen that a record passed a previous iteration, + #but has not enough buddies in the last iteration. Applying the 'save' logic, + #it is best to label this as no_buddies + return BC_NO_BUDDIES + + #catched by safetynet + #if there is at least (there can only be one), pass in the last iteration of saftynets + saftynet_cols = [col for col in subset.columns if col.startswith('safetynet_check:')] + if any(subset.iloc[-1][saftynet_cols] == BC_PASSED): #TODO not shure of this string + return BC_SAFETYNET_SAVED + + + #catched by whitelist + if subset['whitelist_check'].iloc[-1] == BC_WHITELIST_SAVED: + return BC_WHITELIST_SAVED + + # --- failed condition ---- + + #fail in last iteration of spatial check + if ((subset['spatial_check'].iloc[-1] == BC_FLAGGED) and + all(subset.iloc[-1][saftynet_cols] != BC_PASSED) and #not passed is [nan, flagged, no-buddies] + (subset['whitelist_check'].iloc[-1] != BC_WHITELIST_SAVED)): #not saved is [nan, skipped, not-saved] + return BC_FLAGGED + + #fail in any previous iteration of spatial check + if ((any(subset['spatial_check'] == BC_FLAGGED)) and + (subset['spatial_check'].iloc[-1] == BC_NOT_TESTED)): + return BC_FLAGGED + + + raise ValueError(f"Unforeseen situartion encountered in final label logic: \n {subset}") + +def _map_dt_index(pdobj: pd.Series | pd.DataFrame, + ts_map: pd.Series, + datetime_level: str = 'datetime' +) -> pd.DataFrame: + """Revert timestamps in a DataFrame with MultiIndex containing datetime level. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with MultiIndex containing a datetime level. + ts_map : pd.Series + Series mapping synchronized timestamps (index) to original timestamps (values). + datetime_level : str + Name of the datetime level in the MultiIndex. + + Returns + ------- + pd.DataFrame + DataFrame with reverted timestamps in the MultiIndex. + """ + + if isinstance(pdobj, pd.Series): + df = pdobj.to_frame() + returnseries = True + else: + df = pdobj + returnseries = False + + # Get the current index + old_index = df.index + level_names = old_index.names + + df = df.reset_index() + df['_mapped_datetime'] = df[datetime_level].map(lambda x: ts_map.get(x, x) if pd.notna(ts_map.get(x, pd.NaT)) else x) + df = df.drop(columns=[datetime_level]) + df = df.rename(columns={'_mapped_datetime': datetime_level}) + df = df.set_index(level_names) + df = df.sort_index() + if returnseries: + return df.iloc[:,0] + return df + + diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/__init__.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/__init__.py new file mode 100644 index 00000000..c1e892b9 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/__init__.py @@ -0,0 +1,7 @@ +from .findbuddies import assign_spatial_buddies, filter_buddygroup_by_altitude, subset_buddies_to_nearest +from .pdmethods import create_wide_obs_df, concat_multiindices +from .lapsratecorrection import correct_lapse_rate +from .samplechecks import buddy_test_a_station + +from .safetynets import validate_safety_net_configs, apply_safety_net +from .whitesaving import save_whitelist_records \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/findbuddies.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/findbuddies.py new file mode 100644 index 00000000..9c189f3f --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/findbuddies.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING + +import pandas as pd + + +logger = logging.getLogger("") + + +if TYPE_CHECKING: + from ...buddystation import BuddyCheckStation + +# ------------------------------------------ +# Callables by buddy check +# ------------------------------------------ + +def assign_spatial_buddies( + distance_df: pd.DataFrame, + metadf: pd.DataFrame, + buddy_radius: Union[int, float], + min_buddy_distance: Union[int, float], + wrappedstations: List[BuddyCheckStation], + max_alt_diff: Union[int, float, None]=None, +) -> None: + + + spatial_buddies = _find_buddies_by_distance(distance_df=distance_df, + buddy_radius=buddy_radius, + min_buddy_distance=min_buddy_distance) + + #update the wrapstations + for wrapsta in wrappedstations: + wrapsta.set_buddies(spatial_buddies[wrapsta.name], groupname='spatial') + + if max_alt_diff is not None: + for wrapsta in wrappedstations: + filter_buddygroup_by_altitude( + wrappedstation=wrapsta, + groupname='spatial', + altitudes=metadf['altitude'], + max_altitude_diff=max_alt_diff + ) + + + +def filter_buddygroup_by_altitude( + wrappedstation: BuddyCheckStation, + groupname: str, + altitudes: pd.Series, + max_altitude_diff: Union[int, float] +) : + + if altitudes.isnull().any(): + raise ValueError("Altitude series contains NaN values. All stations must have valid altitude data for altitude filtering.") + + #update the filter flag + station_altitude = altitudes.loc[wrappedstation.name] + alt_buddies = [] + for buddy_name in wrappedstation.get_buddies(groupname=groupname): + buddy_altitude = altitudes.loc[buddy_name] + if abs(station_altitude - buddy_altitude) <= max_altitude_diff: + alt_buddies.append(buddy_name) + wrappedstation.filter_buddies(groupname=groupname, filteredbuddies=alt_buddies) + +def subset_buddies_to_nearest( + wrappedstations: List, + distance_df: pd.DataFrame, + max_sample_size: int, + groupname: str, +) -> None: + """Subset buddy groups to the nearest N stations. + + For each wrapped station, the buddies in the specified group are sorted + by distance and only the closest ``max_sample_size`` buddies are kept. + + Parameters + ---------- + wrappedstations : list of BuddyWrapSensor + The wrapped stations whose buddy groups should be subsetted. + distance_df : pd.DataFrame + Symmetric distance matrix with station names as index and columns. + max_sample_size : int + Maximum number of buddies to keep per station. + groupname : str + The name of the buddy group to subset (e.g., 'spatial'). + """ + for wrapsta in wrappedstations: + buddies = wrapsta.get_buddies(groupname=groupname) + if len(buddies) <= max_sample_size: + continue + # Sort buddies by distance and keep only the nearest N + buddy_distances = distance_df.loc[wrapsta.name, buddies] + nearest = buddy_distances.nsmallest(max_sample_size).index.to_list() + wrapsta.filter_buddies(groupname=groupname, filteredbuddies=nearest) + + +# ------------------------------------------ +# Help functions to find buddies +# ------------------------------------------ +def _find_buddies_by_distance( + distance_df: pd.DataFrame, buddy_radius: Union[int, float], min_buddy_distance: Union[int, float]=0 +) -> Dict: + + buddies = {} + for refstation, distances in distance_df.iterrows(): + bud_stations = distances[(distances <= buddy_radius) & (distances >= min_buddy_distance)].index.to_list() + if refstation in bud_stations: + bud_stations.remove(refstation) + buddies[refstation] = bud_stations + + return buddies + diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/lapsratecorrection.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/lapsratecorrection.py new file mode 100644 index 00000000..87ec2553 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/lapsratecorrection.py @@ -0,0 +1,45 @@ + +from __future__ import annotations + +import logging +from typing import List, TYPE_CHECKING + +import pandas as pd + + +logger = logging.getLogger("") + + +if TYPE_CHECKING: + from metobs_toolkit.qc_collection.spatial_checks.buddywrapsensor import BuddyWrapSensor + + + +def correct_lapse_rate(widedf: pd.DataFrame, + wrappedsensors: List[BuddyWrapSensor], + lapserate: float|None = None) -> pd.DataFrame: + + if lapserate is None: + logger.debug("No lapse rate correction applied") + for wrapsens in wrappedsensors: wrapsens.flag_lapsrate_corrections = False + else: + logger.debug("Applying lapse rate correction with rate: %s", lapserate) + #Test if all stations have altitude + has_alts = [budsta.site.flag_has_altitude() for budsta in wrappedsensors] + + if not all(has_alts): + raise ValueError( + "At least one station has a NaN value for 'altitude', not lapse rate correction possible" + ) + for budsta in wrappedsensors: + budsta.flag_lapsrate_corrections = True + + # Since buddy check works with relative differences, correct all + # stations to the 0m altitude + correction_term = budsta.site.altitude * (-1) * lapserate + budsta.cor_term = correction_term #update it in the buddy station + + #apply the correction on the wide dataframe + widedf[budsta.name] = widedf[budsta.name] + correction_term + + return widedf \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/pdmethods.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/pdmethods.py new file mode 100644 index 00000000..68d64636 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/pdmethods.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING, Tuple + +import pandas as pd +from metobs_toolkit.backend_collection.datetime_collection import to_timedelta + +logger = logging.getLogger("") + + +if TYPE_CHECKING: + from ..buddywrapsensor import BuddyWrapSensor + + +def create_wide_obs_df(wrappedsensors: List[BuddyWrapSensor], + instantaneous_tolerance: pd.Timedelta + ) -> Tuple[pd.DataFrame, Dict]: + + concatlist = [] + for wrapsens in wrappedsensors: + records = wrapsens.sensor.series + records.name = wrapsens.name + concatlist.append(records) + + # synchronize the timestamps + logger.debug("Synchronizing timestamps") + combdf, timestamp_map = _synchronize_series( + series_list=concatlist, max_shift=instantaneous_tolerance + ) + + return (combdf, timestamp_map) + +def _synchronize_series( + series_list: List[pd.Series], max_shift: pd.Timedelta +) -> Tuple[pd.DataFrame, Dict]: + """ + Synchronize a list of pandas Series with datetime indexes. + + The target timestamps are defined by: + + + * freq: the highest frequency present in the input series + * origin: the earliest timestamp found, rounded down by the freq + * closing: the latest timestamp found, rounded up by the freq. + + Parameters + ---------- + series_list : list of pandas.Series + List of pandas Series with datetime indexes. + max_shift : pandas.Timedelta + Maximum shift in time that can be applied to each timestamp + in synchronization. + + Returns + ------- + pandas.DataFrame + DataFrame with synchronized Series. + dict + Dictionary mapping each synchronized timestamp to its + original timestamp. + """ + + # find highest frequency + frequencies = [to_timedelta(s.index.inferred_freq) for s in series_list] + trg_freq = min(frequencies) + + # find origin and closing timestamp (earliest/latest) + origin = min([s.index.min() for s in series_list]).floor(trg_freq) + closing = max([s.index.max() for s in series_list]).ceil(trg_freq) + + # Create target datetime axes + target_dt = pd.date_range(start=origin, end=closing, freq=trg_freq) + + # Synchronize (merge with tolerance) series to the common index + synchronized_series = [] + timestamp_mapping = {} + for s in series_list: + targetdf = ( + s.to_frame() + .assign(orig_datetime=s.index) + .reindex( + index=pd.DatetimeIndex(target_dt), + method="nearest", + tolerance=max_shift, + limit=1, + ) + ) + + # extract the mapping (new -> original) + orig_timestampseries = targetdf["orig_datetime"] + orig_timestampseries.name = "original_timestamp" + timestamp_mapping[s.name] = orig_timestampseries + + synchronized_series.append(s) + + return pd.concat(synchronized_series, axis=1), timestamp_mapping + + + +def concat_multiindices( + indices: List[pd.MultiIndex] +) -> pd.MultiIndex: + """Concatenate a list of MultiIndex objects into a single MultiIndex. + + Parameters + ---------- + indices : list of pd.MultiIndex + List of MultiIndex objects to concatenate. + + Returns + ------- + pd.MultiIndex + Concatenated MultiIndex. + """ + if not indices: + return pd.MultiIndex.from_tuples([], names=['name', 'datetime']) + + concatenated = pd.MultiIndex.from_tuples( + [tup for idx in indices for tup in idx], + names=indices[0].names + ) + + + # non_empty_indices = [idx for idx in outlier_indices if len(idx) > 0] + # if non_empty_indices: + # spatial_outliers = non_empty_indices[0] + # for idx in non_empty_indices[1:]: + # spatial_outliers = spatial_outliers.union(idx) + # spatial_outliers = spatial_outliers.drop_duplicates() + # else: + # spatial_outliers = pd.MultiIndex.from_tuples([], names=['name', 'datetime']) + + return concatenated \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/safetynets.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/safetynets.py new file mode 100644 index 00000000..42080e01 --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/safetynets.py @@ -0,0 +1,370 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING + +import pandas as pd + + +logger = logging.getLogger("") + +from .findbuddies import filter_buddygroup_by_altitude, subset_buddies_to_nearest +from .samplechecks import buddy_test_a_station +from ..buddywrapsensor import BC_PASSED, BC_NO_BUDDIES + +if TYPE_CHECKING: + from ..buddywrapsensor import BuddyWrapSensor + + +def validate_safety_net_configs(safety_net_configs: List[Dict]) -> None: + """ + Validate that all required keys are present in safety_net_configs. + + Parameters + ---------- + safety_net_configs : list of dict + List of safety net configuration dictionaries. + + Raises + ------ + ValueError + If safety_net_configs is not a list or contains non-dict elements. + KeyError + If any required key is missing from a safety net configuration. + """ + if safety_net_configs is None: + return None + + required_keys = {"category", "buddy_radius", "z_threshold", "min_sample_size"} + optional_keys = {"max_sample_size", "only_if_previous_had_no_buddies"} + + if not isinstance(safety_net_configs, list): + raise ValueError( + f"safety_net_configs must be a list, got {type(safety_net_configs).__name__}" + ) + + for i, config in enumerate(safety_net_configs): + if not isinstance(config, dict): + raise ValueError( + f"Each safety net config must be a dict, but config at index {i} " + f"is {type(config).__name__}" + ) + + missing_keys = required_keys - set(config.keys()) + if missing_keys: + raise KeyError( + f"Safety net config at index {i} is missing required key(s): " + f"{', '.join(sorted(missing_keys))}. " + f"Required keys are: {', '.join(sorted(required_keys))}" + ) + + # Validate optional max_sample_size + if "max_sample_size" in config: + max_ss = config["max_sample_size"] + if max_ss is not None: + min_ss = config["min_sample_size"] + if max_ss <= min_ss: + raise ValueError( + f"Safety net config at index {i}: 'max_sample_size' " + f"({max_ss}) must be larger than 'min_sample_size' " + f"({min_ss})." + ) + + # Validate optional only_if_previous_had_no_buddies + if "only_if_previous_had_no_buddies" in config: + val = config["only_if_previous_had_no_buddies"] + if not isinstance(val, bool): + raise ValueError( + f"Safety net config at index {i}: " + f"'only_if_previous_had_no_buddies' must be a bool, " + f"got {type(val).__name__}." + ) + if val and i == 0: + raise ValueError( + f"Safety net config at index {i}: " + f"'only_if_previous_had_no_buddies' cannot be True for " + f"the first safety net because there is no previous " + f"safety net to fall back from." + ) + + return None + + +def assign_safety_net_buddies(wrapsta: BuddyWrapSensor, + metadf: pd.DataFrame, + distance_df: pd.DataFrame, + buddygroupname: str, + max_distance: Union[int, float], + min_distance: Union[int, float], + max_alt_diff: Union[int, float, None], + max_sample_size: Union[int, None]) -> None: + """ + Assign category buddies to a wrapped station for safety net buddy checks. + + This function identifies buddies that share the same categorical attribute + (e.g., LCZ, network) with the reference station, within specified distance + constraints and altitude difference limits. + + The assigned buddy group is stored on the wrapped station and can be accessed + using ``wrapsta.get_buddies(groupname=buddygroupname)``. + + Parameters + ---------- + wrapsta : BuddyWrapSensor + The wrapped station for which buddies should be assigned. + metadf : pd.DataFrame + DataFrame containing station metadata. Must have the category column + specified by ``buddygroupname`` (e.g., 'LCZ', 'network') and an + 'altitude' column if ``max_alt_diff`` is not None. + distance_df : pd.DataFrame + Symmetric distance matrix with station names as index and columns. + Distances should be in meters. + buddygroupname : str + The name of the metadata column to group by (e.g., 'LCZ', 'network'). + This is also used as the buddy group identifier on the wrapped station. + max_distance : int or float + Maximum distance (in meters) for category buddies. Stations farther + than this distance will be excluded. + min_distance : int or float + Minimum distance (in meters) required between the station and its + category buddies. Stations closer than this distance will be excluded. + max_alt_diff : int, float, or None + Maximum altitude difference (in meters) allowed for buddies. If None, + no altitude filtering is applied. + max_sample_size : int or None + Maximum number of category buddies to keep per station. If not None, + buddies are sorted by distance and only the nearest ``max_sample_size`` + are retained. If None, no limit is applied. + + Returns + ------- + None + The function modifies ``wrapsta`` in place by assigning the buddy group. + + Notes + ----- + The function applies filters in the following order: + + 1. Category matching: Only stations with the same category value as the + reference station are considered. + 2. Distance filtering: Only stations within [min_distance, max_distance] + are kept. + 3. Altitude filtering (if max_alt_diff is not None): Only stations with + altitude difference <= max_alt_diff are kept. + 4. Sample size limiting (if max_sample_size is not None): Only the nearest + max_sample_size buddies are kept. + + If the reference station has a NaN value for the category, an empty buddy + list is assigned and a warning is logged. + + """ + + + ref_category = metadf.loc[wrapsta.name, buddygroupname] + # Handle NaN values - they should not match with anything + if pd.isna(ref_category): + logger.warning( + "Station %s has NaN value for category '%s' - no category buddies assigned", + wrapsta.name, + buddygroupname, + ) + # Assign empty buddy list + wrapsta.set_buddies([], groupname=buddygroupname) + else: + #find potential candidates + buddy_candidates = metadf.loc[ + metadf[buddygroupname] == ref_category + ].index.to_list() + + #remove self from buddy candidates + buddy_candidates.remove(wrapsta.name) + + target_distances = distance_df.loc[wrapsta.name, buddy_candidates] + #filter by distance + ref_buddies = target_distances[(target_distances <= max_distance) & (target_distances >= min_distance)].index.to_list() + + # Assign the found buddies + wrapsta.set_buddies(ref_buddies, groupname=buddygroupname) + + #filter by altitude difference if needed + if max_alt_diff is not None: + filter_buddygroup_by_altitude( + wrappedstation=wrapsta, + groupname=buddygroupname, + altitudes=metadf['altitude'], + max_altitude_diff=max_alt_diff + ) + + # Subset category buddies to nearest N if max_sample_size is set + if max_sample_size is not None: + subset_buddies_to_nearest( + wrappedstations=[wrapsta], + distance_df=distance_df, + max_sample_size=max_sample_size, + groupname=buddygroupname, + ) + + + +def apply_safety_net( + outliers: pd.Index, + buddycheckstations: List[BuddyWrapSensor], + buddygroupname: str, + metadf: pd.DataFrame, + distance_df: pd.DataFrame, + max_distance: Union[int, float], + min_distance: Union[int, float], + max_alt_diff: Union[int, float, None], + wideobsds: pd.DataFrame, + safety_z_threshold: Union[int, float], + min_sample_size: int, + min_sample_spread: Union[int, float], + use_z_robust_method: bool, + iteration: int, + max_sample_size: Union[int, None] = None, + only_if_previous_had_no_buddies: bool = False, + previous_safetynet_category: Union[str, None] = None, +) -> pd.MultiIndex: + + # Track records that were saved (passed the safety net test) + saved_records = pd.MultiIndex.from_tuples([], names=['name', 'datetime']) + + #create a name map of the wrappedstations + name_map = {wrapsta.name: wrapsta for wrapsta in buddycheckstations} + + # If only_if_previous_had_no_buddies is True, restrict outliers to only + # those records where the previous safety net had insufficient buddies + # (BC_NO_BUDDIES flag). This is determined by inspecting the flags + # already stored on each BuddyWrapSensor for the current iteration. + if only_if_previous_had_no_buddies: + if previous_safetynet_category is None: + raise ValueError( + "only_if_previous_had_no_buddies is True but " + "previous_safetynet_category is None. This should not " + "happen -- the first safety net cannot use this option." + ) + + prev_check_col = f'safetynet_check:{previous_safetynet_category}' + previous_no_buddies = pd.MultiIndex.from_tuples( + [], names=['name', 'datetime'] + ) + + for station_name in outliers.get_level_values('name').unique(): + wrapsta = name_map[station_name] + if ( + not wrapsta.flags.empty + and prev_check_col in wrapsta.flags.columns + ): + iter_mask = ( + wrapsta.flags.index.get_level_values('iteration') + == iteration + ) + iter_flags = wrapsta.flags.loc[iter_mask, prev_check_col] + nb_mask = iter_flags == BC_NO_BUDDIES + if nb_mask.any(): + nb_timestamps = ( + iter_flags[nb_mask] + .index.get_level_values('datetime') + ) + station_nb = pd.MultiIndex.from_arrays( + [ + [station_name] * len(nb_timestamps), + nb_timestamps, + ], + names=['name', 'datetime'], + ) + previous_no_buddies = previous_no_buddies.union( + station_nb + ) + + if previous_no_buddies.empty: + logger.info( + "only_if_previous_had_no_buddies is True but no records " + "from the previous safety net ('%s') had insufficient " + "buddies. Skipping safety net '%s' entirely.", + previous_safetynet_category, + buddygroupname, + ) + return outliers + + outliers = outliers.intersection(previous_no_buddies) + logger.info( + "Filtering to %s outlier records that had insufficient " + "buddies in the previous safety net ('%s').", + len(outliers), + previous_safetynet_category, + ) + + if outliers.empty: + return outliers + + + #find the categorical buddies (only for the outlier stations) + for outlstation in outliers.get_level_values('name').unique(): + wrapsta = name_map[outlstation] + assign_safety_net_buddies( + wrapsta=wrapsta, + metadf = metadf, + distance_df = distance_df, + buddygroupname = buddygroupname, + max_distance = max_distance, + min_distance = min_distance, + max_alt_diff = max_alt_diff, + max_sample_size = max_sample_size) + + + #find outliers in the new categorical group + # The buddy_test_a_station function updates flags/details directly + # and returns only the outlier MultiIndex (BC_FLAGGED records) + # We need to track BC_PASSED records to remove them from outliers + for outlstation in outliers.get_level_values('name').unique(): + wrapsta = name_map[outlstation] + + # Get the timestamps for this station from the original outliers + station_outlier_timestamps = outliers[ + outliers.get_level_values('name') == outlstation + ].get_level_values('datetime') + + # Subset wideobsds to only the outlier timestamps for this station + widedf_subset = wideobsds.loc[station_outlier_timestamps] + + # Run the buddy test - this updates flags/details directly on wrapsta + # and returns outliers (BC_FLAGGED records) + station_flagged = buddy_test_a_station( + centerwrapstation=wrapsta, + buddygroupname=buddygroupname, + widedf=widedf_subset, + min_sample_size=min_sample_size, + min_sample_spread=min_sample_spread, + outlier_threshold=safety_z_threshold, + iteration=iteration, + check_type=f'safetynet_check:{buddygroupname}', + use_z_robust_method=use_z_robust_method, + ) + + # Get passed timestamps from the flags DataFrame + # These are records where the safetynet check passed (BC_PASSED) + check_col = f'safetynet_check:{buddygroupname}' + if not wrapsta.flags.empty and check_col in wrapsta.flags.columns: + # Get flags for this iteration + iter_mask = wrapsta.flags.index.get_level_values('iteration') == iteration + iter_flags = wrapsta.flags.loc[iter_mask, check_col] + + # Find passed timestamps + passed_mask = iter_flags == BC_PASSED + if passed_mask.any(): + passed_timestamps = iter_flags[passed_mask].index.get_level_values('datetime') + + # Create MultiIndex for saved records + station_saved = pd.MultiIndex.from_arrays( + [[outlstation] * len(passed_timestamps), passed_timestamps], + names=['name', 'datetime'] + ) + saved_records = saved_records.union(station_saved) + + # Return original outliers minus the saved records + remaining_outliers = outliers.difference(saved_records) + + return remaining_outliers.sort_values().unique() + + \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/samplechecks.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/samplechecks.py new file mode 100644 index 00000000..156ebfdc --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/samplechecks.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING, Tuple +import numpy as np +import pandas as pd +from metobs_toolkit.backend_collection.datetime_collection import to_timedelta + +logger = logging.getLogger("") + + +if TYPE_CHECKING: + from ..buddywrapsensor import BuddyWrapSensor + +# Import constants from buddywrapstation +from ..buddywrapsensor import BC_NO_BUDDIES, BC_PASSED, BC_FLAGGED, BC_NOT_TESTED + + +def buddy_test_a_station( + centerwrapstation: BuddyWrapSensor, + buddygroupname: str, + widedf: pd.DataFrame, + min_sample_size: int, + min_sample_spread: float, + outlier_threshold: float, + iteration: int, + check_type: str = 'spatial_check', + use_z_robust_method: bool = True, +) -> Tuple[pd.MultiIndex, BuddyWrapSensor]: + + #TODO update docstring + """Find outliers in a buddy group and update station flags/details. + + This function tests whether the center station is an outlier compared to + its buddy stations using z-score analysis. The z-score is computed using + the mean and standard deviation of the buddy stations only (the center + station's values are excluded from the sample distribution). + + Parameters + ---------- + centerwrapstation : BuddyCheckStation + The wrapped station at the center of the buddy group to be tested. + buddygroupname : str + The name of the buddy group to use. + widedf : pd.DataFrame + Wide-format DataFrame with stations as columns and timestamps as index. + min_sample_size : int + Minimum number of valid buddy samples required for z-score calculation. + min_sample_spread : float + when use_z_robust_method is True, this is the equal to the minimum MAD to use (avoids division by near-zero). + when use_z_robust_method is False, this is the standard deviation. + outlier_threshold : float + Z-score threshold above which a record is flagged as outlier. + iteration : int + The current iteration number. + check_type : str, optional + The type of check being performed ('spatial_check', 'safetynet_check:groupname'). + Default is 'spatial_check'. + + Returns + ------- + pd.MultiIndex + MultiIndex with levels ('name', 'datetime') containing only the outlier + records for the center station. + """ + + # Get buddies (excluding center station) + buddies = centerwrapstation.get_buddies(groupname=buddygroupname) + center_name = centerwrapstation.name + + # Subset to buddies only (for sample distribution) and center station + buddydf = widedf[buddies] + center_series = widedf[center_name] + + # Count valid buddy samples per timestamp (center station NOT included) + buddy_sample_sizes = buddydf.notna().sum(axis=1) + + # Mark timestamps where center station has no data as NOT_TESTED + + #Edgecaase: If a station has fewer records than others, they are present as NaN in widedf + #But these timestamps do not ex + no_data = pd.Series(BC_NOT_TESTED, index=center_series[center_series.isna()].index) + centerwrapstation.add_flags( + iteration=iteration, + flag_series=no_data, + column_name=check_type + ) + + + # Find timestamps where center station has data + center_has_data = center_series.notna() + #TODO: pass the flag BC_NOT_TESTED for timestamps where center has no data + + # Separate timestamps by sample size condition (only where center has data) + sufficient_samples_mask = (buddy_sample_sizes >= min_sample_size) & center_has_data + insufficient_samples_mask = (buddy_sample_sizes < min_sample_size) & center_has_data + + timestamps_with_sufficient = widedf.index[sufficient_samples_mask] + timestamps_insufficient = widedf.index[insufficient_samples_mask] + + # ---- Handle timestamps with insufficient buddy samples (BC_NO_BUDDIES) ---- + if not timestamps_insufficient.empty: + # Create flags for NO_BUDDIES + no_buddies_flags = pd.Series(BC_NO_BUDDIES, index=timestamps_insufficient) + centerwrapstation.add_flags( + iteration=iteration, + flag_series=no_buddies_flags, + column_name=check_type + ) + + # Create detail messages + no_buddies_details = pd.Series( + [f"Insufficient buddy sample size (n={int(buddy_sample_sizes.loc[ts])}, " + f"required={min_sample_size}) in {buddygroupname} buddy group " + f"centered on {center_name}" + for ts in timestamps_insufficient], + index=timestamps_insufficient + ) + if check_type == 'spatial_check': + centerwrapstation.add_spatial_details( + iteration=iteration, + detail_series=no_buddies_details + ) + else: + centerwrapstation.add_safetynet_details(iteration=iteration, + safetynetname=buddygroupname, + detail_series=no_buddies_details) + + # ---- Handle timestamps with sufficient samples ---- + if timestamps_with_sufficient.empty: + # No timestamps to process, return empty MultiIndex + return (pd.MultiIndex.from_tuples([], names=['name', 'datetime']), centerwrapstation) + + # Filter to rows with enough valid buddy samples + buddydf_filtered = buddydf.loc[timestamps_with_sufficient] + center_filtered = center_series.loc[timestamps_with_sufficient] + buddy_sample_sizes_filtered = buddy_sample_sizes.loc[timestamps_with_sufficient] + + # Compute z-scores for center station using buddy distribution + if use_z_robust_method: + results_df = _compute_robust_z_scores( + buddydf=buddydf_filtered, + center_values=center_filtered, + min_mad=min_sample_spread, + outlier_threshold=outlier_threshold + ) + + else: + results_df = _compute_center_z_scores( + buddydf=buddydf_filtered, + center_values=center_filtered, + min_std=min_sample_spread, + outlier_threshold=outlier_threshold + ) + + # Separate flagged (outliers) and passed + outlier_timestamps = results_df.index[results_df['flagged']] + passed_timestamps = results_df.index[~results_df['flagged']] + + # ---- Update PASSED flags and details ---- + if not passed_timestamps.empty: + passed_flags = pd.Series(BC_PASSED, index=passed_timestamps) + centerwrapstation.add_flags( + iteration=iteration, + flag_series=passed_flags, + column_name=check_type + ) + + # Create detail messages for passed + passed_details = f"Passed {buddygroupname} check: " + results_df.loc[passed_timestamps, 'details'] + + if check_type == 'spatial_check': + centerwrapstation.add_spatial_details( + iteration=iteration, + detail_series=passed_details + ) + else: + centerwrapstation.add_safetynet_details(iteration=iteration, + safetynetname=buddygroupname, + detail_series=passed_details) + + # ---- Update FLAGGED (outlier) flags and details ---- + if not outlier_timestamps.empty: + flagged_flags = pd.Series(BC_FLAGGED, index=outlier_timestamps) + centerwrapstation.add_flags( + iteration=iteration, + flag_series=flagged_flags, + column_name=check_type + ) + + # Create detail messages for outliers + outlier_details = f"Outlier in {buddygroupname} buddy group centered on {center_name}: " + results_df.loc[outlier_timestamps, 'details'] + + + + if check_type == 'spatial_check': + centerwrapstation.add_spatial_details( + iteration=iteration, + detail_series=outlier_details + ) + else: + centerwrapstation.add_safetynet_details(iteration=iteration, + safetynetname=buddygroupname, + detail_series=outlier_details) + + # ---- Return outliers as MultiIndex ---- + if not outlier_timestamps.empty: + outlier_multiindex = pd.MultiIndex.from_arrays( + [[center_name] * len(outlier_timestamps), outlier_timestamps], + names=['name', 'datetime'] + ) + #Return the updated stations, this is needed when runned in multiprocessing + return (outlier_multiindex, centerwrapstation) + else: + return (pd.MultiIndex.from_tuples([], names=['name', 'datetime']), centerwrapstation) + + + +# ------------------------------------------ +# Statistical sample scoring +# ------------------------------------------ + +def _compute_robust_z_scores( + buddydf: pd.DataFrame, + center_values: pd.Series, + min_mad: float, + outlier_threshold: float + ) -> pd.DataFrame: + #TODO:Docstring + + buddy_not_na_counts = buddydf.notna().sum(axis=1) + #Calculate MADFM (Median Absolute Deviation From Median) + def MAD(x): + "MEDIAN absolute deviation from median" + return (x - x.median()).abs().median() + + mad_series = buddydf.apply(MAD, axis=1) + # Replace std below minimum with the minimum (avoid division by near-zero) + mad_series.loc[mad_series < min_mad] = np.float32(min_mad) + + # Calculate robust z-score for center station + + robust_z_scores = (center_values - buddydf.median(axis=1)).abs() / (1.4826 * mad_series) + + details = ('z (robust)=' + robust_z_scores.map('{:.2f}'.format) + + ', threshold=' + str(outlier_threshold) + + ', n=' + buddy_not_na_counts.map(str) + + ', MAD=' + mad_series.map('{:.2f}'.format) + + ', median=' + buddydf.median(axis=1).map('{:.2f}'.format)) + + # Build result DataFrame + result_df = pd.DataFrame( + index=buddydf.index, + data={ + 'z_score': robust_z_scores, + 'flagged': robust_z_scores > outlier_threshold, + 'details': details, + } + ) + return result_df + + +def _compute_center_z_scores( + buddydf: pd.DataFrame, + center_values: pd.Series, + min_std: float, + outlier_threshold: float +) -> pd.DataFrame: + """Compute z-scores for center station using buddy distribution. + + The z-score is computed as the absolute deviation of the center station's + value from the mean of the buddy stations, divided by the standard + deviation of the buddy stations. The center station's values are NOT + included in the mean/std calculation. + + Parameters + ---------- + buddydf : pd.DataFrame + DataFrame with buddy stations as columns (center station excluded). + center_values : pd.Series + Series with the center station's values to test. + min_std : float + Minimum standard deviation to use (avoids division by near-zero). + outlier_threshold : float + Z-score threshold above which a record is flagged as outlier. + + Returns + ------- + pd.DataFrame + DataFrame with columns: 'z_score', 'flagged', 'buddy_mean', 'buddy_std'. + """ + # Compute mean and std from buddies only (center station excluded) + buddy_mean_series = buddydf.mean(axis=1) + buddy_std_series = buddydf.std(axis=1) + buddy_not_na_counts = buddydf.notna().sum(axis=1) + # Replace std below minimum with the minimum (avoid division by near-zero) + buddy_std_series.loc[buddy_std_series < min_std] = np.float32(min_std) + + # Calculate z-score for center station + z_scores = (center_values - buddy_mean_series).abs() / buddy_std_series + + + details = ('z=' + z_scores.map('{:.2f}'.format) + + ', threshold=' + str(outlier_threshold) + + ', n=' + buddy_not_na_counts.map(str) + + ', mean=' + buddy_mean_series.map('{:.2f}'.format) + + ', std=' + buddy_std_series.map('{:.2f}'.format)) + + # Build result DataFrame + result_df = pd.DataFrame( + index=buddydf.index, + data={ + 'z_score': z_scores, + 'flagged': z_scores > outlier_threshold, + 'details': details, + } + ) + return result_df \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/spatial_checks/methods/whitesaving.py b/src/metobs_toolkit/qc_collection/spatial_checks/methods/whitesaving.py new file mode 100644 index 00000000..818a537d --- /dev/null +++ b/src/metobs_toolkit/qc_collection/spatial_checks/methods/whitesaving.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import logging +from typing import Union, List, Dict, TYPE_CHECKING + +import pandas as pd + + +logger = logging.getLogger("") + +from .findbuddies import filter_buddygroup_by_altitude +from .samplechecks import buddy_test_a_station +if TYPE_CHECKING: + from ..buddywrapsensor import BuddyWrapSensor + from ...whitelist import WhiteSet + + +def save_whitelist_records( + outliers: pd.MultiIndex, + wrappedstations: List[BuddyWrapSensor], + whiteset: WhiteSet, + obstype: str, + iteration: int, +) -> pd.MultiIndex: + """Apply whitelist filtering to outliers and update station details. + + This function filters outlier records against a whitelist. Records that + match the whitelist are marked as 'saved' and removed from the outlier set. + Records that don't match the whitelist remain as outliers. + + Parameters + ---------- + outliers : pd.MultiIndex + MultiIndex with levels ('name', 'datetime') containing the current + outlier records to filter. + wrappedstations : list of BuddyCheckStation + List of wrapped station objects to update with whitelist details. + whiteset : WhiteSet + A WhiteSet instance containing records that should be excluded from + outlier detection. + obstype : str + The observation type being checked. + iteration : int + The current iteration number. + + Returns + ------- + pd.MultiIndex + MultiIndex with levels ('name', 'datetime') containing only the + outliers that were NOT saved by the whitelist. + """ + if outliers.empty: + logger.debug("No outliers to filter with whitelist") + return outliers + + if whiteset._flag_is_empty(): + logger.debug("Whitelist is empty, no records saved") + return outliers + + # Track which records are not saved + # saved_records = pd.MultiIndex.from_tuples([], names=['name', 'datetime']) + remaining_outliers = pd.MultiIndex.from_tuples([], names=['name', 'datetime']) + + # Process each station's outliers + for wrapsta in wrappedstations: + if wrapsta.name not in outliers.get_level_values("name").unique(): + continue # No outliers for this station + + else: + + # Get the outlier datetimes for this station + sta_outlier_dts = pd.DatetimeIndex( + outliers[outliers.get_level_values("name") == wrapsta.name].get_level_values("datetime"), + name="datetime" + ) + + # Create a SensorWhiteSet for this station + sensorwhiteset = whiteset.create_sensorwhitelist( + stationname=wrapsta.name, obstype=obstype + ) + + # Filter to get remaining outliers (not whitelisted) + remaining_dts = sensorwhiteset.catch_white_records(outliers_idx=sta_outlier_dts) + + # Saved records are those that were filtered out + saved_dts = sta_outlier_dts.difference(remaining_dts) + + # Build MultiIndex for saved and remaining + # if len(saved_dts) > 0: + # saved_idx = pd.MultiIndex.from_arrays( + # [[wrapsta.name] * len(saved_dts), saved_dts], + # names=['name', 'datetime'] + # ) + # saved_records = saved_records.union(saved_idx) + + # Update the wrapped station with saved details + + # Create detail messages for saved records + detail_series = pd.Series( + [f"Saved by whitelist at iteration {iteration}" for _ in saved_dts], + index=saved_dts + ) + wrapsta.update_whitelist_details( + whitelistseries=detail_series, + iteration=iteration, + is_saved=True + ) + + # Create detail messages for records not saved by whitelist + detail_series = pd.Series( + [f"Not saved by whitelist at iteration {iteration}" for _ in remaining_dts], + index=remaining_dts + ) + wrapsta.update_whitelist_details( + whitelistseries=detail_series, + iteration=iteration, + is_saved=False + ) + + if len(remaining_dts) > 0: + remaining_idx = pd.MultiIndex.from_arrays( + [[wrapsta.name] * len(remaining_dts), remaining_dts], + names=['name', 'datetime'] + ) + remaining_outliers = remaining_outliers.union(remaining_idx) + + + + + return remaining_outliers.sort_values() + diff --git a/src/metobs_toolkit/qc_collection/step_check.py b/src/metobs_toolkit/qc_collection/step_check.py index 9c563cab..6a750f2d 100644 --- a/src/metobs_toolkit/qc_collection/step_check.py +++ b/src/metobs_toolkit/qc_collection/step_check.py @@ -3,7 +3,9 @@ import pandas as pd from .whitelist import SensorWhiteSet +from .common_functions import create_qcresult_flags from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.qcresult import QCresult logger = logging.getLogger("") @@ -14,7 +16,7 @@ def step_check( max_increase_per_second: Union[int, float], max_decrease_per_second: Union[int, float], sensorwhiteset: SensorWhiteSet, -) -> pd.DatetimeIndex: +) -> QCresult: """ Check for 'spikes' and 'dips' in a time series. @@ -35,15 +37,16 @@ def step_check( max_decrease_per_second : int or float The maximum allowed decrease (per second). This value is extrapolated to the time resolution of records. This value must be negative. - sensorwhiteset : SensorWhiteSet, optional + sensorwhiteset : SensorWhiteSet A SensorWhiteSet instance containing timestamps that should be excluded from outlier detection. Records matching the whiteset criteria will not be flagged as outliers even if they meet the step check criteria. Returns ------- - pd.DatetimeIndex - Timestamps of outlier records. + QCresult + Quality control result object containing flags, outliers, and details + for the step check. Notes ----- @@ -51,7 +54,12 @@ def step_check( threshold. This is because a temperature drop is meteorologically more common than a sudden increase, which is often the result of a radiation error. """ - + checksettings = { + "max_increase_per_second": max_increase_per_second, + "max_decrease_per_second": max_decrease_per_second, + "sensorwhiteset": sensorwhiteset, + } + # Validate argument values if max_decrease_per_second > 0: raise ValueError("max_decrease_per_second must be negative!") @@ -59,22 +67,22 @@ def step_check( raise ValueError("max_increase_per_second must be positive!") # Drop outliers from the series (these are NaNs) - input_series = records.dropna() + to_check_records = records.dropna() # Calculate timedelta between rows - time_diff = input_series.index.to_series().diff() + time_diff = to_check_records.index.to_series().diff() # Define filter step_filter = ( # Step increase ( - (input_series - input_series.shift(1)) + (to_check_records - to_check_records.shift(1)) > (float(max_increase_per_second) * time_diff.dt.total_seconds()) ) # or | # Step decrease ( - (input_series - input_series.shift(1)) + (to_check_records - to_check_records.shift(1)) < (max_decrease_per_second * time_diff.dt.total_seconds()) ) ) @@ -82,7 +90,30 @@ def step_check( outliers_idx = step_filter[step_filter].index # Catch the white records - outliers_idx = sensorwhiteset.catch_white_records(outliers_idx) + outliers_after_white_idx = sensorwhiteset.catch_white_records(outliers_idx) + + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx = pd.DatetimeIndex([]), + outliers_before_white_idx=outliers_idx, + outliers_after_white_idx=outliers_after_white_idx, + ) + - logger.debug("Exiting function step_check") - return outliers_idx + qcresult = QCresult( + checkname="step", + checksettings=checksettings, + flags=flags, + detail='no details' + ) + + #Create and add details + if not outliers_after_white_idx.empty: + detailseries = pd.Series( + data = f'step > {max_increase_per_second:.4g} per second or step < {max_decrease_per_second:.4g} per second', + index = outliers_after_white_idx + ) + qcresult.add_details_by_series(detail_series = detailseries) + return qcresult + + \ No newline at end of file diff --git a/src/metobs_toolkit/qc_collection/window_variation_check.py b/src/metobs_toolkit/qc_collection/window_variation_check.py index 93f36b1c..e1ba22af 100644 --- a/src/metobs_toolkit/qc_collection/window_variation_check.py +++ b/src/metobs_toolkit/qc_collection/window_variation_check.py @@ -1,9 +1,11 @@ import logging from typing import Union import pandas as pd +from numpy import nan -from .common_functions import test_moving_window_condition +from .common_functions import test_moving_window_condition, create_qcresult_flags from .whitelist import SensorWhiteSet +from metobs_toolkit.qcresult import QCresult from metobs_toolkit.backend_collection.decorators import log_entry from metobs_toolkit.backend_collection.datetime_collection import ( timestamps_to_datetimeindex, @@ -20,11 +22,11 @@ def window_variation_check( max_increase_per_second: Union[int, float], max_decrease_per_second: Union[int, float], sensorwhiteset: SensorWhiteSet, -) -> pd.DatetimeIndex: +) -> QCresult: """ Test if the increase or decrease in a time window exceeds a threshold. - This function checks if the variation of observations in time does not exceed a threshold. + This function checks if the variation of observations in time does exceeds a threshold. It applies a moving window over the time series, defined by a duration (`timewindow`), and tests if the window contains at least a minimum number of records (`min_records_per_window`). @@ -74,9 +76,20 @@ def window_variation_check( if max_increase_per_second < 0: raise ValueError("max_increase_per_second must be positive!") + checksettings = { + "timewindow": timewindow, + "min_records_per_window": min_records_per_window, + "max_increase_per_second": max_increase_per_second, + "max_decrease_per_second": max_decrease_per_second, + "sensorwhiteset": sensorwhiteset, + } + + # Drop outliers from the series (these are NaNs) + to_check_records = records.dropna() + # Test if the conditions for the moving window are met by the records frequency is_met = test_moving_window_condition( - records=records, + records=records, #pass records, because freq is estimated windowsize=timewindow, min_records_per_window=min_records_per_window, ) @@ -84,10 +97,22 @@ def window_variation_check( logger.warning( "The minimum number of window members for the window variation check is not met!" ) - return timestamps_to_datetimeindex(timestamps=[], name="datetime") + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx=to_check_records.index, + outliers_before_white_idx=pd.DatetimeIndex([]), + outliers_after_white_idx=pd.DatetimeIndex([]), + ) + + qcresult = QCresult( + checkname="window_variation", + checksettings=checksettings, + flags=flags, + detail=f"Minimum number of records ({min_records_per_window}) per window ({timewindow}) not met.", + ) + return qcresult - # Drop outliers from the series (these are NaNs) - input_series = records.dropna() + # Calculate window thresholds (by linear extrapolation) max_window_increase = ( @@ -126,17 +151,44 @@ def variation_test(window: pd.Series) -> int: return 0 # Apply rolling window - window_outliers = input_series.rolling( + window_flags = to_check_records.rolling( window=timewindow, closed="both", center=True, min_periods=min_records_per_window, ).apply(variation_test) - outliers_idx = window_outliers.loc[window_outliers == 1].index - + # The returns are numeric values (0 --> oke, NaN --> not checked (members/window condition not met), 1 --> outlier) + window_flags = window_flags.map( + {0.0: 'pass', #Dummy label + nan: 'unmet', #Dummy label + 1.0: 'flagged'}) #Dummy label + + # Filter outliers + outliers_idx = window_flags.loc[window_flags == 'flagged'].index # Catch the white records - outliers_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) - - logger.debug("Exiting function window_variation_check") - return outliers_idx + outliers_after_white_idx = sensorwhiteset.catch_white_records(outliers_idx=outliers_idx) + + #Create flags + flags = create_qcresult_flags( + all_input_records=records, + unmet_cond_idx=window_flags[window_flags == 'unmet'].index, + outliers_before_white_idx=outliers_idx, + outliers_after_white_idx=outliers_after_white_idx) + + qcresult = QCresult( + checkname="window_variation", + checksettings=checksettings, + flags=flags, + detail='no details' + ) + + #Create and add details + if not outliers_after_white_idx.empty: + detailseries = pd.Series( + data = f'Variation in {timewindow} window exceeds max increase of {max_window_increase} or max decrease of {max_window_decrease}.', + index = outliers_after_white_idx + ) + qcresult.add_details_by_series(detail_series = detailseries) + + return qcresult diff --git a/src/metobs_toolkit/qcresult.py b/src/metobs_toolkit/qcresult.py new file mode 100644 index 00000000..8d63e609 --- /dev/null +++ b/src/metobs_toolkit/qcresult.py @@ -0,0 +1,203 @@ +from __future__ import annotations +import logging +from typing import Literal, Union, TYPE_CHECKING + +import numpy as np +import pandas as pd + +from metobs_toolkit.backend_collection.decorators import log_entry +from metobs_toolkit.settings_collection.settings import Settings +logger = logging.getLogger("") + + + +#Basic labels +pass_cond = Settings.get("qc_status_labels_per_check.pass.label") #checked and successfull pass +flagged_cond = Settings.get("qc_status_labels_per_check.flagged.label") # checked and flagged as outlier +unmet_cond = Settings.get("qc_status_labels_per_check.condition_unmet.label") #not checked due to unmet specific conditions +saved_cond = Settings.get("qc_status_labels_per_check.saved_whitelist.label") #checked and flagged but saved due to whitelist +unchecked_cond = Settings.get("qc_status_labels_per_check.unchecked.label") #not checked (was nan/gap before check) + + + + +class QCresult: + """Store results of a quality control check. + + This class encapsulates the results of a single QC check including flags, + detected outliers, and detailed information about the check outcome for + all timestamps. + + Parameters + ---------- + checkname : str + Name identifying the quality control check (e.g., 'gross_value', 'persistence'). + checksettings : dict + Dictionary of parameters and settings used for this QC check. + flags : pd.Series + Series with datetime index containing QC flag strings for all timestamps: + 'passed', 'flagged', 'condition_unmet', 'saved', or 'unchecked'. + outliers : pd.Series + Series with datetime index containing outlier values. Index should be + a subset of the flags index. + detail : str, optional + Default detail string for all timestamps. Can be updated for specific + timestamps using add_details_by_series. Default is empty string. + + Attributes + ---------- + checkname : str + Name of the QC check. + checksettings : dict + Settings used for the check. + flags : pd.Series + QC flags for all timestamps. + outliers : pd.Series + Detected outlier values. + details : pd.Series + Detailed information for each timestamp. + """ + + def __init__( + self, + checkname: str, + checksettings: dict, + flags: pd.Series, # index: timestamps, values: 'passed', 'flagged', 'condition_unmet', 'saved' + # outliers: pd.Series, # index: timestamps, values: outlier values + detail: str = "", + ): + self.checkname = checkname + self.checksettings = checksettings + + if not isinstance(flags.index, pd.DatetimeIndex): + raise TypeError("The index of 'flags' must be a pandas.DatetimeIndex.") + self.flags = flags + + + + # if not isinstance(outliers.index, pd.DatetimeIndex): + # raise TypeError("The index of 'outliers' must be a pandas.DatetimeIndex.") + # self.outliers = outliers + + #Set details (Index is Flags thus includes all timestamps!) + self.details = pd.Series([detail] * len(flags), + index=flags.index) + + + def __repr__(self) -> str: + return f"QCresult(checkname={self.checkname})" + + + + @log_entry + def add_details_by_series(self, detail_series: pd.Series) -> None: + """Update the details attribute with values from a detail_series. + + This method updates the details attribute (a pandas Series with datetime + index) using index-value pairs from the provided detail_series. The + detail_series index must be a subset of the details attribute index. + + Parameters + ---------- + detail_series : pd.Series + A pandas Series with datetime index containing detail values to + update. The index should be a subset of the details attribute index. + + Raises + ------ + TypeError + If detail_series is not a pandas Series or if its index is not a + pandas DatetimeIndex. + """ + + # Update details using the index-value pairs from detail_series + self.details.update(detail_series) + + def get_outlier_timestamps(self) -> pd.DatetimeIndex: + """Return the timestamps of the outliers.""" + + return self.flags[self.flags == flagged_cond].index + # return self.outliers.index + + + def remap_timestamps(self, mapping: dict) -> None: + """Remap the timestamps of flags, outliers, and details using a mapping dictionary. + + Parameters + ---------- + mapping : dict + A dictionary where keys are original timestamps and values are the + new timestamps to map to. + """ + self.flags.index = self.flags.index.map(lambda ts: mapping.get(ts, ts)) + # self.outliers.index = self.outliers.index.map(lambda ts: mapping.get(ts, ts)) + self.details.index = self.details.index.map(lambda ts: mapping.get(ts, ts)) + + def _flags_to_basic_labels(self) -> dict: + """Create mapping from QC flag values to display labels. + + Constructs a dictionary mapping internal flag values ('passed', 'flagged', etc.) + to user-facing label strings defined in Settings. Flagged records use the + check-specific label, while all other statuses use the 'goodrecord' label. + + Returns + ------- + dict + Mapping from flag strings to label strings. + """ + label_mapping = { + pass_cond: Settings.get('label_def.goodrecord.label'), + flagged_cond: Settings.get(f'label_def.{self.checkname}.label'), + unmet_cond: Settings.get('label_def.goodrecord.label'), + saved_cond: Settings.get('label_def.goodrecord.label'), + unchecked_cond: Settings.get('label_def.goodrecord.label') + } + return label_mapping + def create_outliersdf(self, + map_to_basic_labels=True, + subset_to_outliers=True) -> pd.DataFrame: + """Create a DataFrame summarizing detected outliers. + + Constructs a DataFrame containing outlier values, their corresponding labels, + and detailed information for each outlier timestamp. This format is compatible + with the Dataset.outliersdf property. + + Returns + ------- + pd.DataFrame + DataFrame with datetime index and columns: + - 'value': outlier values + - 'label': human-readable QC check labels + - 'details': descriptive information about each outlier + Returns empty DataFrame with correct structure if no outliers exist. + """ + outl_timestamps = self.get_outlier_timestamps() + + + if subset_to_outliers: + if outl_timestamps.empty: + # return empty dataframe + return pd.DataFrame( + columns=["label", "details"], + index=pd.DatetimeIndex([], name="datetime"), + ) + targets = self.flags.loc[outl_timestamps] + else: + targets = self.flags + + if map_to_basic_labels: + labels = targets.map(self._flags_to_basic_labels()) + else: + labels = targets + + + outliers_df = pd.DataFrame({ + 'datetime': targets.index, + # 'value': self.outliers.values, + 'label': labels.values, + 'details': self.details.loc[targets.index].values, + + }) + outliers_df.set_index('datetime', inplace=True) + return outliers_df + \ No newline at end of file diff --git a/src/metobs_toolkit/sensordata.py b/src/metobs_toolkit/sensordata.py index 0718f57e..29c4350a 100644 --- a/src/metobs_toolkit/sensordata.py +++ b/src/metobs_toolkit/sensordata.py @@ -22,6 +22,9 @@ from metobs_toolkit.gf_collection.overview_df_constructors import ( sensordata_gap_status_overview_df, ) +from metobs_toolkit.qc_collection.overview_df_constructor import ( + sensordata_qc_overview_df, +) from metobs_toolkit.settings_collection import Settings from metobs_toolkit.xrconversions import sensordata_to_xr from metobs_toolkit.timestampmatcher import TimestampMatcher @@ -33,6 +36,14 @@ MetObsAdditionError, MetObsInternalError, ) +from metobs_toolkit.qcresult import ( + QCresult, + pass_cond, + flagged_cond, + unmet_cond, + saved_cond, + unchecked_cond +) from metobs_toolkit.plot_collection import sensordata_simple_pd_plot import metobs_toolkit.backend_collection.printing_collection as printing @@ -108,7 +119,8 @@ def __init__( self.series = data # datetime as index # outliers - self.outliers = [] # List of {'checkname': ..., 'df': ....., 'settings': } + self.outliers = [] # List of QCresult + self.outliers_values_bin = pd.Series(dtype=datadtype) # Series of outlier values # gaps self.gaps = [] # list of Gap's @@ -143,6 +155,7 @@ def __str__(self) -> str: """Return a string representation of the SensorData object.""" return f"{self.obstype.name} data of station {self.stationname}." + #TODO: update this method to handle QCresult outliers + outliers_values_bin def __add__(self, other: "SensorData") -> "SensorData": """ Combine two SensorData objects for the same station and obstype. @@ -245,39 +258,19 @@ def df(self) -> pd.DataFrame: def to_xr(self) -> xrDataset: return sensordata_to_xr(self, fmt_datetime_coordinate=True) + #TODO: update this method to handle QCresult outliers @property def outliersdf(self) -> pd.DataFrame: """Return a DataFrame of the outlier records.""" logger.debug("Creating outliers DataFrame for %s", self.stationname) to_concat = [] - for outlierinfo in self.outliers: - checkname = outlierinfo["checkname"] - checkdf = outlierinfo["df"].copy() - checkdf["label"] = Settings.get(f"label_def.{checkname}.label") - - # Create details column from all columns except 'value' and 'label' - detail_cols = [ - col for col in checkdf.columns if col not in ["value", "label"] - ] - if detail_cols: - # Build details string for each row - details_list = [] - for _, row in checkdf.iterrows(): - parts = [ - f"{col}: {row[col]}" - for col in detail_cols - if pd.notna(row[col]) - ] - details_list.append(", ".join(parts)) - checkdf["details"] = details_list - # Drop the original detail columns - checkdf = checkdf.drop(columns=detail_cols) - else: - checkdf["details"] = "" - + for qcresult in self.outliers: + checkdf = qcresult.create_outliersdf(subset_to_outliers=True) to_concat.append(checkdf) totaldf = save_concat(to_concat) + #add the values column (values not stored in qcresult, only labels and details) + totaldf["value"] = self.outliers_values_bin.loc[totaldf.index] if totaldf.empty: # return empty dataframe @@ -285,12 +278,12 @@ def outliersdf(self) -> pd.DataFrame: columns=["value", "label", "details"], index=pd.DatetimeIndex([], name="datetime"), ) - else: - totaldf.sort_index(inplace=True) + + totaldf.sort_index(inplace=True) - logger.debug("Outliers DataFrame created successfully for %s", self.stationname) - return totaldf + return totaldf[['value', 'label', 'details']] #fixed column order + @property def gapsdf(self) -> pd.DataFrame: """Return a DataFrame of the gap records.""" @@ -309,6 +302,10 @@ def gapsdf(self) -> pd.DataFrame: def gap_overview_df(self) -> pd.DataFrame: return sensordata_gap_status_overview_df(self) + @copy_doc(sensordata_qc_overview_df) + def qc_overview_df(self) -> pd.DataFrame: + return sensordata_qc_overview_df(self) + @property def stationname(self) -> str: """Return the name of the station this SensorData belongs to.""" @@ -401,10 +398,15 @@ def _setup( self.duplicated_timestamp_check() if apply_invalid_check: - # invalid check - self.invalid_value_check( - skip_records=self.outliers[0]["df"].index - ) # skip the records already labeled as duplicates + # get the records that are flagged by the + if self.outliers[0].checkname =='duplicated_timestamp': + dup_outl_ti = self.outliers[0].get_outlier_timestamps() + else: + dup_outl_ti = pd.DatetimeIndex([]) + # invalid check (no qcresult, these timestamps are removed, and catched by gapcheck) + valid_records = qc.drop_invalid_values(records=self.series, + skip_records=dup_outl_ti) + self.series = valid_records if apply_unit_conv: # convert units to standard units @@ -427,16 +429,8 @@ def _setup( # update the outliers (replace the raw timestamps with the new) outl_datetime_map = timestamp_matcher.get_outlier_map() - for outlinfo in self.outliers: - outlinfo["df"]["new_datetime"] = outlinfo["df"].index.map(outl_datetime_map) - outlinfo["df"] = ( - outlinfo["df"] - .reset_index() - .rename( - columns={"datetime": "raw_timestamp", "new_datetime": "datetime"} - ) - .set_index("datetime") - ) + for qcresult in self.outliers: + qcresult.remap_timestamps(mapping=outl_datetime_map) # create gaps @@ -451,62 +445,23 @@ def _setup( missingrecords=timestamp_matcher.gap_records, target_freq=pd.to_timedelta(timestamp_matcher.target_freq), ) - - def _update_outliers( - self, - qccheckname: str, - outliertimestamps: pd.DatetimeIndex, - check_kwargs: dict, - extra_columns: dict = {}, - overwrite: bool = False, - ) -> None: - """ - Update the outliers attribute. - - Parameters - ---------- - qccheckname : str - Name of the quality control check. - outliertimestamps : pd.DatetimeIndex - Datetime index of the outliers. - check_kwargs : dict - Additional arguments for the check. - extra_columns : dict, optional - Extra columns to add to the outliers DataFrame, by default {}. - overwrite : bool, optional - Whether to overwrite existing outliers, by default False. - - Raises - ------ - MetobsQualityControlError - If the check is already applied and overwrite is False. - """ - logger.debug( - "Entering _update_outliers for %s with check %s", self, qccheckname - ) - - for applied_qc_info in self.outliers: - if qccheckname == applied_qc_info.keys(): - if overwrite: - self.outliers.remove(applied_qc_info) - else: - raise MetObsQualityControlError( - f"The {qccheckname} is already applied on {self}. Fix error or set overwrite=True" - ) - - outlier_values = self.series.loc[outliertimestamps] - outlier_values = outlier_values[~outlier_values.index.duplicated(keep="first")] - - datadict = {"value": outlier_values.to_numpy()} - datadict.update(extra_columns) - df = pd.DataFrame(data=datadict, index=outlier_values.index) - - self.outliers.append( - {"checkname": qccheckname, "df": df, "settings": check_kwargs} - ) - - self.series.loc[outliertimestamps] = np.nan - + + def _update_outliers(self, + qcresult: QCresult, + overwrite: bool = False) -> None: + + #add the results to the outliers list + self.outliers.append(qcresult) + + #Fill the outliers value bin + self.outliers_values_bin = pd.concat([self.outliers_values_bin, + self.series.loc[qcresult.get_outlier_timestamps()]]) + + + #convert the outlier timestamps to NaN in the series + self.series.loc[qcresult.get_outlier_timestamps()] = np.nan + + def _find_gaps(self, missingrecords: pd.Series, target_freq: pd.Timedelta) -> list: """ Identify gaps in the missing records based on the target frequency. @@ -553,6 +508,7 @@ def _rename(self, trgname: str) -> None: gap.name = str(trgname) @log_entry + #TODO: update this method to handle QCresult outliers def convert_outliers_to_gaps(self) -> None: """ Convert all outliers to gaps. @@ -646,19 +602,20 @@ def resample( # update the outliers (replace the raw timestamps with the new) outl_datetime_map = timestampmatcher.get_outlier_map() for outlinfo in self.outliers: - # add mapped timestamps - outlinfo["df"]["new_datetime"] = outlinfo["df"].index.map(outl_datetime_map) - # reformat the dataframe - outlinfo["df"] = ( - outlinfo["df"] - .reset_index() - .rename( - columns={"datetime": "raw_timestamp", "new_datetime": "datetime"} - ) - .set_index("datetime") - ) - # Drop references to NaT datetimes (when qc is applied before resampling) - outlinfo["df"] = outlinfo["df"].loc[outlinfo["df"].index.notnull()] + outlinfo.remap_timestamps(mapping=outl_datetime_map) + # # add mapped timestamps + # outlinfo["df"]["new_datetime"] = outlinfo["df"].index.map(outl_datetime_map) + # # reformat the dataframe + # outlinfo["df"] = ( + # outlinfo["df"] + # .reset_index() + # .rename( + # columns={"datetime": "raw_timestamp", "new_datetime": "datetime"} + # ) + # .set_index("datetime") + # ) + # # Drop references to NaT datetimes (when qc is applied before resampling) + # outlinfo["df"] = outlinfo["df"].loc[outlinfo["df"].index.notnull()] # create gaps orig_gapsdf = self.gapsdf @@ -784,49 +741,7 @@ def pd_plot(self, show_labels: list = ["ok"], **pdplotkwargs) -> Axes: # Quality Control (technical qc + value-based qc) # ------------------------------------------ - @log_entry - def invalid_value_check(self, skip_records: pd.DatetimeIndex) -> None: - """ - Check for invalid values in the series. - - Invalid values are those that could not be cast to numeric. - - Parameters - ---------- - skip_records : pd.DatetimeIndex - Records to skip during the check. - - Raises - ------ - MetobsQualityControlError - If the check is already applied. - """ - - skipped_data = self.series.loc[skip_records] - targets = self.series.drop(skip_records) - - # Option 1: Create a outlier label for these invalid inputs, - # and treath them as outliers - # outlier_timestamps = targets[~targets.notnull()] - - # self._update_outliers( - # qccheckname="invalid_input", - # outliertimestamps=outlier_timestamps.index, - # check_kwargs={}, - # extra_columns={}, - # overwrite=False, - # ) - - # Option 2: Since there is not numeric value present, these timestamps are - # interpreted as gaps --> remove the timestamp, so that it is captured by the - # gap finder. - - # Note: do not treat the first/last timestamps differently. That is - # a philosiphycal choice. - - self.series = targets[targets.notnull()] # subset to numerical casted values - # add the skipped records back - self.series = pd.concat([self.series, skipped_data]).sort_index() + @log_entry def duplicated_timestamp_check(self) -> None: @@ -838,22 +753,17 @@ def duplicated_timestamp_check(self) -> None: MetobsQualityControlError If the check is already applied. """ - - duplicates = pd.Series( - data=self.series.index.duplicated(keep=False), index=self.series.index - ) - duplicates = duplicates.loc[duplicates] - duplicates = duplicates[duplicates.index.duplicated(keep="first")] - - self._update_outliers( - qccheckname="duplicated_timestamp", - outliertimestamps=duplicates.index, - check_kwargs={}, - extra_columns={}, - overwrite=False, - ) - + qcresult = qc.duplicated_timestamp_check(records=self.series) + + #drop duplicates for the series, keep only the first occurrence + #(These values will then be put to Nan in the update) self.series = self.series[~self.series.index.duplicated(keep="first")] + + #Update the outliers + self._update_outliers(qcresult=qcresult, overwrite=False) + + + @log_entry def gross_value_check(self, **qckwargs) -> None: @@ -866,12 +776,9 @@ def gross_value_check(self, **qckwargs) -> None: Additional keyword arguments for the check. """ - outlier_timestamps = qc.gross_value_check(records=self.series, **qckwargs) + qcresult = qc.gross_value_check(records=self.series, **qckwargs) self._update_outliers( - qccheckname="gross_value", - outliertimestamps=outlier_timestamps, - check_kwargs={**qckwargs}, - extra_columns={}, + qcresult=qcresult, overwrite=False, ) @@ -886,13 +793,9 @@ def persistence_check(self, **qckwargs) -> None: Additional keyword arguments for the check. """ - outlier_timestamps = qc.persistence_check(records=self.series, **qckwargs) - + qcresult = qc.persistence_check(records=self.series, **qckwargs) self._update_outliers( - qccheckname="persistence", - outliertimestamps=outlier_timestamps, - check_kwargs={**qckwargs}, - extra_columns={}, + qcresult=qcresult, overwrite=False, ) @@ -907,13 +810,9 @@ def repetitions_check(self, **qckwargs) -> None: Additional keyword arguments for the check. """ - outlier_timestamps = qc.repetitions_check(records=self.series, **qckwargs) - + qcresult = qc.repetitions_check(records=self.series, **qckwargs) self._update_outliers( - qccheckname="repetitions", - outliertimestamps=outlier_timestamps, - check_kwargs={**qckwargs}, - extra_columns={}, + qcresult=qcresult, overwrite=False, ) @@ -928,13 +827,9 @@ def step_check(self, **qckwargs) -> None: Additional keyword arguments for the check. """ - outlier_timestamps = qc.step_check(records=self.series, **qckwargs) - + qcresult = qc.step_check(records=self.series, **qckwargs) self._update_outliers( - qccheckname="step", - outliertimestamps=outlier_timestamps, - check_kwargs={**qckwargs}, - extra_columns={}, + qcresult=qcresult, overwrite=False, ) @@ -949,13 +844,9 @@ def window_variation_check(self, **qckwargs) -> None: Additional keyword arguments for the check. """ - outlier_timestamps = qc.window_variation_check(records=self.series, **qckwargs) - + qcresult = qc.window_variation_check(records=self.series, **qckwargs) self._update_outliers( - qccheckname="window_variation", - outliertimestamps=outlier_timestamps, - check_kwargs={**qckwargs}, - extra_columns={}, + qcresult=qcresult, overwrite=False, ) @@ -983,43 +874,29 @@ def get_qc_freq_statistics(self) -> pd.DataFrame: excluded from the check due to previous QC checks. """ - - infodict = {} # checkname : details - ntotal = self.series.shape[0] # gaps included !! - already_rejected = self.gapsdf.shape[0] # initial gap records - # add the 'ok' labels - infodict[Settings.get("label_def.goodrecord.label")] = { - "N_all": ntotal, - "N_labeled": self.series[self.series.notnull()].shape[0], - } - # add the 'gap' labels - # TODO: I think the filled and failed labels must be included as well - infodict[Settings.get("label_def.regular_gap.label")] = { - "N_all": ntotal, - "N_labeled": already_rejected, - } - - # add the qc check labels - for check in self.outliers: - n_outliers = check["df"].shape[0] - n_checked = ntotal - already_rejected - outlierlabel = Settings.get(f"label_def.{check['checkname']}.label") - infodict[outlierlabel] = { - "N_labeled": n_outliers, - "N_checked": n_checked, - "N_all": ntotal, - } - - # remove the outliers of the previous check - already_rejected = already_rejected + n_outliers - - # Convert to a dataframe - checkdf = pd.DataFrame(infodict).transpose() - checkdf.index.name = "qc_check" - checkdf["name"] = self.stationname - checkdf = checkdf.reset_index().set_index(["name", "qc_check"]) - - return checkdf + + + empty_flags = { + flagged_cond: 0, + pass_cond: 0, + unmet_cond: 0, + saved_cond: 0, + unchecked_cond: 0} + + qcdict = {} + for qcres in self.outliers: + qcdict[qcres.checkname] = empty_flags | qcres.flags.value_counts().to_dict() + + #Convert to a pandas series with multiindex ['checkname', 'flag'] and the name is 'counts' + + qcdf = pd.DataFrame.from_dict(qcdict, orient='index') + qcdf.index.name = 'checkname' + qcseries = qcdf.stack(future_stack=True) + qcseries.index = qcseries.index.set_names('flag', level=-1) + qcseries.name = 'counts' + return qcseries + + # ------------------------------------------ # Gaps related diff --git a/src/metobs_toolkit/settings_collection/label_defenitions.py b/src/metobs_toolkit/settings_collection/label_defenitions.py index 8b2d0607..13877ae0 100644 --- a/src/metobs_toolkit/settings_collection/label_defenitions.py +++ b/src/metobs_toolkit/settings_collection/label_defenitions.py @@ -193,3 +193,26 @@ "buddy_check", "buddy_check_with_safetynets", ] + + +# ============================================================================ +# labels per QC check +# ============================================================================ + +per_check_possible_labels = { + 'pass': {'label':'passed', + "plotkwargs": {"color": "#00a824"} + }, + 'flagged': {'label':'flagged', + "plotkwargs": {"color": "#f0051c"} + }, + 'condition_unmet': {'label':'condition_unmet', + "plotkwargs": {"color": "#808080"} + }, + 'saved_whitelist': {'label':'saved', + "plotkwargs": {"color": "#0000ff"} + }, + 'unchecked': {'label':'unchecked', + "plotkwargs": {"color": "#f7cf07"} + }, +} \ No newline at end of file diff --git a/src/metobs_toolkit/settings_collection/plotting_defaults.py b/src/metobs_toolkit/settings_collection/plotting_defaults.py index ba0e7aa4..baadf72e 100644 --- a/src/metobs_toolkit/settings_collection/plotting_defaults.py +++ b/src/metobs_toolkit/settings_collection/plotting_defaults.py @@ -39,8 +39,12 @@ # "anchor_legend_small": (-3.5, 2.2), "radius_big": 1.0, "radius_small": 0.7, - "txt_size_big_pies": 7, - "txt_size_small_pies": 5, + "txt_label_size_big_pies": 7, + "txt_label_size_small_pies": 5, + "fig_title_kwargs": {"fontsize": 16}, + "big_pie_title_kwargs": {"fontsize": 14}, + "small_pie_title_kwargs": {"fontsize": 10}, + } # ============================================================================= diff --git a/src/metobs_toolkit/settings_collection/settings.py b/src/metobs_toolkit/settings_collection/settings.py index 06bb6776..6f4f6dd1 100644 --- a/src/metobs_toolkit/settings_collection/settings.py +++ b/src/metobs_toolkit/settings_collection/settings.py @@ -14,6 +14,7 @@ import logging from typing import Dict, Any, Optional, Union from copy import deepcopy +import os # import settings modules @@ -26,7 +27,9 @@ scatter, line, vline, + per_check_possible_labels, ) + from metobs_toolkit.settings_collection.plotting_defaults import default_plot_settings logger = logging.getLogger("") @@ -74,6 +77,8 @@ class Settings: "gapfill_label_group": gapfill_label_group, "failed_gapfill_label_group": failed_gapfill_label_group, "qc_label_group": qc_label_group, + "qc_status_labels_per_check": per_check_possible_labels, + # Logging defaults "log_level": "WARNING", "log_format": "LOG:: %(levelname)s - %(message)s", @@ -87,6 +92,9 @@ class Settings: }, # Plotting defaults "plotting_settings": default_plot_settings, + + # Technical settings + "use_N_cores_for_MP": os.cpu_count() - 1 if os.cpu_count() > 1 else 1, } _config: Dict[str, Any] = {} diff --git a/src/metobs_toolkit/settings_collection/version.py b/src/metobs_toolkit/settings_collection/version.py index 6e45a91d..2978f753 100644 --- a/src/metobs_toolkit/settings_collection/version.py +++ b/src/metobs_toolkit/settings_collection/version.py @@ -1 +1 @@ -__version__ = "1.0.0a1" +__version__ = "1.0.0a13" diff --git a/src/metobs_toolkit/station.py b/src/metobs_toolkit/station.py index f3feb61d..4674cf82 100644 --- a/src/metobs_toolkit/station.py +++ b/src/metobs_toolkit/station.py @@ -36,6 +36,9 @@ from metobs_toolkit.gf_collection.overview_df_constructors import ( station_gap_status_overview_df, ) +from metobs_toolkit.qc_collection.overview_df_constructor import ( + station_qc_overview_df, +) from metobs_toolkit.backend_collection.filter_modeldatadf import filter_modeldatadf from metobs_toolkit.geedatasetmanagers import default_datasets as default_gee_datasets from metobs_toolkit.sensordata import SensorData @@ -46,7 +49,7 @@ from metobs_toolkit.io_collection.filewriters import fmt_output_filepath if TYPE_CHECKING: - from matplotlib.pyplot import Axes + from matplotlib.pyplot import Axes, Figure from os import PathLike from xarray import Dataset as xrDataset from metobs_toolkit.sensordata import SensorData @@ -1612,59 +1615,60 @@ def window_variation_check( # apply check on the sensordata self.get_sensor(obstype).window_variation_check(**qc_kwargs) + @copy_doc(station_qc_overview_df) + @log_entry + def qc_overview_df(self, subset_obstypes:Union[list[str], None] = None) -> pd.DataFrame: + return station_qc_overview_df(self, subset_obstypes=subset_obstypes) + @log_entry def get_qc_stats( self, obstype: str = "temp", make_plot: bool = True - ) -> pd.DataFrame: + ) -> Union[dict[str, pd.Series], Figure]: """ - Generate quality control (QC) frequency statistics. - - This method calculates the frequency statistics for various QC checks - applied, and other labels. The order of checks is taken into - account. + Summarize QC label frequencies for one station and optionally plot pies. - Frequency of labels is computed based on the set of all labels (for all - records including gaps). The effectiveness of a check is shown by - the frequency of outliers with respect to the number of records that were given - to the check (thus taking into account the order of checks). - - The frequencies are returned in a dataframe, and can be plotted - as pie charts. + The method gathers: + * final label counts across all records (including gaps) from ``SensorData.df``; + * outlier-only label counts from ``SensorData.outliersdf``; + * per-check outcome counts (flags per QC check) via ``SensorData.get_qc_freq_statistics``. + When ``make_plot`` is True, these counts are visualized with + ``plotting.qc_overview_pies``. Parameters ---------- obstype : str, optional - The target observation type for which to compute frequency statistics, by default "temp". + Observation type to evaluate, by default "temp". make_plot : bool, optional - If True, a figure with pie charts representing the frequencies is generated. The default is True. + If True, return a figure with pie charts; if False, no figure is created. Default is True. Returns ------- - pandas.DataFrame - A DataFrame containing the QC frequency statistics. The DataFrame - has a multi-index with the station name and QC check label, and - includes the following columns: - - * `N_all`: Total number of records in the dataset (including gaps). - * `N_labeled`: Number of records with the specific label. - * `N_checked`: Number of records checked for the specific QC check. - + matplotlib.figure.Figure or dict[str, pandas.Series] + Figure with QC overview pies when ``make_plot`` is True; otherwise a dictionary with + keys ``all_labels``, ``outlier_labels``, and ``per_check_labels``. """ # argument checks self._obstype_is_known_check(obstype) # get freq statistics - qc_df = self.get_sensor(obstype).get_qc_freq_statistics() - + qc_specific_counts = self.get_sensor(obstype).get_qc_freq_statistics() + qc_labels_from_df = self.get_sensor(obstype).df['label'].value_counts() + qc_labels_from_outliers = self.get_sensor(obstype).outliersdf['label'].value_counts() + if make_plot: - plotdf = qc_df.reset_index().drop(columns=["name"]).set_index("qc_check") - - fig = plotting.qc_overview_pies(df=plotdf) - fig.suptitle( - f"QC frequency statistics of {obstype} on Station level: {self.name}." - ) + fig = plotting.qc_overview_pies( + end_labels_from_df=qc_labels_from_df, + end_labels_from_outliers=qc_labels_from_outliers, + per_check_labels=qc_specific_counts, + fig_title = f"QC frequency statistics of {obstype} on Sensor level: {self.name}.") + return fig - else: - return qc_df + + return { + "all_labels": qc_labels_from_df, + "outlier_labels": qc_labels_from_outliers, + "per_check_labels": qc_specific_counts, + } + @log_entry def make_plot_of_modeldata( diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/datatype.json b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/datatype.json new file mode 100644 index 00000000..bbf7c5bc --- /dev/null +++ b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/datatype.json @@ -0,0 +1 @@ +{"class": "Series"} \ No newline at end of file diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/solution_series.parquet b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/solution_series.parquet new file mode 100644 index 00000000..99756aa4 Binary files /dev/null and b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/all_labels/solution_series.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/datatype.json b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/datatype.json index 79b62692..f73490f0 100644 --- a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/datatype.json +++ b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/datatype.json @@ -1 +1 @@ -{"class": "DataFrame"} \ No newline at end of file +{"class": "Dict"} \ No newline at end of file diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/datatype.json b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/datatype.json new file mode 100644 index 00000000..bbf7c5bc --- /dev/null +++ b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/datatype.json @@ -0,0 +1 @@ +{"class": "Series"} \ No newline at end of file diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/solution_series.parquet b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/solution_series.parquet new file mode 100644 index 00000000..6ce975be Binary files /dev/null and b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/outlier_labels/solution_series.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/datatype.json b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/datatype.json new file mode 100644 index 00000000..bbf7c5bc --- /dev/null +++ b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/datatype.json @@ -0,0 +1 @@ +{"class": "Series"} \ No newline at end of file diff --git a/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/solution_series.parquet b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/solution_series.parquet new file mode 100644 index 00000000..9fa1603f Binary files /dev/null and b/tests/pkled_solutions/test_qc_solutions/testbreakingdata/test_qc_stats_check/per_check_labels/solution_series.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_df.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_df.parquet index 5dafe410..5c7db4b2 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_df.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_df.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_gapsdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_gapsdf.parquet index b802b21c..5ec3fd78 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_gapsdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_gapsdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_metadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_metadf.parquet index 7956d4ed..58731c43 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_metadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_metadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_modeldatadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_modeldatadf.parquet index 5b68f8b5..57dae890 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_modeldatadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_modeldatadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_outliersdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_outliersdf.parquet index e6411ee6..b765ec67 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_outliersdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_more_iterations/solution_outliersdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_df.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_df.parquet index 1bb14962..9fe8ac73 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_df.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_df.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_gapsdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_gapsdf.parquet index b802b21c..5ec3fd78 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_gapsdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_gapsdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_metadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_metadf.parquet index 7956d4ed..58731c43 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_metadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_metadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_modeldatadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_modeldatadf.parquet index 5b68f8b5..57dae890 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_modeldatadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_modeldatadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_outliersdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_outliersdf.parquet index b28f51d1..f7d6f4b3 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_outliersdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_one_iteration/solution_outliersdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_df.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_df.parquet index 8a8f0f13..56ecb056 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_df.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_df.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_gapsdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_gapsdf.parquet index b802b21c..5ec3fd78 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_gapsdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_gapsdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_metadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_metadf.parquet index 7956d4ed..58731c43 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_metadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_metadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_modeldatadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_modeldatadf.parquet index 5b68f8b5..57dae890 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_modeldatadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_modeldatadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_outliersdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_outliersdf.parquet index f468f358..193bb52a 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_outliersdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_big_radius/solution_outliersdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_df.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_df.parquet index 53e6317a..c78c7e2c 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_df.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_df.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_gapsdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_gapsdf.parquet index b802b21c..5ec3fd78 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_gapsdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_gapsdf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_metadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_metadf.parquet index 7956d4ed..58731c43 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_metadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_metadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_modeldatadf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_modeldatadf.parquet index 5b68f8b5..57dae890 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_modeldatadf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_modeldatadf.parquet differ diff --git a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_outliersdf.parquet b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_outliersdf.parquet index 12ecd1d0..8b65b52d 100644 Binary files a/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_outliersdf.parquet and b/tests/pkled_solutions/test_qc_solutions/testdemodata/test_buddy_check_with_safety_nets/solution_outliersdf.parquet differ diff --git a/tests/solutionclass.py b/tests/solutionclass.py index 06574a2a..a6f050c1 100644 --- a/tests/solutionclass.py +++ b/tests/solutionclass.py @@ -178,6 +178,11 @@ def _store_dataframe(df: pd.DataFrame, dir_path: Path) -> None: _write_datatype(dir_path, "DataFrame") df.to_parquet(dir_path / "solution_df.parquet") +def _store_series(series: pd.Series, dir_path: Path) -> None: + """Store a Series solution.""" + _write_datatype(dir_path, "Series") + series.to_frame().to_parquet(dir_path / "solution_series.parquet") + def _store_dataset(dataset, dir_path: Path) -> None: """Store a Dataset solution.""" @@ -221,6 +226,8 @@ def _store_dict(data_dict: dict, dir_path: Path) -> None: _store_station(val, key_dir) elif isinstance(val, pd.DataFrame): _store_dataframe(val, key_dir) + elif isinstance(val, pd.Series): + _store_series(val, key_dir) else: raise NotImplementedError( f"store_dict does not support {obj_classname} objects." @@ -251,6 +258,10 @@ def _read_dataframe(dir_path: Path) -> pd.DataFrame: """Read a DataFrame solution.""" return pd.read_parquet(dir_path / "solution_df.parquet") +def _read_series(dir_path: Path) -> pd.Series: + """Read a Series solution.""" + return pd.read_parquet(dir_path / "solution_series.parquet").iloc[:,0] + def _read_dataset(dir_path: Path) -> SerializedDataset: """Read a Dataset solution.""" @@ -285,6 +296,8 @@ def _read_dict(dir_path: Path) -> dict: result[subdir.stem] = _read_station(subdir) elif solution_class == "DataFrame": result[subdir.stem] = _read_dataframe(subdir) + elif solution_class == "Series": + result[subdir.stem] = _read_series(subdir) else: raise NotImplementedError( f"read_dict does not support {solution_class} objects." diff --git a/tests/test_qc.py b/tests/test_qc.py index 23c382e5..2410bcb5 100644 --- a/tests/test_qc.py +++ b/tests/test_qc.py @@ -129,17 +129,17 @@ def test_qc_with_solution(self, regular_qc_on_dataset, overwrite_solution=False) **TestBreakingDataset.solkwargs, methodname=method_name ) - assert_equality(dataset, solutionobj) # dataset comparison + assert_equality(dataset, solutionobj, exclude_columns=['details'] ) # dataset comparison def test_qc_stats_check(self, regular_qc_on_dataset, overwrite_solution=False): method_name = "test_qc_stats_check" dataset = copy.deepcopy(regular_qc_on_dataset) - df = dataset.get_qc_stats(obstype="temp", make_plot=False) + countdicts = dataset.get_qc_stats(obstype="temp", make_plot=False) if overwrite_solution: TestBreakingDataset.solutionfixer.create_solution( - solution=df, + solution=countdicts, methodname=method_name, **TestBreakingDataset.solkwargs, ) @@ -147,7 +147,9 @@ def test_qc_stats_check(self, regular_qc_on_dataset, overwrite_solution=False): solutiondf = TestBreakingDataset.solutionfixer.get_solution( methodname=method_name, **TestBreakingDataset.solkwargs ) - assert_equality(df, solutiondf) + + for key, val in countdicts.items(): + assert_equality(val, solutiondf[key]) @pytest.mark.mpl_image_compare def test_make_plot_by_label_with_outliers(self, regular_qc_on_dataset): @@ -202,6 +204,16 @@ def test_import_data(self, import_dataset, overwrite_solution=False): ) assert_equality(dataset, solutionobj) + + + def test_buddy_check_with_paralelism(self, import_dataset): + dataset = copy.deepcopy(import_dataset) + obstype = "temp" + + dataset.buddy_check(obstype=obstype, use_mp=True) + assert not dataset.outliersdf.empty + + def test_qc_when_some_stations_missing_obs(self, import_dataset): # 1. get_startpoint data @@ -224,6 +236,9 @@ def test_qc_when_some_stations_missing_obs(self, import_dataset): dataset.buddy_check(obstype=obstype) assert orig_count == len(dataset.stations) + + #test if test_qc works if some stations are missing obstype + dataset.get_qc_stats(obstype=obstype) def test_buddy_check_raise_errors(self, import_dataset): # 1. get_startpoint data @@ -232,6 +247,22 @@ def test_buddy_check_raise_errors(self, import_dataset): from metobs_toolkit.backend_collection.errorclasses import ( MetObsMetadataNotFound, ) + + with pytest.raises(DeprecationWarning): + # Should raise error because no altitude info is available and lapsrate is not none + dataset.buddy_check( + obstype="temp", + spatial_buddy_radius=17000, + min_sample_size=3, + max_alt_diff=150, + min_std=1.0, #cause a deprecation warning + spatial_z_threshold=2.4, + N_iter=1, + instantaneous_tolerance=pd.Timedelta("4min"), + lapserate=-0.0065, # -0.0065 + use_mp=False, + ) + with pytest.raises(MetObsMetadataNotFound): # Should raise error because no altitude info is available and lapsrate is not none @@ -240,7 +271,7 @@ def test_buddy_check_raise_errors(self, import_dataset): spatial_buddy_radius=17000, min_sample_size=3, max_alt_diff=150, - min_std=1.0, + min_sample_spread=1.0, spatial_z_threshold=2.4, N_iter=1, instantaneous_tolerance=pd.Timedelta("4min"), @@ -259,7 +290,7 @@ def test_buddy_check_raise_errors(self, import_dataset): spatial_buddy_radius=17000, min_sample_size=3, max_alt_diff=150, - min_std=1.0, + min_sample_spread=1.0, spatial_z_threshold=2.4, N_iter=1, instantaneous_tolerance=pd.Timedelta("4min"), @@ -278,7 +309,7 @@ def test_buddy_check_one_iteration(self, import_dataset, overwrite_solution=Fals spatial_buddy_radius=25000, min_sample_size=3, max_alt_diff=None, - min_std=1.0, + min_sample_spread=1.0, spatial_z_threshold=2.1, N_iter=1, # one iteration test instantaneous_tolerance=pd.Timedelta("4min"), @@ -308,26 +339,52 @@ def test_buddy_check_more_iterations( ): # 0. Get info of the current check _method_name = sys._getframe().f_code.co_name - dataset = copy.deepcopy(import_dataset) + # test one iteration - dataset.buddy_check( + dataset_1iter = copy.deepcopy(import_dataset) + dataset_1iter.buddy_check( obstype="temp", spatial_buddy_radius=25000, min_sample_size=3, max_alt_diff=None, - min_std=1.0, + min_sample_spread=1.0, spatial_z_threshold=2.1, - N_iter=2, # one iteration test + N_iter=1, # one iteration test instantaneous_tolerance=pd.Timedelta("4min"), lapserate=None, # -0.0065 - use_mp=False, + use_mp=True, + ) + + #Test 2 iterations + dataset_2iter = copy.deepcopy(import_dataset) + dataset_2iter.buddy_check( + obstype="temp", + spatial_buddy_radius=25000, + min_sample_size=3, + max_alt_diff=None, + min_sample_spread=1.0, + spatial_z_threshold=2.1, + N_iter=2, # two iteration test + instantaneous_tolerance=pd.Timedelta("4min"), + lapserate=None, # -0.0065 + use_mp=True, ) + + #apply relative tests + outl2 = dataset_2iter.outliersdf + outl1 = dataset_1iter.outliersdf + + #test if all oult indexes are in outl2 + assert outl1.index.isin(outl2.index).all() + assert outl2.shape[0] > outl1.shape[0] + + #absolute testings # overwrite solution? if overwrite_solution: TestDemoDataset.solutionfixer.create_solution( - solution=dataset, + solution=dataset_2iter, **TestDemoDataset.solkwargs, methodname=_method_name, ) @@ -338,7 +395,7 @@ def test_buddy_check_more_iterations( # validate expression assert_equality( - dataset, solutionobj, exclude_columns=["details"] + dataset_2iter, solutionobj, exclude_columns=["details"] ) # dataset comparison def test_buddy_check_no_outliers(self, import_dataset): @@ -351,8 +408,9 @@ def test_buddy_check_no_outliers(self, import_dataset): spatial_buddy_radius=25000, min_sample_size=3, max_alt_diff=None, - min_std=1.0, - spatial_z_threshold=5.9, # this does noet create outliers + min_sample_spread=1.0, + spatial_z_threshold=7.4, # this does noet create outliers + use_z_robust_method=True, N_iter=1, instantaneous_tolerance=pd.Timedelta("4min"), lapserate=None, # -0.0065 @@ -360,6 +418,24 @@ def test_buddy_check_no_outliers(self, import_dataset): ) assert dataset.outliersdf.empty + + #Extra test, same settings with non-robust z-method will make outliers + dataset = copy.deepcopy(import_dataset) + dataset.buddy_check( + obstype="temp", + spatial_buddy_radius=25000, + min_sample_size=3, + max_alt_diff=None, + min_sample_spread=1.0, + spatial_z_threshold=7.4, # this does noet create outliers + N_iter=1, + instantaneous_tolerance=pd.Timedelta("4min"), + use_z_robust_method=False, + lapserate=None, # -0.0065 + use_mp=False, + ) + + assert not dataset.outliersdf.empty def test_buddy_check_with_big_radius( self, import_dataset, overwrite_solution=False @@ -403,11 +479,10 @@ def test_buddy_check_with_safety_nets( _method_name = sys._getframe().f_code.co_name # 1. get_startpoint data - dataset = copy.deepcopy(import_dataset) - + dataset_with_saftynet = copy.deepcopy(import_dataset) # Test 1: Using safety_net_configs with LCZ should match the LCZ safety net method - dataset = copy.deepcopy(dataset) - dataset.buddy_check_with_safetynets( + dataset_with_saftynet = copy.deepcopy(dataset_with_saftynet) + dataset_with_saftynet.buddy_check_with_safetynets( obstype="temp", spatial_buddy_radius=25000, safety_net_configs=[ @@ -420,21 +495,44 @@ def test_buddy_check_with_safety_nets( ], min_sample_size=3, max_alt_diff=None, - min_std=1.0, + min_sample_spread=1.0, spatial_z_threshold=2.1, N_iter=1, instantaneous_tolerance=pd.Timedelta("4min"), lapserate=None, - use_mp=False, + use_mp=True, ) - assert ( - dataset.outliersdf.shape[0] == 74 - ), f"Expected 74 outliers, got {dataset.outliersdf.shape[0]}" + + #Use same settings without saftyenet to make a relative comparison + dataset_without_saftynet = copy.deepcopy(import_dataset) + dataset_without_saftynet.buddy_check_with_safetynets( + obstype="temp", + spatial_buddy_radius=25000, + safety_net_configs=[], # No safety nets + min_sample_size=3, + max_alt_diff=None, + min_sample_spread=1.0, + spatial_z_threshold=2.1, + N_iter=1, + instantaneous_tolerance=pd.Timedelta("4min"), + lapserate=None, + use_mp=True, + ) + + #Relative tests + oult_saftynet = dataset_with_saftynet.outliersdf + oult_without_saftynet = dataset_without_saftynet.outliersdf + + + assert oult_saftynet.index.isin(oult_without_saftynet.index).all() + assert oult_without_saftynet.shape[0] > oult_saftynet.shape[0] + + # overwrite solution? if overwrite_solution: TestDemoDataset.solutionfixer.create_solution( - solution=dataset, + solution=dataset_with_saftynet, **TestDemoDataset.solkwargs, methodname=_method_name, ) @@ -445,7 +543,7 @@ def test_buddy_check_with_safety_nets( ) # validate expression - assert_equality(dataset, solutionobj, exclude_columns=["details"]) + assert_equality(dataset_with_saftynet, solutionobj, exclude_columns=["details"]) def test_buddy_check_with_safety_nets_missing_min_sample_size(self, import_dataset): """Test that an error is raised when min_sample_size is missing from safety_net_configs.""" @@ -604,7 +702,7 @@ def test_whiteset_datetime_only( methodname=_method_name, **TestWhiteRecords.solkwargs ) - assert_equality(dataset, solutionobj) + assert_equality(dataset, solutionobj, exclude_columns=['details'] ) def test_whiteset_name_only(self, dataset_with_outliers, overwrite_solution=False): """Test white_records with Index containing only station names.""" @@ -634,7 +732,7 @@ def test_whiteset_name_only(self, dataset_with_outliers, overwrite_solution=Fals methodname=_method_name, **TestWhiteRecords.solkwargs ) - assert_equality(dataset, solutionobj) + assert_equality(dataset, solutionobj, exclude_columns=['details']) def test_whiteset_name_only_on_station(self, dataset_with_outliers): """Test white_records with name-only Index on Station object.""" @@ -685,7 +783,7 @@ def test_whiteset_name_and_datetime( methodname=_method_name, **TestWhiteRecords.solkwargs ) - assert_equality(dataset, solutionobj) + assert_equality(dataset, solutionobj, exclude_columns=['details']) def test_whiteset_full_multiindex( self, dataset_with_outliers, overwrite_solution=False @@ -716,7 +814,7 @@ def test_whiteset_full_multiindex( methodname=_method_name, **TestWhiteRecords.solkwargs ) - assert_equality(dataset, solutionobj) + assert_equality(dataset, solutionobj, exclude_columns=['details']) def test_white_dt_only_records_buddy_check( self, import_dataset, overwrite_solution=False @@ -1254,7 +1352,7 @@ def test_all_qc_methods_with_whiteset(self, import_dataset): # test_breaking_dataset.test_qc_labels(qc_dataset) # test_breaking_dataset.test_qc_with_solution(qc_dataset, overwrite_solution=False) - # test_breaking_dataset.test_qc_stats_check(qc_dataset, overwrite_solution=False) + # test_breaking_dataset.test_qc_stats_check(qc_dataset, overwrite_solution=OVERWRITE) # test_breaking_dataset.test_make_plot_by_label_with_outliers(qc_dataset) # test_breaking_dataset.test_get_info(qc_dataset) @@ -1269,10 +1367,10 @@ def test_all_qc_methods_with_whiteset(self, import_dataset): # test_demo_dataset.test_buddy_check_one_iteration(imported_demo_dataset, overwrite_solution=OVERWRITE) # test_demo_dataset.test_buddy_check_more_iterations(imported_demo_dataset, overwrite_solution=OVERWRITE) # test_demo_dataset.test_buddy_check_no_outliers(imported_demo_dataset) - test_demo_dataset.test_buddy_check_with_big_radius( - imported_demo_dataset, overwrite_solution=OVERWRITE - ) - # test_demo_dataset.test_buddy_check_with_safety_nets(imported_demo_dataset, overwrite_solution=OVERWRITE) + # test_demo_dataset.test_buddy_check_with_big_radius( + # imported_demo_dataset, overwrite_solution=OVERWRITE + # ) + test_demo_dataset.test_buddy_check_with_safety_nets(imported_demo_dataset, overwrite_solution=OVERWRITE) # test_demo_dataset.test_buddy_check_with_safety_nets_missing_min_sample_size(imported_demo_dataset) # Run white_records tests