From 1152a30ad7c522ced9be64460f7f8cc37761f98d Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Tue, 2 Dec 2025 19:52:35 +0100 Subject: [PATCH 1/7] Added Precision-REcall curve in probscores, similar to ROC curve but better for imbalanced distribution --- pysteps/verification/probscores.py | 149 +++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index f1eb398ba..96425a684 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -20,6 +20,10 @@ ROC_curve_init ROC_curve_accum ROC_curve_compute + PR_curve + PR_curve_init + PR_curve_accum + PR_curve_compute """ import numpy as np @@ -421,3 +425,148 @@ def ROC_curve_compute(ROC, compute_area=False): return POFD_vals, POD_vals, area else: return POFD_vals, POD_vals + + +def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False, compute_prevalence=False): + """ + Compute the Precision-Recall curve, its area, and optionally the prevalence baseline. + + Parameters + ---------- + P_f: array_like + Forecasted probabilities for exceeding the threshold specified in the PR object. + Non-finite values are ignored. + X_o: array_like + Observed values. Non-finite values are ignored. + X_min: float + Precipitation intensity threshold for yes/no prediction. + n_prob_thrs: int + The number of probability thresholds to use. + The interval [0,1] is divided into n_prob_thrs evenly spaced values. + compute_area: bool, optional + If True, compute the area under the Precision-Recall curve. + compute_prevalence: bool, optional + If True, compute and return the prevalence baseline (fraction of positives). + + Returns + ------- + out: tuple + A two-element tuple containing precision and recall for the probability thresholds. + If compute_area is True, return the area under the PR curve as an extra element. + If compute_prevalence is True, return the prevalence baseline as an extra element. + """ + P_f = P_f.copy() + X_o = X_o.copy() + pr = PR_curve_init(X_min, n_prob_thrs) + PR_curve_accum(pr, P_f, X_o) + return PR_curve_compute(pr, X_o, X_min, compute_area, compute_prevalence) + + +def PR_curve_init(X_min, n_prob_thrs=10): + """ + Initialize a Precision-Recall curve object. + + Parameters + ---------- + X_min: float + Precipitation intensity threshold for yes/no prediction. + n_prob_thrs: int + The number of probability thresholds to use. + The interval [0,1] is divided into n_prob_thrs evenly spaced values. + + Returns + ------- + out: dict + The PR curve object containing counters for hits, misses, false alarms, + correct negatives, and the probability thresholds. + """ + PR = {} + PR["X_min"] = X_min + PR["hits"] = np.zeros(n_prob_thrs, dtype=int) + PR["misses"] = np.zeros(n_prob_thrs, dtype=int) + PR["false_alarms"] = np.zeros(n_prob_thrs, dtype=int) + PR["corr_neg"] = np.zeros(n_prob_thrs, dtype=int) + PR["prob_thrs"] = np.linspace(0.0, 1.0, int(n_prob_thrs)) + return PR + + +def PR_curve_accum(PR, P_f, X_o): + """ + Accumulate the given probability-observation pairs into the given PR object. + + Parameters + ---------- + PR: dict + A PR curve object created with PR_curve_init. + P_f: array_like + Forecasted probabilities for exceeding the threshold specified in the PR object. + Non-finite values are ignored. + X_o: array_like + Observed values. Non-finite values are ignored. + + Notes + ----- + Updates the counters (hits, misses, false alarms, correct negatives) for each + probability threshold. + """ + mask = np.logical_and(np.isfinite(P_f), np.isfinite(X_o)) + P_f = P_f[mask] + X_o = X_o[mask] + for i, p in enumerate(PR["prob_thrs"]): + forecast_yes = P_f >= p + obs_yes = X_o >= PR["X_min"] + PR["hits"][i]+=np.sum(np.logical_and(forecast_yes, obs_yes)) + PR["misses"][i]+=np.sum(np.logical_and(~forecast_yes, obs_yes)) + PR["false_alarms"][i]+=np.sum(np.logical_and(forecast_yes, ~obs_yes)) + PR["corr_neg"][i]+=np.sum(np.logical_and(~forecast_yes, ~obs_yes)) + + +def PR_curve_compute(PR, X_o, X_min, compute_area=False, compute_prevalence=False): + """ + Compute the Precision-Recall curve, its area, and optionally prevalence baseline. + + Parameters + ---------- + PR: dict + A PR curve object created with PR_curve_init. + X_o: array_like + Observed values used to compute prevalence. + X_min: float + Precipitation intensity threshold for yes/no prediction. + compute_area: bool, optional + If True, compute the area under the Precision-Recall curve. + compute_prevalence: bool, optional + If True, compute and return the prevalence baseline (fraction of positives). + + Returns + ------- + out: tuple + Precision values and recall values for the probability thresholds. + If compute_area is True, return the area under the PR curve as an extra element. + If compute_prevalence is True, return the prevalence baseline as an extra element. + """ + precision_vals = [] + recall_vals = [] + for i in range(len(PR["prob_thrs"])): + hits = PR["hits"][i] + misses = PR["misses"][i] + false_alarms = PR["false_alarms"][i] + recall = hits/(hits+misses) if (hits+misses)>0 else 0.0 + precision = hits/(hits+false_alarms) if (hits+false_alarms)>0 else 1.0 + recall_vals.append(recall) + precision_vals.append(precision) + + results = (precision_vals, recall_vals) + + if compute_area: + area = 0.0 + for i in range(len(PR["prob_thrs"])-1): + area += (recall_vals[i+1]-recall_vals[i])*(precision_vals[i+1]+precision_vals[i])/2.0 + results = results + (area,) + + if compute_prevalence: + prevalence = np.sum(X_o >= X_min) / len(X_o) + results = results + (prevalence,) + + return results + From 140f04b1cd3c4751ca824ae3b9331c18cfa5112d Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Mon, 8 Dec 2025 16:42:14 +0100 Subject: [PATCH 2/7] Corrected area computation, needed sorting first --- pysteps/verification/probscores.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index 96425a684..37a1a54c2 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -547,26 +547,29 @@ def PR_curve_compute(PR, X_o, X_min, compute_area=False, compute_prevalence=Fals """ precision_vals = [] recall_vals = [] + for i in range(len(PR["prob_thrs"])): hits = PR["hits"][i] misses = PR["misses"][i] false_alarms = PR["false_alarms"][i] - recall = hits/(hits+misses) if (hits+misses)>0 else 0.0 - precision = hits/(hits+false_alarms) if (hits+false_alarms)>0 else 1.0 + + recall = hits / (hits + misses) if (hits + misses) > 0 else 0.0 + precision = hits / (hits + false_alarms) if (hits + false_alarms) > 0 else 1.0 + recall_vals.append(recall) precision_vals.append(precision) + # Add endpoints: (0,1) and (1, prevalence) + prevalence = np.sum(X_o >= X_min) / len(X_o) + recall_vals = [0.0] + recall_vals + [1.0] + precision_vals = [1.0] + precision_vals + [prevalence] + results = (precision_vals, recall_vals) if compute_area: - area = 0.0 - for i in range(len(PR["prob_thrs"])-1): - area += (recall_vals[i+1]-recall_vals[i])*(precision_vals[i+1]+precision_vals[i])/2.0 + # Sort by recall before integration + recall_sorted, precision_sorted = zip(*sorted(zip(recall_vals, precision_vals))) + area = np.trapz(precision_sorted, recall_sorted) results = results + (area,) - if compute_prevalence: - prevalence = np.sum(X_o >= X_min) / len(X_o) - results = results + (prevalence,) - - return results - + return results \ No newline at end of file From 22e1a741a6799d1ef97b6814945d4e4e99fd7a1c Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Mon, 8 Dec 2025 16:44:24 +0100 Subject: [PATCH 3/7] Corrected description --- pysteps/verification/probscores.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index 37a1a54c2..6b5f0a741 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -521,7 +521,7 @@ def PR_curve_accum(PR, P_f, X_o): PR["corr_neg"][i]+=np.sum(np.logical_and(~forecast_yes, ~obs_yes)) -def PR_curve_compute(PR, X_o, X_min, compute_area=False, compute_prevalence=False): +def PR_curve_compute(PR, X_o, X_min, compute_area=False): """ Compute the Precision-Recall curve, its area, and optionally prevalence baseline. @@ -535,15 +535,12 @@ def PR_curve_compute(PR, X_o, X_min, compute_area=False, compute_prevalence=Fals Precipitation intensity threshold for yes/no prediction. compute_area: bool, optional If True, compute the area under the Precision-Recall curve. - compute_prevalence: bool, optional - If True, compute and return the prevalence baseline (fraction of positives). - + Returns ------- out: tuple Precision values and recall values for the probability thresholds. If compute_area is True, return the area under the PR curve as an extra element. - If compute_prevalence is True, return the prevalence baseline as an extra element. """ precision_vals = [] recall_vals = [] From 47542cdafd6d23877855d95bd8ea9a1789744e6e Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Mon, 8 Dec 2025 16:45:17 +0100 Subject: [PATCH 4/7] Corrected description --- pysteps/verification/probscores.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index 6b5f0a741..0cb2c9ba4 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -427,7 +427,7 @@ def ROC_curve_compute(ROC, compute_area=False): return POFD_vals, POD_vals -def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False, compute_prevalence=False): +def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False): """ Compute the Precision-Recall curve, its area, and optionally the prevalence baseline. @@ -445,15 +445,12 @@ def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False, compute_preval The interval [0,1] is divided into n_prob_thrs evenly spaced values. compute_area: bool, optional If True, compute the area under the Precision-Recall curve. - compute_prevalence: bool, optional - If True, compute and return the prevalence baseline (fraction of positives). - + Returns ------- out: tuple A two-element tuple containing precision and recall for the probability thresholds. If compute_area is True, return the area under the PR curve as an extra element. - If compute_prevalence is True, return the prevalence baseline as an extra element. """ P_f = P_f.copy() X_o = X_o.copy() From bb30bad57bf9e243d6c5d372980ec9ded4b01a4e Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Mon, 8 Dec 2025 16:46:26 +0100 Subject: [PATCH 5/7] Removed prevalence option --- pysteps/verification/probscores.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index 0cb2c9ba4..d2e9e1a36 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -456,7 +456,7 @@ def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False): X_o = X_o.copy() pr = PR_curve_init(X_min, n_prob_thrs) PR_curve_accum(pr, P_f, X_o) - return PR_curve_compute(pr, X_o, X_min, compute_area, compute_prevalence) + return PR_curve_compute(pr, X_o, X_min, compute_area) def PR_curve_init(X_min, n_prob_thrs=10): From 19640651671f4e5d59a3d50d63ef695bb2d8fc3f Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Tue, 9 Dec 2025 12:59:41 +0100 Subject: [PATCH 6/7] Removed prevalence completely --- pysteps/verification/probscores.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index d2e9e1a36..2ceceb1b6 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -512,10 +512,10 @@ def PR_curve_accum(PR, P_f, X_o): for i, p in enumerate(PR["prob_thrs"]): forecast_yes = P_f >= p obs_yes = X_o >= PR["X_min"] - PR["hits"][i]+=np.sum(np.logical_and(forecast_yes, obs_yes)) - PR["misses"][i]+=np.sum(np.logical_and(~forecast_yes, obs_yes)) - PR["false_alarms"][i]+=np.sum(np.logical_and(forecast_yes, ~obs_yes)) - PR["corr_neg"][i]+=np.sum(np.logical_and(~forecast_yes, ~obs_yes)) + PR["hits"][i] += np.sum(np.logical_and(forecast_yes, obs_yes)) + PR["misses"][i] += np.sum(np.logical_and(~forecast_yes, obs_yes)) + PR["false_alarms"][i] += np.sum(np.logical_and(forecast_yes, ~obs_yes)) + PR["corr_neg"][i] += np.sum(np.logical_and(~forecast_yes, ~obs_yes)) def PR_curve_compute(PR, X_o, X_min, compute_area=False): @@ -553,17 +553,10 @@ def PR_curve_compute(PR, X_o, X_min, compute_area=False): recall_vals.append(recall) precision_vals.append(precision) - # Add endpoints: (0,1) and (1, prevalence) - prevalence = np.sum(X_o >= X_min) / len(X_o) - recall_vals = [0.0] + recall_vals + [1.0] - precision_vals = [1.0] + precision_vals + [prevalence] - - results = (precision_vals, recall_vals) - if compute_area: # Sort by recall before integration recall_sorted, precision_sorted = zip(*sorted(zip(recall_vals, precision_vals))) area = np.trapz(precision_sorted, recall_sorted) - results = results + (area,) - - return results \ No newline at end of file + return precision_vals, recall_vals, area + else: + return precision_vals, recall_vals \ No newline at end of file From a4a44db3db1fe682fcece9004d19c3228a0637e5 Mon Sep 17 00:00:00 2001 From: Daniel Eduardo Villarreal Jaime Date: Tue, 9 Dec 2025 13:02:25 +0100 Subject: [PATCH 7/7] Updated description --- pysteps/verification/probscores.py | 102 +++++++++++++++-------------- 1 file changed, 53 insertions(+), 49 deletions(-) diff --git a/pysteps/verification/probscores.py b/pysteps/verification/probscores.py index 2ceceb1b6..8108b8673 100644 --- a/pysteps/verification/probscores.py +++ b/pysteps/verification/probscores.py @@ -429,28 +429,29 @@ def ROC_curve_compute(ROC, compute_area=False): def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False): """ - Compute the Precision-Recall curve, its area, and optionally the prevalence baseline. + Compute the Precision–Recall (PR) curve and optionally its area. Parameters ---------- - P_f: array_like - Forecasted probabilities for exceeding the threshold specified in the PR object. - Non-finite values are ignored. - X_o: array_like - Observed values. Non-finite values are ignored. - X_min: float - Precipitation intensity threshold for yes/no prediction. - n_prob_thrs: int - The number of probability thresholds to use. - The interval [0,1] is divided into n_prob_thrs evenly spaced values. - compute_area: bool, optional - If True, compute the area under the Precision-Recall curve. - + P_f : array_like + Forecasted probabilities for exceeding the threshold X_min. + Non-finite values are ignored. + X_o : array_like + Observed values. Non-finite values are ignored. + X_min : float + Precipitation intensity threshold for yes/no prediction. + n_prob_thrs : int, optional + Number of probability thresholds to evaluate. + The interval [0, 1] is divided into n_prob_thrs evenly spaced values. + compute_area : bool, optional + If True, compute the area under the PR curve using trapezoidal integration. + Returns ------- - out: tuple - A two-element tuple containing precision and recall for the probability thresholds. - If compute_area is True, return the area under the PR curve as an extra element. + out : tuple + (precision_vals, recall_vals) for each probability threshold. + If compute_area is True, return (precision_vals, recall_vals, area), + where area is the trapezoidal estimate of the PR curve area. """ P_f = P_f.copy() X_o = X_o.copy() @@ -461,21 +462,24 @@ def PR_curve(P_f, X_o, X_min, n_prob_thrs=10, compute_area=False): def PR_curve_init(X_min, n_prob_thrs=10): """ - Initialize a Precision-Recall curve object. + Initialize a Precision–Recall curve object. Parameters ---------- - X_min: float - Precipitation intensity threshold for yes/no prediction. - n_prob_thrs: int - The number of probability thresholds to use. - The interval [0,1] is divided into n_prob_thrs evenly spaced values. + X_min : float + Precipitation intensity threshold for yes/no prediction. + n_prob_thrs : int, optional + Number of probability thresholds to evaluate. Returns ------- - out: dict - The PR curve object containing counters for hits, misses, false alarms, - correct negatives, and the probability thresholds. + PR : dict + Dictionary containing counters for hits, misses, false alarms, + correct negatives, and the probability thresholds. + Keys: + - "X_min" : threshold value + - "hits", "misses", "false_alarms", "corr_neg" : arrays of counts + - "prob_thrs" : array of evenly spaced thresholds in [0, 1] """ PR = {} PR["X_min"] = X_min @@ -489,22 +493,21 @@ def PR_curve_init(X_min, n_prob_thrs=10): def PR_curve_accum(PR, P_f, X_o): """ - Accumulate the given probability-observation pairs into the given PR object. + Accumulate forecast–observation pairs into the PR object. Parameters ---------- - PR: dict - A PR curve object created with PR_curve_init. - P_f: array_like - Forecasted probabilities for exceeding the threshold specified in the PR object. - Non-finite values are ignored. - X_o: array_like - Observed values. Non-finite values are ignored. + PR : dict + A PR curve object created with PR_curve_init. + P_f : array_like + Forecasted probabilities for exceeding X_min. + X_o : array_like + Observed values. Notes ----- - Updates the counters (hits, misses, false alarms, correct negatives) for each - probability threshold. + Updates the counters (hits, misses, false alarms, correct negatives) + for each probability threshold in PR["prob_thrs"]. """ mask = np.logical_and(np.isfinite(P_f), np.isfinite(X_o)) P_f = P_f[mask] @@ -520,24 +523,25 @@ def PR_curve_accum(PR, P_f, X_o): def PR_curve_compute(PR, X_o, X_min, compute_area=False): """ - Compute the Precision-Recall curve, its area, and optionally prevalence baseline. + Compute precision and recall values from the PR object. Parameters ---------- - PR: dict - A PR curve object created with PR_curve_init. - X_o: array_like - Observed values used to compute prevalence. - X_min: float - Precipitation intensity threshold for yes/no prediction. - compute_area: bool, optional - If True, compute the area under the Precision-Recall curve. - + PR : dict + A PR curve object created with PR_curve_init. + X_o : array_like + Observed values (used only if prevalence or area is computed). + X_min : float + Precipitation intensity threshold for yes/no prediction. + compute_area : bool, optional + If True, compute the area under the PR curve. + Returns ------- - out: tuple - Precision values and recall values for the probability thresholds. - If compute_area is True, return the area under the PR curve as an extra element. + out : tuple + (precision_vals, recall_vals) for each probability threshold. + If compute_area is True, return (precision_vals, recall_vals, area), + where area is the trapezoidal estimate of the PR curve area. """ precision_vals = [] recall_vals = []