From a63f487e0d11883c68a12de4003f3ad7df5ec9fa Mon Sep 17 00:00:00 2001
From: Ryan Bohlender <ryan.bohlender@gmail.com>
Date: Thu, 25 Sep 2025 13:57:17 -0500
Subject: [PATCH] Remove compressed sample gmap

---
 IBDReduce/ibdreduce_v3.py       | 70 +++++++++++++++++++--------------
 tests/data/gmap/sample.gmap     |  4 ++
 tests/data/sample_chr1_run0.txt |  3 ++
 tests/test_fdr.py               | 38 ++++++++++++++++++
 4 files changed, 85 insertions(+), 30 deletions(-)
 create mode 100644 tests/data/gmap/sample.gmap
 create mode 100644 tests/data/sample_chr1_run0.txt
 create mode 100644 tests/test_fdr.py

diff --git a/IBDReduce/ibdreduce_v3.py b/IBDReduce/ibdreduce_v3.py
index eaaa600..b637bb8 100755
--- a/IBDReduce/ibdreduce_v3.py
+++ b/IBDReduce/ibdreduce_v3.py
@@ -83,6 +83,40 @@ def parse_line(line: str) -> Tuple[int, int, np.ndarray, np.ndarray]:
     return chrom, pos, obs_parts, vals
 
 
+def compute_fdr_adjusted_pvalues(empp: np.ndarray, permutation_pvalues: np.ndarray) -> np.ndarray:
+    """Calculate FDR-adjusted p-values from permutation p-values.
+
+    Args:
+        empp: Empirical p-values derived from the observed statistics.
+        permutation_pvalues: Ranked permutation-derived p-values for each breakpoint.
+
+    Returns:
+        Array of FDR-adjusted p-values corresponding to ``empp``.
+    """
+
+    memoize = {}
+    adjp = np.zeros_like(empp, dtype=float)
+    num_breakpoints = permutation_pvalues.shape[0]
+
+    for i, p in enumerate(empp):
+        if p in memoize:
+            Rstar = memoize[p]
+        else:
+            Rstar = np.sum(permutation_pvalues <= p, axis=0)
+            memoize[p] = Rstar
+
+        rp = np.sum(empp <= p)
+        pm = p * num_breakpoints
+        rb = np.percentile(Rstar, 95)
+
+        if rp - rb >= pm:
+            adjp[i] = np.mean(Rstar / (Rstar + rp - pm))
+        else:
+            adjp[i] = np.sum(Rstar >= 1) / len(Rstar)
+
+    return adjp
+
+
 def ibdlen(fpath: str, gmap: GeneticMap) -> Tuple[int, float]:
     """Calculate the IBD length for a single chromosome and permutation set.
     :param fpath: The file path to the IBDmap file.
@@ -301,11 +335,11 @@ def subtract(a):
 
     # Generate the evd
     # Drop the first column, which contains the observed values.
-    data = data[:, 1:]
+    permutation_values = data[:, 1:]
     # Rank the data in descending order, so that the smallest values are ranked highest.
-    data = stats.rankdata(-data, axis=1) / (args.nperm * args.nruns + 1)
+    permutation_pvalues = stats.rankdata(-permutation_values, axis=1) / (args.nperm * args.nruns + 1)
     # Calculate the extreme value distribution (EVD) by taking the minimum across permutations.
-    evd = np.min(data, axis=0)
+    evd = np.min(permutation_pvalues, axis=0)
 
     if args.print_evd:
         print("EVD:\n{}".format("\t".join(str(x) for x in evd)))
@@ -322,35 +356,11 @@ def subtract(a):
     lower = stats.chi2.ppf(0.025, 2 * succ) / 2.0
     lower[np.isnan(lower)] = 0
 
-    upper /= data.shape[1]
-    lower /= data.shape[1]
+    upper /= permutation_pvalues.shape[1]
+    lower /= permutation_pvalues.shape[1]
 
     if args.fdr:
-        # Permutation p-values are likely to be repeated, so we save on computation at the cost of memory
-        memoize = {}
-        for i, p in enumerate(empp):
-            # Rstar is the distribution of rejected null hypotheses over permutations
-            if p in memoize:
-                Rstar = memoize[p]
-            else:
-                # Count how many times each permutation has a value less than or equal to p
-                # This is the distribution of the count of rejected true null hypotheses in permutation
-                Rstar = np.sum(data <= p, axis=0)
-                memoize[p] = Rstar
-            # rp is the number of rejected null hypotheses in the observed set
-            rp = np.sum(data[:, 0] <= p)
-            # pm is the expected number of rejected null hypotheses at a given alpha = p
-            pm = p * data.shape[0]
-            # rb is the 95th percentile of the Rstar distribution
-            rb = np.percentile(Rstar, 95)
-            # The condition for applying the FDR adjustment.
-            # If the number of rejected null hypotheses in the observed set, minus the 95th percentile of rejected null
-            # hypotheses at alpha = p, is greather than or equal to the expected.
-            # The term rp - pm estimates the number of true positives at alpha = p.
-            if rp - rb >= pm:
-                adjp[i] = np.mean(Rstar / (Rstar + rp - pm))
-            else:
-                adjp[i] = sum(Rstar >= 1) / len(Rstar)
+        adjp = compute_fdr_adjusted_pvalues(empp, permutation_pvalues)
 
     # Generate header information and write results to file.
     with open(args.output, "w", encoding="utf-8") as output_file:
diff --git a/tests/data/gmap/sample.gmap b/tests/data/gmap/sample.gmap
new file mode 100644
index 0000000..dbdeb0c
--- /dev/null
+++ b/tests/data/gmap/sample.gmap
@@ -0,0 +1,4 @@
+pos chrom rate
+1000 1 0.0
+2000 1 0.1
+3000 1 0.2
diff --git a/tests/data/sample_chr1_run0.txt b/tests/data/sample_chr1_run0.txt
new file mode 100644
index 0000000..af7b8cb
--- /dev/null
+++ b/tests/data/sample_chr1_run0.txt
@@ -0,0 +1,3 @@
+1 1000 0.1 0.2 0.3 0.3 0.4 0.1
+1 2000 0.2 0.1 0.3 0.5 0.2 0.6
+1 3000 0.1 0.1 0.1 0.2 0.3 0.4
diff --git a/tests/test_fdr.py b/tests/test_fdr.py
new file mode 100644
index 0000000..3100a5c
--- /dev/null
+++ b/tests/test_fdr.py
@@ -0,0 +1,38 @@
+import pathlib
+import sys
+
+import numpy as np
+
+PROJECT_ROOT = pathlib.Path(__file__).resolve().parents[1]
+sys.path.append(str(PROJECT_ROOT / "IBDReduce"))
+
+from ibdreduce_v3 import compute_fdr_adjusted_pvalues
+
+
+def test_fdr_adjustment_matches_manual_example():
+    empp = np.array([0.05, 0.2, 0.4], dtype=float)
+    permutation_pvalues = np.array([
+        [0.8, 0.05, 0.6, 0.1],
+        [0.3, 0.25, 0.1, 0.2],
+        [0.6, 0.35, 0.2, 0.45],
+    ])
+
+    adjusted = compute_fdr_adjusted_pvalues(empp, permutation_pvalues)
+
+    expected = []
+    for p in empp:
+        rstar = np.sum(permutation_pvalues <= p, axis=0)
+        rp = np.sum(empp <= p)
+        pm = p * permutation_pvalues.shape[0]
+        rb = np.percentile(rstar, 95)
+        if rp - rb >= pm:
+            expected.append(np.mean(rstar / (rstar + rp - pm)))
+        else:
+            expected.append(np.sum(rstar >= 1) / len(rstar))
+
+    np.testing.assert_allclose(adjusted, np.array(expected, dtype=float))
+
+    observed_rp = [np.sum(empp <= p) for p in empp]
+    permutation_first_column_counts = [np.sum(permutation_pvalues[:, 0] <= p) for p in empp]
+    assert observed_rp != permutation_first_column_counts
+    assert observed_rp[1] == 2  # sanity-check the hand-calculated rejection counts