From b6418266976cfaae2bdd34cca22c4b4717bdb30f Mon Sep 17 00:00:00 2001
From: hannahbaumann <hbaumann@uci.edu>
Date: Fri, 23 Jan 2026 11:19:47 +0100
Subject: [PATCH 1/2] Update doc strings

---
 src/openfe_analysis/reader.py           | 51 ++++++++++++---
 src/openfe_analysis/rmsd.py             | 86 +++++++++++++++++++------
 src/openfe_analysis/transformations.py  | 70 ++++++++++++++++----
 src/openfe_analysis/utils/multistate.py | 10 ++-
 4 files changed, 172 insertions(+), 45 deletions(-)

diff --git a/src/openfe_analysis/reader.py b/src/openfe_analysis/reader.py
index b7677bc..cd6008f 100644
--- a/src/openfe_analysis/reader.py
+++ b/src/openfe_analysis/reader.py
@@ -12,17 +12,24 @@
 
 def _determine_iteration_dt(dataset) -> float:
     """
-    Find out the timestep between each frame in the trajectory.
+    Determine the time increment between successive iterations
+    in a MultiStateReporter trajectory.
+
+    The timestep is inferred from the serialized MCMC move stored in the
+    ``mcmc_moves`` group of the NetCDF file. Specifically, this assumes the
+    move defines both a ``timestep`` and ``n_steps`` parameter, such that
+
+        dt_iteration = n_steps * timestep
 
     Parameters
     ----------
     dataset : nc.Dataset
-      Dataset holding the MultiStateReporter generated NetCDF file.
+      NetCDF dataset produced by ``openmmtools.multistate.MultiStateReporter``.
 
     Returns
     -------
     float
-      The timestep in units of picoseconds.
+      The time between successive iterations, in picoseconds.
 
     Raises
     ------
@@ -52,12 +59,14 @@ def _determine_iteration_dt(dataset) -> float:
 
 
 class FEReader(ReaderBase):
-    """A MDAnalysis Reader for NetCDF files created by
-    `openmmtools.multistate.MultiStateReporter`
+    """
+    MDAnalysis trajectory reader for NetCDF files written by
+    ``openmmtools.multistate.MultiStateReporter``.
 
     Looks along a multistate NetCDF file along one of two axes:
       - constant state/lambda (varying replica)
       - constant replica (varying lambda)
+    Exactly one of ``state_id`` or ``replica_id`` must be specified.
     """
 
     _state_id: Optional[int]
@@ -75,9 +84,11 @@ def __init__(self, filename, convert_units=True, state_id=None, replica_id=None,
         Parameters
         ----------
         filename : pathlike or nc.Dataset
-          path to the .nc file
+          Path to a MultiStateReporter NetCDF file, or an already-open
+          ``netCDF4.Dataset`` instance.
         convert_units : bool
-          convert positions to Angstrom
+          If ``True`` (default), positions are converted to Angstroms.
+          Otherwise, raw OpenMM units (nanometers) are returned.
         state_id : Optional[int]
           The Hamiltonian state index to extract. Must be defined if
           ``replica_id`` is not defined. May be negative (see notes below).
@@ -85,12 +96,17 @@ def __init__(self, filename, convert_units=True, state_id=None, replica_id=None,
           The replica index to extract. Must be defined if ``state_id``
           is not defined. May be negative (see notes below).
 
+        Raises
+        ------
+        ValueError
+            If neither or both of ``state_id`` and ``replica_id`` are specified.
+
         Notes
         -----
         A negative index may be passed to either ``state_id`` or
         ``replica_id``. This will be interpreted as indexing in reverse
-        starting from the last state/replica. For example, passing a
-        value of -2 for ``replica_id`` will select the before last replica.
+        starting from the last state/replica. For example, ``replica_id=-2``
+        will select the before last replica.
         """
         if not ((state_id is None) ^ (replica_id is None)):
             raise ValueError(
@@ -141,16 +157,31 @@ def n_frames(self) -> int:
 
     @staticmethod
     def parse_n_atoms(filename, **kwargs) -> int:
+        """
+        Determine the number of atoms stored in a MultiStateReporter NetCDF file.
+
+        Parameters
+        ----------
+        filename : path-like
+            Path to the NetCDF file.
+
+        Returns
+        -------
+        int
+            Number of atoms in the system.
+        """
         with nc.Dataset(filename) as ds:
             n_atoms = ds.dimensions["atom"].size
         return n_atoms
 
     def _read_next_timestep(self, ts=None) -> Timestep:
+        # Advance the trajectory by one frame.
         if (self._frame_index + 1) >= len(self):
             raise EOFError
         return self._read_frame(self._frame_index + 1)
 
     def _read_frame(self, frame: int) -> Timestep:
+        # Read a single trajectory frame.
         self._frame_index = frame
 
         if self._state_id is not None:
@@ -187,11 +218,13 @@ def _read_frame(self, frame: int) -> Timestep:
 
     @property
     def dt(self) -> float:
+        # Time difference between successive trajectory frames.
         return self._dt
 
     def _reopen(self):
         self._frame_index = -1
 
     def close(self):
+        # Close the underlying NetCDF dataset if owned by this reader.
         if self._dataset_owner:
             self._dataset.close()
diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py
index 7d2af13..3dfabaa 100644
--- a/src/openfe_analysis/rmsd.py
+++ b/src/openfe_analysis/rmsd.py
@@ -14,22 +14,46 @@
 
 
 def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Universe:
-    """Makes a Universe and applies some transformations
+    """
+    Construct an MDAnalysis Universe from a MultiState NetCDF trajectory
+    and apply standard analysis transformations.
+
+    The Universe is created using the custom ``FEReader`` to extract a
+    single state from a multistate simulation.
 
     Identifies two AtomGroups:
     - protein, defined as having standard amino acid names, then filtered
       down to CA
     - ligand, defined as resname UNK
 
-    Then applies some transformations.
+    Depending on whether a protein is present, a sequence of trajectory
+    transformations is applied:
 
     If a protein is present:
     - prevents the protein from jumping between periodic images
-    - moves the ligand to the image closest to the protein
-    - aligns the entire system to minimise the protein RMSD
+      (class:`NoJump`)
+    - moves the ligand to the image closest to the protein (:class:`Minimiser`)
+    - aligns the entire system to minimise the protein RMSD (:class:`Aligner`)
 
-    If only a ligand:
+    If only a ligand is present:
     - prevents the ligand from jumping between periodic images
+    - Aligns the ligand to minimize its RMSD
+
+    Parameters
+    ----------
+    top : pathlib.Path or Topology
+        Path to a topology file (e.g. PDB) or an already-loaded MDAnalysis
+        topology object.
+    trj : netCDF4.Dataset
+        Open NetCDF dataset produced by
+        ``openmmtools.multistate.MultiStateReporter``.
+    state : int
+        Thermodynamic state index to extract from the multistate trajectory.
+
+    Returns
+    -------
+    MDAnalysis.Universe
+        A Universe with trajectory transformations applied.
     """
     u = mda.Universe(
         top,
@@ -72,23 +96,37 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers
 def gather_rms_data(
     pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None
 ) -> dict[str, list[float]]:
-    """Generate structural analysis of RBFE simulation
+    """
+    Compute structural RMSD-based metrics for a multistate RBFE simulation.
+
+    For each thermodynamic state (lambda), this function:
+      - Loads the trajectory using ``FEReader``
+      - Applies standard PBC-handling and alignment transformations
+      - Computes protein and ligand structural metrics over time
+
+    The following analyses are produced per state:
+      - 1D protein CA RMSD time series
+      - 1D ligand RMSD time series
+      - Ligand center-of-mass displacement from its initial position
+        (``ligand_wander``)
+      - Flattened 2D protein RMSD matrix (pairwise RMSD between frames)
 
     Parameters
     ----------
     pdb_topology : pathlib.Path
-      path to pdb topology
+      Path to the PDB file defining system topology.
     dataset : pathlib.Path
-      path to nc trajectory
+      Path to the NetCDF trajectory file produced by a multistate simulation.
     skip : int, optional
-      step at which to progress through the trajectory.  by default, selects a
-      step that produces roughly 500 frames of analysis per replicate
-
-    Produces, for each lambda state:
-    - 1D protein RMSD timeseries 'protein_RMSD'
-    - ligand RMSD timeseries
-    - ligand COM motion 'ligand_wander'
-    - 2D protein RMSD plot
+      Frame stride for analysis. If ``None``, a stride is chosen such that
+      approximately 500 frames are analyzed per state.
+
+    Returns
+    -------
+    dict[str, list]
+        Dictionary containing per-state analysis results with keys:
+        ``protein_RMSD``, ``ligand_RMSD``, ``ligand_wander``,
+        ``protein_2D_RMSD``, and ``time(ps)``.
     """
     output = {
         "protein_RMSD": [],
@@ -183,19 +221,25 @@ def gather_rms_data(
 
 
 def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]:
-    """2 dimensions RMSD
+    """
+    Compute a flattened 2D RMSD matrix from a trajectory.
+
+    For all unique frame pairs ``(i, j)`` with ``i < j``, this function
+    computes the RMSD between atomic coordinates after optimal alignment.
 
     Parameters
     ----------
     positions : np.ndarray
-      the protein positions for the entire trajectory
+      Atomic coordinates for all frames in the trajectory.
     w : np.ndarray, optional
-      weights array
+      Per-atom weights to use in the RMSD calculation. If ``None``,
+      all atoms are weighted equally.
 
     Returns
     -------
-    rmsd_matrix : list
-      a flattened version of the 2d
+    list of float
+      Flattened list of RMSD values corresponding to all frame pairs
+      ``(i, j)`` with ``i < j``.
     """
     nframes, _, _ = positions.shape
 
diff --git a/src/openfe_analysis/transformations.py b/src/openfe_analysis/transformations.py
index 1a4ef34..49dfc66 100644
--- a/src/openfe_analysis/transformations.py
+++ b/src/openfe_analysis/transformations.py
@@ -13,11 +13,32 @@
 
 
 class NoJump(TransformationBase):
-    """Stops an AtomGroup from moving more than half a box length between frames
-
-    This transformation prevents an AtomGroup "teleporting" across the box
-    border between two subsequent frames.  This then simplifies the calculation
-    of motion over time.
+    """
+    Prevent an AtomGroup from jumping between periodic images.
+
+    This on-the-fly trajectory transformation removes large apparent
+    center-of-mass displacements caused by periodic boundary conditions.
+    If the AtomGroup moves by more than half a box length between
+    consecutive frames, it is translated by an integer number of box
+    vectors to keep its motion continuous.
+
+    The transformation operates in-place on the AtomGroup coordinates
+    and is intended to be applied before analyses that rely on smooth
+    time evolution (e.g. RMSD, COM motion).
+
+    Parameters
+    ----------
+    ag : MDAnalysis.AtomGroup
+        AtomGroup whose center-of-mass motion should be made continuous.
+
+    Notes
+    -----
+    - This transformation assumes an orthorhombic unit cell.
+    - Only translations are applied; no rotations or scaling.
+    - The correction is based on center-of-mass motion and is therefore
+      most appropriate for compact groups (e.g. proteins, ligands).
+    - Must be applied before any alignment transformations to avoid
+      mixing reference frames.
     """
 
     ag: mda.AtomGroup
@@ -45,11 +66,31 @@ def _transform(self, ts):
 
 
 class Minimiser(TransformationBase):
-    """Minimises the difference from ags to central_ag by choosing image
-
-    This transformation will translate any AtomGroup in *ags* in multiples of
-    the box vectors in order to minimise the distance between the center of mass
-    to the center of mass of each ag.
+    """
+    Translate AtomGroups to the nearest periodic image relative to a reference.
+
+    This transformation shifts one or more AtomGroups by integer multiples
+    of the simulation box vectors such that their center of mass is as close
+    as possible to the center of mass of a reference AtomGroup.
+
+    It is commonly used to keep ligands in the same periodic image as a
+    protein during alchemical or replica-exchange simulations.
+
+    Parameters
+    ----------
+    central_ag : MDAnalysis.AtomGroup
+        Reference AtomGroup whose center of mass defines the target image.
+    *ags : MDAnalysis.AtomGroup
+        One or more AtomGroups to be translated into the closest periodic
+        image relative to ``central_ag``.
+
+    Notes
+    -----
+    - This transformation assumes an orthorhombic simulation box.
+    - Translations are applied independently for each AtomGroup.
+    - Coordinates are modified in-place.
+    - This transformation does not prevent inter-frame jumps by itself
+      and is typically used in combination with :class:`NoJump`.
     """
 
     central_ag: mda.AtomGroup
@@ -74,10 +115,13 @@ def _transform(self, ts):
 
 
 class Aligner(TransformationBase):
-    """On-the-fly transformation to align a trajectory to minimise RMSD
+    """
+    Align a trajectory to a reference AtomGroup by minimizing RMSD.
 
-    centers all coordinates onto origin
-    rotates **entire universe** to minimise rmsd relative to **ref_ag**
+    This transformation performs an on-the-fly least-squares alignment
+    of the entire universe to a reference AtomGroup.
+    At each frame, the coordinates are translated and rotated to minimize the
+    RMSD of the atoms relative to their positions in the reference.
     """
 
     ref_pos: npt.NDArray
diff --git a/src/openfe_analysis/utils/multistate.py b/src/openfe_analysis/utils/multistate.py
index f885e2b..50bd438 100644
--- a/src/openfe_analysis/utils/multistate.py
+++ b/src/openfe_analysis/utils/multistate.py
@@ -24,6 +24,11 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray:
     indices : NDArray[int]
       An ordered array of iteration indices which hold positions.
 
+    Raises
+    ------
+    ValueError
+        If positions are not written at a consistent interval.
+
     Note
     ----
     This assumes that the indices are equally spaced by a given
@@ -55,7 +60,8 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray:
 
 
 def _state_to_replica(dataset: nc.Dataset, state_num: int, frame_num: int) -> int:
-    """Convert a state index to replica index at a given Dataset frame
+    """
+    Map a thermodynamic state index to the corresponding replica index.
 
     Parameters
     ----------
@@ -170,7 +176,7 @@ def _get_unitcell(
     dataset: nc.Dataset, replica_index: int, frame_num: int
 ) -> Optional[Tuple[unit.Quantity]]:
     """
-    Helper method to extract a unit cell from the stored
+    Helper method to extract unit cell dimensions from the stored
     box vectors in a MultiState reporter generated NetCDF file
     at a given state and Dataset frame.
 

From 552d6f88766bc7bd2050035540e02a2dab276c28 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 Jan 2026 09:26:54 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/openfe_analysis/tests/conftest.py | 28 +++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/openfe_analysis/tests/conftest.py b/src/openfe_analysis/tests/conftest.py
index 5fa2db4..19961eb 100644
--- a/src/openfe_analysis/tests/conftest.py
+++ b/src/openfe_analysis/tests/conftest.py
@@ -1,6 +1,6 @@
+import pathlib
 from importlib import resources
 
-import pathlib
 import pooch
 import pytest
 
@@ -9,41 +9,49 @@
     path=POOCH_CACHE,
     base_url="doi:10.5281/zenodo.17916322",
     registry={
-       "openfe_analysis_simulation_output.tar.gz":"md5:09752f2c4e5b7744d8afdee66dbd1414",
-       "openfe_analysis_skipped.tar.gz": "md5:3840d044299caacc4ccd50e6b22c0880",
+        "openfe_analysis_simulation_output.tar.gz": "md5:09752f2c4e5b7744d8afdee66dbd1414",
+        "openfe_analysis_skipped.tar.gz": "md5:3840d044299caacc4ccd50e6b22c0880",
     },
 )
 
+
 @pytest.fixture(scope="session")
 def rbfe_output_data_dir() -> pathlib.Path:
     ZENODO_RBFE_DATA.fetch("openfe_analysis_simulation_output.tar.gz", processor=pooch.Untar())
-    result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_simulation_output.tar.gz.untar/openfe_analysis_simulation_output/"
+    result_dir = (
+        pathlib.Path(POOCH_CACHE)
+        / "openfe_analysis_simulation_output.tar.gz.untar/openfe_analysis_simulation_output/"
+    )
     return result_dir
 
+
 @pytest.fixture(scope="session")
 def rbfe_skipped_data_dir() -> pathlib.Path:
     ZENODO_RBFE_DATA.fetch("openfe_analysis_skipped.tar.gz", processor=pooch.Untar())
-    result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_skipped.tar.gz.untar/openfe_analysis_skipped/"
+    result_dir = (
+        pathlib.Path(POOCH_CACHE) / "openfe_analysis_skipped.tar.gz.untar/openfe_analysis_skipped/"
+    )
     return result_dir
 
+
 @pytest.fixture(scope="session")
 def simulation_nc(rbfe_output_data_dir) -> pathlib.Path:
-    return rbfe_output_data_dir/"simulation.nc"
+    return rbfe_output_data_dir / "simulation.nc"
 
 
 @pytest.fixture(scope="session")
 def simulation_skipped_nc(rbfe_skipped_data_dir) -> pathlib.Path:
-    return rbfe_skipped_data_dir/"simulation.nc"
+    return rbfe_skipped_data_dir / "simulation.nc"
 
 
 @pytest.fixture(scope="session")
 def hybrid_system_pdb(rbfe_output_data_dir) -> pathlib.Path:
-    return rbfe_output_data_dir/"hybrid_system.pdb"
+    return rbfe_output_data_dir / "hybrid_system.pdb"
 
 
 @pytest.fixture(scope="session")
-def hybrid_system_skipped_pdb(rbfe_skipped_data_dir)->pathlib.Path:
-    return rbfe_skipped_data_dir/"hybrid_system.pdb"
+def hybrid_system_skipped_pdb(rbfe_skipped_data_dir) -> pathlib.Path:
+    return rbfe_skipped_data_dir / "hybrid_system.pdb"
 
 
 @pytest.fixture(scope="session")