From b6418266976cfaae2bdd34cca22c4b4717bdb30f Mon Sep 17 00:00:00 2001 From: hannahbaumann Date: Fri, 23 Jan 2026 11:19:47 +0100 Subject: [PATCH 1/2] Update doc strings --- src/openfe_analysis/reader.py | 51 ++++++++++++--- src/openfe_analysis/rmsd.py | 86 +++++++++++++++++++------ src/openfe_analysis/transformations.py | 70 ++++++++++++++++---- src/openfe_analysis/utils/multistate.py | 10 ++- 4 files changed, 172 insertions(+), 45 deletions(-) diff --git a/src/openfe_analysis/reader.py b/src/openfe_analysis/reader.py index b7677bc..cd6008f 100644 --- a/src/openfe_analysis/reader.py +++ b/src/openfe_analysis/reader.py @@ -12,17 +12,24 @@ def _determine_iteration_dt(dataset) -> float: """ - Find out the timestep between each frame in the trajectory. + Determine the time increment between successive iterations + in a MultiStateReporter trajectory. + + The timestep is inferred from the serialized MCMC move stored in the + ``mcmc_moves`` group of the NetCDF file. Specifically, this assumes the + move defines both a ``timestep`` and ``n_steps`` parameter, such that + + dt_iteration = n_steps * timestep Parameters ---------- dataset : nc.Dataset - Dataset holding the MultiStateReporter generated NetCDF file. + NetCDF dataset produced by ``openmmtools.multistate.MultiStateReporter``. Returns ------- float - The timestep in units of picoseconds. + The time between successive iterations, in picoseconds. Raises ------ @@ -52,12 +59,14 @@ def _determine_iteration_dt(dataset) -> float: class FEReader(ReaderBase): - """A MDAnalysis Reader for NetCDF files created by - `openmmtools.multistate.MultiStateReporter` + """ + MDAnalysis trajectory reader for NetCDF files written by + ``openmmtools.multistate.MultiStateReporter``. Looks along a multistate NetCDF file along one of two axes: - constant state/lambda (varying replica) - constant replica (varying lambda) + Exactly one of ``state_id`` or ``replica_id`` must be specified. """ _state_id: Optional[int] @@ -75,9 +84,11 @@ def __init__(self, filename, convert_units=True, state_id=None, replica_id=None, Parameters ---------- filename : pathlike or nc.Dataset - path to the .nc file + Path to a MultiStateReporter NetCDF file, or an already-open + ``netCDF4.Dataset`` instance. convert_units : bool - convert positions to Angstrom + If ``True`` (default), positions are converted to Angstroms. + Otherwise, raw OpenMM units (nanometers) are returned. state_id : Optional[int] The Hamiltonian state index to extract. Must be defined if ``replica_id`` is not defined. May be negative (see notes below). @@ -85,12 +96,17 @@ def __init__(self, filename, convert_units=True, state_id=None, replica_id=None, The replica index to extract. Must be defined if ``state_id`` is not defined. May be negative (see notes below). + Raises + ------ + ValueError + If neither or both of ``state_id`` and ``replica_id`` are specified. + Notes ----- A negative index may be passed to either ``state_id`` or ``replica_id``. This will be interpreted as indexing in reverse - starting from the last state/replica. For example, passing a - value of -2 for ``replica_id`` will select the before last replica. + starting from the last state/replica. For example, ``replica_id=-2`` + will select the before last replica. """ if not ((state_id is None) ^ (replica_id is None)): raise ValueError( @@ -141,16 +157,31 @@ def n_frames(self) -> int: @staticmethod def parse_n_atoms(filename, **kwargs) -> int: + """ + Determine the number of atoms stored in a MultiStateReporter NetCDF file. + + Parameters + ---------- + filename : path-like + Path to the NetCDF file. + + Returns + ------- + int + Number of atoms in the system. + """ with nc.Dataset(filename) as ds: n_atoms = ds.dimensions["atom"].size return n_atoms def _read_next_timestep(self, ts=None) -> Timestep: + # Advance the trajectory by one frame. if (self._frame_index + 1) >= len(self): raise EOFError return self._read_frame(self._frame_index + 1) def _read_frame(self, frame: int) -> Timestep: + # Read a single trajectory frame. self._frame_index = frame if self._state_id is not None: @@ -187,11 +218,13 @@ def _read_frame(self, frame: int) -> Timestep: @property def dt(self) -> float: + # Time difference between successive trajectory frames. return self._dt def _reopen(self): self._frame_index = -1 def close(self): + # Close the underlying NetCDF dataset if owned by this reader. if self._dataset_owner: self._dataset.close() diff --git a/src/openfe_analysis/rmsd.py b/src/openfe_analysis/rmsd.py index 7d2af13..3dfabaa 100644 --- a/src/openfe_analysis/rmsd.py +++ b/src/openfe_analysis/rmsd.py @@ -14,22 +14,46 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Universe: - """Makes a Universe and applies some transformations + """ + Construct an MDAnalysis Universe from a MultiState NetCDF trajectory + and apply standard analysis transformations. + + The Universe is created using the custom ``FEReader`` to extract a + single state from a multistate simulation. Identifies two AtomGroups: - protein, defined as having standard amino acid names, then filtered down to CA - ligand, defined as resname UNK - Then applies some transformations. + Depending on whether a protein is present, a sequence of trajectory + transformations is applied: If a protein is present: - prevents the protein from jumping between periodic images - - moves the ligand to the image closest to the protein - - aligns the entire system to minimise the protein RMSD + (class:`NoJump`) + - moves the ligand to the image closest to the protein (:class:`Minimiser`) + - aligns the entire system to minimise the protein RMSD (:class:`Aligner`) - If only a ligand: + If only a ligand is present: - prevents the ligand from jumping between periodic images + - Aligns the ligand to minimize its RMSD + + Parameters + ---------- + top : pathlib.Path or Topology + Path to a topology file (e.g. PDB) or an already-loaded MDAnalysis + topology object. + trj : netCDF4.Dataset + Open NetCDF dataset produced by + ``openmmtools.multistate.MultiStateReporter``. + state : int + Thermodynamic state index to extract from the multistate trajectory. + + Returns + ------- + MDAnalysis.Universe + A Universe with trajectory transformations applied. """ u = mda.Universe( top, @@ -72,23 +96,37 @@ def make_Universe(top: pathlib.Path, trj: nc.Dataset, state: int) -> mda.Univers def gather_rms_data( pdb_topology: pathlib.Path, dataset: pathlib.Path, skip: Optional[int] = None ) -> dict[str, list[float]]: - """Generate structural analysis of RBFE simulation + """ + Compute structural RMSD-based metrics for a multistate RBFE simulation. + + For each thermodynamic state (lambda), this function: + - Loads the trajectory using ``FEReader`` + - Applies standard PBC-handling and alignment transformations + - Computes protein and ligand structural metrics over time + + The following analyses are produced per state: + - 1D protein CA RMSD time series + - 1D ligand RMSD time series + - Ligand center-of-mass displacement from its initial position + (``ligand_wander``) + - Flattened 2D protein RMSD matrix (pairwise RMSD between frames) Parameters ---------- pdb_topology : pathlib.Path - path to pdb topology + Path to the PDB file defining system topology. dataset : pathlib.Path - path to nc trajectory + Path to the NetCDF trajectory file produced by a multistate simulation. skip : int, optional - step at which to progress through the trajectory. by default, selects a - step that produces roughly 500 frames of analysis per replicate - - Produces, for each lambda state: - - 1D protein RMSD timeseries 'protein_RMSD' - - ligand RMSD timeseries - - ligand COM motion 'ligand_wander' - - 2D protein RMSD plot + Frame stride for analysis. If ``None``, a stride is chosen such that + approximately 500 frames are analyzed per state. + + Returns + ------- + dict[str, list] + Dictionary containing per-state analysis results with keys: + ``protein_RMSD``, ``ligand_RMSD``, ``ligand_wander``, + ``protein_2D_RMSD``, and ``time(ps)``. """ output = { "protein_RMSD": [], @@ -183,19 +221,25 @@ def gather_rms_data( def twoD_RMSD(positions, w: Optional[npt.NDArray]) -> list[float]: - """2 dimensions RMSD + """ + Compute a flattened 2D RMSD matrix from a trajectory. + + For all unique frame pairs ``(i, j)`` with ``i < j``, this function + computes the RMSD between atomic coordinates after optimal alignment. Parameters ---------- positions : np.ndarray - the protein positions for the entire trajectory + Atomic coordinates for all frames in the trajectory. w : np.ndarray, optional - weights array + Per-atom weights to use in the RMSD calculation. If ``None``, + all atoms are weighted equally. Returns ------- - rmsd_matrix : list - a flattened version of the 2d + list of float + Flattened list of RMSD values corresponding to all frame pairs + ``(i, j)`` with ``i < j``. """ nframes, _, _ = positions.shape diff --git a/src/openfe_analysis/transformations.py b/src/openfe_analysis/transformations.py index 1a4ef34..49dfc66 100644 --- a/src/openfe_analysis/transformations.py +++ b/src/openfe_analysis/transformations.py @@ -13,11 +13,32 @@ class NoJump(TransformationBase): - """Stops an AtomGroup from moving more than half a box length between frames - - This transformation prevents an AtomGroup "teleporting" across the box - border between two subsequent frames. This then simplifies the calculation - of motion over time. + """ + Prevent an AtomGroup from jumping between periodic images. + + This on-the-fly trajectory transformation removes large apparent + center-of-mass displacements caused by periodic boundary conditions. + If the AtomGroup moves by more than half a box length between + consecutive frames, it is translated by an integer number of box + vectors to keep its motion continuous. + + The transformation operates in-place on the AtomGroup coordinates + and is intended to be applied before analyses that rely on smooth + time evolution (e.g. RMSD, COM motion). + + Parameters + ---------- + ag : MDAnalysis.AtomGroup + AtomGroup whose center-of-mass motion should be made continuous. + + Notes + ----- + - This transformation assumes an orthorhombic unit cell. + - Only translations are applied; no rotations or scaling. + - The correction is based on center-of-mass motion and is therefore + most appropriate for compact groups (e.g. proteins, ligands). + - Must be applied before any alignment transformations to avoid + mixing reference frames. """ ag: mda.AtomGroup @@ -45,11 +66,31 @@ def _transform(self, ts): class Minimiser(TransformationBase): - """Minimises the difference from ags to central_ag by choosing image - - This transformation will translate any AtomGroup in *ags* in multiples of - the box vectors in order to minimise the distance between the center of mass - to the center of mass of each ag. + """ + Translate AtomGroups to the nearest periodic image relative to a reference. + + This transformation shifts one or more AtomGroups by integer multiples + of the simulation box vectors such that their center of mass is as close + as possible to the center of mass of a reference AtomGroup. + + It is commonly used to keep ligands in the same periodic image as a + protein during alchemical or replica-exchange simulations. + + Parameters + ---------- + central_ag : MDAnalysis.AtomGroup + Reference AtomGroup whose center of mass defines the target image. + *ags : MDAnalysis.AtomGroup + One or more AtomGroups to be translated into the closest periodic + image relative to ``central_ag``. + + Notes + ----- + - This transformation assumes an orthorhombic simulation box. + - Translations are applied independently for each AtomGroup. + - Coordinates are modified in-place. + - This transformation does not prevent inter-frame jumps by itself + and is typically used in combination with :class:`NoJump`. """ central_ag: mda.AtomGroup @@ -74,10 +115,13 @@ def _transform(self, ts): class Aligner(TransformationBase): - """On-the-fly transformation to align a trajectory to minimise RMSD + """ + Align a trajectory to a reference AtomGroup by minimizing RMSD. - centers all coordinates onto origin - rotates **entire universe** to minimise rmsd relative to **ref_ag** + This transformation performs an on-the-fly least-squares alignment + of the entire universe to a reference AtomGroup. + At each frame, the coordinates are translated and rotated to minimize the + RMSD of the atoms relative to their positions in the reference. """ ref_pos: npt.NDArray diff --git a/src/openfe_analysis/utils/multistate.py b/src/openfe_analysis/utils/multistate.py index f885e2b..50bd438 100644 --- a/src/openfe_analysis/utils/multistate.py +++ b/src/openfe_analysis/utils/multistate.py @@ -24,6 +24,11 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray: indices : NDArray[int] An ordered array of iteration indices which hold positions. + Raises + ------ + ValueError + If positions are not written at a consistent interval. + Note ---- This assumes that the indices are equally spaced by a given @@ -55,7 +60,8 @@ def _determine_position_indices(dataset: nc.Dataset) -> NDArray: def _state_to_replica(dataset: nc.Dataset, state_num: int, frame_num: int) -> int: - """Convert a state index to replica index at a given Dataset frame + """ + Map a thermodynamic state index to the corresponding replica index. Parameters ---------- @@ -170,7 +176,7 @@ def _get_unitcell( dataset: nc.Dataset, replica_index: int, frame_num: int ) -> Optional[Tuple[unit.Quantity]]: """ - Helper method to extract a unit cell from the stored + Helper method to extract unit cell dimensions from the stored box vectors in a MultiState reporter generated NetCDF file at a given state and Dataset frame. From 552d6f88766bc7bd2050035540e02a2dab276c28 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 09:26:54 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/openfe_analysis/tests/conftest.py | 28 +++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/openfe_analysis/tests/conftest.py b/src/openfe_analysis/tests/conftest.py index 5fa2db4..19961eb 100644 --- a/src/openfe_analysis/tests/conftest.py +++ b/src/openfe_analysis/tests/conftest.py @@ -1,6 +1,6 @@ +import pathlib from importlib import resources -import pathlib import pooch import pytest @@ -9,41 +9,49 @@ path=POOCH_CACHE, base_url="doi:10.5281/zenodo.17916322", registry={ - "openfe_analysis_simulation_output.tar.gz":"md5:09752f2c4e5b7744d8afdee66dbd1414", - "openfe_analysis_skipped.tar.gz": "md5:3840d044299caacc4ccd50e6b22c0880", + "openfe_analysis_simulation_output.tar.gz": "md5:09752f2c4e5b7744d8afdee66dbd1414", + "openfe_analysis_skipped.tar.gz": "md5:3840d044299caacc4ccd50e6b22c0880", }, ) + @pytest.fixture(scope="session") def rbfe_output_data_dir() -> pathlib.Path: ZENODO_RBFE_DATA.fetch("openfe_analysis_simulation_output.tar.gz", processor=pooch.Untar()) - result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_simulation_output.tar.gz.untar/openfe_analysis_simulation_output/" + result_dir = ( + pathlib.Path(POOCH_CACHE) + / "openfe_analysis_simulation_output.tar.gz.untar/openfe_analysis_simulation_output/" + ) return result_dir + @pytest.fixture(scope="session") def rbfe_skipped_data_dir() -> pathlib.Path: ZENODO_RBFE_DATA.fetch("openfe_analysis_skipped.tar.gz", processor=pooch.Untar()) - result_dir = pathlib.Path(POOCH_CACHE) / "openfe_analysis_skipped.tar.gz.untar/openfe_analysis_skipped/" + result_dir = ( + pathlib.Path(POOCH_CACHE) / "openfe_analysis_skipped.tar.gz.untar/openfe_analysis_skipped/" + ) return result_dir + @pytest.fixture(scope="session") def simulation_nc(rbfe_output_data_dir) -> pathlib.Path: - return rbfe_output_data_dir/"simulation.nc" + return rbfe_output_data_dir / "simulation.nc" @pytest.fixture(scope="session") def simulation_skipped_nc(rbfe_skipped_data_dir) -> pathlib.Path: - return rbfe_skipped_data_dir/"simulation.nc" + return rbfe_skipped_data_dir / "simulation.nc" @pytest.fixture(scope="session") def hybrid_system_pdb(rbfe_output_data_dir) -> pathlib.Path: - return rbfe_output_data_dir/"hybrid_system.pdb" + return rbfe_output_data_dir / "hybrid_system.pdb" @pytest.fixture(scope="session") -def hybrid_system_skipped_pdb(rbfe_skipped_data_dir)->pathlib.Path: - return rbfe_skipped_data_dir/"hybrid_system.pdb" +def hybrid_system_skipped_pdb(rbfe_skipped_data_dir) -> pathlib.Path: + return rbfe_skipped_data_dir / "hybrid_system.pdb" @pytest.fixture(scope="session")