DataFrame: Add Attribute Columns (#1814)

ax3l · web-flow · commit 9b90ba018a75 · 2025-12-17T16:18:44.000+01:00
* DataFrame: Add Attribute Columns

Optionally add particle species attributes
as extra columns. This is super useful when dealing with
openPMD extensions or custom attributes, e.g., for accelerator
physics. In the latter case, we store reference particle
information on the particle species group (changes per
iteration/snapshot).

* `to_df()` backwards compatibility

Specifically for `to_df(np.s[...])` calls that
did not name an argument
diff --git a/docs/source/analysis/pandas.rst b/docs/source/analysis/pandas.rst
@@ -50,6 +50,17 @@ One can also combine all iterations in a single dataframe like this:
    # like before but with a new column "iteration" and all particles
    print(df)
 
+Additionally, one can add additional openPMD particle species attributes, e.g.,
+from the `ED-PIC <https://github.com/openPMD/openPMD-standard/blob/1.1.0/EXT_ED-PIC.md#particle-records-macroparticles>`__ extension
+or `custom code properties <https://impactx.readthedocs.io/en/25.11/dataanalysis/dataanalysis.html#additional-beam-attributes>`__
+as extra dataframe columns:
+
+.. code-block:: python
+
+   df = s.to_df("electrons", attributes=["s_ref"])
+
+   # like before but with a new column "s_ref"
+   print(df)
 
 .. _analysis-pandas-ascii:
 
diff --git a/examples/11_particle_dataframe.py b/examples/11_particle_dataframe.py
@@ -38,26 +38,27 @@
     s = io.Series("../samples/git-sample/data%T.h5", io.Access.read_only)
     electrons = s.snapshots()[400].particles["electrons"]
 
-    # all particles
-    df = electrons.to_df()
+    # all particles, extra column for "particleShape" attribute
+    #                (from ED-PIC extension)
+    df = electrons.to_df(attributes=["particleShape"])
     print(type(df) is pd.DataFrame)
     print(df)
 
     # only first 100 particles
-    df = electrons.to_df(np.s_[:100])
+    df = electrons.to_df(slice=np.s_[:100])
     print(df)
 
     # all particles over all steps
-    df = s.to_df("electrons")
+    df = s.to_df("electrons", attributes=["particleShape"])
     print(df)
 
     if found_cudf:
         # all particles - to GPU
-        cdf = cudf.from_pandas(electrons.to_df())
+        cdf = cudf.from_pandas(electrons.to_df(attributes=["particleShape"]))
         print(cdf)
 
         # all particles over all steps - to GPU
-        cdf = s.to_cudf("electrons")
+        cdf = s.to_cudf("electrons", attributes=["particleShape"])
         print(cdf)
 
     # Particles
@@ -67,7 +68,7 @@
         # pickle capabilities, so we test this here:
         dask.config.set(scheduler='processes')
 
-        df = electrons.to_dask()
+        df = electrons.to_dask(attributes=["particleShape"])
         print(df)
 
         # check chunking of a variable
diff --git a/src/binding/python/openpmd_api/DaskDataFrame.py b/src/binding/python/openpmd_api/DaskDataFrame.py
@@ -8,19 +8,22 @@
 import numpy as np
 
 
-def read_chunk_to_df(species, chunk):
+def read_chunk_to_df(species, chunk, attributes=None):
     stride = np.s_[chunk.offset[0]:chunk.offset[0]+chunk.extent[0]]
-    return species.to_df(stride)
+    return species.to_df(attributes=attributes, slice=stride)
 
 
-def particles_to_daskdataframe(particle_species):
+def particles_to_daskdataframe(particle_species, attributes=None):
     """
     Load all records of a particle species into a Dask DataFrame.
 
     Parameters
     ----------
     particle_species : openpmd_api.ParticleSpecies
         A ParticleSpecies class in openPMD-api.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -83,7 +86,9 @@ def particles_to_daskdataframe(particle_species):
 
     # merge DataFrames
     dfs = [
-        delayed(read_chunk_to_df)(particle_species, chunk) for chunk in chunks
+        delayed(read_chunk_to_df)(
+            particle_species, chunk=chunk, attributes=attributes
+        ) for chunk in chunks
     ]
     df = dd.from_delayed(dfs)
 
diff --git a/src/binding/python/openpmd_api/DataFrame.py b/src/binding/python/openpmd_api/DataFrame.py
@@ -10,14 +10,22 @@
 import numpy as np
 
 
-def particles_to_dataframe(particle_species, slice=None):
+def particles_to_dataframe(particle_species,
+                           *legacy_args,
+                           attributes=None,
+                           slice=None):
     """
     Load all records of a particle species into a Pandas DataFrame.
 
     Parameters
     ----------
     particle_species : openpmd_api.ParticleSpecies
         A ParticleSpecies class in openPMD-api.
+    legacy_args : tuple
+        DO NOT USE. Catch-all for legacy, unnamed arguments.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
     slice : np.s_, optional
         A numpy slice that can be used to load only a sub-selection of
         particles.
@@ -40,6 +48,20 @@ def particles_to_dataframe(particle_species, slice=None):
         are optimal arguments for the slice parameter
     pandas.DataFrame : the central dataframe object created here
     """
+    # backwards compatibility: in openPMD-api 0.17+, we added the
+    # additional "attributes" argument and moved slice= to the end.
+    if legacy_args:
+        if attributes is None and slice is None and len(legacy_args) == 1:
+            slice = legacy_args[0]
+            import warnings
+            warnings.warn("The to_df() argument order changed in "
+                          "openPMD-api 0.17.0!\nThe slice "
+                          "argument must be passed as a named argument.",
+                          DeprecationWarning
+                          )
+        else:
+            raise RuntimeError("to_df() does not support unnamed arguments!")
+
     # import pandas here for a lazy import
     try:
         import pandas as pd
@@ -69,14 +91,18 @@ def particles_to_dataframe(particle_species, slice=None):
 
     df = pd.DataFrame(columns)
 
+    if attributes is not None:
+        for attribute in attributes:
+            df[attribute] = particle_species.get_attribute(attribute)
+
     # set a header for the first column (row index)
     #   note: this is NOT the particle id
     df.index.name = "row"
 
     return df
 
 
-def iterations_to_dataframe(series, species_name):
+def iterations_to_dataframe(series, species_name, attributes=None):
     """
     Load all iterations of a particle species into a Pandas DataFrame.
 
@@ -86,6 +112,9 @@ def iterations_to_dataframe(series, species_name):
         A Series class in openPMD-api.
     species_name : string
         The name of a particle species.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -115,7 +144,7 @@ def iterations_to_dataframe(series, species_name):
         (
             iteration
             .particles[species_name]
-            .to_df()
+            .to_df(attributes=attributes)
             .assign(iteration=i)
             for i, iteration in series.snapshots().items()
         ),
@@ -126,7 +155,7 @@ def iterations_to_dataframe(series, species_name):
     return df
 
 
-def iterations_to_cudf(series, species_name):
+def iterations_to_cudf(series, species_name, attributes=None):
     """
     Load all iterations of a particle species into a cuDF DataFrame.
 
@@ -136,6 +165,9 @@ def iterations_to_cudf(series, species_name):
         A Series class in openPMD-api.
     species_name : string
         The name of a particle species.
+    attributes : list of strings, optional
+        A list of attributes of the particle_species that should be read and
+        added as extra columns.
 
     Returns
     -------
@@ -172,7 +204,7 @@ def iterations_to_cudf(series, species_name):
             cudf.from_pandas(
                 iteration
                 .particles[species_name]
-                .to_df()
+                .to_df(attributes=attributes)
                 .assign(iteration=i)
             )
             for i, iteration in series.snapshots().items()