From 3a3f96069493f216c12cc97e3e534d3b98ec8d8d Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Mon, 19 Jan 2026 16:30:43 +0000
Subject: [PATCH 1/4] Fix tests for tskit 1.0

---
 tests/test_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 2836d58..6273b0e 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -189,7 +189,7 @@ def test_small_msprime_complex_mutations(self):
 
     def test_ref_seq(self):
         ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2)
-        tables = ts.tables
+        tables = ts.dump_tables()
         tables.reference_sequence.metadata_schema = (
             tskit.MetadataSchema.permissive_json()
         )

From c77b43941523d8eea7a85e5acd16875f007ade99 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Mon, 19 Jan 2026 17:01:28 +0000
Subject: [PATCH 2/4] Add chunk size to the Python API and test

---
 tests/test_compression.py | 36 +++++++++++++++++++++++++++++++
 tszip/compression.py      | 45 ++++++++++++++++++++++++++-------------
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index 6273b0e..609036a 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -526,3 +526,39 @@ def test_issue95_metadata_dtype_regression(self):
         assert len(ts_decompressed.metadata["reverse_node_map"]) == len(
             ts_original.metadata["reverse_node_map"]
         )
+
+
+class TestChunkSize:
+    @pytest.mark.parametrize(
+        "chunk_size", [1, 2, 1000, 2**21, np.array([100], dtype=int)[0]]
+    )
+    def test_good_chunks(self, tmpdir, chunk_size):
+        files = pathlib.Path(__file__).parent / "files"
+        ts1 = tskit.load(files / "1.0.0.trees")
+        path = tmpdir / "out.trees.tsz"
+        tszip.compress(ts1, path, chunk_size=chunk_size)
+        ts2 = tszip.decompress(path)
+        assert ts1 == ts2
+
+        store = compat.create_zip_store(path, mode="r")
+        root = compat.create_zarr_group(store=store)
+        for _, g in root.groups():
+            for _, a in g.arrays():
+                assert a.chunks == (chunk_size,)
+
+    @pytest.mark.parametrize(
+        ["chunk_size", "exception"],
+        [
+            (0, ValueError),
+            (-1, ValueError),
+            (1.1, TypeError),
+            ("x", TypeError),
+            ("10", TypeError),
+        ],
+    )
+    def test_bad_chunks(self, tmpdir, chunk_size, exception):
+        files = pathlib.Path(__file__).parent / "files"
+        ts = tskit.load(files / "1.0.0.trees")
+        path = tmpdir / "out.trees.tsz"
+        with pytest.raises(exception):
+            tszip.compress(ts, path, chunk_size=chunk_size)
diff --git a/tszip/compression.py b/tszip/compression.py
index 380b65f..b7b3851 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2019 Tskit Developers
+# Copyright (c) 2019-2026 Tskit Developers
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,7 @@
 import tempfile
 import warnings
 import zipfile
+import numbers
 
 import humanize
 import numcodecs
@@ -73,7 +74,7 @@ def minimal_dtype(array):
     return dtype
 
 
-def compress(ts, destination, variants_only=False):
+def compress(ts, destination, variants_only=False, *, chunk_size=None):
     """
     Compresses the specified tree sequence and writes it to the specified path
     or file-like object. By default, fully lossless compression is used so that
@@ -87,6 +88,9 @@ def compress(ts, destination, variants_only=False):
         we should write the compressed file to.
     :param bool variants_only: If True, discard all information not necessary
         to represent the variants in the input file.
+    :param int chunk_size: The number of array elements per chunk in the
+        Zarr encoding. Defaults to 2**20 (1_048_576), resulting in
+        each encoded chunk of 4-byte integer data being 4MiB.
     """
     try:
         destination = pathlib.Path(destination).resolve()
@@ -100,15 +104,15 @@ def compress(ts, destination, variants_only=False):
         logging.debug(f"Writing to temporary file {filename}")
         with compat.create_zip_store(filename, mode="w") as store:
             root = compat.create_zarr_group(store=store)
-            compress_zarr(ts, root, variants_only=variants_only)
+            compress_zarr(ts, root, variants_only=variants_only, chunk_size=chunk_size)
         if is_path:
             os.replace(filename, destination)
             logging.info(f"Wrote {destination}")
         else:
             # Assume that destination is a file-like object open in "wb" mode.
             with open(filename, "rb") as source:
-                chunk_size = 2**10  # 1MiB
-                for chunk in iter(functools.partial(source.read, chunk_size), b""):
+                read_chunk_size = 2**10  # 1MiB
+                for chunk in iter(functools.partial(source.read, read_chunk_size), b""):
                     destination.write(chunk)
 
 
@@ -131,16 +135,14 @@ class Column:
     A single column that is stored in the compressed output.
     """
 
-    def __init__(self, name, array, delta_filter=False):
+    def __init__(self, name, array, chunk_size, delta_filter=False):
         self.name = name
         self.array = array
         self.delta_filter = delta_filter
+        self.chunks = (chunk_size,)
 
     def compress(self, root, compressor):
         shape = self.array.shape
-        chunks = shape
-        if shape[0] == 0:
-            chunks = (1,)
         dtype = minimal_dtype(self.array)
         filters = None
         if self.delta_filter:
@@ -150,7 +152,7 @@ def compress(self, root, compressor):
             self.name,
             shape=shape,
             dtype=dtype,
-            chunks=chunks,
+            chunks=self.chunks,
             filters=filters,
             compressor=compressor,
         )
@@ -170,8 +172,17 @@ def compress(self, root, compressor):
         )
 
 
-def compress_zarr(ts, root, variants_only=False):
-    provenance_dict = provenance.get_provenance_dict({"variants_only": variants_only})
+def compress_zarr(ts, root, variants_only=False, chunk_size=None):
+    provenance_dict = provenance.get_provenance_dict(
+        {"variants_only": variants_only, "chunk_size": chunk_size}
+    )
+
+    if chunk_size is None:
+        chunk_size = 2**20
+    if not isinstance(chunk_size, numbers.Integral):
+        raise TypeError("Chunk size must be an integer")
+    if chunk_size < 1:
+        raise ValueError("Storage chunk size must be >= 1")
 
     if variants_only:
         logging.info("Using lossy variants-only compression")
@@ -254,9 +265,13 @@ def compress_zarr(ts, root, variants_only=False):
         cname="zstd", clevel=9, shuffle=numcodecs.Blosc.SHUFFLE
     )
     for name, data in columns.items():
-        Column(
-            name, data, delta_filter="_offset" in name or name in delta_filter_cols
-        ).compress(root, compressor)
+        col = Column(
+            name,
+            data,
+            chunk_size,
+            delta_filter="_offset" in name or name in delta_filter_cols,
+        )
+        col.compress(root, compressor)
 
 
 def check_format(root):

From 6ee78e70da82ee5b5ff7f2152c5d5fe6d16b0a28 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Tue, 20 Jan 2026 09:40:52 +0000
Subject: [PATCH 3/4] Added chunking to CLI

Also switch to 32MiB chunks

Closes #69
Closes #118
---
 CHANGELOG.rst             |  7 +++++++
 tests/test_cli.py         | 26 +++++++++++++++++++++++++-
 tests/test_compression.py | 11 ++++++++---
 tszip/__init__.py         |  1 +
 tszip/cli.py              | 12 +++++++++++-
 tszip/compression.py      | 21 +++++++++++++--------
 6 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 006ba13..1597712 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,10 @@
+--------------------
+[0.2.7] - 2026-XX-XX
+--------------------
+
+- Add support for very large columns and add the ``chunk_size`` parameter.
+  (jeromekelleher, #119).
+
 --------------------
 [0.2.6] - 2025-09-18
 --------------------
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d5c1734..0d8ab0b 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2019 Tskit Developers
+# Copyright (c) 2019-2026 Tskit Developers
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -34,6 +34,7 @@
 
 import tszip
 import tszip.cli as cli
+from tszip import compat
 
 
 def get_stdout_for_pytest():
@@ -98,6 +99,7 @@ def test_default_values(self):
         self.assertEqual(args.decompress, False)
         self.assertEqual(args.list, False)
         self.assertEqual(args.stdout, False)
+        self.assertEqual(args.chunk_size, tszip.DEFAULT_CHUNK_SIZE)
         self.assertEqual(args.variants_only, False)
         self.assertEqual(args.suffix, ".tsz")
 
@@ -123,6 +125,14 @@ def test_decompress(self):
         args = parser.parse_args([infile, "--decompress"])
         self.assertTrue(args.decompress)
 
+    def test_chunk_size(self):
+        parser = cli.tszip_cli_parser()
+        infile = "tmp.trees.tsz"
+        args = parser.parse_args([infile, "-C", "1234"])
+        self.assertEqual(args.chunk_size, 1234)
+        args = parser.parse_args([infile, "--chunk-size=1234"])
+        self.assertTrue(args.chunk_size, 1234)
+
 
 class TestCli(unittest.TestCase):
     """
@@ -248,6 +258,20 @@ def test_variants_only(self):
         G2 = self.ts.genotype_matrix()
         self.assertTrue(np.array_equal(G1, G2))
 
+    def test_chunk_size(self):
+        self.assertTrue(self.trees_path.exists())
+        self.run_tszip([str(self.trees_path), "--chunk-size=20"])
+        self.assertFalse(self.trees_path.exists())
+        outpath = pathlib.Path(str(self.trees_path) + ".tsz")
+        self.assertTrue(outpath.exists())
+        ts = tszip.decompress(outpath)
+        self.assertEqual(ts.tables, self.ts.tables)
+        store = compat.create_zip_store(str(outpath), mode="r")
+        root = compat.create_zarr_group(store=store)
+        for _, g in root.groups():
+            for _, a in g.arrays():
+                assert a.chunks == (20,)
+
     def test_keep(self):
         self.assertTrue(self.trees_path.exists())
         self.run_tszip([str(self.trees_path), "--keep"])
diff --git a/tests/test_compression.py b/tests/test_compression.py
index 609036a..a3b401c 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2021 Tskit Developers
+# Copyright (c) 2021-2026 Tskit Developers
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -307,7 +307,12 @@ def test_provenance(self):
                 root = compat.create_zarr_group(store=store)
                 self.assertEqual(
                     root.attrs["provenance"],
-                    provenance.get_provenance_dict({"variants_only": variants_only}),
+                    provenance.get_provenance_dict(
+                        {
+                            "variants_only": variants_only,
+                            "chunk_size": compression.DEFAULT_CHUNK_SIZE,
+                        }
+                    ),
                 )
 
     def write_file(self, attrs, path):
@@ -540,7 +545,7 @@ def test_good_chunks(self, tmpdir, chunk_size):
         ts2 = tszip.decompress(path)
         assert ts1 == ts2
 
-        store = compat.create_zip_store(path, mode="r")
+        store = compat.create_zip_store(str(path), mode="r")
         root = compat.create_zarr_group(store=store)
         for _, g in root.groups():
             for _, a in g.arrays():
diff --git a/tszip/__init__.py b/tszip/__init__.py
index 9d8feec..f144c70 100644
--- a/tszip/__init__.py
+++ b/tszip/__init__.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 from .compression import compress  # NOQA
 from .compression import decompress  # NOQA
+from .compression import DEFAULT_CHUNK_SIZE  # NOQA
 from .compression import load  # NOQA
 from .compression import print_summary  # NOQA
 from .provenance import __version__  # NOQA
diff --git a/tszip/cli.py b/tszip/cli.py
index c979a6f..1173782 100644
--- a/tszip/cli.py
+++ b/tszip/cli.py
@@ -64,6 +64,14 @@ def tszip_cli_parser():
         "-v", "--verbosity", action="count", default=0, help="Increase the verbosity"
     )
     parser.add_argument("files", nargs="+", help="The files to compress/decompress.")
+    parser.add_argument(
+        "-C",
+        "--chunk-size",
+        type=int,
+        default=tszip.DEFAULT_CHUNK_SIZE,
+        help="Sets the size of array chunks to be compressed to the specified "
+        f"number of elements. Default={tszip.DEFAULT_CHUNK_SIZE}",
+    )
     parser.add_argument(
         "--variants-only",
         action="store_true",
@@ -125,7 +133,9 @@ def run_compress(args):
         check_output(outfile, args)
         if args.stdout:
             outfile = get_stdout()
-        tszip.compress(ts, outfile, variants_only=args.variants_only)
+        tszip.compress(
+            ts, outfile, variants_only=args.variants_only, chunk_size=args.chunk_size
+        )
         remove_input(infile, args)
 
 
diff --git a/tszip/compression.py b/tszip/compression.py
index b7b3851..82e8f10 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -26,12 +26,12 @@
 import functools
 import json
 import logging
+import numbers
 import os
 import pathlib
 import tempfile
 import warnings
 import zipfile
-import numbers
 
 import humanize
 import numcodecs
@@ -48,6 +48,10 @@
 FORMAT_NAME = "tszip"
 FORMAT_VERSION = [1, 0]
 
+# ~8 million elements, giving 32MiB chunks to be compressed
+# for most columns
+DEFAULT_CHUNK_SIZE = 8 * 2**20
+
 
 def minimal_dtype(array):
     """
@@ -89,8 +93,8 @@ def compress(ts, destination, variants_only=False, *, chunk_size=None):
     :param bool variants_only: If True, discard all information not necessary
         to represent the variants in the input file.
     :param int chunk_size: The number of array elements per chunk in the
-        Zarr encoding. Defaults to 2**20 (1_048_576), resulting in
-        each encoded chunk of 4-byte integer data being 4MiB.
+        Zarr encoding. Defaults to 8_388_608, resulting in
+        each encoded chunk of 4-byte integer data being 32MiB.
     """
     try:
         destination = pathlib.Path(destination).resolve()
@@ -173,16 +177,17 @@ def compress(self, root, compressor):
 
 
 def compress_zarr(ts, root, variants_only=False, chunk_size=None):
-    provenance_dict = provenance.get_provenance_dict(
-        {"variants_only": variants_only, "chunk_size": chunk_size}
-    )
-
     if chunk_size is None:
-        chunk_size = 2**20
+        chunk_size = DEFAULT_CHUNK_SIZE
     if not isinstance(chunk_size, numbers.Integral):
         raise TypeError("Chunk size must be an integer")
     if chunk_size < 1:
         raise ValueError("Storage chunk size must be >= 1")
+    chunk_size = int(chunk_size)  # Avoid issues with JSON serialisation
+
+    provenance_dict = provenance.get_provenance_dict(
+        {"variants_only": variants_only, "chunk_size": chunk_size}
+    )
 
     if variants_only:
         logging.info("Using lossy variants-only compression")

From ae0178ab755897ad40459f8234a89adbd6e49cac Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Wed, 4 Feb 2026 17:02:40 +0000
Subject: [PATCH 4/4] Simplify decode logic

Also improve performance on Zarr v3 and large metadata
---
 tests/test_compression.py |  4 ++
 tszip/compat.py           |  6 ---
 tszip/compression.py      | 91 ++++++++++++++++++++++++---------------
 3 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/tests/test_compression.py b/tests/test_compression.py
index a3b401c..fade286 100644
--- a/tests/test_compression.py
+++ b/tests/test_compression.py
@@ -104,6 +104,10 @@ class RoundTripMixin:
     Set of example tree sequences that we should be able to round trip.
     """
 
+    def test_minimal(self):
+        ts = tskit.Tree.generate_balanced(2).tree_sequence
+        self.verify(ts)
+
     def test_small_msprime_no_recomb(self):
         ts = msprime.simulate(10, mutation_rate=2, random_seed=2)
         self.assertGreater(ts.num_sites, 2)
diff --git a/tszip/compat.py b/tszip/compat.py
index 62e859f..750a880 100644
--- a/tszip/compat.py
+++ b/tszip/compat.py
@@ -56,9 +56,6 @@ def create_empty_array(
     def get_nbytes_stored(array):
         return array.nbytes_stored()
 
-    def group_items(group):
-        return group.members()
-
     def visit_arrays(group, visitor):
         for array in group.array_values():
             visitor(array)
@@ -86,8 +83,5 @@ def create_empty_array(
     def get_nbytes_stored(array):
         return array.nbytes_stored
 
-    def group_items(group):
-        return group.items()
-
     def visit_arrays(group, visitor):
         group.visitvalues(visitor)
diff --git a/tszip/compression.py b/tszip/compression.py
index 82e8f10..c9b92d3 100644
--- a/tszip/compression.py
+++ b/tszip/compression.py
@@ -322,48 +322,69 @@ def load_zarr(path):
             store.close()
 
 
+def _convert_string_field(array):
+    value = array[:]
+    return value.tobytes().decode("utf-8")
+
+
+def _is_string_field(group_name, array_name):
+    string_fields = [
+        (None, "time_units"),
+        ("reference_sequence", "data"),
+        ("reference_sequence", "url"),
+    ]
+    return array_name == "metadata_schema" or (group_name, array_name) in string_fields
+
+
+def _is_metadata_field(group_name, array_name):
+    return group_name in (None, "reference_sequence") and array_name == "metadata"
+
+
+def _convert_metadata_field(array):
+    # Handle backward compatibility: <=0.2.5 versions stored metadata as int8
+    # which can have negative values outside the valid byte range (0-255)
+    value = array[:].astype("uint8")
+    return value.tobytes()
+
+
 def decompress_zarr(root):
     coordinates = root["coordinates"][:]
     dict_repr = {"sequence_length": root.attrs["sequence_length"]}
 
     quantised_arrays = [
-        "edges/left",
-        "edges/right",
-        "migrations/left",
-        "migrations/right",
-        "sites/position",
+        ("edges", "left"),
+        ("edges", "right"),
+        ("migrations", "left"),
+        ("migrations", "right"),
+        ("sites", "position"),
     ]
-    for key, value in compat.group_items(root):
-        if hasattr(value, "members") or hasattr(value, "items"):
-            # This is a zarr Group, iterate over its contents
-            for sub_key, sub_value in compat.group_items(value):
-                if f"{key}/{sub_key}" in quantised_arrays:
-                    dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value]
-                elif sub_key.endswith("metadata_schema") or (key, sub_key) in [
-                    ("reference_sequence", "data"),
-                    ("reference_sequence", "url"),
-                ]:
-                    dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode(
-                        "utf-8"
-                    )
-                elif (key, sub_key) == ("reference_sequence", "metadata"):
-                    dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value)
-                else:
-                    dict_repr.setdefault(key, {})[sub_key] = sub_value
-        else:
-            # This is an array
-            if key.endswith("metadata_schema") or key == "time_units":
-                dict_repr[key] = bytes(value).decode("utf-8")
-            elif key.endswith("metadata"):
-                # Handle backward compatibility: <=0.2.5 versions stored metadata as int8
-                # which can have negative values outside the valid byte range (0-255)
-                try:
-                    dict_repr[key] = bytes(value)
-                except ValueError:
-                    uint8_value = np.array(value, dtype=np.int8).astype(np.uint8)
-                    dict_repr[key] = bytes(uint8_value)
+    for group_name, group in root.groups():
+        group_dict = {}
+        for array_name, array in group.arrays():
+            if (group_name, array_name) in quantised_arrays:
+                value = coordinates[array[:]]
+            elif _is_string_field(group_name, array_name):
+                value = _convert_string_field(array)
+            elif _is_metadata_field(group_name, array_name):
+                value = _convert_metadata_field(array)
             else:
-                dict_repr[key] = value
+                # Otherwise, pass the zarr array through directly to
+                # tskit which will convert to appropriate numpy array
+                value = array
+            group_dict[array_name] = value
+        dict_repr[group_name] = group_dict
+
+    for array_name, array in root.arrays():
+        if array_name == "coordinates":
+            continue
+        if _is_string_field(None, array_name):
+            value = _convert_string_field(array)
+        elif _is_metadata_field(None, array_name):
+            value = _convert_metadata_field(array)
+        else:
+            value = array
+        dict_repr[array_name] = value
+
     return tskit.TableCollection.fromdict(dict_repr).tree_sequence()