From 3a3f96069493f216c12cc97e3e534d3b98ec8d8d Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 19 Jan 2026 16:30:43 +0000 Subject: [PATCH 1/4] Fix tests for tskit 1.0 --- tests/test_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_compression.py b/tests/test_compression.py index 2836d58..6273b0e 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -189,7 +189,7 @@ def test_small_msprime_complex_mutations(self): def test_ref_seq(self): ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2) - tables = ts.tables + tables = ts.dump_tables() tables.reference_sequence.metadata_schema = ( tskit.MetadataSchema.permissive_json() ) From c77b43941523d8eea7a85e5acd16875f007ade99 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 19 Jan 2026 17:01:28 +0000 Subject: [PATCH 2/4] Add chunk size to the Python API and test --- tests/test_compression.py | 36 +++++++++++++++++++++++++++++++ tszip/compression.py | 45 ++++++++++++++++++++++++++------------- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/tests/test_compression.py b/tests/test_compression.py index 6273b0e..609036a 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -526,3 +526,39 @@ def test_issue95_metadata_dtype_regression(self): assert len(ts_decompressed.metadata["reverse_node_map"]) == len( ts_original.metadata["reverse_node_map"] ) + + +class TestChunkSize: + @pytest.mark.parametrize( + "chunk_size", [1, 2, 1000, 2**21, np.array([100], dtype=int)[0]] + ) + def test_good_chunks(self, tmpdir, chunk_size): + files = pathlib.Path(__file__).parent / "files" + ts1 = tskit.load(files / "1.0.0.trees") + path = tmpdir / "out.trees.tsz" + tszip.compress(ts1, path, chunk_size=chunk_size) + ts2 = tszip.decompress(path) + assert ts1 == ts2 + + store = compat.create_zip_store(path, mode="r") + root = compat.create_zarr_group(store=store) + for _, g in root.groups(): + for _, a in g.arrays(): + assert a.chunks == (chunk_size,) + + @pytest.mark.parametrize( + ["chunk_size", "exception"], + [ + (0, ValueError), + (-1, ValueError), + (1.1, TypeError), + ("x", TypeError), + ("10", TypeError), + ], + ) + def test_bad_chunks(self, tmpdir, chunk_size, exception): + files = pathlib.Path(__file__).parent / "files" + ts = tskit.load(files / "1.0.0.trees") + path = tmpdir / "out.trees.tsz" + with pytest.raises(exception): + tszip.compress(ts, path, chunk_size=chunk_size) diff --git a/tszip/compression.py b/tszip/compression.py index 380b65f..b7b3851 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2019 Tskit Developers +# Copyright (c) 2019-2026 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,7 @@ import tempfile import warnings import zipfile +import numbers import humanize import numcodecs @@ -73,7 +74,7 @@ def minimal_dtype(array): return dtype -def compress(ts, destination, variants_only=False): +def compress(ts, destination, variants_only=False, *, chunk_size=None): """ Compresses the specified tree sequence and writes it to the specified path or file-like object. By default, fully lossless compression is used so that @@ -87,6 +88,9 @@ def compress(ts, destination, variants_only=False): we should write the compressed file to. :param bool variants_only: If True, discard all information not necessary to represent the variants in the input file. + :param int chunk_size: The number of array elements per chunk in the + Zarr encoding. Defaults to 2**20 (1_048_576), resulting in + each encoded chunk of 4-byte integer data being 4MiB. """ try: destination = pathlib.Path(destination).resolve() @@ -100,15 +104,15 @@ def compress(ts, destination, variants_only=False): logging.debug(f"Writing to temporary file {filename}") with compat.create_zip_store(filename, mode="w") as store: root = compat.create_zarr_group(store=store) - compress_zarr(ts, root, variants_only=variants_only) + compress_zarr(ts, root, variants_only=variants_only, chunk_size=chunk_size) if is_path: os.replace(filename, destination) logging.info(f"Wrote {destination}") else: # Assume that destination is a file-like object open in "wb" mode. with open(filename, "rb") as source: - chunk_size = 2**10 # 1MiB - for chunk in iter(functools.partial(source.read, chunk_size), b""): + read_chunk_size = 2**10 # 1MiB + for chunk in iter(functools.partial(source.read, read_chunk_size), b""): destination.write(chunk) @@ -131,16 +135,14 @@ class Column: A single column that is stored in the compressed output. """ - def __init__(self, name, array, delta_filter=False): + def __init__(self, name, array, chunk_size, delta_filter=False): self.name = name self.array = array self.delta_filter = delta_filter + self.chunks = (chunk_size,) def compress(self, root, compressor): shape = self.array.shape - chunks = shape - if shape[0] == 0: - chunks = (1,) dtype = minimal_dtype(self.array) filters = None if self.delta_filter: @@ -150,7 +152,7 @@ def compress(self, root, compressor): self.name, shape=shape, dtype=dtype, - chunks=chunks, + chunks=self.chunks, filters=filters, compressor=compressor, ) @@ -170,8 +172,17 @@ def compress(self, root, compressor): ) -def compress_zarr(ts, root, variants_only=False): - provenance_dict = provenance.get_provenance_dict({"variants_only": variants_only}) +def compress_zarr(ts, root, variants_only=False, chunk_size=None): + provenance_dict = provenance.get_provenance_dict( + {"variants_only": variants_only, "chunk_size": chunk_size} + ) + + if chunk_size is None: + chunk_size = 2**20 + if not isinstance(chunk_size, numbers.Integral): + raise TypeError("Chunk size must be an integer") + if chunk_size < 1: + raise ValueError("Storage chunk size must be >= 1") if variants_only: logging.info("Using lossy variants-only compression") @@ -254,9 +265,13 @@ def compress_zarr(ts, root, variants_only=False): cname="zstd", clevel=9, shuffle=numcodecs.Blosc.SHUFFLE ) for name, data in columns.items(): - Column( - name, data, delta_filter="_offset" in name or name in delta_filter_cols - ).compress(root, compressor) + col = Column( + name, + data, + chunk_size, + delta_filter="_offset" in name or name in delta_filter_cols, + ) + col.compress(root, compressor) def check_format(root): From 6ee78e70da82ee5b5ff7f2152c5d5fe6d16b0a28 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 20 Jan 2026 09:40:52 +0000 Subject: [PATCH 3/4] Added chunking to CLI Also switch to 32MiB chunks Closes #69 Closes #118 --- CHANGELOG.rst | 7 +++++++ tests/test_cli.py | 26 +++++++++++++++++++++++++- tests/test_compression.py | 11 ++++++++--- tszip/__init__.py | 1 + tszip/cli.py | 12 +++++++++++- tszip/compression.py | 21 +++++++++++++-------- 6 files changed, 65 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 006ba13..1597712 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,10 @@ +-------------------- +[0.2.7] - 2026-XX-XX +-------------------- + +- Add support for very large columns and add the ``chunk_size`` parameter. + (jeromekelleher, #119). + -------------------- [0.2.6] - 2025-09-18 -------------------- diff --git a/tests/test_cli.py b/tests/test_cli.py index d5c1734..0d8ab0b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2019 Tskit Developers +# Copyright (c) 2019-2026 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -34,6 +34,7 @@ import tszip import tszip.cli as cli +from tszip import compat def get_stdout_for_pytest(): @@ -98,6 +99,7 @@ def test_default_values(self): self.assertEqual(args.decompress, False) self.assertEqual(args.list, False) self.assertEqual(args.stdout, False) + self.assertEqual(args.chunk_size, tszip.DEFAULT_CHUNK_SIZE) self.assertEqual(args.variants_only, False) self.assertEqual(args.suffix, ".tsz") @@ -123,6 +125,14 @@ def test_decompress(self): args = parser.parse_args([infile, "--decompress"]) self.assertTrue(args.decompress) + def test_chunk_size(self): + parser = cli.tszip_cli_parser() + infile = "tmp.trees.tsz" + args = parser.parse_args([infile, "-C", "1234"]) + self.assertEqual(args.chunk_size, 1234) + args = parser.parse_args([infile, "--chunk-size=1234"]) + self.assertTrue(args.chunk_size, 1234) + class TestCli(unittest.TestCase): """ @@ -248,6 +258,20 @@ def test_variants_only(self): G2 = self.ts.genotype_matrix() self.assertTrue(np.array_equal(G1, G2)) + def test_chunk_size(self): + self.assertTrue(self.trees_path.exists()) + self.run_tszip([str(self.trees_path), "--chunk-size=20"]) + self.assertFalse(self.trees_path.exists()) + outpath = pathlib.Path(str(self.trees_path) + ".tsz") + self.assertTrue(outpath.exists()) + ts = tszip.decompress(outpath) + self.assertEqual(ts.tables, self.ts.tables) + store = compat.create_zip_store(str(outpath), mode="r") + root = compat.create_zarr_group(store=store) + for _, g in root.groups(): + for _, a in g.arrays(): + assert a.chunks == (20,) + def test_keep(self): self.assertTrue(self.trees_path.exists()) self.run_tszip([str(self.trees_path), "--keep"]) diff --git a/tests/test_compression.py b/tests/test_compression.py index 609036a..a3b401c 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2021 Tskit Developers +# Copyright (c) 2021-2026 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -307,7 +307,12 @@ def test_provenance(self): root = compat.create_zarr_group(store=store) self.assertEqual( root.attrs["provenance"], - provenance.get_provenance_dict({"variants_only": variants_only}), + provenance.get_provenance_dict( + { + "variants_only": variants_only, + "chunk_size": compression.DEFAULT_CHUNK_SIZE, + } + ), ) def write_file(self, attrs, path): @@ -540,7 +545,7 @@ def test_good_chunks(self, tmpdir, chunk_size): ts2 = tszip.decompress(path) assert ts1 == ts2 - store = compat.create_zip_store(path, mode="r") + store = compat.create_zip_store(str(path), mode="r") root = compat.create_zarr_group(store=store) for _, g in root.groups(): for _, a in g.arrays(): diff --git a/tszip/__init__.py b/tszip/__init__.py index 9d8feec..f144c70 100644 --- a/tszip/__init__.py +++ b/tszip/__init__.py @@ -21,6 +21,7 @@ # SOFTWARE. from .compression import compress # NOQA from .compression import decompress # NOQA +from .compression import DEFAULT_CHUNK_SIZE # NOQA from .compression import load # NOQA from .compression import print_summary # NOQA from .provenance import __version__ # NOQA diff --git a/tszip/cli.py b/tszip/cli.py index c979a6f..1173782 100644 --- a/tszip/cli.py +++ b/tszip/cli.py @@ -64,6 +64,14 @@ def tszip_cli_parser(): "-v", "--verbosity", action="count", default=0, help="Increase the verbosity" ) parser.add_argument("files", nargs="+", help="The files to compress/decompress.") + parser.add_argument( + "-C", + "--chunk-size", + type=int, + default=tszip.DEFAULT_CHUNK_SIZE, + help="Sets the size of array chunks to be compressed to the specified " + f"number of elements. Default={tszip.DEFAULT_CHUNK_SIZE}", + ) parser.add_argument( "--variants-only", action="store_true", @@ -125,7 +133,9 @@ def run_compress(args): check_output(outfile, args) if args.stdout: outfile = get_stdout() - tszip.compress(ts, outfile, variants_only=args.variants_only) + tszip.compress( + ts, outfile, variants_only=args.variants_only, chunk_size=args.chunk_size + ) remove_input(infile, args) diff --git a/tszip/compression.py b/tszip/compression.py index b7b3851..82e8f10 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -26,12 +26,12 @@ import functools import json import logging +import numbers import os import pathlib import tempfile import warnings import zipfile -import numbers import humanize import numcodecs @@ -48,6 +48,10 @@ FORMAT_NAME = "tszip" FORMAT_VERSION = [1, 0] +# ~8 million elements, giving 32MiB chunks to be compressed +# for most columns +DEFAULT_CHUNK_SIZE = 8 * 2**20 + def minimal_dtype(array): """ @@ -89,8 +93,8 @@ def compress(ts, destination, variants_only=False, *, chunk_size=None): :param bool variants_only: If True, discard all information not necessary to represent the variants in the input file. :param int chunk_size: The number of array elements per chunk in the - Zarr encoding. Defaults to 2**20 (1_048_576), resulting in - each encoded chunk of 4-byte integer data being 4MiB. + Zarr encoding. Defaults to 8_388_608, resulting in + each encoded chunk of 4-byte integer data being 32MiB. """ try: destination = pathlib.Path(destination).resolve() @@ -173,16 +177,17 @@ def compress(self, root, compressor): def compress_zarr(ts, root, variants_only=False, chunk_size=None): - provenance_dict = provenance.get_provenance_dict( - {"variants_only": variants_only, "chunk_size": chunk_size} - ) - if chunk_size is None: - chunk_size = 2**20 + chunk_size = DEFAULT_CHUNK_SIZE if not isinstance(chunk_size, numbers.Integral): raise TypeError("Chunk size must be an integer") if chunk_size < 1: raise ValueError("Storage chunk size must be >= 1") + chunk_size = int(chunk_size) # Avoid issues with JSON serialisation + + provenance_dict = provenance.get_provenance_dict( + {"variants_only": variants_only, "chunk_size": chunk_size} + ) if variants_only: logging.info("Using lossy variants-only compression") From ae0178ab755897ad40459f8234a89adbd6e49cac Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 4 Feb 2026 17:02:40 +0000 Subject: [PATCH 4/4] Simplify decode logic Also improve performance on Zarr v3 and large metadata --- tests/test_compression.py | 4 ++ tszip/compat.py | 6 --- tszip/compression.py | 91 ++++++++++++++++++++++++--------------- 3 files changed, 60 insertions(+), 41 deletions(-) diff --git a/tests/test_compression.py b/tests/test_compression.py index a3b401c..fade286 100644 --- a/tests/test_compression.py +++ b/tests/test_compression.py @@ -104,6 +104,10 @@ class RoundTripMixin: Set of example tree sequences that we should be able to round trip. """ + def test_minimal(self): + ts = tskit.Tree.generate_balanced(2).tree_sequence + self.verify(ts) + def test_small_msprime_no_recomb(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=2) self.assertGreater(ts.num_sites, 2) diff --git a/tszip/compat.py b/tszip/compat.py index 62e859f..750a880 100644 --- a/tszip/compat.py +++ b/tszip/compat.py @@ -56,9 +56,6 @@ def create_empty_array( def get_nbytes_stored(array): return array.nbytes_stored() - def group_items(group): - return group.members() - def visit_arrays(group, visitor): for array in group.array_values(): visitor(array) @@ -86,8 +83,5 @@ def create_empty_array( def get_nbytes_stored(array): return array.nbytes_stored - def group_items(group): - return group.items() - def visit_arrays(group, visitor): group.visitvalues(visitor) diff --git a/tszip/compression.py b/tszip/compression.py index 82e8f10..c9b92d3 100644 --- a/tszip/compression.py +++ b/tszip/compression.py @@ -322,48 +322,69 @@ def load_zarr(path): store.close() +def _convert_string_field(array): + value = array[:] + return value.tobytes().decode("utf-8") + + +def _is_string_field(group_name, array_name): + string_fields = [ + (None, "time_units"), + ("reference_sequence", "data"), + ("reference_sequence", "url"), + ] + return array_name == "metadata_schema" or (group_name, array_name) in string_fields + + +def _is_metadata_field(group_name, array_name): + return group_name in (None, "reference_sequence") and array_name == "metadata" + + +def _convert_metadata_field(array): + # Handle backward compatibility: <=0.2.5 versions stored metadata as int8 + # which can have negative values outside the valid byte range (0-255) + value = array[:].astype("uint8") + return value.tobytes() + + def decompress_zarr(root): coordinates = root["coordinates"][:] dict_repr = {"sequence_length": root.attrs["sequence_length"]} quantised_arrays = [ - "edges/left", - "edges/right", - "migrations/left", - "migrations/right", - "sites/position", + ("edges", "left"), + ("edges", "right"), + ("migrations", "left"), + ("migrations", "right"), + ("sites", "position"), ] - for key, value in compat.group_items(root): - if hasattr(value, "members") or hasattr(value, "items"): - # This is a zarr Group, iterate over its contents - for sub_key, sub_value in compat.group_items(value): - if f"{key}/{sub_key}" in quantised_arrays: - dict_repr.setdefault(key, {})[sub_key] = coordinates[sub_value] - elif sub_key.endswith("metadata_schema") or (key, sub_key) in [ - ("reference_sequence", "data"), - ("reference_sequence", "url"), - ]: - dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value).decode( - "utf-8" - ) - elif (key, sub_key) == ("reference_sequence", "metadata"): - dict_repr.setdefault(key, {})[sub_key] = bytes(sub_value) - else: - dict_repr.setdefault(key, {})[sub_key] = sub_value - else: - # This is an array - if key.endswith("metadata_schema") or key == "time_units": - dict_repr[key] = bytes(value).decode("utf-8") - elif key.endswith("metadata"): - # Handle backward compatibility: <=0.2.5 versions stored metadata as int8 - # which can have negative values outside the valid byte range (0-255) - try: - dict_repr[key] = bytes(value) - except ValueError: - uint8_value = np.array(value, dtype=np.int8).astype(np.uint8) - dict_repr[key] = bytes(uint8_value) + for group_name, group in root.groups(): + group_dict = {} + for array_name, array in group.arrays(): + if (group_name, array_name) in quantised_arrays: + value = coordinates[array[:]] + elif _is_string_field(group_name, array_name): + value = _convert_string_field(array) + elif _is_metadata_field(group_name, array_name): + value = _convert_metadata_field(array) else: - dict_repr[key] = value + # Otherwise, pass the zarr array through directly to + # tskit which will convert to appropriate numpy array + value = array + group_dict[array_name] = value + dict_repr[group_name] = group_dict + + for array_name, array in root.arrays(): + if array_name == "coordinates": + continue + if _is_string_field(None, array_name): + value = _convert_string_field(array) + elif _is_metadata_field(None, array_name): + value = _convert_metadata_field(array) + else: + value = array + dict_repr[array_name] = value + return tskit.TableCollection.fromdict(dict_repr).tree_sequence()