Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
--------------------
[0.2.7] - 2026-XX-XX
--------------------

- Add support for very large columns and add the ``chunk_size`` parameter.
(jeromekelleher, #119).

--------------------
[0.2.6] - 2025-09-18
--------------------
Expand Down
26 changes: 25 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2019 Tskit Developers
# Copyright (c) 2019-2026 Tskit Developers
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -34,6 +34,7 @@

import tszip
import tszip.cli as cli
from tszip import compat


def get_stdout_for_pytest():
Expand Down Expand Up @@ -98,6 +99,7 @@ def test_default_values(self):
self.assertEqual(args.decompress, False)
self.assertEqual(args.list, False)
self.assertEqual(args.stdout, False)
self.assertEqual(args.chunk_size, tszip.DEFAULT_CHUNK_SIZE)
self.assertEqual(args.variants_only, False)
self.assertEqual(args.suffix, ".tsz")

Expand All @@ -123,6 +125,14 @@ def test_decompress(self):
args = parser.parse_args([infile, "--decompress"])
self.assertTrue(args.decompress)

def test_chunk_size(self):
parser = cli.tszip_cli_parser()
infile = "tmp.trees.tsz"
args = parser.parse_args([infile, "-C", "1234"])
self.assertEqual(args.chunk_size, 1234)
args = parser.parse_args([infile, "--chunk-size=1234"])
self.assertTrue(args.chunk_size, 1234)


class TestCli(unittest.TestCase):
"""
Expand Down Expand Up @@ -248,6 +258,20 @@ def test_variants_only(self):
G2 = self.ts.genotype_matrix()
self.assertTrue(np.array_equal(G1, G2))

def test_chunk_size(self):
self.assertTrue(self.trees_path.exists())
self.run_tszip([str(self.trees_path), "--chunk-size=20"])
self.assertFalse(self.trees_path.exists())
outpath = pathlib.Path(str(self.trees_path) + ".tsz")
self.assertTrue(outpath.exists())
ts = tszip.decompress(outpath)
self.assertEqual(ts.tables, self.ts.tables)
store = compat.create_zip_store(str(outpath), mode="r")
root = compat.create_zarr_group(store=store)
for _, g in root.groups():
for _, a in g.arrays():
assert a.chunks == (20,)

def test_keep(self):
self.assertTrue(self.trees_path.exists())
self.run_tszip([str(self.trees_path), "--keep"])
Expand Down
51 changes: 48 additions & 3 deletions tests/test_compression.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MIT License
#
# Copyright (c) 2021 Tskit Developers
# Copyright (c) 2021-2026 Tskit Developers
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -104,6 +104,10 @@ class RoundTripMixin:
Set of example tree sequences that we should be able to round trip.
"""

def test_minimal(self):
ts = tskit.Tree.generate_balanced(2).tree_sequence
self.verify(ts)

def test_small_msprime_no_recomb(self):
ts = msprime.simulate(10, mutation_rate=2, random_seed=2)
self.assertGreater(ts.num_sites, 2)
Expand Down Expand Up @@ -189,7 +193,7 @@ def test_small_msprime_complex_mutations(self):

def test_ref_seq(self):
ts = msprime.simulate(10, recombination_rate=1, mutation_rate=2, random_seed=2)
tables = ts.tables
tables = ts.dump_tables()
tables.reference_sequence.metadata_schema = (
tskit.MetadataSchema.permissive_json()
)
Expand Down Expand Up @@ -307,7 +311,12 @@ def test_provenance(self):
root = compat.create_zarr_group(store=store)
self.assertEqual(
root.attrs["provenance"],
provenance.get_provenance_dict({"variants_only": variants_only}),
provenance.get_provenance_dict(
{
"variants_only": variants_only,
"chunk_size": compression.DEFAULT_CHUNK_SIZE,
}
),
)

def write_file(self, attrs, path):
Expand Down Expand Up @@ -526,3 +535,39 @@ def test_issue95_metadata_dtype_regression(self):
assert len(ts_decompressed.metadata["reverse_node_map"]) == len(
ts_original.metadata["reverse_node_map"]
)


class TestChunkSize:
@pytest.mark.parametrize(
"chunk_size", [1, 2, 1000, 2**21, np.array([100], dtype=int)[0]]
)
def test_good_chunks(self, tmpdir, chunk_size):
files = pathlib.Path(__file__).parent / "files"
ts1 = tskit.load(files / "1.0.0.trees")
path = tmpdir / "out.trees.tsz"
tszip.compress(ts1, path, chunk_size=chunk_size)
ts2 = tszip.decompress(path)
assert ts1 == ts2

store = compat.create_zip_store(str(path), mode="r")
root = compat.create_zarr_group(store=store)
for _, g in root.groups():
for _, a in g.arrays():
assert a.chunks == (chunk_size,)

@pytest.mark.parametrize(
["chunk_size", "exception"],
[
(0, ValueError),
(-1, ValueError),
(1.1, TypeError),
("x", TypeError),
("10", TypeError),
],
)
def test_bad_chunks(self, tmpdir, chunk_size, exception):
files = pathlib.Path(__file__).parent / "files"
ts = tskit.load(files / "1.0.0.trees")
path = tmpdir / "out.trees.tsz"
with pytest.raises(exception):
tszip.compress(ts, path, chunk_size=chunk_size)
1 change: 1 addition & 0 deletions tszip/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# SOFTWARE.
from .compression import compress # NOQA
from .compression import decompress # NOQA
from .compression import DEFAULT_CHUNK_SIZE # NOQA
from .compression import load # NOQA
from .compression import print_summary # NOQA
from .provenance import __version__ # NOQA
12 changes: 11 additions & 1 deletion tszip/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ def tszip_cli_parser():
"-v", "--verbosity", action="count", default=0, help="Increase the verbosity"
)
parser.add_argument("files", nargs="+", help="The files to compress/decompress.")
parser.add_argument(
"-C",
"--chunk-size",
type=int,
default=tszip.DEFAULT_CHUNK_SIZE,
help="Sets the size of array chunks to be compressed to the specified "
f"number of elements. Default={tszip.DEFAULT_CHUNK_SIZE}",
)
parser.add_argument(
"--variants-only",
action="store_true",
Expand Down Expand Up @@ -125,7 +133,9 @@ def run_compress(args):
check_output(outfile, args)
if args.stdout:
outfile = get_stdout()
tszip.compress(ts, outfile, variants_only=args.variants_only)
tszip.compress(
ts, outfile, variants_only=args.variants_only, chunk_size=args.chunk_size
)
remove_input(infile, args)


Expand Down
6 changes: 0 additions & 6 deletions tszip/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,6 @@ def create_empty_array(
def get_nbytes_stored(array):
return array.nbytes_stored()

def group_items(group):
return group.members()

def visit_arrays(group, visitor):
for array in group.array_values():
visitor(array)
Expand Down Expand Up @@ -86,8 +83,5 @@ def create_empty_array(
def get_nbytes_stored(array):
return array.nbytes_stored

def group_items(group):
return group.items()

def visit_arrays(group, visitor):
group.visitvalues(visitor)
Loading