From 0fe44b91271004c2f74dba3829c6eece749291ef Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 16:25:53 +0000 Subject: [PATCH 1/9] move environment file move from essd/ to main directory, add gitpython as a dependency --- essd/environment.yml => environment.yml | 1 + 1 file changed, 1 insertion(+) rename essd/environment.yml => environment.yml (91%) diff --git a/essd/environment.yml b/environment.yml similarity index 91% rename from essd/environment.yml rename to environment.yml index f3be0f3..01fb5e0 100644 --- a/essd/environment.yml +++ b/environment.yml @@ -10,3 +10,4 @@ dependencies: - squarify - geoutils - cartopy + - gitpython From 4146fbdca4e11b4f951c335a81f9c3fa4809895d Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 16:51:57 +0000 Subject: [PATCH 2/9] add delayed-assert --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 01fb5e0..d10e680 100644 --- a/environment.yml +++ b/environment.yml @@ -11,3 +11,4 @@ dependencies: - geoutils - cartopy - gitpython + - delayed-assert From b5cda57100ed914f040aaf3e75f163c1827fc362 Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 17:17:12 +0000 Subject: [PATCH 3/9] add test for new csv files and gpkg files - test whether new/updated csv files have correct columns - test whether all geopackage files (a) exist, and (b) have the correct columns --- scripts/test_submission.py | 69 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 scripts/test_submission.py diff --git a/scripts/test_submission.py b/scripts/test_submission.py new file mode 100644 index 0000000..313c06d --- /dev/null +++ b/scripts/test_submission.py @@ -0,0 +1,69 @@ +from pathlib import Path +from glob import glob +from delayed_assert import expect, assert_expectations +from git import Repo +import pandas as pd +import geopandas as gpd + + +def test_column_names(): + """ + Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on + what is found in lake_term_data_template.csv: + + rgi_id,lake_cat,image_id,image_date,inventory_doi,contributor + + Raises an AssertionError if one or more files is missing one or more columns. + + """ + # check all files in dataset/contributor_files/ + contribs = [f"dataset/csv/{fn}" for fn in glob('**/*.csv', + root_dir='dataset/csv', + recursive=True)] \ + + [f"dataset/contributor_files/{fn}" for fn in glob('**/*.csv', + root_dir='dataset/contributor_files', + recursive=True)] + + # get a list of "new" or changed files from the current branch + repo = Repo('.') + diff = [item.a_path for item in repo.index.diff('main')] + + new_contribs = list(set(contribs) & set(diff)) + + print(f"Found {len(new_contribs)} new or changed submissions: ") + for fn in new_contribs: + print(fn) + + # required columns + req_cols = pd.read_csv('lake_term_data_template.csv').columns + + for fn_csv in new_contribs: + csv = pd.read_csv(fn_csv) + # first, check that columns are all there. + for col in req_cols: + expect(col in csv.columns, f"{col} not found in {fn_csv}: \n{list(csv.columns)}") + + assert_expectations() + + +def test_geopackage(): + """ + Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names. + """ + req_cols = pd.read_csv('lake_term_data_template.csv').columns + regions = [fn.split('_lakeflag.csv')[0] for fn in sorted(glob('*lakeflag.csv', root_dir='dataset/csv'))] + + for reg in regions: + expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(), + f"geopackage file not found in dataset/lakeflags/ for {reg}") + expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(), + f"geopackage file not found in dataset/outlines/ for {reg}") + + lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")) + outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")) + + for col in req_cols: + expect(col in lakeflag.columns, f"{col} not found in {reg} lakeflag file: \n{list(lakeflag.columns)}") + expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}") + + assert_expectations() From f8c22fdc301f88ae85559fb471fb1ae020bf267e Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 19:39:18 +0000 Subject: [PATCH 4/9] test for lake_cat values - re-name test_column_names -> test_columns - add list of rgi regions rather than parsing filenames - add test to compare lake_cat values in csv files and geopackage files --- scripts/test_submission.py | 49 +++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/scripts/test_submission.py b/scripts/test_submission.py index 313c06d..6359329 100644 --- a/scripts/test_submission.py +++ b/scripts/test_submission.py @@ -6,7 +6,28 @@ import geopandas as gpd -def test_column_names(): +rgi_regions = ['RGI2000-v7.0-G-01_alaska', + 'RGI2000-v7.0-G-02_western_canada_usa', + 'RGI2000-v7.0-G-03_arctic_canada_north', + 'RGI2000-v7.0-G-04_arctic_canada_south', + 'RGI2000-v7.0-G-05_greenland_periphery', + 'RGI2000-v7.0-G-06_iceland', + 'RGI2000-v7.0-G-07_svalbard_jan_mayen', + 'RGI2000-v7.0-G-08_scandinavia', + 'RGI2000-v7.0-G-09_russian_arctic', + 'RGI2000-v7.0-G-10_north_asia', + 'RGI2000-v7.0-G-11_central_europe', + 'RGI2000-v7.0-G-12_caucasus_middle_east', + 'RGI2000-v7.0-G-13_central_asia', + 'RGI2000-v7.0-G-14_south_asia_west', + 'RGI2000-v7.0-G-15_south_asia_east', + 'RGI2000-v7.0-G-16_low_latitudes', + 'RGI2000-v7.0-G-17_southern_andes', + 'RGI2000-v7.0-G-18_new_zealand', + 'RGI2000-v7.0-G-19_subantarctic_antarctic_islands' +] + +def test_columns(): """ Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on what is found in lake_term_data_template.csv: @@ -51,9 +72,8 @@ def test_geopackage(): Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names. """ req_cols = pd.read_csv('lake_term_data_template.csv').columns - regions = [fn.split('_lakeflag.csv')[0] for fn in sorted(glob('*lakeflag.csv', root_dir='dataset/csv'))] - for reg in regions: + for reg in rgi_regions: expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(), f"geopackage file not found in dataset/lakeflags/ for {reg}") expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(), @@ -67,3 +87,26 @@ def test_geopackage(): expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}") assert_expectations() + + +def test_lake_cat(): + """ + Tests whether the lake_cat value is the same in the csv tables and the geopackage files for all regions. + """ + + for reg in rgi_regions: + attributes = pd.read_csv(Path('dataset', 'csv', f"{reg}_lakeflag.csv")).set_index('rgi_id') + + lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")).set_index('rgi_id') + outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")).set_index('rgi_id') + + # can compare these directly, as they should be identically indexed (and if not, it's an error) + expect((attributes['lake_cat'] == lakeflag['lake_cat']).all(), + f"lake_cat doesn't match for {reg} points file.") + + # have to first select from attributes where index is also in outlines + same_index = attributes.index[attributes.index.isin(outlines.index)] + expect((attributes.loc[same_index, 'lake_cat'] == outlines['lake_cat']).all(), + f"lake_cat doesn't match for {reg} outlines.") + + assert_expectations() From aa1061cd8d25dd025c7d54a4c01ad319615eb9af Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 20:37:33 +0000 Subject: [PATCH 5/9] add pytest dependency --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index d10e680..b321715 100644 --- a/environment.yml +++ b/environment.yml @@ -10,5 +10,6 @@ dependencies: - squarify - geoutils - cartopy + - pytest - gitpython - delayed-assert From 3d8de119bea603012670b83223ce7fdce1c9dfd1 Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 20:49:54 +0000 Subject: [PATCH 6/9] add workflow action for running pytest --- .github/workflows/dataset_checks.yml | 33 ++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/dataset_checks.yml diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml new file mode 100644 index 0000000..e3ff9d3 --- /dev/null +++ b/.github/workflows/dataset_checks.yml @@ -0,0 +1,33 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: dataset checks + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main", "essd_review" ] + +permissions: + contents: read + +jobs: + test: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pandas geopandas delayed-assert gitpython + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Run Dataset Checks + run: | + pytest From 4403d94c346e681dc7c4dc12b4b7212542654ee2 Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 21:00:05 +0000 Subject: [PATCH 7/9] use v5, checkout all tags, branches --- .github/workflows/dataset_checks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml index e3ff9d3..5305e98 100644 --- a/.github/workflows/dataset_checks.yml +++ b/.github/workflows/dataset_checks.yml @@ -18,7 +18,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 + with: + fetch-depth: 0 - name: Set up Python 3.10 uses: actions/setup-python@v3 with: From 8b5be26319ca2556fb0c804a76cdc4845122f598 Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 21:17:11 +0000 Subject: [PATCH 8/9] undo checkout all tags, branches, compare to origin/main --- .github/workflows/dataset_checks.yml | 2 -- scripts/test_submission.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml index 5305e98..d422c8e 100644 --- a/.github/workflows/dataset_checks.yml +++ b/.github/workflows/dataset_checks.yml @@ -19,8 +19,6 @@ jobs: steps: - uses: actions/checkout@v5 - with: - fetch-depth: 0 - name: Set up Python 3.10 uses: actions/setup-python@v3 with: diff --git a/scripts/test_submission.py b/scripts/test_submission.py index 6359329..a9ad057 100644 --- a/scripts/test_submission.py +++ b/scripts/test_submission.py @@ -47,7 +47,7 @@ def test_columns(): # get a list of "new" or changed files from the current branch repo = Repo('.') - diff = [item.a_path for item in repo.index.diff('main')] + diff = [item.a_path for item in repo.index.diff('origin/main')] new_contribs = list(set(contribs) & set(diff)) From d6d4848cfe606aab69d96f034013d3c4fab1d857 Mon Sep 17 00:00:00 2001 From: Bob McNabb Date: Sun, 2 Nov 2025 21:18:19 +0000 Subject: [PATCH 9/9] re-do checkout all tags/branches --- .github/workflows/dataset_checks.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml index d422c8e..5305e98 100644 --- a/.github/workflows/dataset_checks.yml +++ b/.github/workflows/dataset_checks.yml @@ -19,6 +19,8 @@ jobs: steps: - uses: actions/checkout@v5 + with: + fetch-depth: 0 - name: Set up Python 3.10 uses: actions/setup-python@v3 with: