From 0fe44b91271004c2f74dba3829c6eece749291ef Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 16:25:53 +0000
Subject: [PATCH 1/9] move environment file

move from essd/ to main directory, add gitpython as a dependency
---
 essd/environment.yml => environment.yml | 1 +
 1 file changed, 1 insertion(+)
 rename essd/environment.yml => environment.yml (91%)

diff --git a/essd/environment.yml b/environment.yml
similarity index 91%
rename from essd/environment.yml
rename to environment.yml
index f3be0f3..01fb5e0 100644
--- a/essd/environment.yml
+++ b/environment.yml
@@ -10,3 +10,4 @@ dependencies:
   - squarify
   - geoutils
   - cartopy
+  - gitpython

From 4146fbdca4e11b4f951c335a81f9c3fa4809895d Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 16:51:57 +0000
Subject: [PATCH 2/9] add delayed-assert

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index 01fb5e0..d10e680 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,3 +11,4 @@ dependencies:
   - geoutils
   - cartopy
   - gitpython
+  - delayed-assert

From b5cda57100ed914f040aaf3e75f163c1827fc362 Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 17:17:12 +0000
Subject: [PATCH 3/9] add test for new csv files and gpkg files

- test whether new/updated csv files have correct columns
- test whether all geopackage files (a) exist, and (b) have the correct columns
---
 scripts/test_submission.py | 69 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 scripts/test_submission.py

diff --git a/scripts/test_submission.py b/scripts/test_submission.py
new file mode 100644
index 0000000..313c06d
--- /dev/null
+++ b/scripts/test_submission.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+from glob import glob
+from delayed_assert import expect, assert_expectations
+from git import Repo
+import pandas as pd
+import geopandas as gpd
+
+
+def test_column_names():
+    """
+    Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on
+    what is found in lake_term_data_template.csv:
+
+    rgi_id,lake_cat,image_id,image_date,inventory_doi,contributor
+
+    Raises an AssertionError if one or more files is missing one or more columns.
+
+    """
+    # check all files in dataset/contributor_files/
+    contribs = [f"dataset/csv/{fn}" for fn in glob('**/*.csv',
+                                                   root_dir='dataset/csv',
+                                                   recursive=True)] \
+               + [f"dataset/contributor_files/{fn}" for fn in glob('**/*.csv',
+                                                                   root_dir='dataset/contributor_files',
+                                                                   recursive=True)]
+
+    # get a list of "new" or changed files from the current branch
+    repo = Repo('.')
+    diff = [item.a_path for item in repo.index.diff('main')]
+
+    new_contribs = list(set(contribs) & set(diff))
+
+    print(f"Found {len(new_contribs)} new or changed submissions: ")
+    for fn in new_contribs:
+        print(fn)
+
+    # required columns
+    req_cols = pd.read_csv('lake_term_data_template.csv').columns
+
+    for fn_csv in new_contribs:
+        csv = pd.read_csv(fn_csv)
+        # first, check that columns are all there.
+        for col in req_cols:
+            expect(col in csv.columns, f"{col} not found in {fn_csv}: \n{list(csv.columns)}")
+
+    assert_expectations()
+
+
+def test_geopackage():
+    """
+    Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names.
+    """
+    req_cols = pd.read_csv('lake_term_data_template.csv').columns
+    regions = [fn.split('_lakeflag.csv')[0] for fn in sorted(glob('*lakeflag.csv', root_dir='dataset/csv'))]
+
+    for reg in regions:
+        expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(),
+               f"geopackage file not found in dataset/lakeflags/ for {reg}")
+        expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(),
+               f"geopackage file not found in dataset/outlines/ for {reg}")
+
+        lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg"))
+        outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg"))
+
+        for col in req_cols:
+            expect(col in lakeflag.columns, f"{col} not found in {reg} lakeflag file: \n{list(lakeflag.columns)}")
+            expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}")
+
+    assert_expectations()

From f8c22fdc301f88ae85559fb471fb1ae020bf267e Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 19:39:18 +0000
Subject: [PATCH 4/9] test for lake_cat values

- re-name test_column_names -> test_columns
- add list of rgi regions rather than parsing filenames
- add test to compare lake_cat values in csv files and geopackage files
---
 scripts/test_submission.py | 49 +++++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/scripts/test_submission.py b/scripts/test_submission.py
index 313c06d..6359329 100644
--- a/scripts/test_submission.py
+++ b/scripts/test_submission.py
@@ -6,7 +6,28 @@
 import geopandas as gpd
 
 
-def test_column_names():
+rgi_regions = ['RGI2000-v7.0-G-01_alaska',
+    'RGI2000-v7.0-G-02_western_canada_usa',
+    'RGI2000-v7.0-G-03_arctic_canada_north',
+    'RGI2000-v7.0-G-04_arctic_canada_south',
+    'RGI2000-v7.0-G-05_greenland_periphery',
+    'RGI2000-v7.0-G-06_iceland',
+    'RGI2000-v7.0-G-07_svalbard_jan_mayen',
+    'RGI2000-v7.0-G-08_scandinavia',
+    'RGI2000-v7.0-G-09_russian_arctic',
+    'RGI2000-v7.0-G-10_north_asia',
+    'RGI2000-v7.0-G-11_central_europe',
+    'RGI2000-v7.0-G-12_caucasus_middle_east',
+    'RGI2000-v7.0-G-13_central_asia',
+    'RGI2000-v7.0-G-14_south_asia_west',
+    'RGI2000-v7.0-G-15_south_asia_east',
+    'RGI2000-v7.0-G-16_low_latitudes',
+    'RGI2000-v7.0-G-17_southern_andes',
+    'RGI2000-v7.0-G-18_new_zealand',
+    'RGI2000-v7.0-G-19_subantarctic_antarctic_islands'
+]
+
+def test_columns():
     """
     Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on
     what is found in lake_term_data_template.csv:
@@ -51,9 +72,8 @@ def test_geopackage():
     Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names.
     """
     req_cols = pd.read_csv('lake_term_data_template.csv').columns
-    regions = [fn.split('_lakeflag.csv')[0] for fn in sorted(glob('*lakeflag.csv', root_dir='dataset/csv'))]
 
-    for reg in regions:
+    for reg in rgi_regions:
         expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(),
                f"geopackage file not found in dataset/lakeflags/ for {reg}")
         expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(),
@@ -67,3 +87,26 @@ def test_geopackage():
             expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}")
 
     assert_expectations()
+
+
+def test_lake_cat():
+    """
+    Tests whether the lake_cat value is the same in the csv tables and the geopackage files for all regions.
+    """
+
+    for reg in rgi_regions:
+        attributes = pd.read_csv(Path('dataset', 'csv', f"{reg}_lakeflag.csv")).set_index('rgi_id')
+
+        lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")).set_index('rgi_id')
+        outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")).set_index('rgi_id')
+
+        # can compare these directly, as they should be identically indexed (and if not, it's an error)
+        expect((attributes['lake_cat'] == lakeflag['lake_cat']).all(),
+               f"lake_cat doesn't match for {reg} points file.")
+
+        # have to first select from attributes where index is also in outlines
+        same_index = attributes.index[attributes.index.isin(outlines.index)]
+        expect((attributes.loc[same_index, 'lake_cat'] == outlines['lake_cat']).all(),
+               f"lake_cat doesn't match for {reg} outlines.")
+
+    assert_expectations()

From aa1061cd8d25dd025c7d54a4c01ad319615eb9af Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 20:37:33 +0000
Subject: [PATCH 5/9] add pytest dependency

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index d10e680..b321715 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,5 +10,6 @@ dependencies:
   - squarify
   - geoutils
   - cartopy
+  - pytest
   - gitpython
   - delayed-assert

From 3d8de119bea603012670b83223ce7fdce1c9dfd1 Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 20:49:54 +0000
Subject: [PATCH 6/9] add workflow action for running pytest

---
 .github/workflows/dataset_checks.yml | 33 ++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/dataset_checks.yml

diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml
new file mode 100644
index 0000000..e3ff9d3
--- /dev/null
+++ b/.github/workflows/dataset_checks.yml
@@ -0,0 +1,33 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: dataset checks
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main", "essd_review" ]
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pytest pandas geopandas delayed-assert gitpython
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Run Dataset Checks
+      run: |
+        pytest 

From 4403d94c346e681dc7c4dc12b4b7212542654ee2 Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 21:00:05 +0000
Subject: [PATCH 7/9] use v5, checkout all tags, branches

---
 .github/workflows/dataset_checks.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml
index e3ff9d3..5305e98 100644
--- a/.github/workflows/dataset_checks.yml
+++ b/.github/workflows/dataset_checks.yml
@@ -18,7 +18,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
+      with:
+        fetch-depth: 0
     - name: Set up Python 3.10
       uses: actions/setup-python@v3
       with:

From 8b5be26319ca2556fb0c804a76cdc4845122f598 Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 21:17:11 +0000
Subject: [PATCH 8/9] undo checkout all tags, branches, compare to origin/main

---
 .github/workflows/dataset_checks.yml | 2 --
 scripts/test_submission.py           | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml
index 5305e98..d422c8e 100644
--- a/.github/workflows/dataset_checks.yml
+++ b/.github/workflows/dataset_checks.yml
@@ -19,8 +19,6 @@ jobs:
 
     steps:
     - uses: actions/checkout@v5
-      with:
-        fetch-depth: 0
     - name: Set up Python 3.10
       uses: actions/setup-python@v3
       with:
diff --git a/scripts/test_submission.py b/scripts/test_submission.py
index 6359329..a9ad057 100644
--- a/scripts/test_submission.py
+++ b/scripts/test_submission.py
@@ -47,7 +47,7 @@ def test_columns():
 
     # get a list of "new" or changed files from the current branch
     repo = Repo('.')
-    diff = [item.a_path for item in repo.index.diff('main')]
+    diff = [item.a_path for item in repo.index.diff('origin/main')]
 
     new_contribs = list(set(contribs) & set(diff))
 

From d6d4848cfe606aab69d96f034013d3c4fab1d857 Mon Sep 17 00:00:00 2001
From: Bob McNabb <robertmcnabb@gmail.com>
Date: Sun, 2 Nov 2025 21:18:19 +0000
Subject: [PATCH 9/9] re-do checkout all tags/branches

---
 .github/workflows/dataset_checks.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/dataset_checks.yml b/.github/workflows/dataset_checks.yml
index d422c8e..5305e98 100644
--- a/.github/workflows/dataset_checks.yml
+++ b/.github/workflows/dataset_checks.yml
@@ -19,6 +19,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v5
+      with:
+        fetch-depth: 0
     - name: Set up Python 3.10
       uses: actions/setup-python@v3
       with: