Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/workflows/dataset_checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: dataset checks

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main", "essd_review" ]

permissions:
contents: read

jobs:
test:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v5
with:
fetch-depth: 0
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest pandas geopandas delayed-assert gitpython
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Run Dataset Checks
run: |
pytest
3 changes: 3 additions & 0 deletions essd/environment.yml → environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ dependencies:
- squarify
- geoutils
- cartopy
- pytest
- gitpython
- delayed-assert
112 changes: 112 additions & 0 deletions scripts/test_submission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from pathlib import Path
from glob import glob
from delayed_assert import expect, assert_expectations
from git import Repo
import pandas as pd
import geopandas as gpd


rgi_regions = ['RGI2000-v7.0-G-01_alaska',
'RGI2000-v7.0-G-02_western_canada_usa',
'RGI2000-v7.0-G-03_arctic_canada_north',
'RGI2000-v7.0-G-04_arctic_canada_south',
'RGI2000-v7.0-G-05_greenland_periphery',
'RGI2000-v7.0-G-06_iceland',
'RGI2000-v7.0-G-07_svalbard_jan_mayen',
'RGI2000-v7.0-G-08_scandinavia',
'RGI2000-v7.0-G-09_russian_arctic',
'RGI2000-v7.0-G-10_north_asia',
'RGI2000-v7.0-G-11_central_europe',
'RGI2000-v7.0-G-12_caucasus_middle_east',
'RGI2000-v7.0-G-13_central_asia',
'RGI2000-v7.0-G-14_south_asia_west',
'RGI2000-v7.0-G-15_south_asia_east',
'RGI2000-v7.0-G-16_low_latitudes',
'RGI2000-v7.0-G-17_southern_andes',
'RGI2000-v7.0-G-18_new_zealand',
'RGI2000-v7.0-G-19_subantarctic_antarctic_islands'
]

def test_columns():
"""
Checks that each new/updated file in dataset/csv and dataset/contributor_files has the correct columns, based on
what is found in lake_term_data_template.csv:

rgi_id,lake_cat,image_id,image_date,inventory_doi,contributor

Raises an AssertionError if one or more files is missing one or more columns.

"""
# check all files in dataset/contributor_files/
contribs = [f"dataset/csv/{fn}" for fn in glob('**/*.csv',
root_dir='dataset/csv',
recursive=True)] \
+ [f"dataset/contributor_files/{fn}" for fn in glob('**/*.csv',
root_dir='dataset/contributor_files',
recursive=True)]

# get a list of "new" or changed files from the current branch
repo = Repo('.')
diff = [item.a_path for item in repo.index.diff('origin/main')]

new_contribs = list(set(contribs) & set(diff))

print(f"Found {len(new_contribs)} new or changed submissions: ")
for fn in new_contribs:
print(fn)

# required columns
req_cols = pd.read_csv('lake_term_data_template.csv').columns

for fn_csv in new_contribs:
csv = pd.read_csv(fn_csv)
# first, check that columns are all there.
for col in req_cols:
expect(col in csv.columns, f"{col} not found in {fn_csv}: \n{list(csv.columns)}")

assert_expectations()


def test_geopackage():
"""
Tests whether all geopackage files (a) exist for each region, and (b) have the correct column names.
"""
req_cols = pd.read_csv('lake_term_data_template.csv').columns

for reg in rgi_regions:
expect(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg").exists(),
f"geopackage file not found in dataset/lakeflags/ for {reg}")
expect(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg").exists(),
f"geopackage file not found in dataset/outlines/ for {reg}")

lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg"))
outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg"))

for col in req_cols:
expect(col in lakeflag.columns, f"{col} not found in {reg} lakeflag file: \n{list(lakeflag.columns)}")
expect(col in outlines.columns, f"{col} not found in {reg} outlines file: \n{list(outlines.columns)}")

assert_expectations()


def test_lake_cat():
"""
Tests whether the lake_cat value is the same in the csv tables and the geopackage files for all regions.
"""

for reg in rgi_regions:
attributes = pd.read_csv(Path('dataset', 'csv', f"{reg}_lakeflag.csv")).set_index('rgi_id')

lakeflag = gpd.read_file(Path('dataset', 'lakeflags', f"{reg}_lakeflag.gpkg")).set_index('rgi_id')
outlines = gpd.read_file(Path('dataset', 'outlines', f"{reg}_laketerminating.gpkg")).set_index('rgi_id')

# can compare these directly, as they should be identically indexed (and if not, it's an error)
expect((attributes['lake_cat'] == lakeflag['lake_cat']).all(),
f"lake_cat doesn't match for {reg} points file.")

# have to first select from attributes where index is also in outlines
same_index = attributes.index[attributes.index.isin(outlines.index)]
expect((attributes.loc[same_index, 'lake_cat'] == outlines['lake_cat']).all(),
f"lake_cat doesn't match for {reg} outlines.")

assert_expectations()