From 3256f9c32d67d0abf0efe4fd53a41b3feef51e07 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Wed, 14 May 2025 15:13:06 +0100 Subject: [PATCH 01/17] generate provision quality dataset --- digital_land/utils/functions_core.py | 73 ++++++++ .../utils/generate_provision_quality.py | 175 ++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 digital_land/utils/functions_core.py create mode 100644 digital_land/utils/generate_provision_quality.py diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py new file mode 100644 index 000000000..887fa5c09 --- /dev/null +++ b/digital_land/utils/functions_core.py @@ -0,0 +1,73 @@ +import urllib +import os +import sqlite3 +import pandas as pd +import geopandas as gpd +import shapely.wkt + + +global FILES_URL + +FILES_URL = "https://datasette.planning.data.gov.uk/" + + +def download_dataset(dataset, output_dir_path, overwrite=False): + dataset_file_name = f"{dataset}.db" + + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + output_file_path = os.path.join(output_dir_path, dataset_file_name) + + if overwrite is False and os.path.exists(output_file_path): + return + + final_url = os.path.join(FILES_URL, dataset_file_name) + print(f"downloading data from {final_url}") + print(f"to: {output_file_path}") + urllib.request.urlretrieve( + final_url, os.path.join(output_dir_path, dataset_file_name) + ) + print("download complete") + + +def get_pdp_dataset( + dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True +): + + df = pd.read_csv( + f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str" + ) + df.columns = [x.replace("-", "_") for x in df.columns] + + df_valid_geom = df[df[geometry_field].notnull()].copy() + + # load geometry and create GDF + df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply( + shapely.wkt.loads + ) + gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field) + + # Transform to ESPG:27700 for more interpretable area units + gdf.set_crs(epsg=4326, inplace=True) + gdf.to_crs(epsg=crs_out, inplace=True) + + return gdf + + +def query_sqlite(db_path, query_string): + + with sqlite3.connect(db_path) as con: + + cursor = con.execute(query_string) + cols = [column[0] for column in cursor.description] + results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols) + + return results_df + + +def datasette_query(db, sql_string): + params = urllib.parse.urlencode({"sql": sql_string, "_size": "max"}) + url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}" + df = pd.read_csv(url) + return df diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py new file mode 100644 index 000000000..7ea8f6f73 --- /dev/null +++ b/digital_land/utils/generate_provision_quality.py @@ -0,0 +1,175 @@ +import os +import pandas as pd +import numpy as np +import json +from datetime import datetime +from digital_land.utils import functions_core as fc + + +def generate_provision_quality(): + """ + Generates a provision quality dataset and saves it as a parquet file. + """ + td = datetime.today().strftime("%Y-%m-%d") + + # Create the temporary download directory + db_dir = os.path.join("/tmp", "db_downloads") + os.makedirs(db_dir, exist_ok=True) + + # Download the performance db + fc.download_dataset("performance", db_dir, overwrite=False) + path_perf_db = os.path.join(db_dir, "performance.db") + + # Issue quality criteria lookup + lookup_issue_qual = fc.datasette_query( + "digital-land", + """ + SELECT + description, + issue_type, + name, + severity, + responsibility, + quality_criteria_level || " - " || quality_criteria as quality_criteria, + quality_criteria_level as quality_level + FROM issue_type + WHERE quality_criteria_level != '' + AND quality_criteria != '' + """, + ) + + # Transform data + provision = fc.query_sqlite( + path_perf_db, + """ + SELECT organisation, dataset, active_endpoint_count + FROM provision_summary + """, + ) + + # Extract issue count by provision from endpoint_dataset_issue_type_summary + qual_issue = fc.query_sqlite( + path_perf_db, + """ + SELECT + organisation, dataset, + 'issue' as problem_source, + issue_type as problem_type, + sum(count_issues) as count + FROM endpoint_dataset_issue_type_summary + WHERE resource_end_date is not NULL + AND issue_type is not NULL + GROUP BY organisation, dataset, issue_type + """, + ) + + # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) + qual_issue = qual_issue.merge( + lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], + how="inner", + left_on="problem_type", + right_on="issue_type", + ) + qual_issue.drop("issue_type", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_bounds = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check no entities are outside of the local planning authority boundary' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_bounds["problem_source"] = "expectation" + qual_expectation_bounds["problem_type"] = ( + "entity outside of the local planning authority boundary" + ) + qual_expectation_bounds["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_bounds["details"] + ] + qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" + qual_expectation_bounds["quality_level"] = 3 + qual_expectation_bounds.drop("details", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_count = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_count["problem_source"] = "expectation" + qual_expectation_count["problem_type"] = "entity count doesn't match manual count" + qual_expectation_count["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_count["details"] + ] + qual_expectation_count["quality_criteria"] = ( + "3 - conservation area entity count matches LPA" + ) + qual_expectation_count["quality_level"] = 3 + qual_expectation_count.drop("details", axis=1, inplace=True) + + # Combine all problem source tables, and aggregate to criteria level + qual_all_criteria = ( + pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) + .groupby( + ["organisation", "dataset", "quality_criteria", "quality_level"], + as_index=False, + ) + .agg(count_failures=("count", "sum")) + ) + + # Merge issues with the provision data + prov_qual_all = provision.merge( + qual_all_criteria, how="left", on=["organisation", "dataset"] + ) + + prov_qual_all["quality_level_for_sort"] = np.select( + [ + (prov_qual_all["active_endpoint_count"] == 0), + (prov_qual_all["quality_level"].notnull()), + (prov_qual_all["active_endpoint_count"] > 0) + & (prov_qual_all["quality_level"].isnull()), + ], + [0, prov_qual_all["quality_level"], 4], + ) + + level_map = { + 4: "4. data that is trustworthy", + 3: "3. data that is good for ODP", + 2: "2. authoritative data from the LPA", + 1: "1. some data", + 0: "0. no score", + } + + prov_quality = prov_qual_all.groupby( + ["organisation", "dataset"], as_index=False, dropna=False + ).agg(quality_level=("quality_level_for_sort", "min")) + + prov_quality["quality"] = prov_quality["quality_level"].map(level_map) + prov_quality["notes"] = "" + prov_quality["end-date"] = "" + prov_quality["start-date"] = td + prov_quality["entry-date"] = td + + # Output the results as a Parquet file + output_dir = os.path.join( + "/tmp", "performance", "provision-quality", f"entry-date={td}" + ) + os.makedirs(output_dir, exist_ok=True) + + output_file = os.path.join(output_dir, "provision-quality.parquet") + prov_quality.to_parquet(output_file, engine="pyarrow", index=False) + + print(f"Provision quality dataset saved to: {output_file}") From a6e85a5d688ae8fa96ff11a82bf82b643a15aa20 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Thu, 15 May 2025 11:57:29 +0100 Subject: [PATCH 02/17] generate-provision-quality cli command --- digital_land/cli.py | 6 + digital_land/commands.py | 167 +++++++++++++++++ digital_land/utils/functions_core.py | 21 +-- .../utils/generate_provision_quality.py | 175 ------------------ setup.py | 1 + 5 files changed, 183 insertions(+), 187 deletions(-) delete mode 100644 digital_land/utils/generate_provision_quality.py diff --git a/digital_land/cli.py b/digital_land/cli.py index b65aef331..a99a950ea 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -31,6 +31,7 @@ organisation_check, save_state, add_data, + generate_provision_quality, ) from digital_land.command_arguments import ( @@ -825,3 +826,8 @@ def check_state_cmd( if diffs: print(f"State differs from {state_path} - {', '.join(diffs)}") sys.exit(1) + + +@cli.command("generate-provision-quality") +def generate_provision_quality_cmd(): + generate_provision_quality() diff --git a/digital_land/commands.py b/digital_land/commands.py index ad16e6edb..02e3a95e9 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -14,6 +14,7 @@ import geojson from requests import HTTPError import shapely +import numpy as np from digital_land.package.organisation import OrganisationPackage from digital_land.check import duplicate_reference_check @@ -73,6 +74,7 @@ is_url_valid, get_user_response, ) +from digital_land.utils import functions_core as fc from .register import hash_value from .utils.gdal_utils import get_gdal_version @@ -1665,3 +1667,168 @@ def check_and_assign_entities( ): return False return True + + +def generate_provision_quality(): + """Generates a provision quality dataset and saves it as a parquet file""" + td = datetime.today().strftime("%Y-%m-%d") + + # Create the temporary download directory + db_dir = Path("/tmp") / "db_downloads" + os.makedirs(db_dir, exist_ok=True) + + # Download the performance db + fc.download_dataset("performance", db_dir, overwrite=False) + path_perf_db = db_dir / "performance.db" + + # Issue quality criteria lookup + lookup_issue_qual = fc.datasette_query( + "digital-land", + """ + SELECT + description, + issue_type, + name, + severity, + responsibility, + quality_criteria_level || " - " || quality_criteria as quality_criteria, + quality_criteria_level as quality_level + FROM issue_type + WHERE quality_criteria_level != '' + AND quality_criteria != '' + """, + ) + + # Transform data + provision = fc.query_sqlite( + path_perf_db, + """ + SELECT organisation, dataset, active_endpoint_count + FROM provision_summary + """, + ) + + # Extract issue count by provision from endpoint_dataset_issue_type_summary + qual_issue = fc.query_sqlite( + path_perf_db, + """ + SELECT + organisation, dataset, + 'issue' as problem_source, + issue_type as problem_type, + sum(count_issues) as count + FROM endpoint_dataset_issue_type_summary + WHERE resource_end_date is not NULL + AND issue_type is not NULL + GROUP BY organisation, dataset, issue_type + """, + ) + + # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) + qual_issue = qual_issue.merge( + lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], + how="inner", + left_on="problem_type", + right_on="issue_type", + ) + qual_issue.drop("issue_type", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_bounds = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check no entities are outside of the local planning authority boundary' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_bounds["problem_source"] = "expectation" + qual_expectation_bounds["problem_type"] = ( + "entity outside of the local planning authority boundary" + ) + qual_expectation_bounds["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_bounds["details"] + ] + qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" + qual_expectation_bounds["quality_level"] = 3 + qual_expectation_bounds.drop("details", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_count = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_count["problem_source"] = "expectation" + qual_expectation_count["problem_type"] = "entity count doesn't match manual count" + qual_expectation_count["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_count["details"] + ] + qual_expectation_count["quality_criteria"] = ( + "3 - conservation area entity count matches LPA" + ) + qual_expectation_count["quality_level"] = 3 + qual_expectation_count.drop("details", axis=1, inplace=True) + + # Combine all problem source tables, and aggregate to criteria level + qual_all_criteria = ( + pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) + .groupby( + ["organisation", "dataset", "quality_criteria", "quality_level"], + as_index=False, + ) + .agg(count_failures=("count", "sum")) + ) + + # Merge issues with the provision data + prov_qual_all = provision.merge( + qual_all_criteria, how="left", on=["organisation", "dataset"] + ) + + prov_qual_all["quality_level_for_sort"] = np.select( + [ + (prov_qual_all["active_endpoint_count"] == 0), + (prov_qual_all["quality_level"].notnull()), + (prov_qual_all["active_endpoint_count"] > 0) + & (prov_qual_all["quality_level"].isnull()), + ], + [0, prov_qual_all["quality_level"], 4], + ) + + level_map = { + 4: "4. data that is trustworthy", + 3: "3. data that is good for ODP", + 2: "2. authoritative data from the LPA", + 1: "1. some data", + 0: "0. no score", + } + + prov_quality = prov_qual_all.groupby( + ["organisation", "dataset"], as_index=False, dropna=False + ).agg(quality_level=("quality_level_for_sort", "min")) + + prov_quality["quality"] = prov_quality["quality_level"].map(level_map) + prov_quality["notes"] = "" + prov_quality["end-date"] = "" + prov_quality["start-date"] = td + prov_quality["entry-date"] = td + + # Output the results as a Parquet file + output_dir = Path("/tmp") / "performance" / "provision-quality" / f"entry-date={td}" + os.makedirs(output_dir, exist_ok=True) + + output_file = output_dir / "provision-quality.parquet" + prov_quality.to_parquet(output_file, engine="pyarrow", index=False) + + print(f"Provision quality dataset saved to: {output_file}") diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py index 887fa5c09..b891ef405 100644 --- a/digital_land/utils/functions_core.py +++ b/digital_land/utils/functions_core.py @@ -1,10 +1,9 @@ import urllib -import os import sqlite3 import pandas as pd import geopandas as gpd import shapely.wkt - +from pathlib import Path global FILES_URL @@ -12,22 +11,20 @@ def download_dataset(dataset, output_dir_path, overwrite=False): - dataset_file_name = f"{dataset}.db" - - if not os.path.exists(output_dir_path): - os.makedirs(output_dir_path) + output_dir = Path(output_dir_path) + output_dir.mkdir(parents=True, exist_ok=True) - output_file_path = os.path.join(output_dir_path, dataset_file_name) + dataset_file_name = f"{dataset}.db" + output_file_path = output_dir / dataset_file_name - if overwrite is False and os.path.exists(output_file_path): + if not overwrite and output_file_path.exists(): return - final_url = os.path.join(FILES_URL, dataset_file_name) + final_url = f"{FILES_URL}{dataset_file_name}" print(f"downloading data from {final_url}") print(f"to: {output_file_path}") - urllib.request.urlretrieve( - final_url, os.path.join(output_dir_path, dataset_file_name) - ) + + urllib.request.urlretrieve(final_url, output_file_path) print("download complete") diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py deleted file mode 100644 index 7ea8f6f73..000000000 --- a/digital_land/utils/generate_provision_quality.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import pandas as pd -import numpy as np -import json -from datetime import datetime -from digital_land.utils import functions_core as fc - - -def generate_provision_quality(): - """ - Generates a provision quality dataset and saves it as a parquet file. - """ - td = datetime.today().strftime("%Y-%m-%d") - - # Create the temporary download directory - db_dir = os.path.join("/tmp", "db_downloads") - os.makedirs(db_dir, exist_ok=True) - - # Download the performance db - fc.download_dataset("performance", db_dir, overwrite=False) - path_perf_db = os.path.join(db_dir, "performance.db") - - # Issue quality criteria lookup - lookup_issue_qual = fc.datasette_query( - "digital-land", - """ - SELECT - description, - issue_type, - name, - severity, - responsibility, - quality_criteria_level || " - " || quality_criteria as quality_criteria, - quality_criteria_level as quality_level - FROM issue_type - WHERE quality_criteria_level != '' - AND quality_criteria != '' - """, - ) - - # Transform data - provision = fc.query_sqlite( - path_perf_db, - """ - SELECT organisation, dataset, active_endpoint_count - FROM provision_summary - """, - ) - - # Extract issue count by provision from endpoint_dataset_issue_type_summary - qual_issue = fc.query_sqlite( - path_perf_db, - """ - SELECT - organisation, dataset, - 'issue' as problem_source, - issue_type as problem_type, - sum(count_issues) as count - FROM endpoint_dataset_issue_type_summary - WHERE resource_end_date is not NULL - AND issue_type is not NULL - GROUP BY organisation, dataset, issue_type - """, - ) - - # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) - qual_issue = qual_issue.merge( - lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], - how="inner", - left_on="problem_type", - right_on="issue_type", - ) - qual_issue.drop("issue_type", axis=1, inplace=True) - - # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_bounds = fc.datasette_query( - "digital-land", - """ - SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check no entities are outside of the local planning authority boundary' - AND passed = 'False' - AND message not like '%error%' - """, - ) - - qual_expectation_bounds["problem_source"] = "expectation" - qual_expectation_bounds["problem_type"] = ( - "entity outside of the local planning authority boundary" - ) - qual_expectation_bounds["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_bounds["details"] - ] - qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" - qual_expectation_bounds["quality_level"] = 3 - qual_expectation_bounds.drop("details", axis=1, inplace=True) - - # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_count = fc.datasette_query( - "digital-land", - """ - SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' - AND passed = 'False' - AND message not like '%error%' - """, - ) - - qual_expectation_count["problem_source"] = "expectation" - qual_expectation_count["problem_type"] = "entity count doesn't match manual count" - qual_expectation_count["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_count["details"] - ] - qual_expectation_count["quality_criteria"] = ( - "3 - conservation area entity count matches LPA" - ) - qual_expectation_count["quality_level"] = 3 - qual_expectation_count.drop("details", axis=1, inplace=True) - - # Combine all problem source tables, and aggregate to criteria level - qual_all_criteria = ( - pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) - .groupby( - ["organisation", "dataset", "quality_criteria", "quality_level"], - as_index=False, - ) - .agg(count_failures=("count", "sum")) - ) - - # Merge issues with the provision data - prov_qual_all = provision.merge( - qual_all_criteria, how="left", on=["organisation", "dataset"] - ) - - prov_qual_all["quality_level_for_sort"] = np.select( - [ - (prov_qual_all["active_endpoint_count"] == 0), - (prov_qual_all["quality_level"].notnull()), - (prov_qual_all["active_endpoint_count"] > 0) - & (prov_qual_all["quality_level"].isnull()), - ], - [0, prov_qual_all["quality_level"], 4], - ) - - level_map = { - 4: "4. data that is trustworthy", - 3: "3. data that is good for ODP", - 2: "2. authoritative data from the LPA", - 1: "1. some data", - 0: "0. no score", - } - - prov_quality = prov_qual_all.groupby( - ["organisation", "dataset"], as_index=False, dropna=False - ).agg(quality_level=("quality_level_for_sort", "min")) - - prov_quality["quality"] = prov_quality["quality_level"].map(level_map) - prov_quality["notes"] = "" - prov_quality["end-date"] = "" - prov_quality["start-date"] = td - prov_quality["entry-date"] = td - - # Output the results as a Parquet file - output_dir = os.path.join( - "/tmp", "performance", "provision-quality", f"entry-date={td}" - ) - os.makedirs(output_dir, exist_ok=True) - - output_file = os.path.join(output_dir, "provision-quality.parquet") - prov_quality.to_parquet(output_file, engine="pyarrow", index=False) - - print(f"Provision quality dataset saved to: {output_file}") diff --git a/setup.py b/setup.py index a051b7356..685cc8ac9 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ def get_long_description(): "boto3", "moto", "psutil", + "geopandas", ], entry_points={"console_scripts": ["digital-land=digital_land.cli:cli"]}, setup_requires=["pytest-runner"], From 1975e667fe775031047acb1e6f0b1af6d1ef4a03 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Thu, 15 May 2025 13:42:58 +0100 Subject: [PATCH 03/17] update cli --- digital_land/cli.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/digital_land/cli.py b/digital_land/cli.py index a99a950ea..b65aef331 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -31,7 +31,6 @@ organisation_check, save_state, add_data, - generate_provision_quality, ) from digital_land.command_arguments import ( @@ -826,8 +825,3 @@ def check_state_cmd( if diffs: print(f"State differs from {state_path} - {', '.join(diffs)}") sys.exit(1) - - -@cli.command("generate-provision-quality") -def generate_provision_quality_cmd(): - generate_provision_quality() From 4cff1b8d96968fa07349dbb34dacba32967454b9 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Fri, 16 May 2025 12:32:42 +0100 Subject: [PATCH 04/17] utilise api to download performance.sqlite3 --- digital_land/api.py | 14 ++- digital_land/commands.py | 17 ++-- digital_land/utils/functions_core.py | 52 ----------- .../test_generate_provision_quality.py | 90 +++++++++++++++++++ tests/unit/test_functions_core_utils.py | 32 +++++++ 5 files changed, 145 insertions(+), 60 deletions(-) create mode 100644 tests/integration/test_generate_provision_quality.py create mode 100644 tests/unit/test_functions_core_utils.py diff --git a/digital_land/api.py b/digital_land/api.py index ef0262153..03a480e01 100644 --- a/digital_land/api.py +++ b/digital_land/api.py @@ -36,6 +36,8 @@ def download_dataset( overwrite: bool = False, path: str = None, extension: Extension = Extension.CSV, + builder: bool = False, + builder_name: str = None, ): """ Downloads a dataset in CSV or SQLite3 format. @@ -43,6 +45,8 @@ def download_dataset( - overwrite: overwrite file is it already exists (otherwise will just return). - path: file to download to (otherwise /dataset/.). - extension: 'csv' or 'sqlite3', 'csv' by default. + - builder: downloads the dataset from the builder path + - builder_name: name to use for accessing the builder path - Returns: None. The file will be downloaded to the given path or cache, unless an exception occurs. @@ -56,8 +60,14 @@ def download_dataset( # different extensions require different urls and reading modes if extension == self.Extension.SQLITE3: - collection = self.specification.dataset[dataset]["collection"] - url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" + # performance.sqlite requires digital-land-builder path + if builder: + if not builder_name: + raise ValueError("Builder name must be provided when builder=True") + url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3" + else: + collection = self.specification.dataset[dataset]["collection"] + url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" mode = "wb" def get_content(response): diff --git a/digital_land/commands.py b/digital_land/commands.py index 02e3a95e9..7eb7db3c8 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1673,13 +1673,18 @@ def generate_provision_quality(): """Generates a provision quality dataset and saves it as a parquet file""" td = datetime.today().strftime("%Y-%m-%d") - # Create the temporary download directory - db_dir = Path("/tmp") / "db_downloads" - os.makedirs(db_dir, exist_ok=True) + specification = Specification("specification/") + api = API(specification) + + # Download the performance db using api + api.download_dataset( + "performance", + extension=api.Extension.SQLITE3, + builder=True, + builder_name="digital-land", + ) - # Download the performance db - fc.download_dataset("performance", db_dir, overwrite=False) - path_perf_db = db_dir / "performance.db" + path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3" # Issue quality criteria lookup lookup_issue_qual = fc.datasette_query( diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py index b891ef405..c3a62836f 100644 --- a/digital_land/utils/functions_core.py +++ b/digital_land/utils/functions_core.py @@ -1,65 +1,13 @@ import urllib import sqlite3 import pandas as pd -import geopandas as gpd -import shapely.wkt -from pathlib import Path - -global FILES_URL - -FILES_URL = "https://datasette.planning.data.gov.uk/" - - -def download_dataset(dataset, output_dir_path, overwrite=False): - output_dir = Path(output_dir_path) - output_dir.mkdir(parents=True, exist_ok=True) - - dataset_file_name = f"{dataset}.db" - output_file_path = output_dir / dataset_file_name - - if not overwrite and output_file_path.exists(): - return - - final_url = f"{FILES_URL}{dataset_file_name}" - print(f"downloading data from {final_url}") - print(f"to: {output_file_path}") - - urllib.request.urlretrieve(final_url, output_file_path) - print("download complete") - - -def get_pdp_dataset( - dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True -): - - df = pd.read_csv( - f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str" - ) - df.columns = [x.replace("-", "_") for x in df.columns] - - df_valid_geom = df[df[geometry_field].notnull()].copy() - - # load geometry and create GDF - df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply( - shapely.wkt.loads - ) - gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field) - - # Transform to ESPG:27700 for more interpretable area units - gdf.set_crs(epsg=4326, inplace=True) - gdf.to_crs(epsg=crs_out, inplace=True) - - return gdf def query_sqlite(db_path, query_string): - with sqlite3.connect(db_path) as con: - cursor = con.execute(query_string) cols = [column[0] for column in cursor.description] results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols) - return results_df diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py new file mode 100644 index 000000000..33dbf202b --- /dev/null +++ b/tests/integration/test_generate_provision_quality.py @@ -0,0 +1,90 @@ +import pandas as pd +from unittest.mock import patch +from pathlib import Path +from datetime import datetime +from digital_land.commands import generate_provision_quality + + +@patch("digital_land.commands.fc.datasette_query") +@patch("digital_land.commands.fc.query_sqlite") +def test_generate_provision_quality( + mock_query_sqlite, + mock_datasette_query, +): + # mock issue_type + mock_datasette_query.side_effect = [ + pd.DataFrame( + [ + { + "description": "desc", + "issue_type": "missing-value", + "name": "Missing Value", + "severity": "error", + "responsibility": "external", + "quality_criteria": "any other validity error", + "quality_level": 3, + } + ] + ), + # mock LPA boundary check + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 2}', + } + ] + ), + # mock count value + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 1}', + } + ] + ), + ] + + # mock sqlite queries + mock_query_sqlite.side_effect = [ + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "active_endpoint_count": 5, + } + ] + ), + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "problem_source": "issue", + "problem_type": "missing-value", + "count": 1, + } + ] + ), + ] + + generate_provision_quality() + + td = datetime.today().strftime("%Y-%m-%d") + output_file = Path( + f"/tmp/performance/provision-quality/entry-date={td}/provision-quality.parquet" + ) + assert output_file.exists(), "Parquet file not found" + + df = pd.read_parquet(output_file) + assert "organisation" in df.columns + assert "dataset" in df.columns + assert "quality" in df.columns + + assert not df.empty, "Dataframe loaded from Parquet is empty" + assert len(df) == 1 + assert df.iloc[0]["organisation"] == "org1" diff --git a/tests/unit/test_functions_core_utils.py b/tests/unit/test_functions_core_utils.py new file mode 100644 index 000000000..1df56b006 --- /dev/null +++ b/tests/unit/test_functions_core_utils.py @@ -0,0 +1,32 @@ +import pandas as pd +from unittest.mock import patch, Mock +from digital_land.utils.functions_core import datasette_query, query_sqlite + + +@patch("digital_land.utils.functions_core.sqlite3.connect") +def test_query_sqlite(mock_connect): + mock_data = Mock() + mock_data.description = [("organisation",), ("dataset",)] + mock_data.fetchall.return_value = [("org1", "dataset1"), ("org2", "dataset2")] + + mock_con = Mock() + mock_con.execute.return_value = mock_data + mock_connect.return_value.__enter__.return_value = mock_con + + df = query_sqlite("db_path", "SELECT * FROM table") + + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["organisation", "dataset"] + assert len(df) == 2 + assert df.iloc[0]["organisation"] == "org1" + + +@patch("digital_land.utils.functions_core.pd.read_csv") +def test_datasette_query(mock_read_csv): + df_mock = pd.DataFrame({"organisation": ["org1", "org2"]}) + mock_read_csv.return_value = df_mock + + df = datasette_query("db", "SELECT organisation FROM table") + assert isinstance(df, pd.DataFrame) + assert "organisation" in df.columns + assert df.equals(df_mock) From 982fd3fbd1a5260f09086bc6a2836e02ee189997 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Fri, 16 May 2025 14:34:29 +0100 Subject: [PATCH 05/17] make specification parameter optional --- digital_land/api.py | 4 +++- digital_land/commands.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/digital_land/api.py b/digital_land/api.py index 03a480e01..e99ced5bc 100644 --- a/digital_land/api.py +++ b/digital_land/api.py @@ -14,7 +14,7 @@ class API: def __init__( self, - specification: Specification, + specification: Specification = None, url: str = DEFAULT_URL, cache_dir: str = "var/cache", ): @@ -66,6 +66,8 @@ def download_dataset( raise ValueError("Builder name must be provided when builder=True") url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3" else: + if self.specification is None: + raise ValueError("Specification must be provided") collection = self.specification.dataset[dataset]["collection"] url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" mode = "wb" diff --git a/digital_land/commands.py b/digital_land/commands.py index 7eb7db3c8..36e38c774 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1673,8 +1673,7 @@ def generate_provision_quality(): """Generates a provision quality dataset and saves it as a parquet file""" td = datetime.today().strftime("%Y-%m-%d") - specification = Specification("specification/") - api = API(specification) + api = API() # Download the performance db using api api.download_dataset( From 3ce99b625c3900da6689772fd448ab5f99665afb Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Mon, 19 May 2025 09:42:33 +0100 Subject: [PATCH 06/17] Feat/combine config text (#405) * Only run duplicate reference check when needed --- digital_land/commands.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index ad16e6edb..46fe18295 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -376,7 +376,11 @@ def pipeline_run( ), ) - issue_log = duplicate_reference_check(issues=issue_log, csv_path=output_path) + # In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values. + # If we have done this then we will not call duplicate_reference_check as we have already carried out a + # duplicate check and stop messages appearing in issues about reference values not being unique + if combine_fields == {}: + issue_log = duplicate_reference_check(issues=issue_log, csv_path=output_path) issue_log.apply_entity_map() issue_log.save(os.path.join(issue_dir, resource + ".csv")) From 5338f5d46fa9b7e867ecfadd2f51f40ba22cbe56 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Tue, 20 May 2025 11:01:15 +0100 Subject: [PATCH 07/17] New gml.csv file for updated GDAL (#407) --- tests/data/resource_examples/gml.csv | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/data/resource_examples/gml.csv b/tests/data/resource_examples/gml.csv index a7c36099e..7a134e4ee 100644 --- a/tests/data/resource_examples/gml.csv +++ b/tests/data/resource_examples/gml.csv @@ -1,11 +1,11 @@ -WKT,gml_id,Name,Ref -"MULTIPOLYGON (((-2.632272212 53.34206401,-2.632000357 53.34224581,-2.630564119 53.3422481,-2.631009551 53.34186124,-2.631269515 53.34167052,-2.631587148 53.34143591,-2.631736085 53.34132451,-2.630234541 53.34106187,-2.630381899 53.34083419,-2.630430719 53.34083671,-2.630473829 53.3408045,-2.630570696 53.34078062,-2.63062419 53.34078034,-2.630642956 53.3407513,-2.630594227 53.34072236,-2.630564563 53.34068459,-2.630568851 53.34065563,-2.63063132 53.34056742,-2.630716303 53.34044359,-2.630861659 53.34027486,-2.630946484 53.34019947,-2.631384962 53.34029905,-2.63171138 53.3398375,-2.632213184 53.33997422,-2.632657969 53.34008985,-2.632914151 53.33957816,-2.632885686 53.33957175,-2.63293775 53.33946589,-2.632883467 53.3394554,-2.632917069 53.33939897,-2.63292785 53.33935829,-2.632925513 53.33917516,-2.63294476 53.33914532,-2.633046457 53.33865511,-2.633644084 53.33866515,-2.633816397 53.33841963,-2.634026323 53.33798843,-2.634336105 53.33736035,-2.634500904 53.33711002,-2.634540774 53.33710981,-2.634559936 53.33707439,-2.634913572 53.33709021,-2.635470784 53.33712481,-2.636015216 53.33662362,-2.637044871 53.33681168,-2.636935924 53.33706199,-2.636748823 53.33748571,-2.636502934 53.33801909,-2.636344157 53.33835863,-2.636267224 53.33851333,-2.635995827 53.33894819,-2.635870308 53.3391595,-2.635738612 53.33938773,-2.63565651 53.33952134,-2.635579679 53.33962563,-2.635375517 53.3399026,-2.6351317 53.34017833,-2.634725706 53.34052297,-2.634310927 53.34081048,-2.63398409 53.34102574,-2.633800358 53.34113598,-2.633595926 53.34124607,-2.633165035 53.34146691,-2.633066806 53.34151541,-2.632769196 53.34173949,-2.632272212 53.34206401)))",conservation_areas_polygon.1,Daresbury,1 -"MULTIPOLYGON (((-2.636788444 53.35518027,-2.636565647 53.35505107,-2.636258261 53.354965,-2.635477454 53.35471351,-2.635008016 53.35456728,-2.634656133 53.35444461,-2.634469454 53.35431979,-2.634403116 53.35440731,-2.63425788 53.35474516,-2.634113641 53.35494191,-2.633950352 53.35528255,-2.632575891 53.35490245,-2.632233555 53.35520719,-2.632071694 53.35536926,-2.63184661 53.35557804,-2.631608214 53.35579209,-2.630836185 53.35638612,-2.630249779 53.35685884,-2.629762455 53.3572382,-2.629696966 53.35721609,-2.629671084 53.35719106,-2.629914256 53.35673741,-2.630007817 53.35658883,-2.62982389 53.35654846,-2.629577281 53.35644192,-2.629684466 53.35636848,-2.629877509 53.35621111,-2.63027752 53.35582609,-2.630229262 53.35581286,-2.629983626 53.35574659,-2.629927443 53.35573394,-2.62980002 53.35571143,-2.629189378 53.355599,-2.629323648 53.35532862,-2.629638798 53.35540513,-2.629853419 53.35509029,-2.630029088 53.35487279,-2.630113139 53.35489455,-2.630322542 53.35460427,-2.630274311 53.35459284,-2.630385887 53.35441423,-2.630503172 53.35432016,-2.630238926 53.35423529,-2.630147221 53.35412335,-2.630084971 53.35397631,-2.630229717 53.35391264,-2.630356307 53.35383729,-2.630576983 53.3537238,-2.630792318 53.35365976,-2.630942067 53.35362572,-2.631139727 53.35358055,-2.631196349 53.3535479,-2.631225404 53.35347766,-2.631282835 53.35339917,-2.631387062 53.35333571,-2.631354942 53.35319956,-2.631342389 53.35306178,-2.631411528 53.35286273,-2.63121451 53.35284131,-2.63134322 53.35250625,-2.632175723 53.35262316,-2.632232647 53.35251052,-2.632445365 53.35226219,-2.632692377 53.35200468,-2.633013939 53.3521136,-2.635172302 53.3526864,-2.635545184 53.35280573,-2.635979617 53.35238314,-2.636600521 53.35179005,-2.637117499 53.35125666,-2.637200141 53.35124166,-2.637489241 53.35149829,-2.637706892 53.35169042,-2.637759235 53.35176203,-2.637890637 53.35191858,-2.637933025 53.35194532,-2.637997827 53.35195486,-2.638008 53.35203038,-2.638046643 53.3522036,-2.63810637 53.35238822,-2.638371036 53.3530064,-2.638376316 53.3530531,-2.638432739 53.35321456,-2.638441974 53.35332782,-2.638406463 53.3534628,-2.638263964 53.35368292,-2.63826113 53.35372553,-2.638780194 53.35384461,-2.638675041 53.353945,-2.638514619 53.35405163,-2.637348667 53.3548279,-2.636788444 53.35518027)))",conservation_areas_polygon.2,Moore,2 -"MULTIPOLYGON (((-2.818057261 53.33653314,-2.818072201 53.33662793,-2.818048504 53.33670959,-2.818016747 53.33681657,-2.818011453 53.3369213,-2.817737254 53.3368805,-2.81740636 53.33681357,-2.817138523 53.33674881,-2.816795325 53.33665331,-2.816509739 53.33657285,-2.816313127 53.33654149,-2.816037071 53.33651957,-2.815695268 53.3365166,-2.815706931 53.3365832,-2.815623603 53.33708233,-2.815504485 53.33706293,-2.815359707 53.33704855,-2.814933572 53.3370006,-2.814830927 53.33698108,-2.814745702 53.33696621,-2.814819917 53.33655332,-2.814533705 53.33653748,-2.814024696 53.33647392,-2.813709761 53.3364323,-2.813043392 53.33634321,-2.812550771 53.33628195,-2.812208893 53.33626227,-2.810991874 53.336185,-2.810380068 53.3361462,-2.810200041 53.33613305,-2.809900268 53.33611378,-2.809097205 53.33608328,-2.809040959 53.33628451,-2.808997658 53.3365286,-2.80864196 53.33651826,-2.808642644 53.33655437,-2.808291097 53.33654041,-2.807848924 53.33658636,-2.807358971 53.33663272,-2.806916306 53.33667894,-2.806211092 53.33676233,-2.805577301 53.33682422,-2.805276572 53.33675436,-2.805194226 53.33681953,-2.804935377 53.33707946,-2.804730496 53.33701623,-2.804929974 53.33679265,-2.805105693 53.33657632,-2.805498734 53.33646602,-2.805701722 53.33642897,-2.805692708 53.33553112,-2.805850845 53.33524932,-2.806388786 53.33538479,-2.806593808 53.33545502,-2.806980072 53.33560293,-2.807896916 53.33594818,-2.807856331 53.33570709,-2.807852626 53.33551112,-2.807866672 53.33541604,-2.807658952 53.33530089,-2.807403283 53.33517699,-2.807113138 53.33505009,-2.807877939 53.33494436,-2.808524394 53.33485263,-2.808904452 53.33475111,-2.808891381 53.33470797,-2.809165076 53.33461311,-2.809675582 53.33435857,-2.809973991 53.33430649,-2.810225419 53.33430478,-2.810485018 53.33430868,-2.810477659 53.33433875,-2.81064544 53.33435198,-2.810873522 53.33437901,-2.811078266 53.33443521,-2.811330918 53.33449084,-2.811523901 53.33454685,-2.811692226 53.33458866,-2.811848647 53.33462354,-2.812044833 53.33464575,-2.812156644 53.33463987,-2.812363002 53.33459838,-2.812546442 53.33454968,-2.812729951 53.33446594,-2.812743027 53.33442388,-2.812825443 53.33433714,-2.812968369 53.33430049,-2.813718721 53.33408015,-2.814112523 53.33400584,-2.81491254 53.33387844,-2.815307393 53.33383979,-2.815929168 53.33379259,-2.816312301 53.33377568,-2.816700177 53.33389066,-2.816768075 53.3338085,-2.816839463 53.33378636,-2.816922881 53.33377851,-2.816995457 53.33379257,-2.817066814 53.33384663,-2.817121283 53.33392371,-2.817093601 53.33399723,-2.817034104 53.33402631,-2.81698645 53.33403392,-2.816962441 53.33403408,-2.817011177 53.33407679,-2.817035613 53.33410529,-2.816954309 53.3341727,-2.816884428 53.33427374,-2.816778237 53.33443766,-2.816690504 53.33478405,-2.816552321 53.33538637,-2.816597831 53.33555886,-2.81663733 53.33575944,-2.816713246 53.33597405,-2.816750638 53.3360384,-2.816836199 53.33611645,-2.816992823 53.33618736,-2.817528781 53.3363839,-2.818057261 53.33653314)))",conservation_areas_polygon.3,Hale Road,3 -"MULTIPOLYGON (((-2.800712286 53.33601316,-2.800577236 53.33614724,-2.799749745 53.33588959,-2.799243757 53.33570813,-2.798774901 53.33600899,-2.798493827 53.33593565,-2.798361589 53.33591596,-2.797758862 53.33575932,-2.797173054 53.33559896,-2.796880148 53.3355222,-2.796489482 53.33540853,-2.796420519 53.33538176,-2.796076406 53.33530174,-2.796001131 53.33526827,-2.795915513 53.33525842,-2.7955957 53.33519289,-2.79515818 53.33510953,-2.795402822 53.33474216,-2.795725393 53.3342712,-2.79503064 53.33421041,-2.795071289 53.33407309,-2.794656544 53.33402832,-2.794810306 53.3334883,-2.794191919 53.33337128,-2.794526347 53.33308562,-2.794547885 53.33269898,-2.794534379 53.3322786,-2.795293212 53.33223247,-2.795829977 53.3322014,-2.796290185 53.33206856,-2.796387655 53.33238926,-2.796449104 53.332611,-2.79649222 53.33278522,-2.796498685 53.33283317,-2.796488063 53.33286756,-2.796832456 53.33296438,-2.797056179 53.3330106,-2.797251185 53.33304363,-2.797446243 53.33308671,-2.797584887 53.33317817,-2.797718285 53.33327981,-2.797764792 53.33331041,-2.797804782 53.33331689,-2.797868811 53.33337792,-2.79792064 53.33341523,-2.798002396 53.33349673,-2.798094967 53.33355425,-2.798170663 53.33363606,-2.798240142 53.33368331,-2.798303602 53.3337138,-2.798539757 53.33382149,-2.799334956 53.33418892,-2.799433033 53.33423634,-2.799479411 53.33425993,-2.799519853 53.33429048,-2.79956099 53.33435177,-2.79957961 53.33441993,-2.799609737 53.33450196,-2.799599453 53.33458074,-2.799600283 53.33462514,-2.799669319 53.33464848,-2.799781593 53.33453487,-2.799826702 53.33451065,-2.799877956 53.33448982,-2.799957351 53.33447213,-2.800192027 53.33447396,-2.800432272 53.33449644,-2.800793067 53.33454182,-2.800891014 53.33458215,-2.801012327 53.33464648,-2.801098771 53.33470045,-2.801157376 53.33477536,-2.801204342 53.33485026,-2.801267948 53.33488749,-2.801400199 53.33493441,-2.801400903 53.33497196,-2.801224772 53.33531194,-2.801046528 53.33557301,-2.800846135 53.33584791,-2.800712286 53.33601316)))",conservation_areas_polygon.4,Hale Village,4 -"MULTIPOLYGON (((-2.790712725 53.35446721,-2.790377376 53.35438631,-2.790051664 53.35431622,-2.789762662 53.35422935,-2.789528745 53.35417365,-2.789326418 53.35411766,-2.789053861 53.3540287,-2.788806058 53.35393203,-2.788689254 53.35386486,-2.788376538 53.3544992,-2.788069551 53.35445127,-2.788086977 53.35440146,-2.787445568 53.35429472,-2.787508212 53.35415834,-2.787561505 53.35406642,-2.787643683 53.35398806,-2.788128491 53.35362414,-2.787136543 53.35314813,-2.786643729 53.35356206,-2.786259603 53.35342754,-2.785835856 53.353287,-2.785653206 53.3532368,-2.785347185 53.35316558,-2.785197592 53.35309432,-2.785075711 53.35303419,-2.784712362 53.35293099,-2.78481744 53.35205639,-2.78647051 53.35272531,-2.786655432 53.35269093,-2.787728074 53.35309704,-2.78782969 53.3530048,-2.788677774 53.35324327,-2.789265133 53.35342252,-2.791441846 53.35408621,-2.791311306 53.35424128,-2.790910579 53.35412739,-2.790712725 53.35446721)))",conservation_areas_polygon.5,Halebank,5 -"MULTIPOLYGON (((-2.693226824 53.33533116,-2.693296915 53.33535465,-2.693447308 53.33538298,-2.693766008 53.33503902,-2.694326867 53.33435119,-2.69390786 53.33424273,-2.694022089 53.33413334,-2.694088557 53.33400965,-2.694132135 53.33386445,-2.694716469 53.33363999,-2.694568925 53.33356581,-2.69451442 53.33353378,-2.694467318 53.33349254,-2.694419837 53.33342793,-2.694403322 53.33336342,-2.694402418 53.3333078,-2.694398869 53.33323449,-2.694420663 53.33299084,-2.694459534 53.33255604,-2.694531237 53.3321994,-2.694563584 53.33199531,-2.694349961 53.33199234,-2.693956893 53.33203173,-2.693920008 53.33215444,-2.693834283 53.33247924,-2.693506996 53.33245293,-2.693322018 53.33243766,-2.693299499 53.33246115,-2.693113412 53.33242287,-2.693115212 53.33253384,-2.69304052 53.33270052,-2.692916968 53.33292203,-2.692839717 53.33313653,-2.69263046 53.3331355,-2.692543481 53.33350597,-2.692492472 53.33377397,-2.692769627 53.33379285,-2.692706555 53.3341559,-2.69233724 53.33412686,-2.692233176 53.33413079,-2.69211744 53.33411771,-2.692029153 53.33417044,-2.691922085 53.33424941,-2.691884559 53.33430975,-2.691425994 53.33415991,-2.691262495 53.33406363,-2.691214873 53.33399004,-2.691131358 53.33390236,-2.690997526 53.33375739,-2.690902181 53.33363456,-2.690863406 53.33355642,-2.690879591 53.33339368,-2.690942942 53.33325312,-2.691182788 53.33296354,-2.691120654 53.33294989,-2.69114324 53.33289889,-2.691187855 53.33278658,-2.691362354 53.33254482,-2.691731171 53.3319938,-2.69203961 53.3314833,-2.692127515 53.33117743,-2.69215062 53.3311818,-2.692235619 53.33118607,-2.69232458 53.33093807,-2.692406731 53.33076658,-2.692480081 53.33054869,-2.692572756 53.3305069,-2.692703046 53.33046427,-2.692731291 53.3302975,-2.692970509 53.33029135,-2.693449371 53.33027481,-2.69373605 53.33033327,-2.693876931 53.33033263,-2.694347794 53.33041058,-2.694499625 53.3302235,-2.695198031 53.33043798,-2.695849884 53.3306148,-2.696052812 53.33069081,-2.696712289 53.33081816,-2.697119621 53.33101608,-2.69754263 53.33115038,-2.697769665 53.33131566,-2.697973701 53.33151333,-2.698127929 53.33150318,-2.698359611 53.33149031,-2.698730019 53.33147439,-2.698730428 53.33190079,-2.69751652 53.33190224,-2.697527219 53.33210329,-2.697929993 53.33214253,-2.697834892 53.33246696,-2.697672655 53.3324724,-2.697487477 53.33248274,-2.697505374 53.33263073,-2.697476316 53.33274655,-2.697871201 53.3328182,-2.697818959 53.3329249,-2.697751629 53.33305479,-2.69776913 53.33317941,-2.697809158 53.33327192,-2.69769693 53.33349669,-2.697651997 53.33348177,-2.697502943 53.33364197,-2.697260503 53.33386131,-2.697386613 53.33389624,-2.696965018 53.33428305,-2.696937879 53.33431969,-2.696886094 53.33439386,-2.696464994 53.33472782,-2.696350312 53.33477873,-2.696269107 53.33474972,-2.696030994 53.3351138,-2.695978741 53.33522024,-2.69588599 53.33531127,-2.695391548 53.33520991,-2.695300796 53.33533993,-2.69490457 53.33524789,-2.694533773 53.33567725,-2.693661056 53.33540007,-2.693468982 53.33564975,-2.693408188 53.33584115,-2.693394225 53.33593388,-2.693405456 53.33601613,-2.692886454 53.33601915,-2.692800244 53.33600194,-2.693086706 53.33554836,-2.693226824 53.33533116)))",conservation_areas_polygon.6,Halton,6 -"MULTIPOLYGON (((-2.740832295 53.33468831,-2.740774378 53.33470412,-2.740729413 53.33473163,-2.740576448 53.33489972,-2.740471575 53.33485769,-2.740163367 53.33467673,-2.739776702 53.33448476,-2.739396747 53.3343081,-2.739100317 53.33403487,-2.738614277 53.33433964,-2.738126537 53.33405869,-2.738344515 53.33391833,-2.738193911 53.33383748,-2.73783865 53.33359085,-2.738315965 53.3334053,-2.738347899 53.33335451,-2.738366415 53.33331539,-2.738397973 53.33326461,-2.73842355 53.33322571,-2.738494385 53.33319796,-2.738546168 53.33317831,-2.738597114 53.33313891,-2.738602132 53.33304956,-2.738653109 53.33298328,-2.738716925 53.33292834,-2.738749262 53.33290091,-2.738806842 53.33286568,-2.738929794 53.33283769,-2.739387243 53.33262081,-2.739445829 53.33264381,-2.739493371 53.33237914,-2.739292084 53.33238039,-2.739292685 53.33206442,-2.739294682 53.33201579,-2.739416824 53.33200453,-2.739566003 53.33196855,-2.739619462 53.33183064,-2.739657438 53.33174306,-2.739690331 53.33169056,-2.73977611 53.33147498,-2.739877962 53.33148702,-2.739858283 53.33157332,-2.739910045 53.33162368,-2.739933578 53.33164555,-2.739981816 53.33167194,-2.74002462 53.33170537,-2.74011294 53.33176494,-2.740183349 53.33179173,-2.740245867 53.33182675,-2.740299769 53.33187234,-2.740368275 53.33191756,-2.740509567 53.33200538,-2.740569578 53.33204563,-2.740590792 53.33204801,-2.740721173 53.3319914,-2.740861324 53.33201335,-2.74108604 53.33202715,-2.741272575 53.33203865,-2.741213768 53.33231032,-2.741190153 53.33241192,-2.741186286 53.33243225,-2.741065766 53.33256483,-2.740965493 53.33264399,-2.74145198 53.33258867,-2.741847763 53.33254343,-2.742106692 53.33251064,-2.74230605 53.33240435,-2.7425218 53.33247328,-2.742725714 53.33253671,-2.742628679 53.33263185,-2.742533022 53.33274146,-2.742820511 53.33284075,-2.742552625 53.33311435,-2.742179645 53.33296139,-2.741687054 53.33339984,-2.741373897 53.33331247,-2.74119488 53.33348458,-2.741358187 53.33354171,-2.741567543 53.33362614,-2.741849133 53.3337643,-2.741997656 53.3340743,-2.741961483 53.33423017,-2.741578133 53.33423642,-2.741298276 53.33419916,-2.741346737 53.33436627,-2.741365882 53.33448477,-2.741251892 53.33453006,-2.740884141 53.33467225,-2.740832295 53.33468831)))",conservation_areas_polygon.7,Higher Runcorn,7 -"MULTIPOLYGON (((-2.731365138 53.36304468,-2.731052514 53.36289105,-2.730753708 53.36285019,-2.730775291 53.36271436,-2.730528121 53.36267796,-2.730293235 53.36264372,-2.7301736 53.3623781,-2.730115326 53.36223836,-2.730214908 53.36223658,-2.730306619 53.36224554,-2.730462919 53.36227047,-2.730911737 53.36231768,-2.730834842 53.36202349,-2.731276574 53.3619905,-2.731209877 53.36171279,-2.731165039 53.36153298,-2.731115774 53.36131842,-2.731169578 53.36130578,-2.731816175 53.36125455,-2.73230487 53.36121453,-2.732349418 53.36137799,-2.732394289 53.36158133,-2.732426347 53.36175251,-2.732679175 53.36173612,-2.732669859 53.36169153,-2.733237626 53.36163825,-2.73318875 53.36143995,-2.733375316 53.36142398,-2.733416563 53.36141609,-2.733436861 53.36140105,-2.733445155 53.36138617,-2.733440486 53.36136625,-2.733444475 53.36134654,-2.73378871 53.36132079,-2.733841011 53.36153812,-2.734018676 53.36155248,-2.734215568 53.36161094,-2.734334009 53.36168354,-2.73436375 53.36171957,-2.734385347 53.36175035,-2.734398249 53.36178756,-2.734403008 53.36181961,-2.734395235 53.36185695,-2.734387608 53.3618818,-2.73437118 53.36190688,-2.734350959 53.36193424,-2.734314069 53.36197185,-2.734248386 53.36200676,-2.733942146 53.3623042,-2.733913597 53.36233664,-2.733897916 53.36237637,-2.733871264 53.36251923,-2.733854417 53.36274291,-2.7338336 53.36298126,-2.733808387 53.36323672,-2.73378582 53.36353152,-2.733264628 53.36350606,-2.73238955 53.36347126,-2.732449797 53.36325684,-2.732489069 53.36300409,-2.73217143 53.36302095,-2.731920554 53.36344781,-2.731483464 53.36335614,-2.731344315 53.36359404,-2.731089517 53.36353989,-2.731045976 53.36352982,-2.731182425 53.36329301,-2.731365138 53.36304468)))",conservation_areas_polygon.8,Victoria Square,8 -"MULTIPOLYGON (((-2.736877605 53.34817381,-2.736795079 53.34829419,-2.736459974 53.34821773,-2.736395546 53.34832407,-2.735848013 53.34820379,-2.735440525 53.34809623,-2.735485553 53.34803556,-2.735479503 53.34801485,-2.735450219 53.34800487,-2.735188767 53.34800504,-2.734754161 53.34803414,-2.734437979 53.34807149,-2.733871621 53.34814552,-2.733629276 53.34819113,-2.733567172 53.34821021,-2.73349168 53.34832875,-2.733391322 53.34849256,-2.73353946 53.34853577,-2.733709633 53.34857885,-2.733775993 53.34859156,-2.73351788 53.34900328,-2.733088655 53.34891327,-2.732974132 53.34910358,-2.732848424 53.34908243,-2.732720228 53.34927004,-2.732693717 53.34926229,-2.7326477 53.34925862,-2.732252867 53.34917424,-2.732239008 53.34921845,-2.731575118 53.34908791,-2.731755394 53.34896135,-2.731930051 53.34883214,-2.732023572 53.34869507,-2.732139047 53.34853117,-2.732239789 53.34841139,-2.732378294 53.34830468,-2.732553371 53.34819325,-2.732714155 53.34812173,-2.732823009 53.34804152,-2.733399399 53.34792449,-2.73372664 53.34787502,-2.733989311 53.34783746,-2.733987646 53.34781923,-2.734271922 53.34777758,-2.734368324 53.34776576,-2.734484079 53.34775606,-2.735450016 53.34770517,-2.73553026 53.34769723,-2.73558289 53.3476969,-2.735678721 53.34770952,-2.73582312 53.3477302,-2.735844805 53.34770193,-2.735969788 53.34771689,-2.735973808 53.34770608,-2.736173686 53.34773306,-2.736164597 53.34775217,-2.736445193 53.34779869,-2.736644898 53.34783736,-2.73675611 53.3478586,-2.736837767 53.34788901,-2.736948943 53.34793694,-2.736933718 53.34805925,-2.736877605 53.34817381)))",conservation_areas_polygon.9,West Bank Promenade,9 -"MULTIPOLYGON (((-2.737815014 53.31762619,-2.737724903 53.31751747,-2.737662642 53.3174386,-2.737645776 53.31741543,-2.737606452 53.31739572,-2.737695017 53.3173853,-2.73786617 53.31738424,-2.738225421 53.31738201,-2.738556706 53.31738669,-2.738988829 53.31744665,-2.739000602 53.31747641,-2.73944373 53.31754331,-2.739620426 53.31756243,-2.739914589 53.31759655,-2.740269854 53.31771343,-2.740556057 53.31782352,-2.740972025 53.31803014,-2.741211848 53.31817756,-2.741425596 53.31832738,-2.741620439 53.31847165,-2.741439807 53.31857558,-2.74115851 53.31869873,-2.741171574 53.31871716,-2.740875946 53.31887797,-2.740591704 53.31903843,-2.740524197 53.31899249,-2.740355129 53.31908573,-2.740471219 53.3191285,-2.740566161 53.31916449,-2.740654744 53.31918353,-2.740726563 53.31919324,-2.740743614 53.31927716,-2.740612847 53.31931616,-2.740061315 53.31958756,-2.74017288 53.31966891,-2.740072953 53.31972399,-2.739981088 53.31977991,-2.739731568 53.31983762,-2.739669639 53.31984169,-2.739632507 53.31983518,-2.739593064 53.31980874,-2.739518854 53.31982573,-2.739510522 53.31980925,-2.739493738 53.31981268,-2.739485546 53.31980428,-2.739364488 53.31982498,-2.739364289 53.31981348,-2.739169057 53.3198394,-2.739025833 53.31983184,-2.739059173 53.31969801,-2.738730595 53.31965727,-2.738799409 53.3194852,-2.7388649 53.3193146,-2.73851779 53.31931846,-2.738477073 53.31916082,-2.738241064 53.31918367,-2.737988366 53.31921552,-2.7379235 53.31905021,-2.737875111 53.31885668,-2.737544883 53.31893483,-2.737646818 53.31878116,-2.73734971 53.31882821,-2.737353134 53.31878298,-2.737165674 53.31846279,-2.736631508 53.31850033,-2.736615351 53.31846079,-2.736371293 53.31851209,-2.73629566 53.3182808,-2.736237196 53.31810584,-2.736822256 53.31805585,-2.737114813 53.31804731,-2.737441848 53.31804896,-2.737563381 53.31804821,-2.737866683 53.31800986,-2.738075154 53.31796867,-2.737815014 53.31762619)))",conservation_areas_polygon.10,Weston Village,10 +WKT,gml_id,lowerCorner,upperCorner,Name,Ref +"MULTIPOLYGON (((-2.632272212 53.34206401,-2.632000357 53.34224581,-2.630564119 53.3422481,-2.631009551 53.34186124,-2.631269515 53.34167052,-2.631587148 53.34143591,-2.631736085 53.34132451,-2.630234541 53.34106187,-2.630381899 53.34083419,-2.630430719 53.34083671,-2.630473829 53.3408045,-2.630570696 53.34078062,-2.63062419 53.34078034,-2.630642956 53.3407513,-2.630594227 53.34072236,-2.630564563 53.34068459,-2.630568851 53.34065563,-2.63063132 53.34056742,-2.630716303 53.34044359,-2.630861659 53.34027486,-2.630946484 53.34019947,-2.631384962 53.34029905,-2.63171138 53.3398375,-2.632213184 53.33997422,-2.632657969 53.34008985,-2.632914151 53.33957816,-2.632885686 53.33957175,-2.63293775 53.33946589,-2.632883467 53.3394554,-2.632917069 53.33939897,-2.63292785 53.33935829,-2.632925513 53.33917516,-2.63294476 53.33914532,-2.633046457 53.33865511,-2.633644084 53.33866515,-2.633816397 53.33841963,-2.634026323 53.33798843,-2.634336105 53.33736035,-2.634500904 53.33711002,-2.634540774 53.33710981,-2.634559936 53.33707439,-2.634913572 53.33709021,-2.635470784 53.33712481,-2.636015216 53.33662362,-2.637044871 53.33681168,-2.636935924 53.33706199,-2.636748823 53.33748571,-2.636502934 53.33801909,-2.636344157 53.33835863,-2.636267224 53.33851333,-2.635995827 53.33894819,-2.635870308 53.3391595,-2.635738612 53.33938773,-2.63565651 53.33952134,-2.635579679 53.33962563,-2.635375517 53.3399026,-2.6351317 53.34017833,-2.634725706 53.34052297,-2.634310927 53.34081048,-2.63398409 53.34102574,-2.633800358 53.34113598,-2.633595926 53.34124607,-2.633165035 53.34146691,-2.633066806 53.34151541,-2.632769196 53.34173949,-2.632272212 53.34206401)))",conservation_areas_polygon.1,53.336623621792555 -2.6370448708677467,53.34224810480465 -2.630234540634663,Daresbury,1 +"MULTIPOLYGON (((-2.636788444 53.35518027,-2.636565647 53.35505107,-2.636258261 53.354965,-2.635477454 53.35471351,-2.635008016 53.35456728,-2.634656133 53.35444461,-2.634469454 53.35431979,-2.634403116 53.35440731,-2.63425788 53.35474516,-2.634113641 53.35494191,-2.633950352 53.35528255,-2.632575891 53.35490245,-2.632233555 53.35520719,-2.632071694 53.35536926,-2.63184661 53.35557804,-2.631608214 53.35579209,-2.630836185 53.35638612,-2.630249779 53.35685884,-2.629762455 53.3572382,-2.629696966 53.35721609,-2.629671084 53.35719106,-2.629914256 53.35673741,-2.630007817 53.35658883,-2.62982389 53.35654846,-2.629577281 53.35644192,-2.629684466 53.35636848,-2.629877509 53.35621111,-2.63027752 53.35582609,-2.630229262 53.35581286,-2.629983626 53.35574659,-2.629927443 53.35573394,-2.62980002 53.35571143,-2.629189378 53.355599,-2.629323648 53.35532862,-2.629638798 53.35540513,-2.629853419 53.35509029,-2.630029088 53.35487279,-2.630113139 53.35489455,-2.630322542 53.35460427,-2.630274311 53.35459284,-2.630385887 53.35441423,-2.630503172 53.35432016,-2.630238926 53.35423529,-2.630147221 53.35412335,-2.630084971 53.35397631,-2.630229717 53.35391264,-2.630356307 53.35383729,-2.630576983 53.3537238,-2.630792318 53.35365976,-2.630942067 53.35362572,-2.631139727 53.35358055,-2.631196349 53.3535479,-2.631225404 53.35347766,-2.631282835 53.35339917,-2.631387062 53.35333571,-2.631354942 53.35319956,-2.631342389 53.35306178,-2.631411528 53.35286273,-2.63121451 53.35284131,-2.63134322 53.35250625,-2.632175723 53.35262316,-2.632232647 53.35251052,-2.632445365 53.35226219,-2.632692377 53.35200468,-2.633013939 53.3521136,-2.635172302 53.3526864,-2.635545184 53.35280573,-2.635979617 53.35238314,-2.636600521 53.35179005,-2.637117499 53.35125666,-2.637200141 53.35124166,-2.637489241 53.35149829,-2.637706892 53.35169042,-2.637759235 53.35176203,-2.637890637 53.35191858,-2.637933025 53.35194532,-2.637997827 53.35195486,-2.638008 53.35203038,-2.638046643 53.3522036,-2.63810637 53.35238822,-2.638371036 53.3530064,-2.638376316 53.3530531,-2.638432739 53.35321456,-2.638441974 53.35332782,-2.638406463 53.3534628,-2.638263964 53.35368292,-2.63826113 53.35372553,-2.638780194 53.35384461,-2.638675041 53.353945,-2.638514619 53.35405163,-2.637348667 53.3548279,-2.636788444 53.35518027)))",conservation_areas_polygon.2,53.351241662102865 -2.6387801935178166,53.35723820484838 -2.629189378252195,Moore,2 +"MULTIPOLYGON (((-2.818057261 53.33653314,-2.818072201 53.33662793,-2.818048504 53.33670959,-2.818016747 53.33681657,-2.818011453 53.3369213,-2.817737254 53.3368805,-2.81740636 53.33681357,-2.817138523 53.33674881,-2.816795325 53.33665331,-2.816509739 53.33657285,-2.816313127 53.33654149,-2.816037071 53.33651957,-2.815695268 53.3365166,-2.815706931 53.3365832,-2.815623603 53.33708233,-2.815504485 53.33706293,-2.815359707 53.33704855,-2.814933572 53.3370006,-2.814830927 53.33698108,-2.814745702 53.33696621,-2.814819917 53.33655332,-2.814533705 53.33653748,-2.814024696 53.33647392,-2.813709761 53.3364323,-2.813043392 53.33634321,-2.812550771 53.33628195,-2.812208893 53.33626227,-2.810991874 53.336185,-2.810380068 53.3361462,-2.810200041 53.33613305,-2.809900268 53.33611378,-2.809097205 53.33608328,-2.809040959 53.33628451,-2.808997658 53.3365286,-2.80864196 53.33651826,-2.808642644 53.33655437,-2.808291097 53.33654041,-2.807848924 53.33658636,-2.807358971 53.33663272,-2.806916306 53.33667894,-2.806211092 53.33676233,-2.805577301 53.33682422,-2.805276572 53.33675436,-2.805194226 53.33681953,-2.804935377 53.33707946,-2.804730496 53.33701623,-2.804929974 53.33679265,-2.805105693 53.33657632,-2.805498734 53.33646602,-2.805701722 53.33642897,-2.805692708 53.33553112,-2.805850845 53.33524932,-2.806388786 53.33538479,-2.806593808 53.33545502,-2.806980072 53.33560293,-2.807896916 53.33594818,-2.807856331 53.33570709,-2.807852626 53.33551112,-2.807866672 53.33541604,-2.807658952 53.33530089,-2.807403283 53.33517699,-2.807113138 53.33505009,-2.807877939 53.33494436,-2.808524394 53.33485263,-2.808904452 53.33475111,-2.808891381 53.33470797,-2.809165076 53.33461311,-2.809675582 53.33435857,-2.809973991 53.33430649,-2.810225419 53.33430478,-2.810485018 53.33430868,-2.810477659 53.33433875,-2.81064544 53.33435198,-2.810873522 53.33437901,-2.811078266 53.33443521,-2.811330918 53.33449084,-2.811523901 53.33454685,-2.811692226 53.33458866,-2.811848647 53.33462354,-2.812044833 53.33464575,-2.812156644 53.33463987,-2.812363002 53.33459838,-2.812546442 53.33454968,-2.812729951 53.33446594,-2.812743027 53.33442388,-2.812825443 53.33433714,-2.812968369 53.33430049,-2.813718721 53.33408015,-2.814112523 53.33400584,-2.81491254 53.33387844,-2.815307393 53.33383979,-2.815929168 53.33379259,-2.816312301 53.33377568,-2.816700177 53.33389066,-2.816768075 53.3338085,-2.816839463 53.33378636,-2.816922881 53.33377851,-2.816995457 53.33379257,-2.817066814 53.33384663,-2.817121283 53.33392371,-2.817093601 53.33399723,-2.817034104 53.33402631,-2.81698645 53.33403392,-2.816962441 53.33403408,-2.817011177 53.33407679,-2.817035613 53.33410529,-2.816954309 53.3341727,-2.816884428 53.33427374,-2.816778237 53.33443766,-2.816690504 53.33478405,-2.816552321 53.33538637,-2.816597831 53.33555886,-2.81663733 53.33575944,-2.816713246 53.33597405,-2.816750638 53.3360384,-2.816836199 53.33611645,-2.816992823 53.33618736,-2.817528781 53.3363839,-2.818057261 53.33653314)))",conservation_areas_polygon.3,53.33377567729678 -2.81807220104152,53.3370823316775 -2.8047304963029385,Hale Road,3 +"MULTIPOLYGON (((-2.800712286 53.33601316,-2.800577236 53.33614724,-2.799749745 53.33588959,-2.799243757 53.33570813,-2.798774901 53.33600899,-2.798493827 53.33593565,-2.798361589 53.33591596,-2.797758862 53.33575932,-2.797173054 53.33559896,-2.796880148 53.3355222,-2.796489482 53.33540853,-2.796420519 53.33538176,-2.796076406 53.33530174,-2.796001131 53.33526827,-2.795915513 53.33525842,-2.7955957 53.33519289,-2.79515818 53.33510953,-2.795402822 53.33474216,-2.795725393 53.3342712,-2.79503064 53.33421041,-2.795071289 53.33407309,-2.794656544 53.33402832,-2.794810306 53.3334883,-2.794191919 53.33337128,-2.794526347 53.33308562,-2.794547885 53.33269898,-2.794534379 53.3322786,-2.795293212 53.33223247,-2.795829977 53.3322014,-2.796290185 53.33206856,-2.796387655 53.33238926,-2.796449104 53.332611,-2.79649222 53.33278522,-2.796498685 53.33283317,-2.796488063 53.33286756,-2.796832456 53.33296438,-2.797056179 53.3330106,-2.797251185 53.33304363,-2.797446243 53.33308671,-2.797584887 53.33317817,-2.797718285 53.33327981,-2.797764792 53.33331041,-2.797804782 53.33331689,-2.797868811 53.33337792,-2.79792064 53.33341523,-2.798002396 53.33349673,-2.798094967 53.33355425,-2.798170663 53.33363606,-2.798240142 53.33368331,-2.798303602 53.3337138,-2.798539757 53.33382149,-2.799334956 53.33418892,-2.799433033 53.33423634,-2.799479411 53.33425993,-2.799519853 53.33429048,-2.79956099 53.33435177,-2.79957961 53.33441993,-2.799609737 53.33450196,-2.799599453 53.33458074,-2.799600283 53.33462514,-2.799669319 53.33464848,-2.799781593 53.33453487,-2.799826702 53.33451065,-2.799877956 53.33448982,-2.799957351 53.33447213,-2.800192027 53.33447396,-2.800432272 53.33449644,-2.800793067 53.33454182,-2.800891014 53.33458215,-2.801012327 53.33464648,-2.801098771 53.33470045,-2.801157376 53.33477536,-2.801204342 53.33485026,-2.801267948 53.33488749,-2.801400199 53.33493441,-2.801400903 53.33497196,-2.801224772 53.33531194,-2.801046528 53.33557301,-2.800846135 53.33584791,-2.800712286 53.33601316)))",conservation_areas_polygon.4,53.33206856009858 -2.8014009026934446,53.33614724444181 -2.794191918978806,Hale Village,4 +"MULTIPOLYGON (((-2.790712725 53.35446721,-2.790377376 53.35438631,-2.790051664 53.35431622,-2.789762662 53.35422935,-2.789528745 53.35417365,-2.789326418 53.35411766,-2.789053861 53.3540287,-2.788806058 53.35393203,-2.788689254 53.35386486,-2.788376538 53.3544992,-2.788069551 53.35445127,-2.788086977 53.35440146,-2.787445568 53.35429472,-2.787508212 53.35415834,-2.787561505 53.35406642,-2.787643683 53.35398806,-2.788128491 53.35362414,-2.787136543 53.35314813,-2.786643729 53.35356206,-2.786259603 53.35342754,-2.785835856 53.353287,-2.785653206 53.3532368,-2.785347185 53.35316558,-2.785197592 53.35309432,-2.785075711 53.35303419,-2.784712362 53.35293099,-2.78481744 53.35205639,-2.78647051 53.35272531,-2.786655432 53.35269093,-2.787728074 53.35309704,-2.78782969 53.3530048,-2.788677774 53.35324327,-2.789265133 53.35342252,-2.791441846 53.35408621,-2.791311306 53.35424128,-2.790910579 53.35412739,-2.790712725 53.35446721)))",conservation_areas_polygon.5,53.35205638586151 -2.7914418459304846,53.35449920453541 -2.7847123619887992,Halebank,5 +"MULTIPOLYGON (((-2.693226824 53.33533116,-2.693296915 53.33535465,-2.693447308 53.33538298,-2.693766008 53.33503902,-2.694326867 53.33435119,-2.69390786 53.33424273,-2.694022089 53.33413334,-2.694088557 53.33400965,-2.694132135 53.33386445,-2.694716469 53.33363999,-2.694568925 53.33356581,-2.69451442 53.33353378,-2.694467318 53.33349254,-2.694419837 53.33342793,-2.694403322 53.33336342,-2.694402418 53.3333078,-2.694398869 53.33323449,-2.694420663 53.33299084,-2.694459534 53.33255604,-2.694531237 53.3321994,-2.694563584 53.33199531,-2.694349961 53.33199234,-2.693956893 53.33203173,-2.693920008 53.33215444,-2.693834283 53.33247924,-2.693506996 53.33245293,-2.693322018 53.33243766,-2.693299499 53.33246115,-2.693113412 53.33242287,-2.693115212 53.33253384,-2.69304052 53.33270052,-2.692916968 53.33292203,-2.692839717 53.33313653,-2.69263046 53.3331355,-2.692543481 53.33350597,-2.692492472 53.33377397,-2.692769627 53.33379285,-2.692706555 53.3341559,-2.69233724 53.33412686,-2.692233176 53.33413079,-2.69211744 53.33411771,-2.692029153 53.33417044,-2.691922085 53.33424941,-2.691884559 53.33430975,-2.691425994 53.33415991,-2.691262495 53.33406363,-2.691214873 53.33399004,-2.691131358 53.33390236,-2.690997526 53.33375739,-2.690902181 53.33363456,-2.690863406 53.33355642,-2.690879591 53.33339368,-2.690942942 53.33325312,-2.691182788 53.33296354,-2.691120654 53.33294989,-2.69114324 53.33289889,-2.691187855 53.33278658,-2.691362354 53.33254482,-2.691731171 53.3319938,-2.69203961 53.3314833,-2.692127515 53.33117743,-2.69215062 53.3311818,-2.692235619 53.33118607,-2.69232458 53.33093807,-2.692406731 53.33076658,-2.692480081 53.33054869,-2.692572756 53.3305069,-2.692703046 53.33046427,-2.692731291 53.3302975,-2.692970509 53.33029135,-2.693449371 53.33027481,-2.69373605 53.33033327,-2.693876931 53.33033263,-2.694347794 53.33041058,-2.694499625 53.3302235,-2.695198031 53.33043798,-2.695849884 53.3306148,-2.696052812 53.33069081,-2.696712289 53.33081816,-2.697119621 53.33101608,-2.69754263 53.33115038,-2.697769665 53.33131566,-2.697973701 53.33151333,-2.698127929 53.33150318,-2.698359611 53.33149031,-2.698730019 53.33147439,-2.698730428 53.33190079,-2.69751652 53.33190224,-2.697527219 53.33210329,-2.697929993 53.33214253,-2.697834892 53.33246696,-2.697672655 53.3324724,-2.697487477 53.33248274,-2.697505374 53.33263073,-2.697476316 53.33274655,-2.697871201 53.3328182,-2.697818959 53.3329249,-2.697751629 53.33305479,-2.69776913 53.33317941,-2.697809158 53.33327192,-2.69769693 53.33349669,-2.697651997 53.33348177,-2.697502943 53.33364197,-2.697260503 53.33386131,-2.697386613 53.33389624,-2.696965018 53.33428305,-2.696937879 53.33431969,-2.696886094 53.33439386,-2.696464994 53.33472782,-2.696350312 53.33477873,-2.696269107 53.33474972,-2.696030994 53.3351138,-2.695978741 53.33522024,-2.69588599 53.33531127,-2.695391548 53.33520991,-2.695300796 53.33533993,-2.69490457 53.33524789,-2.694533773 53.33567725,-2.693661056 53.33540007,-2.693468982 53.33564975,-2.693408188 53.33584115,-2.693394225 53.33593388,-2.693405456 53.33601613,-2.692886454 53.33601915,-2.692800244 53.33600194,-2.693086706 53.33554836,-2.693226824 53.33533116)))",conservation_areas_polygon.6,53.33022350189056 -2.6987304284150504,53.336019148385034 -2.6908634063722134,Halton,6 +"MULTIPOLYGON (((-2.740832295 53.33468831,-2.740774378 53.33470412,-2.740729413 53.33473163,-2.740576448 53.33489972,-2.740471575 53.33485769,-2.740163367 53.33467673,-2.739776702 53.33448476,-2.739396747 53.3343081,-2.739100317 53.33403487,-2.738614277 53.33433964,-2.738126537 53.33405869,-2.738344515 53.33391833,-2.738193911 53.33383748,-2.73783865 53.33359085,-2.738315965 53.3334053,-2.738347899 53.33335451,-2.738366415 53.33331539,-2.738397973 53.33326461,-2.73842355 53.33322571,-2.738494385 53.33319796,-2.738546168 53.33317831,-2.738597114 53.33313891,-2.738602132 53.33304956,-2.738653109 53.33298328,-2.738716925 53.33292834,-2.738749262 53.33290091,-2.738806842 53.33286568,-2.738929794 53.33283769,-2.739387243 53.33262081,-2.739445829 53.33264381,-2.739493371 53.33237914,-2.739292084 53.33238039,-2.739292685 53.33206442,-2.739294682 53.33201579,-2.739416824 53.33200453,-2.739566003 53.33196855,-2.739619462 53.33183064,-2.739657438 53.33174306,-2.739690331 53.33169056,-2.73977611 53.33147498,-2.739877962 53.33148702,-2.739858283 53.33157332,-2.739910045 53.33162368,-2.739933578 53.33164555,-2.739981816 53.33167194,-2.74002462 53.33170537,-2.74011294 53.33176494,-2.740183349 53.33179173,-2.740245867 53.33182675,-2.740299769 53.33187234,-2.740368275 53.33191756,-2.740509567 53.33200538,-2.740569578 53.33204563,-2.740590792 53.33204801,-2.740721173 53.3319914,-2.740861324 53.33201335,-2.74108604 53.33202715,-2.741272575 53.33203865,-2.741213768 53.33231032,-2.741190153 53.33241192,-2.741186286 53.33243225,-2.741065766 53.33256483,-2.740965493 53.33264399,-2.74145198 53.33258867,-2.741847763 53.33254343,-2.742106692 53.33251064,-2.74230605 53.33240435,-2.7425218 53.33247328,-2.742725714 53.33253671,-2.742628679 53.33263185,-2.742533022 53.33274146,-2.742820511 53.33284075,-2.742552625 53.33311435,-2.742179645 53.33296139,-2.741687054 53.33339984,-2.741373897 53.33331247,-2.74119488 53.33348458,-2.741358187 53.33354171,-2.741567543 53.33362614,-2.741849133 53.3337643,-2.741997656 53.3340743,-2.741961483 53.33423017,-2.741578133 53.33423642,-2.741298276 53.33419916,-2.741346737 53.33436627,-2.741365882 53.33448477,-2.741251892 53.33453006,-2.740884141 53.33467225,-2.740832295 53.33468831)))",conservation_areas_polygon.7,53.331474975214284 -2.742820510669728,53.334899721691386 -2.737838650202599,Higher Runcorn,7 +"MULTIPOLYGON (((-2.731365138 53.36304468,-2.731052514 53.36289105,-2.730753708 53.36285019,-2.730775291 53.36271436,-2.730528121 53.36267796,-2.730293235 53.36264372,-2.7301736 53.3623781,-2.730115326 53.36223836,-2.730214908 53.36223658,-2.730306619 53.36224554,-2.730462919 53.36227047,-2.730911737 53.36231768,-2.730834842 53.36202349,-2.731276574 53.3619905,-2.731209877 53.36171279,-2.731165039 53.36153298,-2.731115774 53.36131842,-2.731169578 53.36130578,-2.731816175 53.36125455,-2.73230487 53.36121453,-2.732349418 53.36137799,-2.732394289 53.36158133,-2.732426347 53.36175251,-2.732679175 53.36173612,-2.732669859 53.36169153,-2.733237626 53.36163825,-2.73318875 53.36143995,-2.733375316 53.36142398,-2.733416563 53.36141609,-2.733436861 53.36140105,-2.733445155 53.36138617,-2.733440486 53.36136625,-2.733444475 53.36134654,-2.73378871 53.36132079,-2.733841011 53.36153812,-2.734018676 53.36155248,-2.734215568 53.36161094,-2.734334009 53.36168354,-2.73436375 53.36171957,-2.734385347 53.36175035,-2.734398249 53.36178756,-2.734403008 53.36181961,-2.734395235 53.36185695,-2.734387608 53.3618818,-2.73437118 53.36190688,-2.734350959 53.36193424,-2.734314069 53.36197185,-2.734248386 53.36200676,-2.733942146 53.3623042,-2.733913597 53.36233664,-2.733897916 53.36237637,-2.733871264 53.36251923,-2.733854417 53.36274291,-2.7338336 53.36298126,-2.733808387 53.36323672,-2.73378582 53.36353152,-2.733264628 53.36350606,-2.73238955 53.36347126,-2.732449797 53.36325684,-2.732489069 53.36300409,-2.73217143 53.36302095,-2.731920554 53.36344781,-2.731483464 53.36335614,-2.731344315 53.36359404,-2.731089517 53.36353989,-2.731045976 53.36352982,-2.731182425 53.36329301,-2.731365138 53.36304468)))",conservation_areas_polygon.8,53.36121452724432 -2.7344030078801715,53.36359404388266 -2.730115325658145,Victoria Square,8 +"MULTIPOLYGON (((-2.736877605 53.34817381,-2.736795079 53.34829419,-2.736459974 53.34821773,-2.736395546 53.34832407,-2.735848013 53.34820379,-2.735440525 53.34809623,-2.735485553 53.34803556,-2.735479503 53.34801485,-2.735450219 53.34800487,-2.735188767 53.34800504,-2.734754161 53.34803414,-2.734437979 53.34807149,-2.733871621 53.34814552,-2.733629276 53.34819113,-2.733567172 53.34821021,-2.73349168 53.34832875,-2.733391322 53.34849256,-2.73353946 53.34853577,-2.733709633 53.34857885,-2.733775993 53.34859156,-2.73351788 53.34900328,-2.733088655 53.34891327,-2.732974132 53.34910358,-2.732848424 53.34908243,-2.732720228 53.34927004,-2.732693717 53.34926229,-2.7326477 53.34925862,-2.732252867 53.34917424,-2.732239008 53.34921845,-2.731575118 53.34908791,-2.731755394 53.34896135,-2.731930051 53.34883214,-2.732023572 53.34869507,-2.732139047 53.34853117,-2.732239789 53.34841139,-2.732378294 53.34830468,-2.732553371 53.34819325,-2.732714155 53.34812173,-2.732823009 53.34804152,-2.733399399 53.34792449,-2.73372664 53.34787502,-2.733989311 53.34783746,-2.733987646 53.34781923,-2.734271922 53.34777758,-2.734368324 53.34776576,-2.734484079 53.34775606,-2.735450016 53.34770517,-2.73553026 53.34769723,-2.73558289 53.3476969,-2.735678721 53.34770952,-2.73582312 53.3477302,-2.735844805 53.34770193,-2.735969788 53.34771689,-2.735973808 53.34770608,-2.736173686 53.34773306,-2.736164597 53.34775217,-2.736445193 53.34779869,-2.736644898 53.34783736,-2.73675611 53.3478586,-2.736837767 53.34788901,-2.736948943 53.34793694,-2.736933718 53.34805925,-2.736877605 53.34817381)))",conservation_areas_polygon.9,53.347696902912006 -2.7369489434384215,53.3492700401932 -2.7315751180850665,West Bank Promenade,9 +"MULTIPOLYGON (((-2.737815014 53.31762619,-2.737724903 53.31751747,-2.737662642 53.3174386,-2.737645776 53.31741543,-2.737606452 53.31739572,-2.737695017 53.3173853,-2.73786617 53.31738424,-2.738225421 53.31738201,-2.738556706 53.31738669,-2.738988829 53.31744665,-2.739000602 53.31747641,-2.73944373 53.31754331,-2.739620426 53.31756243,-2.739914589 53.31759655,-2.740269854 53.31771343,-2.740556057 53.31782352,-2.740972025 53.31803014,-2.741211848 53.31817756,-2.741425596 53.31832738,-2.741620439 53.31847165,-2.741439807 53.31857558,-2.74115851 53.31869873,-2.741171574 53.31871716,-2.740875946 53.31887797,-2.740591704 53.31903843,-2.740524197 53.31899249,-2.740355129 53.31908573,-2.740471219 53.3191285,-2.740566161 53.31916449,-2.740654744 53.31918353,-2.740726563 53.31919324,-2.740743614 53.31927716,-2.740612847 53.31931616,-2.740061315 53.31958756,-2.74017288 53.31966891,-2.740072953 53.31972399,-2.739981088 53.31977991,-2.739731568 53.31983762,-2.739669639 53.31984169,-2.739632507 53.31983518,-2.739593064 53.31980874,-2.739518854 53.31982573,-2.739510522 53.31980925,-2.739493738 53.31981268,-2.739485546 53.31980428,-2.739364488 53.31982498,-2.739364289 53.31981348,-2.739169057 53.3198394,-2.739025833 53.31983184,-2.739059173 53.31969801,-2.738730595 53.31965727,-2.738799409 53.3194852,-2.7388649 53.3193146,-2.73851779 53.31931846,-2.738477073 53.31916082,-2.738241064 53.31918367,-2.737988366 53.31921552,-2.7379235 53.31905021,-2.737875111 53.31885668,-2.737544883 53.31893483,-2.737646818 53.31878116,-2.73734971 53.31882821,-2.737353134 53.31878298,-2.737165674 53.31846279,-2.736631508 53.31850033,-2.736615351 53.31846079,-2.736371293 53.31851209,-2.73629566 53.3182808,-2.736237196 53.31810584,-2.736822256 53.31805585,-2.737114813 53.31804731,-2.737441848 53.31804896,-2.737563381 53.31804821,-2.737866683 53.31800986,-2.738075154 53.31796867,-2.737815014 53.31762619)))",conservation_areas_polygon.10,53.31738201292983 -2.7416204385738534,53.31984169271952 -2.736237195970133,Weston Village,10 From f1167ef4fb5ceda8601e87d39b65ce37534c5895 Mon Sep 17 00:00:00 2001 From: CarlosCoelhoSL <110818364+CarlosCoelhoSL@users.noreply.github.com> Date: Tue, 20 May 2025 15:25:22 +0100 Subject: [PATCH 08/17] Add data dataset update (#408) * initial commit * adds tests * adds acceptance test * updated mask fixes tests fixes output path * updates diff calculation for new entities * adds function comments * restructures get user response * test * test --- digital_land/commands.py | 102 ++++++-- digital_land/utils/add_data_utils.py | 128 ++++++++++ tests/acceptance/test_add_data.py | 81 ++++++- tests/data/specification/dataset.csv | 4 +- tests/integration/test_add_data_utils.py | 293 +++++++++++++++++++++++ tests/unit/test_add_data_utils.py | 18 +- 6 files changed, 598 insertions(+), 28 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 46fe18295..f88559c48 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -65,10 +65,13 @@ from digital_land.state import State from digital_land.utils.add_data_utils import ( clear_log, + download_dataset, get_column_field_summary, + get_transformed_entities, get_entity_summary, get_existing_endpoints_summary, get_issue_summary, + get_updated_entities_summary, is_date_valid, is_url_valid, get_user_response, @@ -932,38 +935,42 @@ def add_data( add_data_cache_dir = cache_dir / "add_data" - output_path = ( - add_data_cache_dir - / "transformed/" - / (endpoint_resource_info["resource"] + ".csv") - ) - - issue_dir = add_data_cache_dir / "issue/" - column_field_dir = add_data_cache_dir / "column_field/" - dataset_resource_dir = add_data_cache_dir / "dataset_resource/" - converted_resource_dir = add_data_cache_dir / "converted_resource/" - converted_dir = add_data_cache_dir / "converted/" - output_log_dir = add_data_cache_dir / "log/" - operational_issue_dir = add_data_cache_dir / "performance/ " / "operational_issue/" - - output_path.parent.mkdir(parents=True, exist_ok=True) - issue_dir.mkdir(parents=True, exist_ok=True) - column_field_dir.mkdir(parents=True, exist_ok=True) - dataset_resource_dir.mkdir(parents=True, exist_ok=True) - converted_resource_dir.mkdir(parents=True, exist_ok=True) - converted_dir.mkdir(parents=True, exist_ok=True) - output_log_dir.mkdir(parents=True, exist_ok=True) - operational_issue_dir.mkdir(parents=True, exist_ok=True) - collection.load_log_items() for dataset in endpoint_resource_info["pipelines"]: + pipeline = Pipeline(pipeline_dir, dataset) + specification = Specification(specification_dir) + + issue_dir = add_data_cache_dir / "issue/" / dataset + column_field_dir = add_data_cache_dir / "column_field/" / dataset + dataset_resource_dir = add_data_cache_dir / "dataset_resource/" / dataset + converted_resource_dir = add_data_cache_dir / "converted_resource/" + converted_dir = add_data_cache_dir / "converted/" + output_log_dir = add_data_cache_dir / "log/" + operational_issue_dir = ( + add_data_cache_dir / "performance/ " / "operational_issue/" + ) + output_path = ( + add_data_cache_dir + / "transformed/" + / dataset + / (endpoint_resource_info["resource"] + ".csv") + ) + + output_path.parent.mkdir(parents=True, exist_ok=True) + issue_dir.mkdir(parents=True, exist_ok=True) + column_field_dir.mkdir(parents=True, exist_ok=True) + dataset_resource_dir.mkdir(parents=True, exist_ok=True) + converted_resource_dir.mkdir(parents=True, exist_ok=True) + converted_dir.mkdir(parents=True, exist_ok=True) + output_log_dir.mkdir(parents=True, exist_ok=True) + operational_issue_dir.mkdir(parents=True, exist_ok=True) print("======================================================================") print("Run pipeline") print("======================================================================") try: pipeline_run( dataset, - Pipeline(pipeline_dir, dataset), + pipeline, Specification(specification_dir), endpoint_resource_info["resource_path"], output_path=output_path, @@ -1113,6 +1120,11 @@ def add_data( shutil.copy(cache_pipeline_dir / "lookup.csv", pipeline_dir / "lookup.csv") # Now check for existing endpoints for this provision/organisation + print( + "\n======================================================================" + ) + print("Retire old endpoints/sources") + print("======================================================================") existing_endpoints_summary, existing_sources = get_existing_endpoints_summary( endpoint_resource_info, collection, dataset ) @@ -1132,6 +1144,48 @@ def add_data( pd.DataFrame.from_records(sources_to_retire) ) + # Update dataset and view newly updated dataset + print( + "\n======================================================================" + ) + print("Update dataset") + print("======================================================================") + if get_user_response( + f"""\nDo you want to view an updated {dataset} dataset with the newly added data? + \nNote this requires downloading the dataset if not already done so - + for some datasets this can take a while \n\n(yes/no): """ + ): + dataset_path = download_dataset(dataset, specification, cache_dir) + original_entities = get_transformed_entities(dataset_path, output_path) + print(f"Updating {dataset}.sqlite3 with new data...") + dataset_update( + input_paths=[output_path], + output_path=None, + organisation_path=organisation_path, + pipeline=pipeline, + dataset=dataset, + specification=specification, + issue_dir=os.path.split(issue_dir)[0], + column_field_dir=os.path.split(column_field_dir)[0], + dataset_resource_dir=os.path.split(dataset_resource_dir)[0], + dataset_path=dataset_path, + ) + updated_entities = get_transformed_entities(dataset_path, output_path) + updated_entities_summary, diffs_df = get_updated_entities_summary( + original_entities, updated_entities + ) + print(updated_entities_summary) + if diffs_df is not None: + diffs_path = ( + add_data_cache_dir + / dataset + / "diffs" + / f"{endpoint_resource_info['resource']}.csv" + ) + os.makedirs(os.path.dirname(diffs_path)) + diffs_df.to_csv(diffs_path) + print(f"\nDetailed breakdown found in file: {diffs_path}") + def add_endpoints_and_lookups( csv_file_path, diff --git a/digital_land/utils/add_data_utils.py b/digital_land/utils/add_data_utils.py index eb6b75c88..0d6dac1f8 100644 --- a/digital_land/utils/add_data_utils.py +++ b/digital_land/utils/add_data_utils.py @@ -1,11 +1,14 @@ import csv +import json import os import duckdb +import sqlite3 from datetime import datetime from urllib.parse import urlparse import pandas as pd +from digital_land.api import API from digital_land.collect import Collector from digital_land.pipeline.main import Pipeline from digital_land.specification import Specification @@ -295,3 +298,128 @@ def get_existing_endpoints_summary(endpoint_resource_info, collection, dataset): ) return existing_endpoints_summary, retirable_sources + + +def download_dataset(dataset, specification, cache_dir): + # Download existing dataset + api = API(specification=specification, cache_dir=cache_dir) + dataset_path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3") + # Determine whether to download new copy of dataset or use cached version + download = True + if os.path.exists(dataset_path): + print(f"\nExisting dataset at {dataset_path} detected") + if get_user_response( + "Do you want to use the existing dataset (otherwise download a fresh version)? (yes/no): " + ): + download = False + if download: + print(f"Downloading {dataset}.sqlite3...") + api.download_dataset( + dataset=dataset, + overwrite=True, + path=dataset_path, + extension=api.Extension.SQLITE3, + ) + + return dataset_path + + +def get_transformed_entities(dataset_path, transformed_path): + """ + Returns a Dataframe of entities from a dataset. + It returns entities that have facts in the transformed file at `transformed_path` + """ + entities = pd.read_csv(transformed_path)["entity"].unique().tolist() + entity_list_str = ", ".join(str(e) for e in entities) + sql = f"SELECT * FROM entity WHERE entity IN ({entity_list_str})" + + with sqlite3.connect(dataset_path) as conn: + entities_df = pd.read_sql_query(sql, conn) + + return entities_df + + +def normalise_json(val): + """ + Returns a sorted stringified json + """ + # This function accepts a stringified json + # It returns a sorted stringified json of the input + try: + return json.dumps(json.loads(val), sort_keys=True) + except Exception: + return val # if failure to pass just return original string + + +def get_updated_entities_summary(original_entity_df, updated_entity_df): + """ + This will return a summary of the differences between two dataframes of the same entities + """ + # replace None/nan with "" for consistent comparison + original_entity_df = original_entity_df.fillna("") + updated_entity_df = updated_entity_df.fillna("") + + original_entity_df = original_entity_df.set_index("entity").sort_index() + updated_entity_df = updated_entity_df.set_index("entity").sort_index() + + # filter out newly added entities, store them in a separate df + new_entities_df = updated_entity_df.loc[ + ~updated_entity_df.index.isin(original_entity_df.index) + ] + updated_entity_df = updated_entity_df.loc[ + updated_entity_df.index.isin(original_entity_df.index) + ] + + # the json column can get reordered in the update dataset process + # load json into dict and sort keys to ensure comparison is correct + if "json" in original_entity_df.columns: + original_entity_df["json"] = original_entity_df["json"].apply(normalise_json) + updated_entity_df["json"] = updated_entity_df["json"].apply(normalise_json) + new_entities_df["json"] = new_entities_df["json"].apply(normalise_json) + + # find differences + mask = ~( + (original_entity_df == updated_entity_df) + | (original_entity_df.isna() & updated_entity_df.isna()) + ) + diff_positions = mask.stack() + # dataframe of which values have changed. + changed = diff_positions[diff_positions] + diffs = pd.DataFrame( + { + "entity": changed.index.get_level_values(0), + "field": changed.index.get_level_values(1), + "original_value": original_entity_df.stack()[changed.index], + "updated_value": updated_entity_df.stack()[changed.index], + "new_entity": False, + } + ).reset_index(drop=True) + + # add diffs for new entities + if not new_entities_df.empty: + new_diffs = new_entities_df.reset_index().melt( + id_vars=["entity"], var_name="field", value_name="updated_value" + ) + new_diffs["original_value"] = None + new_diffs["new_entity"] = True + # Reorder columns to match + new_diffs = new_diffs[ + ["entity", "field", "original_value", "updated_value", "new_entity"] + ] + + # Concatenate with existing diffs + diffs = pd.concat([diffs, new_diffs], ignore_index=True) + + updated_entities_summary = "" + if len(diffs) > 0: + diffs_df = pd.DataFrame(diffs) + grouped_diffs = diffs_df.groupby("entity")["field"].apply(list).reset_index() + updated_entities_summary += "\nChanged fields by entity:\n" + for _, row in grouped_diffs.iterrows(): + updated_entities_summary += ( + f"\nEntity: {row['entity']}, Fields changed: {', '.join(row['field'])}" + ) + return updated_entities_summary, diffs_df + else: + updated_entities_summary += "\nNo differences found in updated dataset" + return updated_entities_summary, None diff --git a/tests/acceptance/test_add_data.py b/tests/acceptance/test_add_data.py index 9dd9a0ee3..d17ca6d71 100644 --- a/tests/acceptance/test_add_data.py +++ b/tests/acceptance/test_add_data.py @@ -1,9 +1,11 @@ import csv from datetime import datetime import os +from pathlib import Path +import shutil import tempfile from unittest import mock -from unittest.mock import Mock +from unittest.mock import Mock, patch from click.testing import CliRunner import pandas as pd import pytest @@ -152,6 +154,19 @@ def mock_request_get_no_reference(mocker): ) +@pytest.fixture +def mock_download_dataset(): + original_dataset_path = Path("tests/data/dataset/central-activities-zone.sqlite3") + updated_dataset_path = tempfile.NamedTemporaryFile(suffix=".sqlite3").name + # copy so we can update a version to compare to original + shutil.copy(original_dataset_path, updated_dataset_path) + with patch( + "digital_land.commands.download_dataset", + return_value=Path(updated_dataset_path), + ) as mock: + yield mock + + def create_input_csv( data, fieldnames=[ @@ -181,6 +196,7 @@ def test_cli_add_data( cache_dir, organisation_csv, mock_request_get, + mock_download_dataset, monkeypatch, ): no_error_input_data = { @@ -243,6 +259,7 @@ def test_cli_add_data_incorrect_input_data( pipeline_dir, organisation_csv, mock_request_get, + mock_download_dataset, cache_dir, ): incorrect_input_data = { @@ -287,6 +304,7 @@ def test_cli_add_data_consecutive_runs( pipeline_dir, organisation_csv, mock_request_get, + mock_download_dataset, monkeypatch, cache_dir, ): @@ -365,6 +383,7 @@ def test_cli_add_data_pipeline_fail( cache_dir, organisation_csv, mock_request_get, + mock_download_dataset, monkeypatch, ): no_error_input_data = { @@ -417,6 +436,7 @@ def test_cli_add_data_remaining_unassigned_entities( cache_dir, organisation_csv, mock_request_get_no_reference, + mock_download_dataset, monkeypatch, ): no_error_input_data = { @@ -464,6 +484,7 @@ def test_cli_add_data_old_endpoints_retired( cache_dir, organisation_csv, mock_request_get, + mock_download_dataset, monkeypatch, ): no_error_input_data = { @@ -552,3 +573,61 @@ def test_cli_add_data_old_endpoints_retired( source_df = pd.read_csv(os.path.join(collection_dir, "source.csv")) assert source_df["end-date"].values[0] == datetime.utcnow().isoformat()[:10] + + +# Add acceptance test +# @patch("digital_land.commands.API.download_dataset", return_value=Path("tests/data/dataset/central-activities-zone.sqlite3")) +def test_cli_add_data_update_dataset( + collection_dir, + specification_dir, + pipeline_dir, + cache_dir, + organisation_csv, + mock_request_get, + mock_download_dataset, + monkeypatch, +): + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + csv_path = create_input_csv(no_error_input_data) + + # Mock in user input + monkeypatch.setattr("builtins.input", lambda _: "yes") + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "add-data", + csv_path, + "conservation-area", + "--collection-dir", + str(collection_dir), + "--specification-dir", + str(specification_dir), + "--pipeline-dir", + str(pipeline_dir), + "--organisation-path", + str(organisation_csv), + "--cache-dir", + str(cache_dir), + ], + ) + if result.exit_code != 0: + # Print the command output if the test fails, gives more detail on what's gone wrong + print("Command failed with exit code:", result.exit_code) + print("Command output:") + print(result.output) + print("Command error output:") + print(result.exception) + + assert result.exit_code == 0 + print("result std out", result.stdout) + assert "Entity: 44000000, Fields changed:" in result.stdout diff --git a/tests/data/specification/dataset.csv b/tests/data/specification/dataset.csv index aeee29c91..39eab0c86 100644 --- a/tests/data/specification/dataset.csv +++ b/tests/data/specification/dataset.csv @@ -1,5 +1,5 @@ -dataset,name,text,typology,prefix -dataset-one,"First Dataset","Text of first dataset",, +dataset,name,text,typology,prefix,collection +dataset-one,"First Dataset","Text of first dataset",,,collection-one dataset-two,"Second Dataset","Text of first dataset",, dataset-three,"Third Dataset","Text of third dataset",, tree-preservation-zone-type,"Types of zone covered by the tree preservation order","Tree preservation zone type",category, diff --git a/tests/integration/test_add_data_utils.py b/tests/integration/test_add_data_utils.py index c76221e98..4f7f520f7 100644 --- a/tests/integration/test_add_data_utils.py +++ b/tests/integration/test_add_data_utils.py @@ -1,15 +1,23 @@ import csv from datetime import datetime import os +import shutil +import tempfile +from unittest.mock import Mock +import pandas as pd import pytest from digital_land.collection import Collection +from digital_land.specification import Specification from digital_land.utils.add_data_utils import ( clear_log, + download_dataset, get_column_field_summary, get_entity_summary, get_existing_endpoints_summary, get_issue_summary, + get_transformed_entities, + get_updated_entities_summary, ) @@ -1037,3 +1045,288 @@ def test_get_existing_endpoints_ended_source_with_no_endpoint(tmp_path): assert not existing_endpoints_summary assert len(existing_sources) == 0 + + +def test_download_dataset(tmp_path_factory, mocker): + dataset = "dataset-one" + specification_dir = "tests/data/specification" + specification = Specification(specification_dir) + # create temp cache dir + cache_dir = tmp_path_factory.mktemp("cache") + + # mock api download url + sqlite_file_path = "tests/data/dataset/central-activities-zone.sqlite3" + with open(sqlite_file_path, "rb") as f: + data = f.read() + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = data + mocker.patch("requests.get", return_value=mock_response) + + download_dataset(dataset, specification, cache_dir) + + path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3") + assert os.path.exists(path) + + +def test_download_dataset_use_cache_dataset(tmp_path_factory, mocker): + dataset = "dataset-one" + specification_dir = "tests/data/specification" + specification = Specification(specification_dir) + # create temp cache dir + cache_dir = tmp_path_factory.mktemp("cache") + + path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3") + # put db file in cache dir + sqlite_file_path = "tests/data/dataset/central-activities-zone.sqlite3" + os.makedirs(os.path.dirname(path)) + shutil.copy(sqlite_file_path, path) + + # mock user response + mocker.patch( + "digital_land.utils.add_data_utils.get_user_response", return_value=True + ) + mock_get = mocker.patch("requests.get") + + download_dataset(dataset, specification, cache_dir) + + # assert requests.get was NOT called + mock_get.assert_not_called() + + +def test_get_transformed_entities(): + output_path = tempfile.NamedTemporaryFile().name + dataset_path = "tests/data/dataset/central-activities-zone.sqlite3" + + transformed_headers = [ + "end-date", + "entity", + "entry-date", + "entry-number", + "fact", + "field", + "priority", + "reference-entity", + "resource", + "start-date", + "value", + ] + transformed_rows = [ + { + "end-date": "", + "entity": 2200001, + "entry-date": "", + "entry-number": 1, + "fact": "fact1", + "field": "field1", + "priority": "", + "reference-entity": "", + "resource": "resource", + "start-date": "", + "value": "value1", + }, + { + "end-date": "", + "entity": 2200002, + "entry-date": "", + "entry-number": 2, + "fact": "fact2", + "field": "field1", + "priority": "", + "reference-entity": "", + "resource": "resource", + "start-date": "", + "value": "value1", + }, + ] + with open(output_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=transformed_headers) + writer.writeheader() + writer.writerows(transformed_rows) + + entities = get_transformed_entities(dataset_path, output_path) + + assert len(entities) == 2 + assert entities.iloc[0]["entity"] == 2200001 + assert entities.iloc[0]["reference"] == "CAZ00000001" + assert entities.iloc[1]["entity"] == 2200002 + assert entities.iloc[1]["reference"] == "CAZ00000002" + + +def test_get_updated_entities_summary_new_entity(): + original_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "name1", + "reference": "ref1", + } + ] + ) + updated_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "name1", + "reference": "ref1", + }, + { + "end-date": "", + "entity": 2200002, + "dataset": "", + "json": "json", + "name": "name2", + "reference": "ref2", + }, + ] + ) + + updated_entities_summary, diffs_df = get_updated_entities_summary( + original_entity_df, updated_entity_df + ) + + assert len(diffs_df) == 5 + assert "end-date" in diffs_df["field"].values + assert "dataset" in diffs_df["field"].values + assert "name" in diffs_df["field"].values + assert "reference" in diffs_df["field"].values + assert "json" in diffs_df["field"].values + + assert "original_value" in diffs_df.columns + assert all(not value for value in diffs_df["original_value"].values) + + assert "updated_value" in diffs_df.columns + assert diffs_df[diffs_df["field"] == "name"]["updated_value"].values[0] == "name2" + + assert all(value for value in diffs_df["new_entity"].values) + + assert ( + "Entity: 2200002, Fields changed: end-date, dataset, json, name, reference" + in updated_entities_summary + ) + assert "Entity: 2200001" not in updated_entities_summary + + +def test_get_updated_entities_summary_updated_entity(): + original_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "name1", + "reference": "ref1", + } + ] + ) + updated_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "updated end date", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "updated name", + "reference": "ref1", + } + ] + ) + + updated_entities_summary, diffs_df = get_updated_entities_summary( + original_entity_df, updated_entity_df + ) + + assert len(diffs_df) == 2 + + assert diffs_df[diffs_df["field"] == "name"]["original_value"].values[0] == "name1" + assert ( + diffs_df[diffs_df["field"] == "name"]["updated_value"].values[0] + == "updated name" + ) + + assert diffs_df[diffs_df["field"] == "end-date"]["original_value"].values[0] == "" + assert ( + diffs_df[diffs_df["field"] == "end-date"]["updated_value"].values[0] + == "updated end date" + ) + + assert not all(value == "" for value in diffs_df["new_entity"].values) + + assert "Entity: 2200001, Fields changed: end-date, name" in updated_entities_summary + + +def test_get_updated_entities_summary_no_updates(): + original_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "name1", + "reference": "ref1", + } + ] + ) + updated_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "name1", + "reference": "ref1", + } + ] + ) + + updated_entities_summary, diffs_df = get_updated_entities_summary( + original_entity_df, updated_entity_df + ) + + assert "No differences found in updated dataset" in updated_entities_summary + assert not diffs_df + + +def test_get_updated_entities_summary_updated_entity_none_agnostic(): + original_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": "", + "entity": 2200001, + "dataset": "", + "json": "json", + "name": None, + "reference": "ref1", + } + ] + ) + updated_entity_df = pd.DataFrame.from_records( + [ + { + "end-date": None, + "entity": 2200001, + "dataset": "", + "json": "json", + "name": "", + "reference": "ref1", + } + ] + ) + + updated_entities_summary, diffs_df = get_updated_entities_summary( + original_entity_df, updated_entity_df + ) + + assert "No differences found in updated dataset" in updated_entities_summary + assert not diffs_df diff --git a/tests/unit/test_add_data_utils.py b/tests/unit/test_add_data_utils.py index 4e89129fd..124c9c6e5 100644 --- a/tests/unit/test_add_data_utils.py +++ b/tests/unit/test_add_data_utils.py @@ -1,7 +1,11 @@ import pytest from digital_land.commands import is_url_valid -from digital_land.utils.add_data_utils import get_user_response, is_date_valid +from digital_land.utils.add_data_utils import ( + get_user_response, + is_date_valid, + normalise_json, +) def test_is_url_valid(): @@ -88,3 +92,15 @@ def test_get_user_response_fail(monkeypatch): result = get_user_response("message") assert not result + + +def test_normalise_json(): + json_string = '{"secondproperty": "secondvalue", "firstproperty": "firstvalue"}' + + sorted_json_string = normalise_json(json_string) + + # ensure json is sorted + assert isinstance(sorted_json_string, str) + assert sorted_json_string.find("firstproperty") < sorted_json_string.find( + "secondproperty" + ) From 86f6766f0986489f56e5156311c4ccc07e960993 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Wed, 14 May 2025 15:13:06 +0100 Subject: [PATCH 09/17] generate provision quality dataset --- digital_land/utils/functions_core.py | 73 ++++++++ .../utils/generate_provision_quality.py | 175 ++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100644 digital_land/utils/functions_core.py create mode 100644 digital_land/utils/generate_provision_quality.py diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py new file mode 100644 index 000000000..887fa5c09 --- /dev/null +++ b/digital_land/utils/functions_core.py @@ -0,0 +1,73 @@ +import urllib +import os +import sqlite3 +import pandas as pd +import geopandas as gpd +import shapely.wkt + + +global FILES_URL + +FILES_URL = "https://datasette.planning.data.gov.uk/" + + +def download_dataset(dataset, output_dir_path, overwrite=False): + dataset_file_name = f"{dataset}.db" + + if not os.path.exists(output_dir_path): + os.makedirs(output_dir_path) + + output_file_path = os.path.join(output_dir_path, dataset_file_name) + + if overwrite is False and os.path.exists(output_file_path): + return + + final_url = os.path.join(FILES_URL, dataset_file_name) + print(f"downloading data from {final_url}") + print(f"to: {output_file_path}") + urllib.request.urlretrieve( + final_url, os.path.join(output_dir_path, dataset_file_name) + ) + print("download complete") + + +def get_pdp_dataset( + dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True +): + + df = pd.read_csv( + f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str" + ) + df.columns = [x.replace("-", "_") for x in df.columns] + + df_valid_geom = df[df[geometry_field].notnull()].copy() + + # load geometry and create GDF + df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply( + shapely.wkt.loads + ) + gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field) + + # Transform to ESPG:27700 for more interpretable area units + gdf.set_crs(epsg=4326, inplace=True) + gdf.to_crs(epsg=crs_out, inplace=True) + + return gdf + + +def query_sqlite(db_path, query_string): + + with sqlite3.connect(db_path) as con: + + cursor = con.execute(query_string) + cols = [column[0] for column in cursor.description] + results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols) + + return results_df + + +def datasette_query(db, sql_string): + params = urllib.parse.urlencode({"sql": sql_string, "_size": "max"}) + url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}" + df = pd.read_csv(url) + return df diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py new file mode 100644 index 000000000..7ea8f6f73 --- /dev/null +++ b/digital_land/utils/generate_provision_quality.py @@ -0,0 +1,175 @@ +import os +import pandas as pd +import numpy as np +import json +from datetime import datetime +from digital_land.utils import functions_core as fc + + +def generate_provision_quality(): + """ + Generates a provision quality dataset and saves it as a parquet file. + """ + td = datetime.today().strftime("%Y-%m-%d") + + # Create the temporary download directory + db_dir = os.path.join("/tmp", "db_downloads") + os.makedirs(db_dir, exist_ok=True) + + # Download the performance db + fc.download_dataset("performance", db_dir, overwrite=False) + path_perf_db = os.path.join(db_dir, "performance.db") + + # Issue quality criteria lookup + lookup_issue_qual = fc.datasette_query( + "digital-land", + """ + SELECT + description, + issue_type, + name, + severity, + responsibility, + quality_criteria_level || " - " || quality_criteria as quality_criteria, + quality_criteria_level as quality_level + FROM issue_type + WHERE quality_criteria_level != '' + AND quality_criteria != '' + """, + ) + + # Transform data + provision = fc.query_sqlite( + path_perf_db, + """ + SELECT organisation, dataset, active_endpoint_count + FROM provision_summary + """, + ) + + # Extract issue count by provision from endpoint_dataset_issue_type_summary + qual_issue = fc.query_sqlite( + path_perf_db, + """ + SELECT + organisation, dataset, + 'issue' as problem_source, + issue_type as problem_type, + sum(count_issues) as count + FROM endpoint_dataset_issue_type_summary + WHERE resource_end_date is not NULL + AND issue_type is not NULL + GROUP BY organisation, dataset, issue_type + """, + ) + + # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) + qual_issue = qual_issue.merge( + lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], + how="inner", + left_on="problem_type", + right_on="issue_type", + ) + qual_issue.drop("issue_type", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_bounds = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check no entities are outside of the local planning authority boundary' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_bounds["problem_source"] = "expectation" + qual_expectation_bounds["problem_type"] = ( + "entity outside of the local planning authority boundary" + ) + qual_expectation_bounds["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_bounds["details"] + ] + qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" + qual_expectation_bounds["quality_level"] = 3 + qual_expectation_bounds.drop("details", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_count = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_count["problem_source"] = "expectation" + qual_expectation_count["problem_type"] = "entity count doesn't match manual count" + qual_expectation_count["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_count["details"] + ] + qual_expectation_count["quality_criteria"] = ( + "3 - conservation area entity count matches LPA" + ) + qual_expectation_count["quality_level"] = 3 + qual_expectation_count.drop("details", axis=1, inplace=True) + + # Combine all problem source tables, and aggregate to criteria level + qual_all_criteria = ( + pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) + .groupby( + ["organisation", "dataset", "quality_criteria", "quality_level"], + as_index=False, + ) + .agg(count_failures=("count", "sum")) + ) + + # Merge issues with the provision data + prov_qual_all = provision.merge( + qual_all_criteria, how="left", on=["organisation", "dataset"] + ) + + prov_qual_all["quality_level_for_sort"] = np.select( + [ + (prov_qual_all["active_endpoint_count"] == 0), + (prov_qual_all["quality_level"].notnull()), + (prov_qual_all["active_endpoint_count"] > 0) + & (prov_qual_all["quality_level"].isnull()), + ], + [0, prov_qual_all["quality_level"], 4], + ) + + level_map = { + 4: "4. data that is trustworthy", + 3: "3. data that is good for ODP", + 2: "2. authoritative data from the LPA", + 1: "1. some data", + 0: "0. no score", + } + + prov_quality = prov_qual_all.groupby( + ["organisation", "dataset"], as_index=False, dropna=False + ).agg(quality_level=("quality_level_for_sort", "min")) + + prov_quality["quality"] = prov_quality["quality_level"].map(level_map) + prov_quality["notes"] = "" + prov_quality["end-date"] = "" + prov_quality["start-date"] = td + prov_quality["entry-date"] = td + + # Output the results as a Parquet file + output_dir = os.path.join( + "/tmp", "performance", "provision-quality", f"entry-date={td}" + ) + os.makedirs(output_dir, exist_ok=True) + + output_file = os.path.join(output_dir, "provision-quality.parquet") + prov_quality.to_parquet(output_file, engine="pyarrow", index=False) + + print(f"Provision quality dataset saved to: {output_file}") From 0de99250afa42f7cfbceb06328d717ec3bacae21 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Thu, 15 May 2025 11:57:29 +0100 Subject: [PATCH 10/17] generate-provision-quality cli command --- digital_land/cli.py | 6 + digital_land/commands.py | 167 +++++++++++++++++ digital_land/utils/functions_core.py | 21 +-- .../utils/generate_provision_quality.py | 175 ------------------ setup.py | 1 + 5 files changed, 183 insertions(+), 187 deletions(-) delete mode 100644 digital_land/utils/generate_provision_quality.py diff --git a/digital_land/cli.py b/digital_land/cli.py index b65aef331..a99a950ea 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -31,6 +31,7 @@ organisation_check, save_state, add_data, + generate_provision_quality, ) from digital_land.command_arguments import ( @@ -825,3 +826,8 @@ def check_state_cmd( if diffs: print(f"State differs from {state_path} - {', '.join(diffs)}") sys.exit(1) + + +@cli.command("generate-provision-quality") +def generate_provision_quality_cmd(): + generate_provision_quality() diff --git a/digital_land/commands.py b/digital_land/commands.py index f88559c48..c8207c2b8 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -14,6 +14,7 @@ import geojson from requests import HTTPError import shapely +import numpy as np from digital_land.package.organisation import OrganisationPackage from digital_land.check import duplicate_reference_check @@ -76,6 +77,7 @@ is_url_valid, get_user_response, ) +from digital_land.utils import functions_core as fc from .register import hash_value from .utils.gdal_utils import get_gdal_version @@ -1723,3 +1725,168 @@ def check_and_assign_entities( ): return False return True + + +def generate_provision_quality(): + """Generates a provision quality dataset and saves it as a parquet file""" + td = datetime.today().strftime("%Y-%m-%d") + + # Create the temporary download directory + db_dir = Path("/tmp") / "db_downloads" + os.makedirs(db_dir, exist_ok=True) + + # Download the performance db + fc.download_dataset("performance", db_dir, overwrite=False) + path_perf_db = db_dir / "performance.db" + + # Issue quality criteria lookup + lookup_issue_qual = fc.datasette_query( + "digital-land", + """ + SELECT + description, + issue_type, + name, + severity, + responsibility, + quality_criteria_level || " - " || quality_criteria as quality_criteria, + quality_criteria_level as quality_level + FROM issue_type + WHERE quality_criteria_level != '' + AND quality_criteria != '' + """, + ) + + # Transform data + provision = fc.query_sqlite( + path_perf_db, + """ + SELECT organisation, dataset, active_endpoint_count + FROM provision_summary + """, + ) + + # Extract issue count by provision from endpoint_dataset_issue_type_summary + qual_issue = fc.query_sqlite( + path_perf_db, + """ + SELECT + organisation, dataset, + 'issue' as problem_source, + issue_type as problem_type, + sum(count_issues) as count + FROM endpoint_dataset_issue_type_summary + WHERE resource_end_date is not NULL + AND issue_type is not NULL + GROUP BY organisation, dataset, issue_type + """, + ) + + # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) + qual_issue = qual_issue.merge( + lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], + how="inner", + left_on="problem_type", + right_on="issue_type", + ) + qual_issue.drop("issue_type", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_bounds = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check no entities are outside of the local planning authority boundary' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_bounds["problem_source"] = "expectation" + qual_expectation_bounds["problem_type"] = ( + "entity outside of the local planning authority boundary" + ) + qual_expectation_bounds["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_bounds["details"] + ] + qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" + qual_expectation_bounds["quality_level"] = 3 + qual_expectation_bounds.drop("details", axis=1, inplace=True) + + # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds + qual_expectation_count = fc.datasette_query( + "digital-land", + """ + SELECT organisation, dataset, details + FROM expectation + WHERE 1=1 + AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' + AND passed = 'False' + AND message not like '%error%' + """, + ) + + qual_expectation_count["problem_source"] = "expectation" + qual_expectation_count["problem_type"] = "entity count doesn't match manual count" + qual_expectation_count["count"] = [ + json.loads(v)["actual"] for v in qual_expectation_count["details"] + ] + qual_expectation_count["quality_criteria"] = ( + "3 - conservation area entity count matches LPA" + ) + qual_expectation_count["quality_level"] = 3 + qual_expectation_count.drop("details", axis=1, inplace=True) + + # Combine all problem source tables, and aggregate to criteria level + qual_all_criteria = ( + pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) + .groupby( + ["organisation", "dataset", "quality_criteria", "quality_level"], + as_index=False, + ) + .agg(count_failures=("count", "sum")) + ) + + # Merge issues with the provision data + prov_qual_all = provision.merge( + qual_all_criteria, how="left", on=["organisation", "dataset"] + ) + + prov_qual_all["quality_level_for_sort"] = np.select( + [ + (prov_qual_all["active_endpoint_count"] == 0), + (prov_qual_all["quality_level"].notnull()), + (prov_qual_all["active_endpoint_count"] > 0) + & (prov_qual_all["quality_level"].isnull()), + ], + [0, prov_qual_all["quality_level"], 4], + ) + + level_map = { + 4: "4. data that is trustworthy", + 3: "3. data that is good for ODP", + 2: "2. authoritative data from the LPA", + 1: "1. some data", + 0: "0. no score", + } + + prov_quality = prov_qual_all.groupby( + ["organisation", "dataset"], as_index=False, dropna=False + ).agg(quality_level=("quality_level_for_sort", "min")) + + prov_quality["quality"] = prov_quality["quality_level"].map(level_map) + prov_quality["notes"] = "" + prov_quality["end-date"] = "" + prov_quality["start-date"] = td + prov_quality["entry-date"] = td + + # Output the results as a Parquet file + output_dir = Path("/tmp") / "performance" / "provision-quality" / f"entry-date={td}" + os.makedirs(output_dir, exist_ok=True) + + output_file = output_dir / "provision-quality.parquet" + prov_quality.to_parquet(output_file, engine="pyarrow", index=False) + + print(f"Provision quality dataset saved to: {output_file}") diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py index 887fa5c09..b891ef405 100644 --- a/digital_land/utils/functions_core.py +++ b/digital_land/utils/functions_core.py @@ -1,10 +1,9 @@ import urllib -import os import sqlite3 import pandas as pd import geopandas as gpd import shapely.wkt - +from pathlib import Path global FILES_URL @@ -12,22 +11,20 @@ def download_dataset(dataset, output_dir_path, overwrite=False): - dataset_file_name = f"{dataset}.db" - - if not os.path.exists(output_dir_path): - os.makedirs(output_dir_path) + output_dir = Path(output_dir_path) + output_dir.mkdir(parents=True, exist_ok=True) - output_file_path = os.path.join(output_dir_path, dataset_file_name) + dataset_file_name = f"{dataset}.db" + output_file_path = output_dir / dataset_file_name - if overwrite is False and os.path.exists(output_file_path): + if not overwrite and output_file_path.exists(): return - final_url = os.path.join(FILES_URL, dataset_file_name) + final_url = f"{FILES_URL}{dataset_file_name}" print(f"downloading data from {final_url}") print(f"to: {output_file_path}") - urllib.request.urlretrieve( - final_url, os.path.join(output_dir_path, dataset_file_name) - ) + + urllib.request.urlretrieve(final_url, output_file_path) print("download complete") diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py deleted file mode 100644 index 7ea8f6f73..000000000 --- a/digital_land/utils/generate_provision_quality.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import pandas as pd -import numpy as np -import json -from datetime import datetime -from digital_land.utils import functions_core as fc - - -def generate_provision_quality(): - """ - Generates a provision quality dataset and saves it as a parquet file. - """ - td = datetime.today().strftime("%Y-%m-%d") - - # Create the temporary download directory - db_dir = os.path.join("/tmp", "db_downloads") - os.makedirs(db_dir, exist_ok=True) - - # Download the performance db - fc.download_dataset("performance", db_dir, overwrite=False) - path_perf_db = os.path.join(db_dir, "performance.db") - - # Issue quality criteria lookup - lookup_issue_qual = fc.datasette_query( - "digital-land", - """ - SELECT - description, - issue_type, - name, - severity, - responsibility, - quality_criteria_level || " - " || quality_criteria as quality_criteria, - quality_criteria_level as quality_level - FROM issue_type - WHERE quality_criteria_level != '' - AND quality_criteria != '' - """, - ) - - # Transform data - provision = fc.query_sqlite( - path_perf_db, - """ - SELECT organisation, dataset, active_endpoint_count - FROM provision_summary - """, - ) - - # Extract issue count by provision from endpoint_dataset_issue_type_summary - qual_issue = fc.query_sqlite( - path_perf_db, - """ - SELECT - organisation, dataset, - 'issue' as problem_source, - issue_type as problem_type, - sum(count_issues) as count - FROM endpoint_dataset_issue_type_summary - WHERE resource_end_date is not NULL - AND issue_type is not NULL - GROUP BY organisation, dataset, issue_type - """, - ) - - # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria) - qual_issue = qual_issue.merge( - lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]], - how="inner", - left_on="problem_type", - right_on="issue_type", - ) - qual_issue.drop("issue_type", axis=1, inplace=True) - - # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_bounds = fc.datasette_query( - "digital-land", - """ - SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check no entities are outside of the local planning authority boundary' - AND passed = 'False' - AND message not like '%error%' - """, - ) - - qual_expectation_bounds["problem_source"] = "expectation" - qual_expectation_bounds["problem_type"] = ( - "entity outside of the local planning authority boundary" - ) - qual_expectation_bounds["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_bounds["details"] - ] - qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" - qual_expectation_bounds["quality_level"] = 3 - qual_expectation_bounds.drop("details", axis=1, inplace=True) - - # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_count = fc.datasette_query( - "digital-land", - """ - SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' - AND passed = 'False' - AND message not like '%error%' - """, - ) - - qual_expectation_count["problem_source"] = "expectation" - qual_expectation_count["problem_type"] = "entity count doesn't match manual count" - qual_expectation_count["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_count["details"] - ] - qual_expectation_count["quality_criteria"] = ( - "3 - conservation area entity count matches LPA" - ) - qual_expectation_count["quality_level"] = 3 - qual_expectation_count.drop("details", axis=1, inplace=True) - - # Combine all problem source tables, and aggregate to criteria level - qual_all_criteria = ( - pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count]) - .groupby( - ["organisation", "dataset", "quality_criteria", "quality_level"], - as_index=False, - ) - .agg(count_failures=("count", "sum")) - ) - - # Merge issues with the provision data - prov_qual_all = provision.merge( - qual_all_criteria, how="left", on=["organisation", "dataset"] - ) - - prov_qual_all["quality_level_for_sort"] = np.select( - [ - (prov_qual_all["active_endpoint_count"] == 0), - (prov_qual_all["quality_level"].notnull()), - (prov_qual_all["active_endpoint_count"] > 0) - & (prov_qual_all["quality_level"].isnull()), - ], - [0, prov_qual_all["quality_level"], 4], - ) - - level_map = { - 4: "4. data that is trustworthy", - 3: "3. data that is good for ODP", - 2: "2. authoritative data from the LPA", - 1: "1. some data", - 0: "0. no score", - } - - prov_quality = prov_qual_all.groupby( - ["organisation", "dataset"], as_index=False, dropna=False - ).agg(quality_level=("quality_level_for_sort", "min")) - - prov_quality["quality"] = prov_quality["quality_level"].map(level_map) - prov_quality["notes"] = "" - prov_quality["end-date"] = "" - prov_quality["start-date"] = td - prov_quality["entry-date"] = td - - # Output the results as a Parquet file - output_dir = os.path.join( - "/tmp", "performance", "provision-quality", f"entry-date={td}" - ) - os.makedirs(output_dir, exist_ok=True) - - output_file = os.path.join(output_dir, "provision-quality.parquet") - prov_quality.to_parquet(output_file, engine="pyarrow", index=False) - - print(f"Provision quality dataset saved to: {output_file}") diff --git a/setup.py b/setup.py index a051b7356..685cc8ac9 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ def get_long_description(): "boto3", "moto", "psutil", + "geopandas", ], entry_points={"console_scripts": ["digital-land=digital_land.cli:cli"]}, setup_requires=["pytest-runner"], From 95775254ed5d35378395ec7e7d3aac15d37eb084 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Thu, 15 May 2025 13:42:58 +0100 Subject: [PATCH 11/17] update cli --- digital_land/cli.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/digital_land/cli.py b/digital_land/cli.py index a99a950ea..b65aef331 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -31,7 +31,6 @@ organisation_check, save_state, add_data, - generate_provision_quality, ) from digital_land.command_arguments import ( @@ -826,8 +825,3 @@ def check_state_cmd( if diffs: print(f"State differs from {state_path} - {', '.join(diffs)}") sys.exit(1) - - -@cli.command("generate-provision-quality") -def generate_provision_quality_cmd(): - generate_provision_quality() From 698b00fa92b4646d61e4361524e0b7750e18d120 Mon Sep 17 00:00:00 2001 From: kena vyas Date: Fri, 16 May 2025 12:32:42 +0100 Subject: [PATCH 12/17] utilise api to download performance.sqlite3 --- digital_land/api.py | 14 ++- digital_land/commands.py | 17 ++-- digital_land/utils/functions_core.py | 52 ----------- .../test_generate_provision_quality.py | 90 +++++++++++++++++++ tests/unit/test_functions_core_utils.py | 32 +++++++ 5 files changed, 145 insertions(+), 60 deletions(-) create mode 100644 tests/integration/test_generate_provision_quality.py create mode 100644 tests/unit/test_functions_core_utils.py diff --git a/digital_land/api.py b/digital_land/api.py index ef0262153..03a480e01 100644 --- a/digital_land/api.py +++ b/digital_land/api.py @@ -36,6 +36,8 @@ def download_dataset( overwrite: bool = False, path: str = None, extension: Extension = Extension.CSV, + builder: bool = False, + builder_name: str = None, ): """ Downloads a dataset in CSV or SQLite3 format. @@ -43,6 +45,8 @@ def download_dataset( - overwrite: overwrite file is it already exists (otherwise will just return). - path: file to download to (otherwise /dataset/.). - extension: 'csv' or 'sqlite3', 'csv' by default. + - builder: downloads the dataset from the builder path + - builder_name: name to use for accessing the builder path - Returns: None. The file will be downloaded to the given path or cache, unless an exception occurs. @@ -56,8 +60,14 @@ def download_dataset( # different extensions require different urls and reading modes if extension == self.Extension.SQLITE3: - collection = self.specification.dataset[dataset]["collection"] - url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" + # performance.sqlite requires digital-land-builder path + if builder: + if not builder_name: + raise ValueError("Builder name must be provided when builder=True") + url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3" + else: + collection = self.specification.dataset[dataset]["collection"] + url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" mode = "wb" def get_content(response): diff --git a/digital_land/commands.py b/digital_land/commands.py index c8207c2b8..9b5dd4986 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1731,13 +1731,18 @@ def generate_provision_quality(): """Generates a provision quality dataset and saves it as a parquet file""" td = datetime.today().strftime("%Y-%m-%d") - # Create the temporary download directory - db_dir = Path("/tmp") / "db_downloads" - os.makedirs(db_dir, exist_ok=True) + specification = Specification("specification/") + api = API(specification) + + # Download the performance db using api + api.download_dataset( + "performance", + extension=api.Extension.SQLITE3, + builder=True, + builder_name="digital-land", + ) - # Download the performance db - fc.download_dataset("performance", db_dir, overwrite=False) - path_perf_db = db_dir / "performance.db" + path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3" # Issue quality criteria lookup lookup_issue_qual = fc.datasette_query( diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py index b891ef405..c3a62836f 100644 --- a/digital_land/utils/functions_core.py +++ b/digital_land/utils/functions_core.py @@ -1,65 +1,13 @@ import urllib import sqlite3 import pandas as pd -import geopandas as gpd -import shapely.wkt -from pathlib import Path - -global FILES_URL - -FILES_URL = "https://datasette.planning.data.gov.uk/" - - -def download_dataset(dataset, output_dir_path, overwrite=False): - output_dir = Path(output_dir_path) - output_dir.mkdir(parents=True, exist_ok=True) - - dataset_file_name = f"{dataset}.db" - output_file_path = output_dir / dataset_file_name - - if not overwrite and output_file_path.exists(): - return - - final_url = f"{FILES_URL}{dataset_file_name}" - print(f"downloading data from {final_url}") - print(f"to: {output_file_path}") - - urllib.request.urlretrieve(final_url, output_file_path) - print("download complete") - - -def get_pdp_dataset( - dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True -): - - df = pd.read_csv( - f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str" - ) - df.columns = [x.replace("-", "_") for x in df.columns] - - df_valid_geom = df[df[geometry_field].notnull()].copy() - - # load geometry and create GDF - df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply( - shapely.wkt.loads - ) - gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field) - - # Transform to ESPG:27700 for more interpretable area units - gdf.set_crs(epsg=4326, inplace=True) - gdf.to_crs(epsg=crs_out, inplace=True) - - return gdf def query_sqlite(db_path, query_string): - with sqlite3.connect(db_path) as con: - cursor = con.execute(query_string) cols = [column[0] for column in cursor.description] results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols) - return results_df diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py new file mode 100644 index 000000000..33dbf202b --- /dev/null +++ b/tests/integration/test_generate_provision_quality.py @@ -0,0 +1,90 @@ +import pandas as pd +from unittest.mock import patch +from pathlib import Path +from datetime import datetime +from digital_land.commands import generate_provision_quality + + +@patch("digital_land.commands.fc.datasette_query") +@patch("digital_land.commands.fc.query_sqlite") +def test_generate_provision_quality( + mock_query_sqlite, + mock_datasette_query, +): + # mock issue_type + mock_datasette_query.side_effect = [ + pd.DataFrame( + [ + { + "description": "desc", + "issue_type": "missing-value", + "name": "Missing Value", + "severity": "error", + "responsibility": "external", + "quality_criteria": "any other validity error", + "quality_level": 3, + } + ] + ), + # mock LPA boundary check + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 2}', + } + ] + ), + # mock count value + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 1}', + } + ] + ), + ] + + # mock sqlite queries + mock_query_sqlite.side_effect = [ + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "active_endpoint_count": 5, + } + ] + ), + pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "problem_source": "issue", + "problem_type": "missing-value", + "count": 1, + } + ] + ), + ] + + generate_provision_quality() + + td = datetime.today().strftime("%Y-%m-%d") + output_file = Path( + f"/tmp/performance/provision-quality/entry-date={td}/provision-quality.parquet" + ) + assert output_file.exists(), "Parquet file not found" + + df = pd.read_parquet(output_file) + assert "organisation" in df.columns + assert "dataset" in df.columns + assert "quality" in df.columns + + assert not df.empty, "Dataframe loaded from Parquet is empty" + assert len(df) == 1 + assert df.iloc[0]["organisation"] == "org1" diff --git a/tests/unit/test_functions_core_utils.py b/tests/unit/test_functions_core_utils.py new file mode 100644 index 000000000..1df56b006 --- /dev/null +++ b/tests/unit/test_functions_core_utils.py @@ -0,0 +1,32 @@ +import pandas as pd +from unittest.mock import patch, Mock +from digital_land.utils.functions_core import datasette_query, query_sqlite + + +@patch("digital_land.utils.functions_core.sqlite3.connect") +def test_query_sqlite(mock_connect): + mock_data = Mock() + mock_data.description = [("organisation",), ("dataset",)] + mock_data.fetchall.return_value = [("org1", "dataset1"), ("org2", "dataset2")] + + mock_con = Mock() + mock_con.execute.return_value = mock_data + mock_connect.return_value.__enter__.return_value = mock_con + + df = query_sqlite("db_path", "SELECT * FROM table") + + assert isinstance(df, pd.DataFrame) + assert list(df.columns) == ["organisation", "dataset"] + assert len(df) == 2 + assert df.iloc[0]["organisation"] == "org1" + + +@patch("digital_land.utils.functions_core.pd.read_csv") +def test_datasette_query(mock_read_csv): + df_mock = pd.DataFrame({"organisation": ["org1", "org2"]}) + mock_read_csv.return_value = df_mock + + df = datasette_query("db", "SELECT organisation FROM table") + assert isinstance(df, pd.DataFrame) + assert "organisation" in df.columns + assert df.equals(df_mock) From d03a6556571d349f2014c279e8dd342f0bccb13f Mon Sep 17 00:00:00 2001 From: kena vyas Date: Fri, 16 May 2025 14:34:29 +0100 Subject: [PATCH 13/17] make specification parameter optional --- digital_land/api.py | 4 +++- digital_land/commands.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/digital_land/api.py b/digital_land/api.py index 03a480e01..e99ced5bc 100644 --- a/digital_land/api.py +++ b/digital_land/api.py @@ -14,7 +14,7 @@ class API: def __init__( self, - specification: Specification, + specification: Specification = None, url: str = DEFAULT_URL, cache_dir: str = "var/cache", ): @@ -66,6 +66,8 @@ def download_dataset( raise ValueError("Builder name must be provided when builder=True") url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3" else: + if self.specification is None: + raise ValueError("Specification must be provided") collection = self.specification.dataset[dataset]["collection"] url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3" mode = "wb" diff --git a/digital_land/commands.py b/digital_land/commands.py index 9b5dd4986..6d5bbd272 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1731,8 +1731,7 @@ def generate_provision_quality(): """Generates a provision quality dataset and saves it as a parquet file""" td = datetime.today().strftime("%Y-%m-%d") - specification = Specification("specification/") - api = API(specification) + api = API() # Download the performance db using api api.download_dataset( From 7f236fa50f683384527c3df4d71b70d9f95a70fd Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Fri, 30 May 2025 11:22:38 +0100 Subject: [PATCH 14/17] Change datasette_query to duckdb.query --- digital_land/commands.py | 96 ++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 6d5bbd272..5afe18d62 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -15,6 +15,7 @@ from requests import HTTPError import shapely import numpy as np +import duckdb from digital_land.package.organisation import OrganisationPackage from digital_land.check import duplicate_reference_check @@ -1744,22 +1745,26 @@ def generate_provision_quality(): path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3" # Issue quality criteria lookup - lookup_issue_qual = fc.datasette_query( - "digital-land", - """ + specification_repo_url = ( + "https://raw.githubusercontent.com/digital-land/specification/refs/heads/" + ) + issue_type_url = f"{specification_repo_url}main/content/issue-type.csv" + + lookup_issue_qual = duckdb.query( + f""" SELECT description, - issue_type, + "issue-type" AS issue_type, name, severity, responsibility, - quality_criteria_level || " - " || quality_criteria as quality_criteria, + quality_criteria_level || ' - ' || quality_criteria as quality_criteria, quality_criteria_level as quality_level - FROM issue_type - WHERE quality_criteria_level != '' + FROM read_csv('{issue_type_url}') + WHERE CAST(quality_criteria_level AS string) != '' AND quality_criteria != '' - """, - ) + """ + ).to_df() # Transform data provision = fc.query_sqlite( @@ -1793,54 +1798,47 @@ def generate_provision_quality(): left_on="problem_type", right_on="issue_type", ) - qual_issue.drop("issue_type", axis=1, inplace=True) + qual_issue = qual_issue.drop(columns="issue_type") # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_bounds = fc.datasette_query( - "digital-land", - """ - SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check no entities are outside of the local planning authority boundary' - AND passed = 'False' - AND message not like '%error%' - """, - ) + s3_uri = f"s3://development-collection-data/log/expectation/dataset=*/*.parquet" - qual_expectation_bounds["problem_source"] = "expectation" - qual_expectation_bounds["problem_type"] = ( - "entity outside of the local planning authority boundary" + qual_expectation_bounds = duckdb.query( + f""" + SELECT organisation, dataset, details + FROM read_parquet('{s3_uri}') + WHERE name = 'Check no entities are outside of the local planning authority boundary' + AND passed = 'False' + AND message not like '%error%' + """ + ).to_df() + qual_expectation_bounds = qual_expectation_bounds.assign( + problem_source="expectation", + problem_type="entity outside of the local planning authority boundary", + count=[json.loads(v)["actual"] for v in qual_expectation_bounds["details"]], + quality_criteria="3 - entities within LPA boundary", + quality_level=3, ) - qual_expectation_bounds["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_bounds["details"] - ] - qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary" - qual_expectation_bounds["quality_level"] = 3 - qual_expectation_bounds.drop("details", axis=1, inplace=True) + qual_expectation_bounds = qual_expectation_bounds.drop(columns="details") # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - qual_expectation_count = fc.datasette_query( - "digital-land", - """ + qual_expectation_count = duckdb.query( + f""" SELECT organisation, dataset, details - FROM expectation - WHERE 1=1 - AND name = 'Check number of entities inside the local planning authority boundary matches the manual count' - AND passed = 'False' - AND message not like '%error%' - """, - ) - - qual_expectation_count["problem_source"] = "expectation" - qual_expectation_count["problem_type"] = "entity count doesn't match manual count" - qual_expectation_count["count"] = [ - json.loads(v)["actual"] for v in qual_expectation_count["details"] - ] - qual_expectation_count["quality_criteria"] = ( - "3 - conservation area entity count matches LPA" + FROM read_parquet('{s3_uri}') + WHERE name = 'Check number of entities inside the local planning authority boundary matches the manual count' + AND passed = 'False' + AND message not like '%error%' + """ + ).to_df() + + qual_expectation_count = qual_expectation_count.assign( + problem_source="expectation", + problem_type="entity count doesn't match manual count", + count=[json.loads(v)["actual"] for v in qual_expectation_count["details"]], + quality_criteria="3 - conservation area entity count matches LPA", + quality_level=3, ) - qual_expectation_count["quality_level"] = 3 qual_expectation_count.drop("details", axis=1, inplace=True) # Combine all problem source tables, and aggregate to criteria level From 83210b73404581361a06bcf8ef76e5c1152c8fa2 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Fri, 30 May 2025 11:27:39 +0100 Subject: [PATCH 15/17] Change datasette_query to duckdb.query --- digital_land/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 5afe18d62..2417e5f63 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -1801,7 +1801,7 @@ def generate_provision_quality(): qual_issue = qual_issue.drop(columns="issue_type") # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds - s3_uri = f"s3://development-collection-data/log/expectation/dataset=*/*.parquet" + s3_uri = "s3://development-collection-data/log/expectation/dataset=*/*.parquet" qual_expectation_bounds = duckdb.query( f""" From bf32e96f27a952a9bb613238fbb262a6dabe3d11 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Fri, 30 May 2025 12:08:11 +0100 Subject: [PATCH 16/17] Added test for duckdb instead of datasette --- .../test_generate_provision_quality.py | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py index 33dbf202b..897325286 100644 --- a/tests/integration/test_generate_provision_quality.py +++ b/tests/integration/test_generate_provision_quality.py @@ -1,19 +1,18 @@ import pandas as pd -from unittest.mock import patch +from unittest.mock import patch, Mock from pathlib import Path from datetime import datetime from digital_land.commands import generate_provision_quality -@patch("digital_land.commands.fc.datasette_query") +@patch("digital_land.commands.duckdb.query") @patch("digital_land.commands.fc.query_sqlite") def test_generate_provision_quality( mock_query_sqlite, - mock_datasette_query, + mock_duckdb_query, ): # mock issue_type - mock_datasette_query.side_effect = [ - pd.DataFrame( + df1 = pd.DataFrame( [ { "description": "desc", @@ -25,9 +24,9 @@ def test_generate_provision_quality( "quality_level": 3, } ] - ), - # mock LPA boundary check - pd.DataFrame( + ) + # mock LPA boundary check + df2 =pd.DataFrame( [ { "organisation": "org1", @@ -35,9 +34,9 @@ def test_generate_provision_quality( "details": '{"actual": 2}', } ] - ), - # mock count value - pd.DataFrame( + ) + # mock count value + df3 = pd.DataFrame( [ { "organisation": "org1", @@ -45,8 +44,19 @@ def test_generate_provision_quality( "details": '{"actual": 1}', } ] - ), - ] + ) + + # Wrap each in a mock with .to_df() + rel1 = Mock() + rel1.to_df.return_value = df1 + + rel2 = Mock() + rel2.to_df.return_value = df2 + + rel3 = Mock() + rel3.to_df.return_value = df3 + + mock_duckdb_query.side_effect = [rel1, rel2, rel3] # mock sqlite queries mock_query_sqlite.side_effect = [ @@ -81,10 +91,12 @@ def test_generate_provision_quality( assert output_file.exists(), "Parquet file not found" df = pd.read_parquet(output_file) - assert "organisation" in df.columns - assert "dataset" in df.columns - assert "quality" in df.columns - assert not df.empty, "Dataframe loaded from Parquet is empty" + assert set(["organisation", "dataset", "quality"]).issubset(df.columns) assert len(df) == 1 assert df.iloc[0]["organisation"] == "org1" + assert df.iloc[0]["dataset"] == "dataset1" + assert df["quality"].iloc[0] in [ + "3. data that is good for ODP", + "4. data that is trustworthy", + ] From ba74c084135cc4b90ab5eefe5f1f5bf65b466b75 Mon Sep 17 00:00:00 2001 From: alexglasertpx Date: Fri, 30 May 2025 12:14:44 +0100 Subject: [PATCH 17/17] Added test for duckdb instead of datasette --- .../test_generate_provision_quality.py | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py index 897325286..edb291f58 100644 --- a/tests/integration/test_generate_provision_quality.py +++ b/tests/integration/test_generate_provision_quality.py @@ -13,38 +13,38 @@ def test_generate_provision_quality( ): # mock issue_type df1 = pd.DataFrame( - [ - { - "description": "desc", - "issue_type": "missing-value", - "name": "Missing Value", - "severity": "error", - "responsibility": "external", - "quality_criteria": "any other validity error", - "quality_level": 3, - } - ] - ) + [ + { + "description": "desc", + "issue_type": "missing-value", + "name": "Missing Value", + "severity": "error", + "responsibility": "external", + "quality_criteria": "any other validity error", + "quality_level": 3, + } + ] + ) # mock LPA boundary check - df2 =pd.DataFrame( - [ - { - "organisation": "org1", - "dataset": "dataset1", - "details": '{"actual": 2}', - } - ] - ) + df2 = pd.DataFrame( + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 2}', + } + ] + ) # mock count value df3 = pd.DataFrame( - [ - { - "organisation": "org1", - "dataset": "dataset1", - "details": '{"actual": 1}', - } - ] - ) + [ + { + "organisation": "org1", + "dataset": "dataset1", + "details": '{"actual": 1}', + } + ] + ) # Wrap each in a mock with .to_df() rel1 = Mock()