From 3256f9c32d67d0abf0efe4fd53a41b3feef51e07 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Wed, 14 May 2025 15:13:06 +0100
Subject: [PATCH 01/17] generate provision quality dataset

---
 digital_land/utils/functions_core.py          |  73 ++++++++
 .../utils/generate_provision_quality.py       | 175 ++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 digital_land/utils/functions_core.py
 create mode 100644 digital_land/utils/generate_provision_quality.py

diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
new file mode 100644
index 000000000..887fa5c09
--- /dev/null
+++ b/digital_land/utils/functions_core.py
@@ -0,0 +1,73 @@
+import urllib
+import os
+import sqlite3
+import pandas as pd
+import geopandas as gpd
+import shapely.wkt
+
+
+global FILES_URL
+
+FILES_URL = "https://datasette.planning.data.gov.uk/"
+
+
+def download_dataset(dataset, output_dir_path, overwrite=False):
+    dataset_file_name = f"{dataset}.db"
+
+    if not os.path.exists(output_dir_path):
+        os.makedirs(output_dir_path)
+
+    output_file_path = os.path.join(output_dir_path, dataset_file_name)
+
+    if overwrite is False and os.path.exists(output_file_path):
+        return
+
+    final_url = os.path.join(FILES_URL, dataset_file_name)
+    print(f"downloading data from {final_url}")
+    print(f"to: {output_file_path}")
+    urllib.request.urlretrieve(
+        final_url, os.path.join(output_dir_path, dataset_file_name)
+    )
+    print("download complete")
+
+
+def get_pdp_dataset(
+    dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True
+):
+
+    df = pd.read_csv(
+        f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str"
+    )
+    df.columns = [x.replace("-", "_") for x in df.columns]
+
+    df_valid_geom = df[df[geometry_field].notnull()].copy()
+
+    # load geometry and create GDF
+    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(
+        shapely.wkt.loads
+    )
+    gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field)
+
+    # Transform to ESPG:27700 for more interpretable area units
+    gdf.set_crs(epsg=4326, inplace=True)
+    gdf.to_crs(epsg=crs_out, inplace=True)
+
+    return gdf
+
+
+def query_sqlite(db_path, query_string):
+
+    with sqlite3.connect(db_path) as con:
+
+        cursor = con.execute(query_string)
+        cols = [column[0] for column in cursor.description]
+        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
+
+    return results_df
+
+
+def datasette_query(db, sql_string):
+    params = urllib.parse.urlencode({"sql": sql_string, "_size": "max"})
+    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
+    df = pd.read_csv(url)
+    return df
diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py
new file mode 100644
index 000000000..7ea8f6f73
--- /dev/null
+++ b/digital_land/utils/generate_provision_quality.py
@@ -0,0 +1,175 @@
+import os
+import pandas as pd
+import numpy as np
+import json
+from datetime import datetime
+from digital_land.utils import functions_core as fc
+
+
+def generate_provision_quality():
+    """
+    Generates a provision quality dataset and saves it as a parquet file.
+    """
+    td = datetime.today().strftime("%Y-%m-%d")
+
+    # Create the temporary download directory
+    db_dir = os.path.join("/tmp", "db_downloads")
+    os.makedirs(db_dir, exist_ok=True)
+
+    # Download the performance db
+    fc.download_dataset("performance", db_dir, overwrite=False)
+    path_perf_db = os.path.join(db_dir, "performance.db")
+
+    # Issue quality criteria lookup
+    lookup_issue_qual = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT
+            description,
+            issue_type,
+            name,
+            severity,
+            responsibility,
+            quality_criteria_level || " - " || quality_criteria as quality_criteria,
+            quality_criteria_level as quality_level
+        FROM issue_type
+        WHERE quality_criteria_level != ''
+        AND quality_criteria != ''
+        """,
+    )
+
+    # Transform data
+    provision = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT organisation, dataset, active_endpoint_count
+        FROM provision_summary
+        """,
+    )
+
+    # Extract issue count by provision from endpoint_dataset_issue_type_summary
+    qual_issue = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT
+            organisation, dataset,
+            'issue' as problem_source,
+            issue_type as problem_type,
+            sum(count_issues) as count
+        FROM endpoint_dataset_issue_type_summary
+        WHERE resource_end_date is not NULL
+        AND issue_type is not NULL
+        GROUP BY organisation, dataset, issue_type
+        """,
+    )
+
+    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
+    qual_issue = qual_issue.merge(
+        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
+        how="inner",
+        left_on="problem_type",
+        right_on="issue_type",
+    )
+    qual_issue.drop("issue_type", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_bounds = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check no entities are outside of the local planning authority boundary'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_bounds["problem_source"] = "expectation"
+    qual_expectation_bounds["problem_type"] = (
+        "entity outside of the local planning authority boundary"
+    )
+    qual_expectation_bounds["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
+    ]
+    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
+    qual_expectation_bounds["quality_level"] = 3
+    qual_expectation_bounds.drop("details", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_count = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_count["problem_source"] = "expectation"
+    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
+    qual_expectation_count["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_count["details"]
+    ]
+    qual_expectation_count["quality_criteria"] = (
+        "3 - conservation area entity count matches LPA"
+    )
+    qual_expectation_count["quality_level"] = 3
+    qual_expectation_count.drop("details", axis=1, inplace=True)
+
+    # Combine all problem source tables, and aggregate to criteria level
+    qual_all_criteria = (
+        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
+        .groupby(
+            ["organisation", "dataset", "quality_criteria", "quality_level"],
+            as_index=False,
+        )
+        .agg(count_failures=("count", "sum"))
+    )
+
+    # Merge issues with the provision data
+    prov_qual_all = provision.merge(
+        qual_all_criteria, how="left", on=["organisation", "dataset"]
+    )
+
+    prov_qual_all["quality_level_for_sort"] = np.select(
+        [
+            (prov_qual_all["active_endpoint_count"] == 0),
+            (prov_qual_all["quality_level"].notnull()),
+            (prov_qual_all["active_endpoint_count"] > 0)
+            & (prov_qual_all["quality_level"].isnull()),
+        ],
+        [0, prov_qual_all["quality_level"], 4],
+    )
+
+    level_map = {
+        4: "4. data that is trustworthy",
+        3: "3. data that is good for ODP",
+        2: "2. authoritative data from the LPA",
+        1: "1. some data",
+        0: "0. no score",
+    }
+
+    prov_quality = prov_qual_all.groupby(
+        ["organisation", "dataset"], as_index=False, dropna=False
+    ).agg(quality_level=("quality_level_for_sort", "min"))
+
+    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
+    prov_quality["notes"] = ""
+    prov_quality["end-date"] = ""
+    prov_quality["start-date"] = td
+    prov_quality["entry-date"] = td
+
+    # Output the results as a Parquet file
+    output_dir = os.path.join(
+        "/tmp", "performance", "provision-quality", f"entry-date={td}"
+    )
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_file = os.path.join(output_dir, "provision-quality.parquet")
+    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
+
+    print(f"Provision quality dataset saved to: {output_file}")

From a6e85a5d688ae8fa96ff11a82bf82b643a15aa20 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Thu, 15 May 2025 11:57:29 +0100
Subject: [PATCH 02/17] generate-provision-quality cli command

---
 digital_land/cli.py                           |   6 +
 digital_land/commands.py                      | 167 +++++++++++++++++
 digital_land/utils/functions_core.py          |  21 +--
 .../utils/generate_provision_quality.py       | 175 ------------------
 setup.py                                      |   1 +
 5 files changed, 183 insertions(+), 187 deletions(-)
 delete mode 100644 digital_land/utils/generate_provision_quality.py

diff --git a/digital_land/cli.py b/digital_land/cli.py
index b65aef331..a99a950ea 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -31,6 +31,7 @@
     organisation_check,
     save_state,
     add_data,
+    generate_provision_quality,
 )
 
 from digital_land.command_arguments import (
@@ -825,3 +826,8 @@ def check_state_cmd(
     if diffs:
         print(f"State differs from {state_path} - {', '.join(diffs)}")
         sys.exit(1)
+
+
+@cli.command("generate-provision-quality")
+def generate_provision_quality_cmd():
+    generate_provision_quality()
diff --git a/digital_land/commands.py b/digital_land/commands.py
index ad16e6edb..02e3a95e9 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -14,6 +14,7 @@
 import geojson
 from requests import HTTPError
 import shapely
+import numpy as np
 
 from digital_land.package.organisation import OrganisationPackage
 from digital_land.check import duplicate_reference_check
@@ -73,6 +74,7 @@
     is_url_valid,
     get_user_response,
 )
+from digital_land.utils import functions_core as fc
 
 from .register import hash_value
 from .utils.gdal_utils import get_gdal_version
@@ -1665,3 +1667,168 @@ def check_and_assign_entities(
         ):
             return False
     return True
+
+
+def generate_provision_quality():
+    """Generates a provision quality dataset and saves it as a parquet file"""
+    td = datetime.today().strftime("%Y-%m-%d")
+
+    # Create the temporary download directory
+    db_dir = Path("/tmp") / "db_downloads"
+    os.makedirs(db_dir, exist_ok=True)
+
+    # Download the performance db
+    fc.download_dataset("performance", db_dir, overwrite=False)
+    path_perf_db = db_dir / "performance.db"
+
+    # Issue quality criteria lookup
+    lookup_issue_qual = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT
+            description,
+            issue_type,
+            name,
+            severity,
+            responsibility,
+            quality_criteria_level || " - " || quality_criteria as quality_criteria,
+            quality_criteria_level as quality_level
+        FROM issue_type
+        WHERE quality_criteria_level != ''
+        AND quality_criteria != ''
+        """,
+    )
+
+    # Transform data
+    provision = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT organisation, dataset, active_endpoint_count
+        FROM provision_summary
+        """,
+    )
+
+    # Extract issue count by provision from endpoint_dataset_issue_type_summary
+    qual_issue = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT
+            organisation, dataset,
+            'issue' as problem_source,
+            issue_type as problem_type,
+            sum(count_issues) as count
+        FROM endpoint_dataset_issue_type_summary
+        WHERE resource_end_date is not NULL
+        AND issue_type is not NULL
+        GROUP BY organisation, dataset, issue_type
+        """,
+    )
+
+    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
+    qual_issue = qual_issue.merge(
+        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
+        how="inner",
+        left_on="problem_type",
+        right_on="issue_type",
+    )
+    qual_issue.drop("issue_type", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_bounds = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check no entities are outside of the local planning authority boundary'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_bounds["problem_source"] = "expectation"
+    qual_expectation_bounds["problem_type"] = (
+        "entity outside of the local planning authority boundary"
+    )
+    qual_expectation_bounds["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
+    ]
+    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
+    qual_expectation_bounds["quality_level"] = 3
+    qual_expectation_bounds.drop("details", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_count = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_count["problem_source"] = "expectation"
+    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
+    qual_expectation_count["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_count["details"]
+    ]
+    qual_expectation_count["quality_criteria"] = (
+        "3 - conservation area entity count matches LPA"
+    )
+    qual_expectation_count["quality_level"] = 3
+    qual_expectation_count.drop("details", axis=1, inplace=True)
+
+    # Combine all problem source tables, and aggregate to criteria level
+    qual_all_criteria = (
+        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
+        .groupby(
+            ["organisation", "dataset", "quality_criteria", "quality_level"],
+            as_index=False,
+        )
+        .agg(count_failures=("count", "sum"))
+    )
+
+    # Merge issues with the provision data
+    prov_qual_all = provision.merge(
+        qual_all_criteria, how="left", on=["organisation", "dataset"]
+    )
+
+    prov_qual_all["quality_level_for_sort"] = np.select(
+        [
+            (prov_qual_all["active_endpoint_count"] == 0),
+            (prov_qual_all["quality_level"].notnull()),
+            (prov_qual_all["active_endpoint_count"] > 0)
+            & (prov_qual_all["quality_level"].isnull()),
+        ],
+        [0, prov_qual_all["quality_level"], 4],
+    )
+
+    level_map = {
+        4: "4. data that is trustworthy",
+        3: "3. data that is good for ODP",
+        2: "2. authoritative data from the LPA",
+        1: "1. some data",
+        0: "0. no score",
+    }
+
+    prov_quality = prov_qual_all.groupby(
+        ["organisation", "dataset"], as_index=False, dropna=False
+    ).agg(quality_level=("quality_level_for_sort", "min"))
+
+    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
+    prov_quality["notes"] = ""
+    prov_quality["end-date"] = ""
+    prov_quality["start-date"] = td
+    prov_quality["entry-date"] = td
+
+    # Output the results as a Parquet file
+    output_dir = Path("/tmp") / "performance" / "provision-quality" / f"entry-date={td}"
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_file = output_dir / "provision-quality.parquet"
+    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
+
+    print(f"Provision quality dataset saved to: {output_file}")
diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
index 887fa5c09..b891ef405 100644
--- a/digital_land/utils/functions_core.py
+++ b/digital_land/utils/functions_core.py
@@ -1,10 +1,9 @@
 import urllib
-import os
 import sqlite3
 import pandas as pd
 import geopandas as gpd
 import shapely.wkt
-
+from pathlib import Path
 
 global FILES_URL
 
@@ -12,22 +11,20 @@
 
 
 def download_dataset(dataset, output_dir_path, overwrite=False):
-    dataset_file_name = f"{dataset}.db"
-
-    if not os.path.exists(output_dir_path):
-        os.makedirs(output_dir_path)
+    output_dir = Path(output_dir_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-    output_file_path = os.path.join(output_dir_path, dataset_file_name)
+    dataset_file_name = f"{dataset}.db"
+    output_file_path = output_dir / dataset_file_name
 
-    if overwrite is False and os.path.exists(output_file_path):
+    if not overwrite and output_file_path.exists():
         return
 
-    final_url = os.path.join(FILES_URL, dataset_file_name)
+    final_url = f"{FILES_URL}{dataset_file_name}"
     print(f"downloading data from {final_url}")
     print(f"to: {output_file_path}")
-    urllib.request.urlretrieve(
-        final_url, os.path.join(output_dir_path, dataset_file_name)
-    )
+
+    urllib.request.urlretrieve(final_url, output_file_path)
     print("download complete")
 
 
diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py
deleted file mode 100644
index 7ea8f6f73..000000000
--- a/digital_land/utils/generate_provision_quality.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import pandas as pd
-import numpy as np
-import json
-from datetime import datetime
-from digital_land.utils import functions_core as fc
-
-
-def generate_provision_quality():
-    """
-    Generates a provision quality dataset and saves it as a parquet file.
-    """
-    td = datetime.today().strftime("%Y-%m-%d")
-
-    # Create the temporary download directory
-    db_dir = os.path.join("/tmp", "db_downloads")
-    os.makedirs(db_dir, exist_ok=True)
-
-    # Download the performance db
-    fc.download_dataset("performance", db_dir, overwrite=False)
-    path_perf_db = os.path.join(db_dir, "performance.db")
-
-    # Issue quality criteria lookup
-    lookup_issue_qual = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT
-            description,
-            issue_type,
-            name,
-            severity,
-            responsibility,
-            quality_criteria_level || " - " || quality_criteria as quality_criteria,
-            quality_criteria_level as quality_level
-        FROM issue_type
-        WHERE quality_criteria_level != ''
-        AND quality_criteria != ''
-        """,
-    )
-
-    # Transform data
-    provision = fc.query_sqlite(
-        path_perf_db,
-        """
-        SELECT organisation, dataset, active_endpoint_count
-        FROM provision_summary
-        """,
-    )
-
-    # Extract issue count by provision from endpoint_dataset_issue_type_summary
-    qual_issue = fc.query_sqlite(
-        path_perf_db,
-        """
-        SELECT
-            organisation, dataset,
-            'issue' as problem_source,
-            issue_type as problem_type,
-            sum(count_issues) as count
-        FROM endpoint_dataset_issue_type_summary
-        WHERE resource_end_date is not NULL
-        AND issue_type is not NULL
-        GROUP BY organisation, dataset, issue_type
-        """,
-    )
-
-    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
-    qual_issue = qual_issue.merge(
-        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
-        how="inner",
-        left_on="problem_type",
-        right_on="issue_type",
-    )
-    qual_issue.drop("issue_type", axis=1, inplace=True)
-
-    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_bounds = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check no entities are outside of the local planning authority boundary'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
-
-    qual_expectation_bounds["problem_source"] = "expectation"
-    qual_expectation_bounds["problem_type"] = (
-        "entity outside of the local planning authority boundary"
-    )
-    qual_expectation_bounds["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
-    ]
-    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
-    qual_expectation_bounds["quality_level"] = 3
-    qual_expectation_bounds.drop("details", axis=1, inplace=True)
-
-    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_count = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
-
-    qual_expectation_count["problem_source"] = "expectation"
-    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
-    qual_expectation_count["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_count["details"]
-    ]
-    qual_expectation_count["quality_criteria"] = (
-        "3 - conservation area entity count matches LPA"
-    )
-    qual_expectation_count["quality_level"] = 3
-    qual_expectation_count.drop("details", axis=1, inplace=True)
-
-    # Combine all problem source tables, and aggregate to criteria level
-    qual_all_criteria = (
-        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
-        .groupby(
-            ["organisation", "dataset", "quality_criteria", "quality_level"],
-            as_index=False,
-        )
-        .agg(count_failures=("count", "sum"))
-    )
-
-    # Merge issues with the provision data
-    prov_qual_all = provision.merge(
-        qual_all_criteria, how="left", on=["organisation", "dataset"]
-    )
-
-    prov_qual_all["quality_level_for_sort"] = np.select(
-        [
-            (prov_qual_all["active_endpoint_count"] == 0),
-            (prov_qual_all["quality_level"].notnull()),
-            (prov_qual_all["active_endpoint_count"] > 0)
-            & (prov_qual_all["quality_level"].isnull()),
-        ],
-        [0, prov_qual_all["quality_level"], 4],
-    )
-
-    level_map = {
-        4: "4. data that is trustworthy",
-        3: "3. data that is good for ODP",
-        2: "2. authoritative data from the LPA",
-        1: "1. some data",
-        0: "0. no score",
-    }
-
-    prov_quality = prov_qual_all.groupby(
-        ["organisation", "dataset"], as_index=False, dropna=False
-    ).agg(quality_level=("quality_level_for_sort", "min"))
-
-    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
-    prov_quality["notes"] = ""
-    prov_quality["end-date"] = ""
-    prov_quality["start-date"] = td
-    prov_quality["entry-date"] = td
-
-    # Output the results as a Parquet file
-    output_dir = os.path.join(
-        "/tmp", "performance", "provision-quality", f"entry-date={td}"
-    )
-    os.makedirs(output_dir, exist_ok=True)
-
-    output_file = os.path.join(output_dir, "provision-quality.parquet")
-    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
-
-    print(f"Provision quality dataset saved to: {output_file}")
diff --git a/setup.py b/setup.py
index a051b7356..685cc8ac9 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,7 @@ def get_long_description():
         "boto3",
         "moto",
         "psutil",
+        "geopandas",
     ],
     entry_points={"console_scripts": ["digital-land=digital_land.cli:cli"]},
     setup_requires=["pytest-runner"],

From 1975e667fe775031047acb1e6f0b1af6d1ef4a03 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Thu, 15 May 2025 13:42:58 +0100
Subject: [PATCH 03/17] update cli

---
 digital_land/cli.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/digital_land/cli.py b/digital_land/cli.py
index a99a950ea..b65aef331 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -31,7 +31,6 @@
     organisation_check,
     save_state,
     add_data,
-    generate_provision_quality,
 )
 
 from digital_land.command_arguments import (
@@ -826,8 +825,3 @@ def check_state_cmd(
     if diffs:
         print(f"State differs from {state_path} - {', '.join(diffs)}")
         sys.exit(1)
-
-
-@cli.command("generate-provision-quality")
-def generate_provision_quality_cmd():
-    generate_provision_quality()

From 4cff1b8d96968fa07349dbb34dacba32967454b9 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Fri, 16 May 2025 12:32:42 +0100
Subject: [PATCH 04/17] utilise api to download performance.sqlite3

---
 digital_land/api.py                           | 14 ++-
 digital_land/commands.py                      | 17 ++--
 digital_land/utils/functions_core.py          | 52 -----------
 .../test_generate_provision_quality.py        | 90 +++++++++++++++++++
 tests/unit/test_functions_core_utils.py       | 32 +++++++
 5 files changed, 145 insertions(+), 60 deletions(-)
 create mode 100644 tests/integration/test_generate_provision_quality.py
 create mode 100644 tests/unit/test_functions_core_utils.py

diff --git a/digital_land/api.py b/digital_land/api.py
index ef0262153..03a480e01 100644
--- a/digital_land/api.py
+++ b/digital_land/api.py
@@ -36,6 +36,8 @@ def download_dataset(
         overwrite: bool = False,
         path: str = None,
         extension: Extension = Extension.CSV,
+        builder: bool = False,
+        builder_name: str = None,
     ):
         """
         Downloads a dataset in CSV or SQLite3 format.
@@ -43,6 +45,8 @@ def download_dataset(
         - overwrite: overwrite file is it already exists (otherwise will just return).
         - path: file to download to (otherwise <cache-dir>/dataset/<dataset-name>.<extension>).
         - extension: 'csv' or 'sqlite3', 'csv' by default.
+        - builder: downloads the dataset from the builder path
+        - builder_name: name to use for accessing the builder path
         - Returns: None.
         The file will be downloaded to the given path or cache, unless an exception occurs.
 
@@ -56,8 +60,14 @@ def download_dataset(
 
         # different extensions require different urls and reading modes
         if extension == self.Extension.SQLITE3:
-            collection = self.specification.dataset[dataset]["collection"]
-            url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
+            # performance.sqlite requires digital-land-builder path
+            if builder:
+                if not builder_name:
+                    raise ValueError("Builder name must be provided when builder=True")
+                url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3"
+            else:
+                collection = self.specification.dataset[dataset]["collection"]
+                url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
             mode = "wb"
 
             def get_content(response):
diff --git a/digital_land/commands.py b/digital_land/commands.py
index 02e3a95e9..7eb7db3c8 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1673,13 +1673,18 @@ def generate_provision_quality():
     """Generates a provision quality dataset and saves it as a parquet file"""
     td = datetime.today().strftime("%Y-%m-%d")
 
-    # Create the temporary download directory
-    db_dir = Path("/tmp") / "db_downloads"
-    os.makedirs(db_dir, exist_ok=True)
+    specification = Specification("specification/")
+    api = API(specification)
+
+    # Download the performance db using api
+    api.download_dataset(
+        "performance",
+        extension=api.Extension.SQLITE3,
+        builder=True,
+        builder_name="digital-land",
+    )
 
-    # Download the performance db
-    fc.download_dataset("performance", db_dir, overwrite=False)
-    path_perf_db = db_dir / "performance.db"
+    path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3"
 
     # Issue quality criteria lookup
     lookup_issue_qual = fc.datasette_query(
diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
index b891ef405..c3a62836f 100644
--- a/digital_land/utils/functions_core.py
+++ b/digital_land/utils/functions_core.py
@@ -1,65 +1,13 @@
 import urllib
 import sqlite3
 import pandas as pd
-import geopandas as gpd
-import shapely.wkt
-from pathlib import Path
-
-global FILES_URL
-
-FILES_URL = "https://datasette.planning.data.gov.uk/"
-
-
-def download_dataset(dataset, output_dir_path, overwrite=False):
-    output_dir = Path(output_dir_path)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    dataset_file_name = f"{dataset}.db"
-    output_file_path = output_dir / dataset_file_name
-
-    if not overwrite and output_file_path.exists():
-        return
-
-    final_url = f"{FILES_URL}{dataset_file_name}"
-    print(f"downloading data from {final_url}")
-    print(f"to: {output_file_path}")
-
-    urllib.request.urlretrieve(final_url, output_file_path)
-    print("download complete")
-
-
-def get_pdp_dataset(
-    dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True
-):
-
-    df = pd.read_csv(
-        f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str"
-    )
-    df.columns = [x.replace("-", "_") for x in df.columns]
-
-    df_valid_geom = df[df[geometry_field].notnull()].copy()
-
-    # load geometry and create GDF
-    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(
-        shapely.wkt.loads
-    )
-    gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field)
-
-    # Transform to ESPG:27700 for more interpretable area units
-    gdf.set_crs(epsg=4326, inplace=True)
-    gdf.to_crs(epsg=crs_out, inplace=True)
-
-    return gdf
 
 
 def query_sqlite(db_path, query_string):
-
     with sqlite3.connect(db_path) as con:
-
         cursor = con.execute(query_string)
         cols = [column[0] for column in cursor.description]
         results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
-
     return results_df
 
 
diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py
new file mode 100644
index 000000000..33dbf202b
--- /dev/null
+++ b/tests/integration/test_generate_provision_quality.py
@@ -0,0 +1,90 @@
+import pandas as pd
+from unittest.mock import patch
+from pathlib import Path
+from datetime import datetime
+from digital_land.commands import generate_provision_quality
+
+
+@patch("digital_land.commands.fc.datasette_query")
+@patch("digital_land.commands.fc.query_sqlite")
+def test_generate_provision_quality(
+    mock_query_sqlite,
+    mock_datasette_query,
+):
+    # mock issue_type
+    mock_datasette_query.side_effect = [
+        pd.DataFrame(
+            [
+                {
+                    "description": "desc",
+                    "issue_type": "missing-value",
+                    "name": "Missing Value",
+                    "severity": "error",
+                    "responsibility": "external",
+                    "quality_criteria": "any other validity error",
+                    "quality_level": 3,
+                }
+            ]
+        ),
+        # mock LPA boundary check
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "details": '{"actual": 2}',
+                }
+            ]
+        ),
+        # mock count value
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "details": '{"actual": 1}',
+                }
+            ]
+        ),
+    ]
+
+    # mock sqlite queries
+    mock_query_sqlite.side_effect = [
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "active_endpoint_count": 5,
+                }
+            ]
+        ),
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "problem_source": "issue",
+                    "problem_type": "missing-value",
+                    "count": 1,
+                }
+            ]
+        ),
+    ]
+
+    generate_provision_quality()
+
+    td = datetime.today().strftime("%Y-%m-%d")
+    output_file = Path(
+        f"/tmp/performance/provision-quality/entry-date={td}/provision-quality.parquet"
+    )
+    assert output_file.exists(), "Parquet file not found"
+
+    df = pd.read_parquet(output_file)
+    assert "organisation" in df.columns
+    assert "dataset" in df.columns
+    assert "quality" in df.columns
+
+    assert not df.empty, "Dataframe loaded from Parquet is empty"
+    assert len(df) == 1
+    assert df.iloc[0]["organisation"] == "org1"
diff --git a/tests/unit/test_functions_core_utils.py b/tests/unit/test_functions_core_utils.py
new file mode 100644
index 000000000..1df56b006
--- /dev/null
+++ b/tests/unit/test_functions_core_utils.py
@@ -0,0 +1,32 @@
+import pandas as pd
+from unittest.mock import patch, Mock
+from digital_land.utils.functions_core import datasette_query, query_sqlite
+
+
+@patch("digital_land.utils.functions_core.sqlite3.connect")
+def test_query_sqlite(mock_connect):
+    mock_data = Mock()
+    mock_data.description = [("organisation",), ("dataset",)]
+    mock_data.fetchall.return_value = [("org1", "dataset1"), ("org2", "dataset2")]
+
+    mock_con = Mock()
+    mock_con.execute.return_value = mock_data
+    mock_connect.return_value.__enter__.return_value = mock_con
+
+    df = query_sqlite("db_path", "SELECT * FROM table")
+
+    assert isinstance(df, pd.DataFrame)
+    assert list(df.columns) == ["organisation", "dataset"]
+    assert len(df) == 2
+    assert df.iloc[0]["organisation"] == "org1"
+
+
+@patch("digital_land.utils.functions_core.pd.read_csv")
+def test_datasette_query(mock_read_csv):
+    df_mock = pd.DataFrame({"organisation": ["org1", "org2"]})
+    mock_read_csv.return_value = df_mock
+
+    df = datasette_query("db", "SELECT organisation FROM table")
+    assert isinstance(df, pd.DataFrame)
+    assert "organisation" in df.columns
+    assert df.equals(df_mock)

From 982fd3fbd1a5260f09086bc6a2836e02ee189997 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Fri, 16 May 2025 14:34:29 +0100
Subject: [PATCH 05/17] make specification parameter optional

---
 digital_land/api.py      | 4 +++-
 digital_land/commands.py | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/digital_land/api.py b/digital_land/api.py
index 03a480e01..e99ced5bc 100644
--- a/digital_land/api.py
+++ b/digital_land/api.py
@@ -14,7 +14,7 @@
 class API:
     def __init__(
         self,
-        specification: Specification,
+        specification: Specification = None,
         url: str = DEFAULT_URL,
         cache_dir: str = "var/cache",
     ):
@@ -66,6 +66,8 @@ def download_dataset(
                     raise ValueError("Builder name must be provided when builder=True")
                 url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3"
             else:
+                if self.specification is None:
+                    raise ValueError("Specification must be provided")
                 collection = self.specification.dataset[dataset]["collection"]
                 url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
             mode = "wb"
diff --git a/digital_land/commands.py b/digital_land/commands.py
index 7eb7db3c8..36e38c774 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1673,8 +1673,7 @@ def generate_provision_quality():
     """Generates a provision quality dataset and saves it as a parquet file"""
     td = datetime.today().strftime("%Y-%m-%d")
 
-    specification = Specification("specification/")
-    api = API(specification)
+    api = API()
 
     # Download the performance db using api
     api.download_dataset(

From 3ce99b625c3900da6689772fd448ab5f99665afb Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Mon, 19 May 2025 09:42:33 +0100
Subject: [PATCH 06/17] Feat/combine config text (#405)

* Only run duplicate reference check when needed
---
 digital_land/commands.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index ad16e6edb..46fe18295 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -376,7 +376,11 @@ def pipeline_run(
         ),
     )
 
-    issue_log = duplicate_reference_check(issues=issue_log, csv_path=output_path)
+    # In the FactCombinePhase, when combine_fields has some values, we check for duplicates and combine values.
+    # If we have done this then we will not call duplicate_reference_check as we have already carried out a
+    # duplicate check and stop messages appearing in issues about reference values not being unique
+    if combine_fields == {}:
+        issue_log = duplicate_reference_check(issues=issue_log, csv_path=output_path)
 
     issue_log.apply_entity_map()
     issue_log.save(os.path.join(issue_dir, resource + ".csv"))

From 5338f5d46fa9b7e867ecfadd2f51f40ba22cbe56 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Tue, 20 May 2025 11:01:15 +0100
Subject: [PATCH 07/17] New gml.csv file for updated GDAL (#407)

---
 tests/data/resource_examples/gml.csv | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/data/resource_examples/gml.csv b/tests/data/resource_examples/gml.csv
index a7c36099e..7a134e4ee 100644
--- a/tests/data/resource_examples/gml.csv
+++ b/tests/data/resource_examples/gml.csv
@@ -1,11 +1,11 @@
-WKT,gml_id,Name,Ref
-"MULTIPOLYGON (((-2.632272212 53.34206401,-2.632000357 53.34224581,-2.630564119 53.3422481,-2.631009551 53.34186124,-2.631269515 53.34167052,-2.631587148 53.34143591,-2.631736085 53.34132451,-2.630234541 53.34106187,-2.630381899 53.34083419,-2.630430719 53.34083671,-2.630473829 53.3408045,-2.630570696 53.34078062,-2.63062419 53.34078034,-2.630642956 53.3407513,-2.630594227 53.34072236,-2.630564563 53.34068459,-2.630568851 53.34065563,-2.63063132 53.34056742,-2.630716303 53.34044359,-2.630861659 53.34027486,-2.630946484 53.34019947,-2.631384962 53.34029905,-2.63171138 53.3398375,-2.632213184 53.33997422,-2.632657969 53.34008985,-2.632914151 53.33957816,-2.632885686 53.33957175,-2.63293775 53.33946589,-2.632883467 53.3394554,-2.632917069 53.33939897,-2.63292785 53.33935829,-2.632925513 53.33917516,-2.63294476 53.33914532,-2.633046457 53.33865511,-2.633644084 53.33866515,-2.633816397 53.33841963,-2.634026323 53.33798843,-2.634336105 53.33736035,-2.634500904 53.33711002,-2.634540774 53.33710981,-2.634559936 53.33707439,-2.634913572 53.33709021,-2.635470784 53.33712481,-2.636015216 53.33662362,-2.637044871 53.33681168,-2.636935924 53.33706199,-2.636748823 53.33748571,-2.636502934 53.33801909,-2.636344157 53.33835863,-2.636267224 53.33851333,-2.635995827 53.33894819,-2.635870308 53.3391595,-2.635738612 53.33938773,-2.63565651 53.33952134,-2.635579679 53.33962563,-2.635375517 53.3399026,-2.6351317 53.34017833,-2.634725706 53.34052297,-2.634310927 53.34081048,-2.63398409 53.34102574,-2.633800358 53.34113598,-2.633595926 53.34124607,-2.633165035 53.34146691,-2.633066806 53.34151541,-2.632769196 53.34173949,-2.632272212 53.34206401)))",conservation_areas_polygon.1,Daresbury,1
-"MULTIPOLYGON (((-2.636788444 53.35518027,-2.636565647 53.35505107,-2.636258261 53.354965,-2.635477454 53.35471351,-2.635008016 53.35456728,-2.634656133 53.35444461,-2.634469454 53.35431979,-2.634403116 53.35440731,-2.63425788 53.35474516,-2.634113641 53.35494191,-2.633950352 53.35528255,-2.632575891 53.35490245,-2.632233555 53.35520719,-2.632071694 53.35536926,-2.63184661 53.35557804,-2.631608214 53.35579209,-2.630836185 53.35638612,-2.630249779 53.35685884,-2.629762455 53.3572382,-2.629696966 53.35721609,-2.629671084 53.35719106,-2.629914256 53.35673741,-2.630007817 53.35658883,-2.62982389 53.35654846,-2.629577281 53.35644192,-2.629684466 53.35636848,-2.629877509 53.35621111,-2.63027752 53.35582609,-2.630229262 53.35581286,-2.629983626 53.35574659,-2.629927443 53.35573394,-2.62980002 53.35571143,-2.629189378 53.355599,-2.629323648 53.35532862,-2.629638798 53.35540513,-2.629853419 53.35509029,-2.630029088 53.35487279,-2.630113139 53.35489455,-2.630322542 53.35460427,-2.630274311 53.35459284,-2.630385887 53.35441423,-2.630503172 53.35432016,-2.630238926 53.35423529,-2.630147221 53.35412335,-2.630084971 53.35397631,-2.630229717 53.35391264,-2.630356307 53.35383729,-2.630576983 53.3537238,-2.630792318 53.35365976,-2.630942067 53.35362572,-2.631139727 53.35358055,-2.631196349 53.3535479,-2.631225404 53.35347766,-2.631282835 53.35339917,-2.631387062 53.35333571,-2.631354942 53.35319956,-2.631342389 53.35306178,-2.631411528 53.35286273,-2.63121451 53.35284131,-2.63134322 53.35250625,-2.632175723 53.35262316,-2.632232647 53.35251052,-2.632445365 53.35226219,-2.632692377 53.35200468,-2.633013939 53.3521136,-2.635172302 53.3526864,-2.635545184 53.35280573,-2.635979617 53.35238314,-2.636600521 53.35179005,-2.637117499 53.35125666,-2.637200141 53.35124166,-2.637489241 53.35149829,-2.637706892 53.35169042,-2.637759235 53.35176203,-2.637890637 53.35191858,-2.637933025 53.35194532,-2.637997827 53.35195486,-2.638008 53.35203038,-2.638046643 53.3522036,-2.63810637 53.35238822,-2.638371036 53.3530064,-2.638376316 53.3530531,-2.638432739 53.35321456,-2.638441974 53.35332782,-2.638406463 53.3534628,-2.638263964 53.35368292,-2.63826113 53.35372553,-2.638780194 53.35384461,-2.638675041 53.353945,-2.638514619 53.35405163,-2.637348667 53.3548279,-2.636788444 53.35518027)))",conservation_areas_polygon.2,Moore,2
-"MULTIPOLYGON (((-2.818057261 53.33653314,-2.818072201 53.33662793,-2.818048504 53.33670959,-2.818016747 53.33681657,-2.818011453 53.3369213,-2.817737254 53.3368805,-2.81740636 53.33681357,-2.817138523 53.33674881,-2.816795325 53.33665331,-2.816509739 53.33657285,-2.816313127 53.33654149,-2.816037071 53.33651957,-2.815695268 53.3365166,-2.815706931 53.3365832,-2.815623603 53.33708233,-2.815504485 53.33706293,-2.815359707 53.33704855,-2.814933572 53.3370006,-2.814830927 53.33698108,-2.814745702 53.33696621,-2.814819917 53.33655332,-2.814533705 53.33653748,-2.814024696 53.33647392,-2.813709761 53.3364323,-2.813043392 53.33634321,-2.812550771 53.33628195,-2.812208893 53.33626227,-2.810991874 53.336185,-2.810380068 53.3361462,-2.810200041 53.33613305,-2.809900268 53.33611378,-2.809097205 53.33608328,-2.809040959 53.33628451,-2.808997658 53.3365286,-2.80864196 53.33651826,-2.808642644 53.33655437,-2.808291097 53.33654041,-2.807848924 53.33658636,-2.807358971 53.33663272,-2.806916306 53.33667894,-2.806211092 53.33676233,-2.805577301 53.33682422,-2.805276572 53.33675436,-2.805194226 53.33681953,-2.804935377 53.33707946,-2.804730496 53.33701623,-2.804929974 53.33679265,-2.805105693 53.33657632,-2.805498734 53.33646602,-2.805701722 53.33642897,-2.805692708 53.33553112,-2.805850845 53.33524932,-2.806388786 53.33538479,-2.806593808 53.33545502,-2.806980072 53.33560293,-2.807896916 53.33594818,-2.807856331 53.33570709,-2.807852626 53.33551112,-2.807866672 53.33541604,-2.807658952 53.33530089,-2.807403283 53.33517699,-2.807113138 53.33505009,-2.807877939 53.33494436,-2.808524394 53.33485263,-2.808904452 53.33475111,-2.808891381 53.33470797,-2.809165076 53.33461311,-2.809675582 53.33435857,-2.809973991 53.33430649,-2.810225419 53.33430478,-2.810485018 53.33430868,-2.810477659 53.33433875,-2.81064544 53.33435198,-2.810873522 53.33437901,-2.811078266 53.33443521,-2.811330918 53.33449084,-2.811523901 53.33454685,-2.811692226 53.33458866,-2.811848647 53.33462354,-2.812044833 53.33464575,-2.812156644 53.33463987,-2.812363002 53.33459838,-2.812546442 53.33454968,-2.812729951 53.33446594,-2.812743027 53.33442388,-2.812825443 53.33433714,-2.812968369 53.33430049,-2.813718721 53.33408015,-2.814112523 53.33400584,-2.81491254 53.33387844,-2.815307393 53.33383979,-2.815929168 53.33379259,-2.816312301 53.33377568,-2.816700177 53.33389066,-2.816768075 53.3338085,-2.816839463 53.33378636,-2.816922881 53.33377851,-2.816995457 53.33379257,-2.817066814 53.33384663,-2.817121283 53.33392371,-2.817093601 53.33399723,-2.817034104 53.33402631,-2.81698645 53.33403392,-2.816962441 53.33403408,-2.817011177 53.33407679,-2.817035613 53.33410529,-2.816954309 53.3341727,-2.816884428 53.33427374,-2.816778237 53.33443766,-2.816690504 53.33478405,-2.816552321 53.33538637,-2.816597831 53.33555886,-2.81663733 53.33575944,-2.816713246 53.33597405,-2.816750638 53.3360384,-2.816836199 53.33611645,-2.816992823 53.33618736,-2.817528781 53.3363839,-2.818057261 53.33653314)))",conservation_areas_polygon.3,Hale Road,3
-"MULTIPOLYGON (((-2.800712286 53.33601316,-2.800577236 53.33614724,-2.799749745 53.33588959,-2.799243757 53.33570813,-2.798774901 53.33600899,-2.798493827 53.33593565,-2.798361589 53.33591596,-2.797758862 53.33575932,-2.797173054 53.33559896,-2.796880148 53.3355222,-2.796489482 53.33540853,-2.796420519 53.33538176,-2.796076406 53.33530174,-2.796001131 53.33526827,-2.795915513 53.33525842,-2.7955957 53.33519289,-2.79515818 53.33510953,-2.795402822 53.33474216,-2.795725393 53.3342712,-2.79503064 53.33421041,-2.795071289 53.33407309,-2.794656544 53.33402832,-2.794810306 53.3334883,-2.794191919 53.33337128,-2.794526347 53.33308562,-2.794547885 53.33269898,-2.794534379 53.3322786,-2.795293212 53.33223247,-2.795829977 53.3322014,-2.796290185 53.33206856,-2.796387655 53.33238926,-2.796449104 53.332611,-2.79649222 53.33278522,-2.796498685 53.33283317,-2.796488063 53.33286756,-2.796832456 53.33296438,-2.797056179 53.3330106,-2.797251185 53.33304363,-2.797446243 53.33308671,-2.797584887 53.33317817,-2.797718285 53.33327981,-2.797764792 53.33331041,-2.797804782 53.33331689,-2.797868811 53.33337792,-2.79792064 53.33341523,-2.798002396 53.33349673,-2.798094967 53.33355425,-2.798170663 53.33363606,-2.798240142 53.33368331,-2.798303602 53.3337138,-2.798539757 53.33382149,-2.799334956 53.33418892,-2.799433033 53.33423634,-2.799479411 53.33425993,-2.799519853 53.33429048,-2.79956099 53.33435177,-2.79957961 53.33441993,-2.799609737 53.33450196,-2.799599453 53.33458074,-2.799600283 53.33462514,-2.799669319 53.33464848,-2.799781593 53.33453487,-2.799826702 53.33451065,-2.799877956 53.33448982,-2.799957351 53.33447213,-2.800192027 53.33447396,-2.800432272 53.33449644,-2.800793067 53.33454182,-2.800891014 53.33458215,-2.801012327 53.33464648,-2.801098771 53.33470045,-2.801157376 53.33477536,-2.801204342 53.33485026,-2.801267948 53.33488749,-2.801400199 53.33493441,-2.801400903 53.33497196,-2.801224772 53.33531194,-2.801046528 53.33557301,-2.800846135 53.33584791,-2.800712286 53.33601316)))",conservation_areas_polygon.4,Hale Village,4
-"MULTIPOLYGON (((-2.790712725 53.35446721,-2.790377376 53.35438631,-2.790051664 53.35431622,-2.789762662 53.35422935,-2.789528745 53.35417365,-2.789326418 53.35411766,-2.789053861 53.3540287,-2.788806058 53.35393203,-2.788689254 53.35386486,-2.788376538 53.3544992,-2.788069551 53.35445127,-2.788086977 53.35440146,-2.787445568 53.35429472,-2.787508212 53.35415834,-2.787561505 53.35406642,-2.787643683 53.35398806,-2.788128491 53.35362414,-2.787136543 53.35314813,-2.786643729 53.35356206,-2.786259603 53.35342754,-2.785835856 53.353287,-2.785653206 53.3532368,-2.785347185 53.35316558,-2.785197592 53.35309432,-2.785075711 53.35303419,-2.784712362 53.35293099,-2.78481744 53.35205639,-2.78647051 53.35272531,-2.786655432 53.35269093,-2.787728074 53.35309704,-2.78782969 53.3530048,-2.788677774 53.35324327,-2.789265133 53.35342252,-2.791441846 53.35408621,-2.791311306 53.35424128,-2.790910579 53.35412739,-2.790712725 53.35446721)))",conservation_areas_polygon.5,Halebank,5
-"MULTIPOLYGON (((-2.693226824 53.33533116,-2.693296915 53.33535465,-2.693447308 53.33538298,-2.693766008 53.33503902,-2.694326867 53.33435119,-2.69390786 53.33424273,-2.694022089 53.33413334,-2.694088557 53.33400965,-2.694132135 53.33386445,-2.694716469 53.33363999,-2.694568925 53.33356581,-2.69451442 53.33353378,-2.694467318 53.33349254,-2.694419837 53.33342793,-2.694403322 53.33336342,-2.694402418 53.3333078,-2.694398869 53.33323449,-2.694420663 53.33299084,-2.694459534 53.33255604,-2.694531237 53.3321994,-2.694563584 53.33199531,-2.694349961 53.33199234,-2.693956893 53.33203173,-2.693920008 53.33215444,-2.693834283 53.33247924,-2.693506996 53.33245293,-2.693322018 53.33243766,-2.693299499 53.33246115,-2.693113412 53.33242287,-2.693115212 53.33253384,-2.69304052 53.33270052,-2.692916968 53.33292203,-2.692839717 53.33313653,-2.69263046 53.3331355,-2.692543481 53.33350597,-2.692492472 53.33377397,-2.692769627 53.33379285,-2.692706555 53.3341559,-2.69233724 53.33412686,-2.692233176 53.33413079,-2.69211744 53.33411771,-2.692029153 53.33417044,-2.691922085 53.33424941,-2.691884559 53.33430975,-2.691425994 53.33415991,-2.691262495 53.33406363,-2.691214873 53.33399004,-2.691131358 53.33390236,-2.690997526 53.33375739,-2.690902181 53.33363456,-2.690863406 53.33355642,-2.690879591 53.33339368,-2.690942942 53.33325312,-2.691182788 53.33296354,-2.691120654 53.33294989,-2.69114324 53.33289889,-2.691187855 53.33278658,-2.691362354 53.33254482,-2.691731171 53.3319938,-2.69203961 53.3314833,-2.692127515 53.33117743,-2.69215062 53.3311818,-2.692235619 53.33118607,-2.69232458 53.33093807,-2.692406731 53.33076658,-2.692480081 53.33054869,-2.692572756 53.3305069,-2.692703046 53.33046427,-2.692731291 53.3302975,-2.692970509 53.33029135,-2.693449371 53.33027481,-2.69373605 53.33033327,-2.693876931 53.33033263,-2.694347794 53.33041058,-2.694499625 53.3302235,-2.695198031 53.33043798,-2.695849884 53.3306148,-2.696052812 53.33069081,-2.696712289 53.33081816,-2.697119621 53.33101608,-2.69754263 53.33115038,-2.697769665 53.33131566,-2.697973701 53.33151333,-2.698127929 53.33150318,-2.698359611 53.33149031,-2.698730019 53.33147439,-2.698730428 53.33190079,-2.69751652 53.33190224,-2.697527219 53.33210329,-2.697929993 53.33214253,-2.697834892 53.33246696,-2.697672655 53.3324724,-2.697487477 53.33248274,-2.697505374 53.33263073,-2.697476316 53.33274655,-2.697871201 53.3328182,-2.697818959 53.3329249,-2.697751629 53.33305479,-2.69776913 53.33317941,-2.697809158 53.33327192,-2.69769693 53.33349669,-2.697651997 53.33348177,-2.697502943 53.33364197,-2.697260503 53.33386131,-2.697386613 53.33389624,-2.696965018 53.33428305,-2.696937879 53.33431969,-2.696886094 53.33439386,-2.696464994 53.33472782,-2.696350312 53.33477873,-2.696269107 53.33474972,-2.696030994 53.3351138,-2.695978741 53.33522024,-2.69588599 53.33531127,-2.695391548 53.33520991,-2.695300796 53.33533993,-2.69490457 53.33524789,-2.694533773 53.33567725,-2.693661056 53.33540007,-2.693468982 53.33564975,-2.693408188 53.33584115,-2.693394225 53.33593388,-2.693405456 53.33601613,-2.692886454 53.33601915,-2.692800244 53.33600194,-2.693086706 53.33554836,-2.693226824 53.33533116)))",conservation_areas_polygon.6,Halton,6
-"MULTIPOLYGON (((-2.740832295 53.33468831,-2.740774378 53.33470412,-2.740729413 53.33473163,-2.740576448 53.33489972,-2.740471575 53.33485769,-2.740163367 53.33467673,-2.739776702 53.33448476,-2.739396747 53.3343081,-2.739100317 53.33403487,-2.738614277 53.33433964,-2.738126537 53.33405869,-2.738344515 53.33391833,-2.738193911 53.33383748,-2.73783865 53.33359085,-2.738315965 53.3334053,-2.738347899 53.33335451,-2.738366415 53.33331539,-2.738397973 53.33326461,-2.73842355 53.33322571,-2.738494385 53.33319796,-2.738546168 53.33317831,-2.738597114 53.33313891,-2.738602132 53.33304956,-2.738653109 53.33298328,-2.738716925 53.33292834,-2.738749262 53.33290091,-2.738806842 53.33286568,-2.738929794 53.33283769,-2.739387243 53.33262081,-2.739445829 53.33264381,-2.739493371 53.33237914,-2.739292084 53.33238039,-2.739292685 53.33206442,-2.739294682 53.33201579,-2.739416824 53.33200453,-2.739566003 53.33196855,-2.739619462 53.33183064,-2.739657438 53.33174306,-2.739690331 53.33169056,-2.73977611 53.33147498,-2.739877962 53.33148702,-2.739858283 53.33157332,-2.739910045 53.33162368,-2.739933578 53.33164555,-2.739981816 53.33167194,-2.74002462 53.33170537,-2.74011294 53.33176494,-2.740183349 53.33179173,-2.740245867 53.33182675,-2.740299769 53.33187234,-2.740368275 53.33191756,-2.740509567 53.33200538,-2.740569578 53.33204563,-2.740590792 53.33204801,-2.740721173 53.3319914,-2.740861324 53.33201335,-2.74108604 53.33202715,-2.741272575 53.33203865,-2.741213768 53.33231032,-2.741190153 53.33241192,-2.741186286 53.33243225,-2.741065766 53.33256483,-2.740965493 53.33264399,-2.74145198 53.33258867,-2.741847763 53.33254343,-2.742106692 53.33251064,-2.74230605 53.33240435,-2.7425218 53.33247328,-2.742725714 53.33253671,-2.742628679 53.33263185,-2.742533022 53.33274146,-2.742820511 53.33284075,-2.742552625 53.33311435,-2.742179645 53.33296139,-2.741687054 53.33339984,-2.741373897 53.33331247,-2.74119488 53.33348458,-2.741358187 53.33354171,-2.741567543 53.33362614,-2.741849133 53.3337643,-2.741997656 53.3340743,-2.741961483 53.33423017,-2.741578133 53.33423642,-2.741298276 53.33419916,-2.741346737 53.33436627,-2.741365882 53.33448477,-2.741251892 53.33453006,-2.740884141 53.33467225,-2.740832295 53.33468831)))",conservation_areas_polygon.7,Higher Runcorn,7
-"MULTIPOLYGON (((-2.731365138 53.36304468,-2.731052514 53.36289105,-2.730753708 53.36285019,-2.730775291 53.36271436,-2.730528121 53.36267796,-2.730293235 53.36264372,-2.7301736 53.3623781,-2.730115326 53.36223836,-2.730214908 53.36223658,-2.730306619 53.36224554,-2.730462919 53.36227047,-2.730911737 53.36231768,-2.730834842 53.36202349,-2.731276574 53.3619905,-2.731209877 53.36171279,-2.731165039 53.36153298,-2.731115774 53.36131842,-2.731169578 53.36130578,-2.731816175 53.36125455,-2.73230487 53.36121453,-2.732349418 53.36137799,-2.732394289 53.36158133,-2.732426347 53.36175251,-2.732679175 53.36173612,-2.732669859 53.36169153,-2.733237626 53.36163825,-2.73318875 53.36143995,-2.733375316 53.36142398,-2.733416563 53.36141609,-2.733436861 53.36140105,-2.733445155 53.36138617,-2.733440486 53.36136625,-2.733444475 53.36134654,-2.73378871 53.36132079,-2.733841011 53.36153812,-2.734018676 53.36155248,-2.734215568 53.36161094,-2.734334009 53.36168354,-2.73436375 53.36171957,-2.734385347 53.36175035,-2.734398249 53.36178756,-2.734403008 53.36181961,-2.734395235 53.36185695,-2.734387608 53.3618818,-2.73437118 53.36190688,-2.734350959 53.36193424,-2.734314069 53.36197185,-2.734248386 53.36200676,-2.733942146 53.3623042,-2.733913597 53.36233664,-2.733897916 53.36237637,-2.733871264 53.36251923,-2.733854417 53.36274291,-2.7338336 53.36298126,-2.733808387 53.36323672,-2.73378582 53.36353152,-2.733264628 53.36350606,-2.73238955 53.36347126,-2.732449797 53.36325684,-2.732489069 53.36300409,-2.73217143 53.36302095,-2.731920554 53.36344781,-2.731483464 53.36335614,-2.731344315 53.36359404,-2.731089517 53.36353989,-2.731045976 53.36352982,-2.731182425 53.36329301,-2.731365138 53.36304468)))",conservation_areas_polygon.8,Victoria Square,8
-"MULTIPOLYGON (((-2.736877605 53.34817381,-2.736795079 53.34829419,-2.736459974 53.34821773,-2.736395546 53.34832407,-2.735848013 53.34820379,-2.735440525 53.34809623,-2.735485553 53.34803556,-2.735479503 53.34801485,-2.735450219 53.34800487,-2.735188767 53.34800504,-2.734754161 53.34803414,-2.734437979 53.34807149,-2.733871621 53.34814552,-2.733629276 53.34819113,-2.733567172 53.34821021,-2.73349168 53.34832875,-2.733391322 53.34849256,-2.73353946 53.34853577,-2.733709633 53.34857885,-2.733775993 53.34859156,-2.73351788 53.34900328,-2.733088655 53.34891327,-2.732974132 53.34910358,-2.732848424 53.34908243,-2.732720228 53.34927004,-2.732693717 53.34926229,-2.7326477 53.34925862,-2.732252867 53.34917424,-2.732239008 53.34921845,-2.731575118 53.34908791,-2.731755394 53.34896135,-2.731930051 53.34883214,-2.732023572 53.34869507,-2.732139047 53.34853117,-2.732239789 53.34841139,-2.732378294 53.34830468,-2.732553371 53.34819325,-2.732714155 53.34812173,-2.732823009 53.34804152,-2.733399399 53.34792449,-2.73372664 53.34787502,-2.733989311 53.34783746,-2.733987646 53.34781923,-2.734271922 53.34777758,-2.734368324 53.34776576,-2.734484079 53.34775606,-2.735450016 53.34770517,-2.73553026 53.34769723,-2.73558289 53.3476969,-2.735678721 53.34770952,-2.73582312 53.3477302,-2.735844805 53.34770193,-2.735969788 53.34771689,-2.735973808 53.34770608,-2.736173686 53.34773306,-2.736164597 53.34775217,-2.736445193 53.34779869,-2.736644898 53.34783736,-2.73675611 53.3478586,-2.736837767 53.34788901,-2.736948943 53.34793694,-2.736933718 53.34805925,-2.736877605 53.34817381)))",conservation_areas_polygon.9,West Bank Promenade,9
-"MULTIPOLYGON (((-2.737815014 53.31762619,-2.737724903 53.31751747,-2.737662642 53.3174386,-2.737645776 53.31741543,-2.737606452 53.31739572,-2.737695017 53.3173853,-2.73786617 53.31738424,-2.738225421 53.31738201,-2.738556706 53.31738669,-2.738988829 53.31744665,-2.739000602 53.31747641,-2.73944373 53.31754331,-2.739620426 53.31756243,-2.739914589 53.31759655,-2.740269854 53.31771343,-2.740556057 53.31782352,-2.740972025 53.31803014,-2.741211848 53.31817756,-2.741425596 53.31832738,-2.741620439 53.31847165,-2.741439807 53.31857558,-2.74115851 53.31869873,-2.741171574 53.31871716,-2.740875946 53.31887797,-2.740591704 53.31903843,-2.740524197 53.31899249,-2.740355129 53.31908573,-2.740471219 53.3191285,-2.740566161 53.31916449,-2.740654744 53.31918353,-2.740726563 53.31919324,-2.740743614 53.31927716,-2.740612847 53.31931616,-2.740061315 53.31958756,-2.74017288 53.31966891,-2.740072953 53.31972399,-2.739981088 53.31977991,-2.739731568 53.31983762,-2.739669639 53.31984169,-2.739632507 53.31983518,-2.739593064 53.31980874,-2.739518854 53.31982573,-2.739510522 53.31980925,-2.739493738 53.31981268,-2.739485546 53.31980428,-2.739364488 53.31982498,-2.739364289 53.31981348,-2.739169057 53.3198394,-2.739025833 53.31983184,-2.739059173 53.31969801,-2.738730595 53.31965727,-2.738799409 53.3194852,-2.7388649 53.3193146,-2.73851779 53.31931846,-2.738477073 53.31916082,-2.738241064 53.31918367,-2.737988366 53.31921552,-2.7379235 53.31905021,-2.737875111 53.31885668,-2.737544883 53.31893483,-2.737646818 53.31878116,-2.73734971 53.31882821,-2.737353134 53.31878298,-2.737165674 53.31846279,-2.736631508 53.31850033,-2.736615351 53.31846079,-2.736371293 53.31851209,-2.73629566 53.3182808,-2.736237196 53.31810584,-2.736822256 53.31805585,-2.737114813 53.31804731,-2.737441848 53.31804896,-2.737563381 53.31804821,-2.737866683 53.31800986,-2.738075154 53.31796867,-2.737815014 53.31762619)))",conservation_areas_polygon.10,Weston Village,10
+WKT,gml_id,lowerCorner,upperCorner,Name,Ref
+"MULTIPOLYGON (((-2.632272212 53.34206401,-2.632000357 53.34224581,-2.630564119 53.3422481,-2.631009551 53.34186124,-2.631269515 53.34167052,-2.631587148 53.34143591,-2.631736085 53.34132451,-2.630234541 53.34106187,-2.630381899 53.34083419,-2.630430719 53.34083671,-2.630473829 53.3408045,-2.630570696 53.34078062,-2.63062419 53.34078034,-2.630642956 53.3407513,-2.630594227 53.34072236,-2.630564563 53.34068459,-2.630568851 53.34065563,-2.63063132 53.34056742,-2.630716303 53.34044359,-2.630861659 53.34027486,-2.630946484 53.34019947,-2.631384962 53.34029905,-2.63171138 53.3398375,-2.632213184 53.33997422,-2.632657969 53.34008985,-2.632914151 53.33957816,-2.632885686 53.33957175,-2.63293775 53.33946589,-2.632883467 53.3394554,-2.632917069 53.33939897,-2.63292785 53.33935829,-2.632925513 53.33917516,-2.63294476 53.33914532,-2.633046457 53.33865511,-2.633644084 53.33866515,-2.633816397 53.33841963,-2.634026323 53.33798843,-2.634336105 53.33736035,-2.634500904 53.33711002,-2.634540774 53.33710981,-2.634559936 53.33707439,-2.634913572 53.33709021,-2.635470784 53.33712481,-2.636015216 53.33662362,-2.637044871 53.33681168,-2.636935924 53.33706199,-2.636748823 53.33748571,-2.636502934 53.33801909,-2.636344157 53.33835863,-2.636267224 53.33851333,-2.635995827 53.33894819,-2.635870308 53.3391595,-2.635738612 53.33938773,-2.63565651 53.33952134,-2.635579679 53.33962563,-2.635375517 53.3399026,-2.6351317 53.34017833,-2.634725706 53.34052297,-2.634310927 53.34081048,-2.63398409 53.34102574,-2.633800358 53.34113598,-2.633595926 53.34124607,-2.633165035 53.34146691,-2.633066806 53.34151541,-2.632769196 53.34173949,-2.632272212 53.34206401)))",conservation_areas_polygon.1,53.336623621792555 -2.6370448708677467,53.34224810480465 -2.630234540634663,Daresbury,1
+"MULTIPOLYGON (((-2.636788444 53.35518027,-2.636565647 53.35505107,-2.636258261 53.354965,-2.635477454 53.35471351,-2.635008016 53.35456728,-2.634656133 53.35444461,-2.634469454 53.35431979,-2.634403116 53.35440731,-2.63425788 53.35474516,-2.634113641 53.35494191,-2.633950352 53.35528255,-2.632575891 53.35490245,-2.632233555 53.35520719,-2.632071694 53.35536926,-2.63184661 53.35557804,-2.631608214 53.35579209,-2.630836185 53.35638612,-2.630249779 53.35685884,-2.629762455 53.3572382,-2.629696966 53.35721609,-2.629671084 53.35719106,-2.629914256 53.35673741,-2.630007817 53.35658883,-2.62982389 53.35654846,-2.629577281 53.35644192,-2.629684466 53.35636848,-2.629877509 53.35621111,-2.63027752 53.35582609,-2.630229262 53.35581286,-2.629983626 53.35574659,-2.629927443 53.35573394,-2.62980002 53.35571143,-2.629189378 53.355599,-2.629323648 53.35532862,-2.629638798 53.35540513,-2.629853419 53.35509029,-2.630029088 53.35487279,-2.630113139 53.35489455,-2.630322542 53.35460427,-2.630274311 53.35459284,-2.630385887 53.35441423,-2.630503172 53.35432016,-2.630238926 53.35423529,-2.630147221 53.35412335,-2.630084971 53.35397631,-2.630229717 53.35391264,-2.630356307 53.35383729,-2.630576983 53.3537238,-2.630792318 53.35365976,-2.630942067 53.35362572,-2.631139727 53.35358055,-2.631196349 53.3535479,-2.631225404 53.35347766,-2.631282835 53.35339917,-2.631387062 53.35333571,-2.631354942 53.35319956,-2.631342389 53.35306178,-2.631411528 53.35286273,-2.63121451 53.35284131,-2.63134322 53.35250625,-2.632175723 53.35262316,-2.632232647 53.35251052,-2.632445365 53.35226219,-2.632692377 53.35200468,-2.633013939 53.3521136,-2.635172302 53.3526864,-2.635545184 53.35280573,-2.635979617 53.35238314,-2.636600521 53.35179005,-2.637117499 53.35125666,-2.637200141 53.35124166,-2.637489241 53.35149829,-2.637706892 53.35169042,-2.637759235 53.35176203,-2.637890637 53.35191858,-2.637933025 53.35194532,-2.637997827 53.35195486,-2.638008 53.35203038,-2.638046643 53.3522036,-2.63810637 53.35238822,-2.638371036 53.3530064,-2.638376316 53.3530531,-2.638432739 53.35321456,-2.638441974 53.35332782,-2.638406463 53.3534628,-2.638263964 53.35368292,-2.63826113 53.35372553,-2.638780194 53.35384461,-2.638675041 53.353945,-2.638514619 53.35405163,-2.637348667 53.3548279,-2.636788444 53.35518027)))",conservation_areas_polygon.2,53.351241662102865 -2.6387801935178166,53.35723820484838 -2.629189378252195,Moore,2
+"MULTIPOLYGON (((-2.818057261 53.33653314,-2.818072201 53.33662793,-2.818048504 53.33670959,-2.818016747 53.33681657,-2.818011453 53.3369213,-2.817737254 53.3368805,-2.81740636 53.33681357,-2.817138523 53.33674881,-2.816795325 53.33665331,-2.816509739 53.33657285,-2.816313127 53.33654149,-2.816037071 53.33651957,-2.815695268 53.3365166,-2.815706931 53.3365832,-2.815623603 53.33708233,-2.815504485 53.33706293,-2.815359707 53.33704855,-2.814933572 53.3370006,-2.814830927 53.33698108,-2.814745702 53.33696621,-2.814819917 53.33655332,-2.814533705 53.33653748,-2.814024696 53.33647392,-2.813709761 53.3364323,-2.813043392 53.33634321,-2.812550771 53.33628195,-2.812208893 53.33626227,-2.810991874 53.336185,-2.810380068 53.3361462,-2.810200041 53.33613305,-2.809900268 53.33611378,-2.809097205 53.33608328,-2.809040959 53.33628451,-2.808997658 53.3365286,-2.80864196 53.33651826,-2.808642644 53.33655437,-2.808291097 53.33654041,-2.807848924 53.33658636,-2.807358971 53.33663272,-2.806916306 53.33667894,-2.806211092 53.33676233,-2.805577301 53.33682422,-2.805276572 53.33675436,-2.805194226 53.33681953,-2.804935377 53.33707946,-2.804730496 53.33701623,-2.804929974 53.33679265,-2.805105693 53.33657632,-2.805498734 53.33646602,-2.805701722 53.33642897,-2.805692708 53.33553112,-2.805850845 53.33524932,-2.806388786 53.33538479,-2.806593808 53.33545502,-2.806980072 53.33560293,-2.807896916 53.33594818,-2.807856331 53.33570709,-2.807852626 53.33551112,-2.807866672 53.33541604,-2.807658952 53.33530089,-2.807403283 53.33517699,-2.807113138 53.33505009,-2.807877939 53.33494436,-2.808524394 53.33485263,-2.808904452 53.33475111,-2.808891381 53.33470797,-2.809165076 53.33461311,-2.809675582 53.33435857,-2.809973991 53.33430649,-2.810225419 53.33430478,-2.810485018 53.33430868,-2.810477659 53.33433875,-2.81064544 53.33435198,-2.810873522 53.33437901,-2.811078266 53.33443521,-2.811330918 53.33449084,-2.811523901 53.33454685,-2.811692226 53.33458866,-2.811848647 53.33462354,-2.812044833 53.33464575,-2.812156644 53.33463987,-2.812363002 53.33459838,-2.812546442 53.33454968,-2.812729951 53.33446594,-2.812743027 53.33442388,-2.812825443 53.33433714,-2.812968369 53.33430049,-2.813718721 53.33408015,-2.814112523 53.33400584,-2.81491254 53.33387844,-2.815307393 53.33383979,-2.815929168 53.33379259,-2.816312301 53.33377568,-2.816700177 53.33389066,-2.816768075 53.3338085,-2.816839463 53.33378636,-2.816922881 53.33377851,-2.816995457 53.33379257,-2.817066814 53.33384663,-2.817121283 53.33392371,-2.817093601 53.33399723,-2.817034104 53.33402631,-2.81698645 53.33403392,-2.816962441 53.33403408,-2.817011177 53.33407679,-2.817035613 53.33410529,-2.816954309 53.3341727,-2.816884428 53.33427374,-2.816778237 53.33443766,-2.816690504 53.33478405,-2.816552321 53.33538637,-2.816597831 53.33555886,-2.81663733 53.33575944,-2.816713246 53.33597405,-2.816750638 53.3360384,-2.816836199 53.33611645,-2.816992823 53.33618736,-2.817528781 53.3363839,-2.818057261 53.33653314)))",conservation_areas_polygon.3,53.33377567729678 -2.81807220104152,53.3370823316775 -2.8047304963029385,Hale Road,3
+"MULTIPOLYGON (((-2.800712286 53.33601316,-2.800577236 53.33614724,-2.799749745 53.33588959,-2.799243757 53.33570813,-2.798774901 53.33600899,-2.798493827 53.33593565,-2.798361589 53.33591596,-2.797758862 53.33575932,-2.797173054 53.33559896,-2.796880148 53.3355222,-2.796489482 53.33540853,-2.796420519 53.33538176,-2.796076406 53.33530174,-2.796001131 53.33526827,-2.795915513 53.33525842,-2.7955957 53.33519289,-2.79515818 53.33510953,-2.795402822 53.33474216,-2.795725393 53.3342712,-2.79503064 53.33421041,-2.795071289 53.33407309,-2.794656544 53.33402832,-2.794810306 53.3334883,-2.794191919 53.33337128,-2.794526347 53.33308562,-2.794547885 53.33269898,-2.794534379 53.3322786,-2.795293212 53.33223247,-2.795829977 53.3322014,-2.796290185 53.33206856,-2.796387655 53.33238926,-2.796449104 53.332611,-2.79649222 53.33278522,-2.796498685 53.33283317,-2.796488063 53.33286756,-2.796832456 53.33296438,-2.797056179 53.3330106,-2.797251185 53.33304363,-2.797446243 53.33308671,-2.797584887 53.33317817,-2.797718285 53.33327981,-2.797764792 53.33331041,-2.797804782 53.33331689,-2.797868811 53.33337792,-2.79792064 53.33341523,-2.798002396 53.33349673,-2.798094967 53.33355425,-2.798170663 53.33363606,-2.798240142 53.33368331,-2.798303602 53.3337138,-2.798539757 53.33382149,-2.799334956 53.33418892,-2.799433033 53.33423634,-2.799479411 53.33425993,-2.799519853 53.33429048,-2.79956099 53.33435177,-2.79957961 53.33441993,-2.799609737 53.33450196,-2.799599453 53.33458074,-2.799600283 53.33462514,-2.799669319 53.33464848,-2.799781593 53.33453487,-2.799826702 53.33451065,-2.799877956 53.33448982,-2.799957351 53.33447213,-2.800192027 53.33447396,-2.800432272 53.33449644,-2.800793067 53.33454182,-2.800891014 53.33458215,-2.801012327 53.33464648,-2.801098771 53.33470045,-2.801157376 53.33477536,-2.801204342 53.33485026,-2.801267948 53.33488749,-2.801400199 53.33493441,-2.801400903 53.33497196,-2.801224772 53.33531194,-2.801046528 53.33557301,-2.800846135 53.33584791,-2.800712286 53.33601316)))",conservation_areas_polygon.4,53.33206856009858 -2.8014009026934446,53.33614724444181 -2.794191918978806,Hale Village,4
+"MULTIPOLYGON (((-2.790712725 53.35446721,-2.790377376 53.35438631,-2.790051664 53.35431622,-2.789762662 53.35422935,-2.789528745 53.35417365,-2.789326418 53.35411766,-2.789053861 53.3540287,-2.788806058 53.35393203,-2.788689254 53.35386486,-2.788376538 53.3544992,-2.788069551 53.35445127,-2.788086977 53.35440146,-2.787445568 53.35429472,-2.787508212 53.35415834,-2.787561505 53.35406642,-2.787643683 53.35398806,-2.788128491 53.35362414,-2.787136543 53.35314813,-2.786643729 53.35356206,-2.786259603 53.35342754,-2.785835856 53.353287,-2.785653206 53.3532368,-2.785347185 53.35316558,-2.785197592 53.35309432,-2.785075711 53.35303419,-2.784712362 53.35293099,-2.78481744 53.35205639,-2.78647051 53.35272531,-2.786655432 53.35269093,-2.787728074 53.35309704,-2.78782969 53.3530048,-2.788677774 53.35324327,-2.789265133 53.35342252,-2.791441846 53.35408621,-2.791311306 53.35424128,-2.790910579 53.35412739,-2.790712725 53.35446721)))",conservation_areas_polygon.5,53.35205638586151 -2.7914418459304846,53.35449920453541 -2.7847123619887992,Halebank,5
+"MULTIPOLYGON (((-2.693226824 53.33533116,-2.693296915 53.33535465,-2.693447308 53.33538298,-2.693766008 53.33503902,-2.694326867 53.33435119,-2.69390786 53.33424273,-2.694022089 53.33413334,-2.694088557 53.33400965,-2.694132135 53.33386445,-2.694716469 53.33363999,-2.694568925 53.33356581,-2.69451442 53.33353378,-2.694467318 53.33349254,-2.694419837 53.33342793,-2.694403322 53.33336342,-2.694402418 53.3333078,-2.694398869 53.33323449,-2.694420663 53.33299084,-2.694459534 53.33255604,-2.694531237 53.3321994,-2.694563584 53.33199531,-2.694349961 53.33199234,-2.693956893 53.33203173,-2.693920008 53.33215444,-2.693834283 53.33247924,-2.693506996 53.33245293,-2.693322018 53.33243766,-2.693299499 53.33246115,-2.693113412 53.33242287,-2.693115212 53.33253384,-2.69304052 53.33270052,-2.692916968 53.33292203,-2.692839717 53.33313653,-2.69263046 53.3331355,-2.692543481 53.33350597,-2.692492472 53.33377397,-2.692769627 53.33379285,-2.692706555 53.3341559,-2.69233724 53.33412686,-2.692233176 53.33413079,-2.69211744 53.33411771,-2.692029153 53.33417044,-2.691922085 53.33424941,-2.691884559 53.33430975,-2.691425994 53.33415991,-2.691262495 53.33406363,-2.691214873 53.33399004,-2.691131358 53.33390236,-2.690997526 53.33375739,-2.690902181 53.33363456,-2.690863406 53.33355642,-2.690879591 53.33339368,-2.690942942 53.33325312,-2.691182788 53.33296354,-2.691120654 53.33294989,-2.69114324 53.33289889,-2.691187855 53.33278658,-2.691362354 53.33254482,-2.691731171 53.3319938,-2.69203961 53.3314833,-2.692127515 53.33117743,-2.69215062 53.3311818,-2.692235619 53.33118607,-2.69232458 53.33093807,-2.692406731 53.33076658,-2.692480081 53.33054869,-2.692572756 53.3305069,-2.692703046 53.33046427,-2.692731291 53.3302975,-2.692970509 53.33029135,-2.693449371 53.33027481,-2.69373605 53.33033327,-2.693876931 53.33033263,-2.694347794 53.33041058,-2.694499625 53.3302235,-2.695198031 53.33043798,-2.695849884 53.3306148,-2.696052812 53.33069081,-2.696712289 53.33081816,-2.697119621 53.33101608,-2.69754263 53.33115038,-2.697769665 53.33131566,-2.697973701 53.33151333,-2.698127929 53.33150318,-2.698359611 53.33149031,-2.698730019 53.33147439,-2.698730428 53.33190079,-2.69751652 53.33190224,-2.697527219 53.33210329,-2.697929993 53.33214253,-2.697834892 53.33246696,-2.697672655 53.3324724,-2.697487477 53.33248274,-2.697505374 53.33263073,-2.697476316 53.33274655,-2.697871201 53.3328182,-2.697818959 53.3329249,-2.697751629 53.33305479,-2.69776913 53.33317941,-2.697809158 53.33327192,-2.69769693 53.33349669,-2.697651997 53.33348177,-2.697502943 53.33364197,-2.697260503 53.33386131,-2.697386613 53.33389624,-2.696965018 53.33428305,-2.696937879 53.33431969,-2.696886094 53.33439386,-2.696464994 53.33472782,-2.696350312 53.33477873,-2.696269107 53.33474972,-2.696030994 53.3351138,-2.695978741 53.33522024,-2.69588599 53.33531127,-2.695391548 53.33520991,-2.695300796 53.33533993,-2.69490457 53.33524789,-2.694533773 53.33567725,-2.693661056 53.33540007,-2.693468982 53.33564975,-2.693408188 53.33584115,-2.693394225 53.33593388,-2.693405456 53.33601613,-2.692886454 53.33601915,-2.692800244 53.33600194,-2.693086706 53.33554836,-2.693226824 53.33533116)))",conservation_areas_polygon.6,53.33022350189056 -2.6987304284150504,53.336019148385034 -2.6908634063722134,Halton,6
+"MULTIPOLYGON (((-2.740832295 53.33468831,-2.740774378 53.33470412,-2.740729413 53.33473163,-2.740576448 53.33489972,-2.740471575 53.33485769,-2.740163367 53.33467673,-2.739776702 53.33448476,-2.739396747 53.3343081,-2.739100317 53.33403487,-2.738614277 53.33433964,-2.738126537 53.33405869,-2.738344515 53.33391833,-2.738193911 53.33383748,-2.73783865 53.33359085,-2.738315965 53.3334053,-2.738347899 53.33335451,-2.738366415 53.33331539,-2.738397973 53.33326461,-2.73842355 53.33322571,-2.738494385 53.33319796,-2.738546168 53.33317831,-2.738597114 53.33313891,-2.738602132 53.33304956,-2.738653109 53.33298328,-2.738716925 53.33292834,-2.738749262 53.33290091,-2.738806842 53.33286568,-2.738929794 53.33283769,-2.739387243 53.33262081,-2.739445829 53.33264381,-2.739493371 53.33237914,-2.739292084 53.33238039,-2.739292685 53.33206442,-2.739294682 53.33201579,-2.739416824 53.33200453,-2.739566003 53.33196855,-2.739619462 53.33183064,-2.739657438 53.33174306,-2.739690331 53.33169056,-2.73977611 53.33147498,-2.739877962 53.33148702,-2.739858283 53.33157332,-2.739910045 53.33162368,-2.739933578 53.33164555,-2.739981816 53.33167194,-2.74002462 53.33170537,-2.74011294 53.33176494,-2.740183349 53.33179173,-2.740245867 53.33182675,-2.740299769 53.33187234,-2.740368275 53.33191756,-2.740509567 53.33200538,-2.740569578 53.33204563,-2.740590792 53.33204801,-2.740721173 53.3319914,-2.740861324 53.33201335,-2.74108604 53.33202715,-2.741272575 53.33203865,-2.741213768 53.33231032,-2.741190153 53.33241192,-2.741186286 53.33243225,-2.741065766 53.33256483,-2.740965493 53.33264399,-2.74145198 53.33258867,-2.741847763 53.33254343,-2.742106692 53.33251064,-2.74230605 53.33240435,-2.7425218 53.33247328,-2.742725714 53.33253671,-2.742628679 53.33263185,-2.742533022 53.33274146,-2.742820511 53.33284075,-2.742552625 53.33311435,-2.742179645 53.33296139,-2.741687054 53.33339984,-2.741373897 53.33331247,-2.74119488 53.33348458,-2.741358187 53.33354171,-2.741567543 53.33362614,-2.741849133 53.3337643,-2.741997656 53.3340743,-2.741961483 53.33423017,-2.741578133 53.33423642,-2.741298276 53.33419916,-2.741346737 53.33436627,-2.741365882 53.33448477,-2.741251892 53.33453006,-2.740884141 53.33467225,-2.740832295 53.33468831)))",conservation_areas_polygon.7,53.331474975214284 -2.742820510669728,53.334899721691386 -2.737838650202599,Higher Runcorn,7
+"MULTIPOLYGON (((-2.731365138 53.36304468,-2.731052514 53.36289105,-2.730753708 53.36285019,-2.730775291 53.36271436,-2.730528121 53.36267796,-2.730293235 53.36264372,-2.7301736 53.3623781,-2.730115326 53.36223836,-2.730214908 53.36223658,-2.730306619 53.36224554,-2.730462919 53.36227047,-2.730911737 53.36231768,-2.730834842 53.36202349,-2.731276574 53.3619905,-2.731209877 53.36171279,-2.731165039 53.36153298,-2.731115774 53.36131842,-2.731169578 53.36130578,-2.731816175 53.36125455,-2.73230487 53.36121453,-2.732349418 53.36137799,-2.732394289 53.36158133,-2.732426347 53.36175251,-2.732679175 53.36173612,-2.732669859 53.36169153,-2.733237626 53.36163825,-2.73318875 53.36143995,-2.733375316 53.36142398,-2.733416563 53.36141609,-2.733436861 53.36140105,-2.733445155 53.36138617,-2.733440486 53.36136625,-2.733444475 53.36134654,-2.73378871 53.36132079,-2.733841011 53.36153812,-2.734018676 53.36155248,-2.734215568 53.36161094,-2.734334009 53.36168354,-2.73436375 53.36171957,-2.734385347 53.36175035,-2.734398249 53.36178756,-2.734403008 53.36181961,-2.734395235 53.36185695,-2.734387608 53.3618818,-2.73437118 53.36190688,-2.734350959 53.36193424,-2.734314069 53.36197185,-2.734248386 53.36200676,-2.733942146 53.3623042,-2.733913597 53.36233664,-2.733897916 53.36237637,-2.733871264 53.36251923,-2.733854417 53.36274291,-2.7338336 53.36298126,-2.733808387 53.36323672,-2.73378582 53.36353152,-2.733264628 53.36350606,-2.73238955 53.36347126,-2.732449797 53.36325684,-2.732489069 53.36300409,-2.73217143 53.36302095,-2.731920554 53.36344781,-2.731483464 53.36335614,-2.731344315 53.36359404,-2.731089517 53.36353989,-2.731045976 53.36352982,-2.731182425 53.36329301,-2.731365138 53.36304468)))",conservation_areas_polygon.8,53.36121452724432 -2.7344030078801715,53.36359404388266 -2.730115325658145,Victoria Square,8
+"MULTIPOLYGON (((-2.736877605 53.34817381,-2.736795079 53.34829419,-2.736459974 53.34821773,-2.736395546 53.34832407,-2.735848013 53.34820379,-2.735440525 53.34809623,-2.735485553 53.34803556,-2.735479503 53.34801485,-2.735450219 53.34800487,-2.735188767 53.34800504,-2.734754161 53.34803414,-2.734437979 53.34807149,-2.733871621 53.34814552,-2.733629276 53.34819113,-2.733567172 53.34821021,-2.73349168 53.34832875,-2.733391322 53.34849256,-2.73353946 53.34853577,-2.733709633 53.34857885,-2.733775993 53.34859156,-2.73351788 53.34900328,-2.733088655 53.34891327,-2.732974132 53.34910358,-2.732848424 53.34908243,-2.732720228 53.34927004,-2.732693717 53.34926229,-2.7326477 53.34925862,-2.732252867 53.34917424,-2.732239008 53.34921845,-2.731575118 53.34908791,-2.731755394 53.34896135,-2.731930051 53.34883214,-2.732023572 53.34869507,-2.732139047 53.34853117,-2.732239789 53.34841139,-2.732378294 53.34830468,-2.732553371 53.34819325,-2.732714155 53.34812173,-2.732823009 53.34804152,-2.733399399 53.34792449,-2.73372664 53.34787502,-2.733989311 53.34783746,-2.733987646 53.34781923,-2.734271922 53.34777758,-2.734368324 53.34776576,-2.734484079 53.34775606,-2.735450016 53.34770517,-2.73553026 53.34769723,-2.73558289 53.3476969,-2.735678721 53.34770952,-2.73582312 53.3477302,-2.735844805 53.34770193,-2.735969788 53.34771689,-2.735973808 53.34770608,-2.736173686 53.34773306,-2.736164597 53.34775217,-2.736445193 53.34779869,-2.736644898 53.34783736,-2.73675611 53.3478586,-2.736837767 53.34788901,-2.736948943 53.34793694,-2.736933718 53.34805925,-2.736877605 53.34817381)))",conservation_areas_polygon.9,53.347696902912006 -2.7369489434384215,53.3492700401932 -2.7315751180850665,West Bank Promenade,9
+"MULTIPOLYGON (((-2.737815014 53.31762619,-2.737724903 53.31751747,-2.737662642 53.3174386,-2.737645776 53.31741543,-2.737606452 53.31739572,-2.737695017 53.3173853,-2.73786617 53.31738424,-2.738225421 53.31738201,-2.738556706 53.31738669,-2.738988829 53.31744665,-2.739000602 53.31747641,-2.73944373 53.31754331,-2.739620426 53.31756243,-2.739914589 53.31759655,-2.740269854 53.31771343,-2.740556057 53.31782352,-2.740972025 53.31803014,-2.741211848 53.31817756,-2.741425596 53.31832738,-2.741620439 53.31847165,-2.741439807 53.31857558,-2.74115851 53.31869873,-2.741171574 53.31871716,-2.740875946 53.31887797,-2.740591704 53.31903843,-2.740524197 53.31899249,-2.740355129 53.31908573,-2.740471219 53.3191285,-2.740566161 53.31916449,-2.740654744 53.31918353,-2.740726563 53.31919324,-2.740743614 53.31927716,-2.740612847 53.31931616,-2.740061315 53.31958756,-2.74017288 53.31966891,-2.740072953 53.31972399,-2.739981088 53.31977991,-2.739731568 53.31983762,-2.739669639 53.31984169,-2.739632507 53.31983518,-2.739593064 53.31980874,-2.739518854 53.31982573,-2.739510522 53.31980925,-2.739493738 53.31981268,-2.739485546 53.31980428,-2.739364488 53.31982498,-2.739364289 53.31981348,-2.739169057 53.3198394,-2.739025833 53.31983184,-2.739059173 53.31969801,-2.738730595 53.31965727,-2.738799409 53.3194852,-2.7388649 53.3193146,-2.73851779 53.31931846,-2.738477073 53.31916082,-2.738241064 53.31918367,-2.737988366 53.31921552,-2.7379235 53.31905021,-2.737875111 53.31885668,-2.737544883 53.31893483,-2.737646818 53.31878116,-2.73734971 53.31882821,-2.737353134 53.31878298,-2.737165674 53.31846279,-2.736631508 53.31850033,-2.736615351 53.31846079,-2.736371293 53.31851209,-2.73629566 53.3182808,-2.736237196 53.31810584,-2.736822256 53.31805585,-2.737114813 53.31804731,-2.737441848 53.31804896,-2.737563381 53.31804821,-2.737866683 53.31800986,-2.738075154 53.31796867,-2.737815014 53.31762619)))",conservation_areas_polygon.10,53.31738201292983 -2.7416204385738534,53.31984169271952 -2.736237195970133,Weston Village,10

From f1167ef4fb5ceda8601e87d39b65ce37534c5895 Mon Sep 17 00:00:00 2001
From: CarlosCoelhoSL <110818364+CarlosCoelhoSL@users.noreply.github.com>
Date: Tue, 20 May 2025 15:25:22 +0100
Subject: [PATCH 08/17] Add data dataset update (#408)

* initial commit

* adds tests

* adds acceptance test

* updated mask
fixes tests
fixes output path

* updates diff calculation for new entities

* adds function comments

* restructures get user response

* test

* test
---
 digital_land/commands.py                 | 102 ++++++--
 digital_land/utils/add_data_utils.py     | 128 ++++++++++
 tests/acceptance/test_add_data.py        |  81 ++++++-
 tests/data/specification/dataset.csv     |   4 +-
 tests/integration/test_add_data_utils.py | 293 +++++++++++++++++++++++
 tests/unit/test_add_data_utils.py        |  18 +-
 6 files changed, 598 insertions(+), 28 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 46fe18295..f88559c48 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -65,10 +65,13 @@
 from digital_land.state import State
 from digital_land.utils.add_data_utils import (
     clear_log,
+    download_dataset,
     get_column_field_summary,
+    get_transformed_entities,
     get_entity_summary,
     get_existing_endpoints_summary,
     get_issue_summary,
+    get_updated_entities_summary,
     is_date_valid,
     is_url_valid,
     get_user_response,
@@ -932,38 +935,42 @@ def add_data(
 
     add_data_cache_dir = cache_dir / "add_data"
 
-    output_path = (
-        add_data_cache_dir
-        / "transformed/"
-        / (endpoint_resource_info["resource"] + ".csv")
-    )
-
-    issue_dir = add_data_cache_dir / "issue/"
-    column_field_dir = add_data_cache_dir / "column_field/"
-    dataset_resource_dir = add_data_cache_dir / "dataset_resource/"
-    converted_resource_dir = add_data_cache_dir / "converted_resource/"
-    converted_dir = add_data_cache_dir / "converted/"
-    output_log_dir = add_data_cache_dir / "log/"
-    operational_issue_dir = add_data_cache_dir / "performance/ " / "operational_issue/"
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    issue_dir.mkdir(parents=True, exist_ok=True)
-    column_field_dir.mkdir(parents=True, exist_ok=True)
-    dataset_resource_dir.mkdir(parents=True, exist_ok=True)
-    converted_resource_dir.mkdir(parents=True, exist_ok=True)
-    converted_dir.mkdir(parents=True, exist_ok=True)
-    output_log_dir.mkdir(parents=True, exist_ok=True)
-    operational_issue_dir.mkdir(parents=True, exist_ok=True)
-
     collection.load_log_items()
     for dataset in endpoint_resource_info["pipelines"]:
+        pipeline = Pipeline(pipeline_dir, dataset)
+        specification = Specification(specification_dir)
+
+        issue_dir = add_data_cache_dir / "issue/" / dataset
+        column_field_dir = add_data_cache_dir / "column_field/" / dataset
+        dataset_resource_dir = add_data_cache_dir / "dataset_resource/" / dataset
+        converted_resource_dir = add_data_cache_dir / "converted_resource/"
+        converted_dir = add_data_cache_dir / "converted/"
+        output_log_dir = add_data_cache_dir / "log/"
+        operational_issue_dir = (
+            add_data_cache_dir / "performance/ " / "operational_issue/"
+        )
+        output_path = (
+            add_data_cache_dir
+            / "transformed/"
+            / dataset
+            / (endpoint_resource_info["resource"] + ".csv")
+        )
+
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        issue_dir.mkdir(parents=True, exist_ok=True)
+        column_field_dir.mkdir(parents=True, exist_ok=True)
+        dataset_resource_dir.mkdir(parents=True, exist_ok=True)
+        converted_resource_dir.mkdir(parents=True, exist_ok=True)
+        converted_dir.mkdir(parents=True, exist_ok=True)
+        output_log_dir.mkdir(parents=True, exist_ok=True)
+        operational_issue_dir.mkdir(parents=True, exist_ok=True)
         print("======================================================================")
         print("Run pipeline")
         print("======================================================================")
         try:
             pipeline_run(
                 dataset,
-                Pipeline(pipeline_dir, dataset),
+                pipeline,
                 Specification(specification_dir),
                 endpoint_resource_info["resource_path"],
                 output_path=output_path,
@@ -1113,6 +1120,11 @@ def add_data(
             shutil.copy(cache_pipeline_dir / "lookup.csv", pipeline_dir / "lookup.csv")
 
         # Now check for existing endpoints for this provision/organisation
+        print(
+            "\n======================================================================"
+        )
+        print("Retire old endpoints/sources")
+        print("======================================================================")
         existing_endpoints_summary, existing_sources = get_existing_endpoints_summary(
             endpoint_resource_info, collection, dataset
         )
@@ -1132,6 +1144,48 @@ def add_data(
                         pd.DataFrame.from_records(sources_to_retire)
                     )
 
+        # Update dataset and view newly updated dataset
+        print(
+            "\n======================================================================"
+        )
+        print("Update dataset")
+        print("======================================================================")
+        if get_user_response(
+            f"""\nDo you want to view an updated {dataset} dataset with the newly added data?
+            \nNote this requires downloading the dataset if not already done so -
+            for some datasets this can take a while \n\n(yes/no): """
+        ):
+            dataset_path = download_dataset(dataset, specification, cache_dir)
+            original_entities = get_transformed_entities(dataset_path, output_path)
+            print(f"Updating {dataset}.sqlite3 with new data...")
+            dataset_update(
+                input_paths=[output_path],
+                output_path=None,
+                organisation_path=organisation_path,
+                pipeline=pipeline,
+                dataset=dataset,
+                specification=specification,
+                issue_dir=os.path.split(issue_dir)[0],
+                column_field_dir=os.path.split(column_field_dir)[0],
+                dataset_resource_dir=os.path.split(dataset_resource_dir)[0],
+                dataset_path=dataset_path,
+            )
+            updated_entities = get_transformed_entities(dataset_path, output_path)
+            updated_entities_summary, diffs_df = get_updated_entities_summary(
+                original_entities, updated_entities
+            )
+            print(updated_entities_summary)
+            if diffs_df is not None:
+                diffs_path = (
+                    add_data_cache_dir
+                    / dataset
+                    / "diffs"
+                    / f"{endpoint_resource_info['resource']}.csv"
+                )
+                os.makedirs(os.path.dirname(diffs_path))
+                diffs_df.to_csv(diffs_path)
+                print(f"\nDetailed breakdown found in file: {diffs_path}")
+
 
 def add_endpoints_and_lookups(
     csv_file_path,
diff --git a/digital_land/utils/add_data_utils.py b/digital_land/utils/add_data_utils.py
index eb6b75c88..0d6dac1f8 100644
--- a/digital_land/utils/add_data_utils.py
+++ b/digital_land/utils/add_data_utils.py
@@ -1,11 +1,14 @@
 import csv
+import json
 import os
 import duckdb
+import sqlite3
 from datetime import datetime
 from urllib.parse import urlparse
 
 import pandas as pd
 
+from digital_land.api import API
 from digital_land.collect import Collector
 from digital_land.pipeline.main import Pipeline
 from digital_land.specification import Specification
@@ -295,3 +298,128 @@ def get_existing_endpoints_summary(endpoint_resource_info, collection, dataset):
             )
 
     return existing_endpoints_summary, retirable_sources
+
+
+def download_dataset(dataset, specification, cache_dir):
+    # Download existing dataset
+    api = API(specification=specification, cache_dir=cache_dir)
+    dataset_path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3")
+    # Determine whether to download new copy of dataset or use cached version
+    download = True
+    if os.path.exists(dataset_path):
+        print(f"\nExisting dataset at {dataset_path} detected")
+        if get_user_response(
+            "Do you want to use the existing dataset (otherwise download a fresh version)? (yes/no): "
+        ):
+            download = False
+    if download:
+        print(f"Downloading {dataset}.sqlite3...")
+        api.download_dataset(
+            dataset=dataset,
+            overwrite=True,
+            path=dataset_path,
+            extension=api.Extension.SQLITE3,
+        )
+
+    return dataset_path
+
+
+def get_transformed_entities(dataset_path, transformed_path):
+    """
+    Returns a Dataframe of entities from a dataset.
+    It returns entities that have facts in the transformed file at `transformed_path`
+    """
+    entities = pd.read_csv(transformed_path)["entity"].unique().tolist()
+    entity_list_str = ", ".join(str(e) for e in entities)
+    sql = f"SELECT * FROM entity WHERE entity IN ({entity_list_str})"
+
+    with sqlite3.connect(dataset_path) as conn:
+        entities_df = pd.read_sql_query(sql, conn)
+
+    return entities_df
+
+
+def normalise_json(val):
+    """
+    Returns a sorted stringified json
+    """
+    # This function accepts a stringified json
+    # It returns a sorted stringified json of the input
+    try:
+        return json.dumps(json.loads(val), sort_keys=True)
+    except Exception:
+        return val  # if failure to pass just return original string
+
+
+def get_updated_entities_summary(original_entity_df, updated_entity_df):
+    """
+    This will return a summary of the differences between two dataframes of the same entities
+    """
+    # replace None/nan with "" for consistent comparison
+    original_entity_df = original_entity_df.fillna("")
+    updated_entity_df = updated_entity_df.fillna("")
+
+    original_entity_df = original_entity_df.set_index("entity").sort_index()
+    updated_entity_df = updated_entity_df.set_index("entity").sort_index()
+
+    # filter out newly added entities, store them in a separate df
+    new_entities_df = updated_entity_df.loc[
+        ~updated_entity_df.index.isin(original_entity_df.index)
+    ]
+    updated_entity_df = updated_entity_df.loc[
+        updated_entity_df.index.isin(original_entity_df.index)
+    ]
+
+    # the json column can get reordered in the update dataset process
+    # load json into dict and sort keys to ensure comparison is correct
+    if "json" in original_entity_df.columns:
+        original_entity_df["json"] = original_entity_df["json"].apply(normalise_json)
+        updated_entity_df["json"] = updated_entity_df["json"].apply(normalise_json)
+        new_entities_df["json"] = new_entities_df["json"].apply(normalise_json)
+
+    # find differences
+    mask = ~(
+        (original_entity_df == updated_entity_df)
+        | (original_entity_df.isna() & updated_entity_df.isna())
+    )
+    diff_positions = mask.stack()
+    # dataframe of which values have changed.
+    changed = diff_positions[diff_positions]
+    diffs = pd.DataFrame(
+        {
+            "entity": changed.index.get_level_values(0),
+            "field": changed.index.get_level_values(1),
+            "original_value": original_entity_df.stack()[changed.index],
+            "updated_value": updated_entity_df.stack()[changed.index],
+            "new_entity": False,
+        }
+    ).reset_index(drop=True)
+
+    # add diffs for new entities
+    if not new_entities_df.empty:
+        new_diffs = new_entities_df.reset_index().melt(
+            id_vars=["entity"], var_name="field", value_name="updated_value"
+        )
+        new_diffs["original_value"] = None
+        new_diffs["new_entity"] = True
+        # Reorder columns to match
+        new_diffs = new_diffs[
+            ["entity", "field", "original_value", "updated_value", "new_entity"]
+        ]
+
+        # Concatenate with existing diffs
+        diffs = pd.concat([diffs, new_diffs], ignore_index=True)
+
+    updated_entities_summary = ""
+    if len(diffs) > 0:
+        diffs_df = pd.DataFrame(diffs)
+        grouped_diffs = diffs_df.groupby("entity")["field"].apply(list).reset_index()
+        updated_entities_summary += "\nChanged fields by entity:\n"
+        for _, row in grouped_diffs.iterrows():
+            updated_entities_summary += (
+                f"\nEntity: {row['entity']}, Fields changed: {', '.join(row['field'])}"
+            )
+        return updated_entities_summary, diffs_df
+    else:
+        updated_entities_summary += "\nNo differences found in updated dataset"
+        return updated_entities_summary, None
diff --git a/tests/acceptance/test_add_data.py b/tests/acceptance/test_add_data.py
index 9dd9a0ee3..d17ca6d71 100644
--- a/tests/acceptance/test_add_data.py
+++ b/tests/acceptance/test_add_data.py
@@ -1,9 +1,11 @@
 import csv
 from datetime import datetime
 import os
+from pathlib import Path
+import shutil
 import tempfile
 from unittest import mock
-from unittest.mock import Mock
+from unittest.mock import Mock, patch
 from click.testing import CliRunner
 import pandas as pd
 import pytest
@@ -152,6 +154,19 @@ def mock_request_get_no_reference(mocker):
     )
 
 
+@pytest.fixture
+def mock_download_dataset():
+    original_dataset_path = Path("tests/data/dataset/central-activities-zone.sqlite3")
+    updated_dataset_path = tempfile.NamedTemporaryFile(suffix=".sqlite3").name
+    # copy so we can update a version to compare to original
+    shutil.copy(original_dataset_path, updated_dataset_path)
+    with patch(
+        "digital_land.commands.download_dataset",
+        return_value=Path(updated_dataset_path),
+    ) as mock:
+        yield mock
+
+
 def create_input_csv(
     data,
     fieldnames=[
@@ -181,6 +196,7 @@ def test_cli_add_data(
     cache_dir,
     organisation_csv,
     mock_request_get,
+    mock_download_dataset,
     monkeypatch,
 ):
     no_error_input_data = {
@@ -243,6 +259,7 @@ def test_cli_add_data_incorrect_input_data(
     pipeline_dir,
     organisation_csv,
     mock_request_get,
+    mock_download_dataset,
     cache_dir,
 ):
     incorrect_input_data = {
@@ -287,6 +304,7 @@ def test_cli_add_data_consecutive_runs(
     pipeline_dir,
     organisation_csv,
     mock_request_get,
+    mock_download_dataset,
     monkeypatch,
     cache_dir,
 ):
@@ -365,6 +383,7 @@ def test_cli_add_data_pipeline_fail(
     cache_dir,
     organisation_csv,
     mock_request_get,
+    mock_download_dataset,
     monkeypatch,
 ):
     no_error_input_data = {
@@ -417,6 +436,7 @@ def test_cli_add_data_remaining_unassigned_entities(
     cache_dir,
     organisation_csv,
     mock_request_get_no_reference,
+    mock_download_dataset,
     monkeypatch,
 ):
     no_error_input_data = {
@@ -464,6 +484,7 @@ def test_cli_add_data_old_endpoints_retired(
     cache_dir,
     organisation_csv,
     mock_request_get,
+    mock_download_dataset,
     monkeypatch,
 ):
     no_error_input_data = {
@@ -552,3 +573,61 @@ def test_cli_add_data_old_endpoints_retired(
 
     source_df = pd.read_csv(os.path.join(collection_dir, "source.csv"))
     assert source_df["end-date"].values[0] == datetime.utcnow().isoformat()[:10]
+
+
+# Add acceptance test
+# @patch("digital_land.commands.API.download_dataset", return_value=Path("tests/data/dataset/central-activities-zone.sqlite3"))
+def test_cli_add_data_update_dataset(
+    collection_dir,
+    specification_dir,
+    pipeline_dir,
+    cache_dir,
+    organisation_csv,
+    mock_request_get,
+    mock_download_dataset,
+    monkeypatch,
+):
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+    csv_path = create_input_csv(no_error_input_data)
+
+    # Mock in user input
+    monkeypatch.setattr("builtins.input", lambda _: "yes")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "add-data",
+            csv_path,
+            "conservation-area",
+            "--collection-dir",
+            str(collection_dir),
+            "--specification-dir",
+            str(specification_dir),
+            "--pipeline-dir",
+            str(pipeline_dir),
+            "--organisation-path",
+            str(organisation_csv),
+            "--cache-dir",
+            str(cache_dir),
+        ],
+    )
+    if result.exit_code != 0:
+        # Print the command output if the test fails, gives more detail on what's gone wrong
+        print("Command failed with exit code:", result.exit_code)
+        print("Command output:")
+        print(result.output)
+        print("Command error output:")
+        print(result.exception)
+
+    assert result.exit_code == 0
+    print("result std out", result.stdout)
+    assert "Entity: 44000000, Fields changed:" in result.stdout
diff --git a/tests/data/specification/dataset.csv b/tests/data/specification/dataset.csv
index aeee29c91..39eab0c86 100644
--- a/tests/data/specification/dataset.csv
+++ b/tests/data/specification/dataset.csv
@@ -1,5 +1,5 @@
-dataset,name,text,typology,prefix
-dataset-one,"First Dataset","Text of first dataset",,
+dataset,name,text,typology,prefix,collection
+dataset-one,"First Dataset","Text of first dataset",,,collection-one
 dataset-two,"Second Dataset","Text of first dataset",,
 dataset-three,"Third Dataset","Text of third dataset",,
 tree-preservation-zone-type,"Types of zone covered by the tree preservation order","Tree preservation zone type",category,
diff --git a/tests/integration/test_add_data_utils.py b/tests/integration/test_add_data_utils.py
index c76221e98..4f7f520f7 100644
--- a/tests/integration/test_add_data_utils.py
+++ b/tests/integration/test_add_data_utils.py
@@ -1,15 +1,23 @@
 import csv
 from datetime import datetime
 import os
+import shutil
+import tempfile
+from unittest.mock import Mock
+import pandas as pd
 import pytest
 
 from digital_land.collection import Collection
+from digital_land.specification import Specification
 from digital_land.utils.add_data_utils import (
     clear_log,
+    download_dataset,
     get_column_field_summary,
     get_entity_summary,
     get_existing_endpoints_summary,
     get_issue_summary,
+    get_transformed_entities,
+    get_updated_entities_summary,
 )
 
 
@@ -1037,3 +1045,288 @@ def test_get_existing_endpoints_ended_source_with_no_endpoint(tmp_path):
 
     assert not existing_endpoints_summary
     assert len(existing_sources) == 0
+
+
+def test_download_dataset(tmp_path_factory, mocker):
+    dataset = "dataset-one"
+    specification_dir = "tests/data/specification"
+    specification = Specification(specification_dir)
+    # create temp cache dir
+    cache_dir = tmp_path_factory.mktemp("cache")
+
+    # mock api download url
+    sqlite_file_path = "tests/data/dataset/central-activities-zone.sqlite3"
+    with open(sqlite_file_path, "rb") as f:
+        data = f.read()
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.request.headers = {"test": "test"}
+    mock_response.headers = {"test": "test"}
+    mock_response.content = data
+    mocker.patch("requests.get", return_value=mock_response)
+
+    download_dataset(dataset, specification, cache_dir)
+
+    path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3")
+    assert os.path.exists(path)
+
+
+def test_download_dataset_use_cache_dataset(tmp_path_factory, mocker):
+    dataset = "dataset-one"
+    specification_dir = "tests/data/specification"
+    specification = Specification(specification_dir)
+    # create temp cache dir
+    cache_dir = tmp_path_factory.mktemp("cache")
+
+    path = os.path.join(cache_dir, "dataset", f"{dataset}.sqlite3")
+    # put db file in cache dir
+    sqlite_file_path = "tests/data/dataset/central-activities-zone.sqlite3"
+    os.makedirs(os.path.dirname(path))
+    shutil.copy(sqlite_file_path, path)
+
+    # mock user response
+    mocker.patch(
+        "digital_land.utils.add_data_utils.get_user_response", return_value=True
+    )
+    mock_get = mocker.patch("requests.get")
+
+    download_dataset(dataset, specification, cache_dir)
+
+    # assert requests.get was NOT called
+    mock_get.assert_not_called()
+
+
+def test_get_transformed_entities():
+    output_path = tempfile.NamedTemporaryFile().name
+    dataset_path = "tests/data/dataset/central-activities-zone.sqlite3"
+
+    transformed_headers = [
+        "end-date",
+        "entity",
+        "entry-date",
+        "entry-number",
+        "fact",
+        "field",
+        "priority",
+        "reference-entity",
+        "resource",
+        "start-date",
+        "value",
+    ]
+    transformed_rows = [
+        {
+            "end-date": "",
+            "entity": 2200001,
+            "entry-date": "",
+            "entry-number": 1,
+            "fact": "fact1",
+            "field": "field1",
+            "priority": "",
+            "reference-entity": "",
+            "resource": "resource",
+            "start-date": "",
+            "value": "value1",
+        },
+        {
+            "end-date": "",
+            "entity": 2200002,
+            "entry-date": "",
+            "entry-number": 2,
+            "fact": "fact2",
+            "field": "field1",
+            "priority": "",
+            "reference-entity": "",
+            "resource": "resource",
+            "start-date": "",
+            "value": "value1",
+        },
+    ]
+    with open(output_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=transformed_headers)
+        writer.writeheader()
+        writer.writerows(transformed_rows)
+
+    entities = get_transformed_entities(dataset_path, output_path)
+
+    assert len(entities) == 2
+    assert entities.iloc[0]["entity"] == 2200001
+    assert entities.iloc[0]["reference"] == "CAZ00000001"
+    assert entities.iloc[1]["entity"] == 2200002
+    assert entities.iloc[1]["reference"] == "CAZ00000002"
+
+
+def test_get_updated_entities_summary_new_entity():
+    original_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "name1",
+                "reference": "ref1",
+            }
+        ]
+    )
+    updated_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "name1",
+                "reference": "ref1",
+            },
+            {
+                "end-date": "",
+                "entity": 2200002,
+                "dataset": "",
+                "json": "json",
+                "name": "name2",
+                "reference": "ref2",
+            },
+        ]
+    )
+
+    updated_entities_summary, diffs_df = get_updated_entities_summary(
+        original_entity_df, updated_entity_df
+    )
+
+    assert len(diffs_df) == 5
+    assert "end-date" in diffs_df["field"].values
+    assert "dataset" in diffs_df["field"].values
+    assert "name" in diffs_df["field"].values
+    assert "reference" in diffs_df["field"].values
+    assert "json" in diffs_df["field"].values
+
+    assert "original_value" in diffs_df.columns
+    assert all(not value for value in diffs_df["original_value"].values)
+
+    assert "updated_value" in diffs_df.columns
+    assert diffs_df[diffs_df["field"] == "name"]["updated_value"].values[0] == "name2"
+
+    assert all(value for value in diffs_df["new_entity"].values)
+
+    assert (
+        "Entity: 2200002, Fields changed: end-date, dataset, json, name, reference"
+        in updated_entities_summary
+    )
+    assert "Entity: 2200001" not in updated_entities_summary
+
+
+def test_get_updated_entities_summary_updated_entity():
+    original_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "name1",
+                "reference": "ref1",
+            }
+        ]
+    )
+    updated_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "updated end date",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "updated name",
+                "reference": "ref1",
+            }
+        ]
+    )
+
+    updated_entities_summary, diffs_df = get_updated_entities_summary(
+        original_entity_df, updated_entity_df
+    )
+
+    assert len(diffs_df) == 2
+
+    assert diffs_df[diffs_df["field"] == "name"]["original_value"].values[0] == "name1"
+    assert (
+        diffs_df[diffs_df["field"] == "name"]["updated_value"].values[0]
+        == "updated name"
+    )
+
+    assert diffs_df[diffs_df["field"] == "end-date"]["original_value"].values[0] == ""
+    assert (
+        diffs_df[diffs_df["field"] == "end-date"]["updated_value"].values[0]
+        == "updated end date"
+    )
+
+    assert not all(value == "" for value in diffs_df["new_entity"].values)
+
+    assert "Entity: 2200001, Fields changed: end-date, name" in updated_entities_summary
+
+
+def test_get_updated_entities_summary_no_updates():
+    original_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "name1",
+                "reference": "ref1",
+            }
+        ]
+    )
+    updated_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "name1",
+                "reference": "ref1",
+            }
+        ]
+    )
+
+    updated_entities_summary, diffs_df = get_updated_entities_summary(
+        original_entity_df, updated_entity_df
+    )
+
+    assert "No differences found in updated dataset" in updated_entities_summary
+    assert not diffs_df
+
+
+def test_get_updated_entities_summary_updated_entity_none_agnostic():
+    original_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": "",
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": None,
+                "reference": "ref1",
+            }
+        ]
+    )
+    updated_entity_df = pd.DataFrame.from_records(
+        [
+            {
+                "end-date": None,
+                "entity": 2200001,
+                "dataset": "",
+                "json": "json",
+                "name": "",
+                "reference": "ref1",
+            }
+        ]
+    )
+
+    updated_entities_summary, diffs_df = get_updated_entities_summary(
+        original_entity_df, updated_entity_df
+    )
+
+    assert "No differences found in updated dataset" in updated_entities_summary
+    assert not diffs_df
diff --git a/tests/unit/test_add_data_utils.py b/tests/unit/test_add_data_utils.py
index 4e89129fd..124c9c6e5 100644
--- a/tests/unit/test_add_data_utils.py
+++ b/tests/unit/test_add_data_utils.py
@@ -1,7 +1,11 @@
 import pytest
 
 from digital_land.commands import is_url_valid
-from digital_land.utils.add_data_utils import get_user_response, is_date_valid
+from digital_land.utils.add_data_utils import (
+    get_user_response,
+    is_date_valid,
+    normalise_json,
+)
 
 
 def test_is_url_valid():
@@ -88,3 +92,15 @@ def test_get_user_response_fail(monkeypatch):
     result = get_user_response("message")
 
     assert not result
+
+
+def test_normalise_json():
+    json_string = '{"secondproperty": "secondvalue", "firstproperty": "firstvalue"}'
+
+    sorted_json_string = normalise_json(json_string)
+
+    # ensure json is sorted
+    assert isinstance(sorted_json_string, str)
+    assert sorted_json_string.find("firstproperty") < sorted_json_string.find(
+        "secondproperty"
+    )

From 86f6766f0986489f56e5156311c4ccc07e960993 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Wed, 14 May 2025 15:13:06 +0100
Subject: [PATCH 09/17] generate provision quality dataset

---
 digital_land/utils/functions_core.py          |  73 ++++++++
 .../utils/generate_provision_quality.py       | 175 ++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 digital_land/utils/functions_core.py
 create mode 100644 digital_land/utils/generate_provision_quality.py

diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
new file mode 100644
index 000000000..887fa5c09
--- /dev/null
+++ b/digital_land/utils/functions_core.py
@@ -0,0 +1,73 @@
+import urllib
+import os
+import sqlite3
+import pandas as pd
+import geopandas as gpd
+import shapely.wkt
+
+
+global FILES_URL
+
+FILES_URL = "https://datasette.planning.data.gov.uk/"
+
+
+def download_dataset(dataset, output_dir_path, overwrite=False):
+    dataset_file_name = f"{dataset}.db"
+
+    if not os.path.exists(output_dir_path):
+        os.makedirs(output_dir_path)
+
+    output_file_path = os.path.join(output_dir_path, dataset_file_name)
+
+    if overwrite is False and os.path.exists(output_file_path):
+        return
+
+    final_url = os.path.join(FILES_URL, dataset_file_name)
+    print(f"downloading data from {final_url}")
+    print(f"to: {output_file_path}")
+    urllib.request.urlretrieve(
+        final_url, os.path.join(output_dir_path, dataset_file_name)
+    )
+    print("download complete")
+
+
+def get_pdp_dataset(
+    dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True
+):
+
+    df = pd.read_csv(
+        f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str"
+    )
+    df.columns = [x.replace("-", "_") for x in df.columns]
+
+    df_valid_geom = df[df[geometry_field].notnull()].copy()
+
+    # load geometry and create GDF
+    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(
+        shapely.wkt.loads
+    )
+    gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field)
+
+    # Transform to ESPG:27700 for more interpretable area units
+    gdf.set_crs(epsg=4326, inplace=True)
+    gdf.to_crs(epsg=crs_out, inplace=True)
+
+    return gdf
+
+
+def query_sqlite(db_path, query_string):
+
+    with sqlite3.connect(db_path) as con:
+
+        cursor = con.execute(query_string)
+        cols = [column[0] for column in cursor.description]
+        results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
+
+    return results_df
+
+
+def datasette_query(db, sql_string):
+    params = urllib.parse.urlencode({"sql": sql_string, "_size": "max"})
+    url = f"https://datasette.planning.data.gov.uk/{db}.csv?{params}"
+    df = pd.read_csv(url)
+    return df
diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py
new file mode 100644
index 000000000..7ea8f6f73
--- /dev/null
+++ b/digital_land/utils/generate_provision_quality.py
@@ -0,0 +1,175 @@
+import os
+import pandas as pd
+import numpy as np
+import json
+from datetime import datetime
+from digital_land.utils import functions_core as fc
+
+
+def generate_provision_quality():
+    """
+    Generates a provision quality dataset and saves it as a parquet file.
+    """
+    td = datetime.today().strftime("%Y-%m-%d")
+
+    # Create the temporary download directory
+    db_dir = os.path.join("/tmp", "db_downloads")
+    os.makedirs(db_dir, exist_ok=True)
+
+    # Download the performance db
+    fc.download_dataset("performance", db_dir, overwrite=False)
+    path_perf_db = os.path.join(db_dir, "performance.db")
+
+    # Issue quality criteria lookup
+    lookup_issue_qual = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT
+            description,
+            issue_type,
+            name,
+            severity,
+            responsibility,
+            quality_criteria_level || " - " || quality_criteria as quality_criteria,
+            quality_criteria_level as quality_level
+        FROM issue_type
+        WHERE quality_criteria_level != ''
+        AND quality_criteria != ''
+        """,
+    )
+
+    # Transform data
+    provision = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT organisation, dataset, active_endpoint_count
+        FROM provision_summary
+        """,
+    )
+
+    # Extract issue count by provision from endpoint_dataset_issue_type_summary
+    qual_issue = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT
+            organisation, dataset,
+            'issue' as problem_source,
+            issue_type as problem_type,
+            sum(count_issues) as count
+        FROM endpoint_dataset_issue_type_summary
+        WHERE resource_end_date is not NULL
+        AND issue_type is not NULL
+        GROUP BY organisation, dataset, issue_type
+        """,
+    )
+
+    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
+    qual_issue = qual_issue.merge(
+        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
+        how="inner",
+        left_on="problem_type",
+        right_on="issue_type",
+    )
+    qual_issue.drop("issue_type", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_bounds = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check no entities are outside of the local planning authority boundary'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_bounds["problem_source"] = "expectation"
+    qual_expectation_bounds["problem_type"] = (
+        "entity outside of the local planning authority boundary"
+    )
+    qual_expectation_bounds["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
+    ]
+    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
+    qual_expectation_bounds["quality_level"] = 3
+    qual_expectation_bounds.drop("details", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_count = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_count["problem_source"] = "expectation"
+    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
+    qual_expectation_count["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_count["details"]
+    ]
+    qual_expectation_count["quality_criteria"] = (
+        "3 - conservation area entity count matches LPA"
+    )
+    qual_expectation_count["quality_level"] = 3
+    qual_expectation_count.drop("details", axis=1, inplace=True)
+
+    # Combine all problem source tables, and aggregate to criteria level
+    qual_all_criteria = (
+        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
+        .groupby(
+            ["organisation", "dataset", "quality_criteria", "quality_level"],
+            as_index=False,
+        )
+        .agg(count_failures=("count", "sum"))
+    )
+
+    # Merge issues with the provision data
+    prov_qual_all = provision.merge(
+        qual_all_criteria, how="left", on=["organisation", "dataset"]
+    )
+
+    prov_qual_all["quality_level_for_sort"] = np.select(
+        [
+            (prov_qual_all["active_endpoint_count"] == 0),
+            (prov_qual_all["quality_level"].notnull()),
+            (prov_qual_all["active_endpoint_count"] > 0)
+            & (prov_qual_all["quality_level"].isnull()),
+        ],
+        [0, prov_qual_all["quality_level"], 4],
+    )
+
+    level_map = {
+        4: "4. data that is trustworthy",
+        3: "3. data that is good for ODP",
+        2: "2. authoritative data from the LPA",
+        1: "1. some data",
+        0: "0. no score",
+    }
+
+    prov_quality = prov_qual_all.groupby(
+        ["organisation", "dataset"], as_index=False, dropna=False
+    ).agg(quality_level=("quality_level_for_sort", "min"))
+
+    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
+    prov_quality["notes"] = ""
+    prov_quality["end-date"] = ""
+    prov_quality["start-date"] = td
+    prov_quality["entry-date"] = td
+
+    # Output the results as a Parquet file
+    output_dir = os.path.join(
+        "/tmp", "performance", "provision-quality", f"entry-date={td}"
+    )
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_file = os.path.join(output_dir, "provision-quality.parquet")
+    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
+
+    print(f"Provision quality dataset saved to: {output_file}")

From 0de99250afa42f7cfbceb06328d717ec3bacae21 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Thu, 15 May 2025 11:57:29 +0100
Subject: [PATCH 10/17] generate-provision-quality cli command

---
 digital_land/cli.py                           |   6 +
 digital_land/commands.py                      | 167 +++++++++++++++++
 digital_land/utils/functions_core.py          |  21 +--
 .../utils/generate_provision_quality.py       | 175 ------------------
 setup.py                                      |   1 +
 5 files changed, 183 insertions(+), 187 deletions(-)
 delete mode 100644 digital_land/utils/generate_provision_quality.py

diff --git a/digital_land/cli.py b/digital_land/cli.py
index b65aef331..a99a950ea 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -31,6 +31,7 @@
     organisation_check,
     save_state,
     add_data,
+    generate_provision_quality,
 )
 
 from digital_land.command_arguments import (
@@ -825,3 +826,8 @@ def check_state_cmd(
     if diffs:
         print(f"State differs from {state_path} - {', '.join(diffs)}")
         sys.exit(1)
+
+
+@cli.command("generate-provision-quality")
+def generate_provision_quality_cmd():
+    generate_provision_quality()
diff --git a/digital_land/commands.py b/digital_land/commands.py
index f88559c48..c8207c2b8 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -14,6 +14,7 @@
 import geojson
 from requests import HTTPError
 import shapely
+import numpy as np
 
 from digital_land.package.organisation import OrganisationPackage
 from digital_land.check import duplicate_reference_check
@@ -76,6 +77,7 @@
     is_url_valid,
     get_user_response,
 )
+from digital_land.utils import functions_core as fc
 
 from .register import hash_value
 from .utils.gdal_utils import get_gdal_version
@@ -1723,3 +1725,168 @@ def check_and_assign_entities(
         ):
             return False
     return True
+
+
+def generate_provision_quality():
+    """Generates a provision quality dataset and saves it as a parquet file"""
+    td = datetime.today().strftime("%Y-%m-%d")
+
+    # Create the temporary download directory
+    db_dir = Path("/tmp") / "db_downloads"
+    os.makedirs(db_dir, exist_ok=True)
+
+    # Download the performance db
+    fc.download_dataset("performance", db_dir, overwrite=False)
+    path_perf_db = db_dir / "performance.db"
+
+    # Issue quality criteria lookup
+    lookup_issue_qual = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT
+            description,
+            issue_type,
+            name,
+            severity,
+            responsibility,
+            quality_criteria_level || " - " || quality_criteria as quality_criteria,
+            quality_criteria_level as quality_level
+        FROM issue_type
+        WHERE quality_criteria_level != ''
+        AND quality_criteria != ''
+        """,
+    )
+
+    # Transform data
+    provision = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT organisation, dataset, active_endpoint_count
+        FROM provision_summary
+        """,
+    )
+
+    # Extract issue count by provision from endpoint_dataset_issue_type_summary
+    qual_issue = fc.query_sqlite(
+        path_perf_db,
+        """
+        SELECT
+            organisation, dataset,
+            'issue' as problem_source,
+            issue_type as problem_type,
+            sum(count_issues) as count
+        FROM endpoint_dataset_issue_type_summary
+        WHERE resource_end_date is not NULL
+        AND issue_type is not NULL
+        GROUP BY organisation, dataset, issue_type
+        """,
+    )
+
+    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
+    qual_issue = qual_issue.merge(
+        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
+        how="inner",
+        left_on="problem_type",
+        right_on="issue_type",
+    )
+    qual_issue.drop("issue_type", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_bounds = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check no entities are outside of the local planning authority boundary'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_bounds["problem_source"] = "expectation"
+    qual_expectation_bounds["problem_type"] = (
+        "entity outside of the local planning authority boundary"
+    )
+    qual_expectation_bounds["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
+    ]
+    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
+    qual_expectation_bounds["quality_level"] = 3
+    qual_expectation_bounds.drop("details", axis=1, inplace=True)
+
+    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
+    qual_expectation_count = fc.datasette_query(
+        "digital-land",
+        """
+        SELECT organisation, dataset, details
+        FROM expectation
+        WHERE 1=1
+            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
+            AND passed = 'False'
+            AND message not like '%error%'
+        """,
+    )
+
+    qual_expectation_count["problem_source"] = "expectation"
+    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
+    qual_expectation_count["count"] = [
+        json.loads(v)["actual"] for v in qual_expectation_count["details"]
+    ]
+    qual_expectation_count["quality_criteria"] = (
+        "3 - conservation area entity count matches LPA"
+    )
+    qual_expectation_count["quality_level"] = 3
+    qual_expectation_count.drop("details", axis=1, inplace=True)
+
+    # Combine all problem source tables, and aggregate to criteria level
+    qual_all_criteria = (
+        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
+        .groupby(
+            ["organisation", "dataset", "quality_criteria", "quality_level"],
+            as_index=False,
+        )
+        .agg(count_failures=("count", "sum"))
+    )
+
+    # Merge issues with the provision data
+    prov_qual_all = provision.merge(
+        qual_all_criteria, how="left", on=["organisation", "dataset"]
+    )
+
+    prov_qual_all["quality_level_for_sort"] = np.select(
+        [
+            (prov_qual_all["active_endpoint_count"] == 0),
+            (prov_qual_all["quality_level"].notnull()),
+            (prov_qual_all["active_endpoint_count"] > 0)
+            & (prov_qual_all["quality_level"].isnull()),
+        ],
+        [0, prov_qual_all["quality_level"], 4],
+    )
+
+    level_map = {
+        4: "4. data that is trustworthy",
+        3: "3. data that is good for ODP",
+        2: "2. authoritative data from the LPA",
+        1: "1. some data",
+        0: "0. no score",
+    }
+
+    prov_quality = prov_qual_all.groupby(
+        ["organisation", "dataset"], as_index=False, dropna=False
+    ).agg(quality_level=("quality_level_for_sort", "min"))
+
+    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
+    prov_quality["notes"] = ""
+    prov_quality["end-date"] = ""
+    prov_quality["start-date"] = td
+    prov_quality["entry-date"] = td
+
+    # Output the results as a Parquet file
+    output_dir = Path("/tmp") / "performance" / "provision-quality" / f"entry-date={td}"
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_file = output_dir / "provision-quality.parquet"
+    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
+
+    print(f"Provision quality dataset saved to: {output_file}")
diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
index 887fa5c09..b891ef405 100644
--- a/digital_land/utils/functions_core.py
+++ b/digital_land/utils/functions_core.py
@@ -1,10 +1,9 @@
 import urllib
-import os
 import sqlite3
 import pandas as pd
 import geopandas as gpd
 import shapely.wkt
-
+from pathlib import Path
 
 global FILES_URL
 
@@ -12,22 +11,20 @@
 
 
 def download_dataset(dataset, output_dir_path, overwrite=False):
-    dataset_file_name = f"{dataset}.db"
-
-    if not os.path.exists(output_dir_path):
-        os.makedirs(output_dir_path)
+    output_dir = Path(output_dir_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-    output_file_path = os.path.join(output_dir_path, dataset_file_name)
+    dataset_file_name = f"{dataset}.db"
+    output_file_path = output_dir / dataset_file_name
 
-    if overwrite is False and os.path.exists(output_file_path):
+    if not overwrite and output_file_path.exists():
         return
 
-    final_url = os.path.join(FILES_URL, dataset_file_name)
+    final_url = f"{FILES_URL}{dataset_file_name}"
     print(f"downloading data from {final_url}")
     print(f"to: {output_file_path}")
-    urllib.request.urlretrieve(
-        final_url, os.path.join(output_dir_path, dataset_file_name)
-    )
+
+    urllib.request.urlretrieve(final_url, output_file_path)
     print("download complete")
 
 
diff --git a/digital_land/utils/generate_provision_quality.py b/digital_land/utils/generate_provision_quality.py
deleted file mode 100644
index 7ea8f6f73..000000000
--- a/digital_land/utils/generate_provision_quality.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import pandas as pd
-import numpy as np
-import json
-from datetime import datetime
-from digital_land.utils import functions_core as fc
-
-
-def generate_provision_quality():
-    """
-    Generates a provision quality dataset and saves it as a parquet file.
-    """
-    td = datetime.today().strftime("%Y-%m-%d")
-
-    # Create the temporary download directory
-    db_dir = os.path.join("/tmp", "db_downloads")
-    os.makedirs(db_dir, exist_ok=True)
-
-    # Download the performance db
-    fc.download_dataset("performance", db_dir, overwrite=False)
-    path_perf_db = os.path.join(db_dir, "performance.db")
-
-    # Issue quality criteria lookup
-    lookup_issue_qual = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT
-            description,
-            issue_type,
-            name,
-            severity,
-            responsibility,
-            quality_criteria_level || " - " || quality_criteria as quality_criteria,
-            quality_criteria_level as quality_level
-        FROM issue_type
-        WHERE quality_criteria_level != ''
-        AND quality_criteria != ''
-        """,
-    )
-
-    # Transform data
-    provision = fc.query_sqlite(
-        path_perf_db,
-        """
-        SELECT organisation, dataset, active_endpoint_count
-        FROM provision_summary
-        """,
-    )
-
-    # Extract issue count by provision from endpoint_dataset_issue_type_summary
-    qual_issue = fc.query_sqlite(
-        path_perf_db,
-        """
-        SELECT
-            organisation, dataset,
-            'issue' as problem_source,
-            issue_type as problem_type,
-            sum(count_issues) as count
-        FROM endpoint_dataset_issue_type_summary
-        WHERE resource_end_date is not NULL
-        AND issue_type is not NULL
-        GROUP BY organisation, dataset, issue_type
-        """,
-    )
-
-    # Join on quality criteria and level from issue_type lookup (this restricts to only issues linked to a quality criteria)
-    qual_issue = qual_issue.merge(
-        lookup_issue_qual[["issue_type", "quality_criteria", "quality_level"]],
-        how="inner",
-        left_on="problem_type",
-        right_on="issue_type",
-    )
-    qual_issue.drop("issue_type", axis=1, inplace=True)
-
-    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_bounds = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check no entities are outside of the local planning authority boundary'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
-
-    qual_expectation_bounds["problem_source"] = "expectation"
-    qual_expectation_bounds["problem_type"] = (
-        "entity outside of the local planning authority boundary"
-    )
-    qual_expectation_bounds["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
-    ]
-    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
-    qual_expectation_bounds["quality_level"] = 3
-    qual_expectation_bounds.drop("details", axis=1, inplace=True)
-
-    # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_count = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
-
-    qual_expectation_count["problem_source"] = "expectation"
-    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
-    qual_expectation_count["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_count["details"]
-    ]
-    qual_expectation_count["quality_criteria"] = (
-        "3 - conservation area entity count matches LPA"
-    )
-    qual_expectation_count["quality_level"] = 3
-    qual_expectation_count.drop("details", axis=1, inplace=True)
-
-    # Combine all problem source tables, and aggregate to criteria level
-    qual_all_criteria = (
-        pd.concat([qual_issue, qual_expectation_bounds, qual_expectation_count])
-        .groupby(
-            ["organisation", "dataset", "quality_criteria", "quality_level"],
-            as_index=False,
-        )
-        .agg(count_failures=("count", "sum"))
-    )
-
-    # Merge issues with the provision data
-    prov_qual_all = provision.merge(
-        qual_all_criteria, how="left", on=["organisation", "dataset"]
-    )
-
-    prov_qual_all["quality_level_for_sort"] = np.select(
-        [
-            (prov_qual_all["active_endpoint_count"] == 0),
-            (prov_qual_all["quality_level"].notnull()),
-            (prov_qual_all["active_endpoint_count"] > 0)
-            & (prov_qual_all["quality_level"].isnull()),
-        ],
-        [0, prov_qual_all["quality_level"], 4],
-    )
-
-    level_map = {
-        4: "4. data that is trustworthy",
-        3: "3. data that is good for ODP",
-        2: "2. authoritative data from the LPA",
-        1: "1. some data",
-        0: "0. no score",
-    }
-
-    prov_quality = prov_qual_all.groupby(
-        ["organisation", "dataset"], as_index=False, dropna=False
-    ).agg(quality_level=("quality_level_for_sort", "min"))
-
-    prov_quality["quality"] = prov_quality["quality_level"].map(level_map)
-    prov_quality["notes"] = ""
-    prov_quality["end-date"] = ""
-    prov_quality["start-date"] = td
-    prov_quality["entry-date"] = td
-
-    # Output the results as a Parquet file
-    output_dir = os.path.join(
-        "/tmp", "performance", "provision-quality", f"entry-date={td}"
-    )
-    os.makedirs(output_dir, exist_ok=True)
-
-    output_file = os.path.join(output_dir, "provision-quality.parquet")
-    prov_quality.to_parquet(output_file, engine="pyarrow", index=False)
-
-    print(f"Provision quality dataset saved to: {output_file}")
diff --git a/setup.py b/setup.py
index a051b7356..685cc8ac9 100644
--- a/setup.py
+++ b/setup.py
@@ -61,6 +61,7 @@ def get_long_description():
         "boto3",
         "moto",
         "psutil",
+        "geopandas",
     ],
     entry_points={"console_scripts": ["digital-land=digital_land.cli:cli"]},
     setup_requires=["pytest-runner"],

From 95775254ed5d35378395ec7e7d3aac15d37eb084 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Thu, 15 May 2025 13:42:58 +0100
Subject: [PATCH 11/17] update cli

---
 digital_land/cli.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/digital_land/cli.py b/digital_land/cli.py
index a99a950ea..b65aef331 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -31,7 +31,6 @@
     organisation_check,
     save_state,
     add_data,
-    generate_provision_quality,
 )
 
 from digital_land.command_arguments import (
@@ -826,8 +825,3 @@ def check_state_cmd(
     if diffs:
         print(f"State differs from {state_path} - {', '.join(diffs)}")
         sys.exit(1)
-
-
-@cli.command("generate-provision-quality")
-def generate_provision_quality_cmd():
-    generate_provision_quality()

From 698b00fa92b4646d61e4361524e0b7750e18d120 Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Fri, 16 May 2025 12:32:42 +0100
Subject: [PATCH 12/17] utilise api to download performance.sqlite3

---
 digital_land/api.py                           | 14 ++-
 digital_land/commands.py                      | 17 ++--
 digital_land/utils/functions_core.py          | 52 -----------
 .../test_generate_provision_quality.py        | 90 +++++++++++++++++++
 tests/unit/test_functions_core_utils.py       | 32 +++++++
 5 files changed, 145 insertions(+), 60 deletions(-)
 create mode 100644 tests/integration/test_generate_provision_quality.py
 create mode 100644 tests/unit/test_functions_core_utils.py

diff --git a/digital_land/api.py b/digital_land/api.py
index ef0262153..03a480e01 100644
--- a/digital_land/api.py
+++ b/digital_land/api.py
@@ -36,6 +36,8 @@ def download_dataset(
         overwrite: bool = False,
         path: str = None,
         extension: Extension = Extension.CSV,
+        builder: bool = False,
+        builder_name: str = None,
     ):
         """
         Downloads a dataset in CSV or SQLite3 format.
@@ -43,6 +45,8 @@ def download_dataset(
         - overwrite: overwrite file is it already exists (otherwise will just return).
         - path: file to download to (otherwise <cache-dir>/dataset/<dataset-name>.<extension>).
         - extension: 'csv' or 'sqlite3', 'csv' by default.
+        - builder: downloads the dataset from the builder path
+        - builder_name: name to use for accessing the builder path
         - Returns: None.
         The file will be downloaded to the given path or cache, unless an exception occurs.
 
@@ -56,8 +60,14 @@ def download_dataset(
 
         # different extensions require different urls and reading modes
         if extension == self.Extension.SQLITE3:
-            collection = self.specification.dataset[dataset]["collection"]
-            url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
+            # performance.sqlite requires digital-land-builder path
+            if builder:
+                if not builder_name:
+                    raise ValueError("Builder name must be provided when builder=True")
+                url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3"
+            else:
+                collection = self.specification.dataset[dataset]["collection"]
+                url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
             mode = "wb"
 
             def get_content(response):
diff --git a/digital_land/commands.py b/digital_land/commands.py
index c8207c2b8..9b5dd4986 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1731,13 +1731,18 @@ def generate_provision_quality():
     """Generates a provision quality dataset and saves it as a parquet file"""
     td = datetime.today().strftime("%Y-%m-%d")
 
-    # Create the temporary download directory
-    db_dir = Path("/tmp") / "db_downloads"
-    os.makedirs(db_dir, exist_ok=True)
+    specification = Specification("specification/")
+    api = API(specification)
+
+    # Download the performance db using api
+    api.download_dataset(
+        "performance",
+        extension=api.Extension.SQLITE3,
+        builder=True,
+        builder_name="digital-land",
+    )
 
-    # Download the performance db
-    fc.download_dataset("performance", db_dir, overwrite=False)
-    path_perf_db = db_dir / "performance.db"
+    path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3"
 
     # Issue quality criteria lookup
     lookup_issue_qual = fc.datasette_query(
diff --git a/digital_land/utils/functions_core.py b/digital_land/utils/functions_core.py
index b891ef405..c3a62836f 100644
--- a/digital_land/utils/functions_core.py
+++ b/digital_land/utils/functions_core.py
@@ -1,65 +1,13 @@
 import urllib
 import sqlite3
 import pandas as pd
-import geopandas as gpd
-import shapely.wkt
-from pathlib import Path
-
-global FILES_URL
-
-FILES_URL = "https://datasette.planning.data.gov.uk/"
-
-
-def download_dataset(dataset, output_dir_path, overwrite=False):
-    output_dir = Path(output_dir_path)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    dataset_file_name = f"{dataset}.db"
-    output_file_path = output_dir / dataset_file_name
-
-    if not overwrite and output_file_path.exists():
-        return
-
-    final_url = f"{FILES_URL}{dataset_file_name}"
-    print(f"downloading data from {final_url}")
-    print(f"to: {output_file_path}")
-
-    urllib.request.urlretrieve(final_url, output_file_path)
-    print("download complete")
-
-
-def get_pdp_dataset(
-    dataset, geometry_field="geometry", crs_out=4326, underscore_cols=True
-):
-
-    df = pd.read_csv(
-        f"https://files.planning.data.gov.uk/dataset/{dataset}.csv", dtype="str"
-    )
-    df.columns = [x.replace("-", "_") for x in df.columns]
-
-    df_valid_geom = df[df[geometry_field].notnull()].copy()
-
-    # load geometry and create GDF
-    df_valid_geom[geometry_field] = df_valid_geom[geometry_field].apply(
-        shapely.wkt.loads
-    )
-    gdf = gpd.GeoDataFrame(df_valid_geom, geometry=geometry_field)
-
-    # Transform to ESPG:27700 for more interpretable area units
-    gdf.set_crs(epsg=4326, inplace=True)
-    gdf.to_crs(epsg=crs_out, inplace=True)
-
-    return gdf
 
 
 def query_sqlite(db_path, query_string):
-
     with sqlite3.connect(db_path) as con:
-
         cursor = con.execute(query_string)
         cols = [column[0] for column in cursor.description]
         results_df = pd.DataFrame.from_records(data=cursor.fetchall(), columns=cols)
-
     return results_df
 
 
diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py
new file mode 100644
index 000000000..33dbf202b
--- /dev/null
+++ b/tests/integration/test_generate_provision_quality.py
@@ -0,0 +1,90 @@
+import pandas as pd
+from unittest.mock import patch
+from pathlib import Path
+from datetime import datetime
+from digital_land.commands import generate_provision_quality
+
+
+@patch("digital_land.commands.fc.datasette_query")
+@patch("digital_land.commands.fc.query_sqlite")
+def test_generate_provision_quality(
+    mock_query_sqlite,
+    mock_datasette_query,
+):
+    # mock issue_type
+    mock_datasette_query.side_effect = [
+        pd.DataFrame(
+            [
+                {
+                    "description": "desc",
+                    "issue_type": "missing-value",
+                    "name": "Missing Value",
+                    "severity": "error",
+                    "responsibility": "external",
+                    "quality_criteria": "any other validity error",
+                    "quality_level": 3,
+                }
+            ]
+        ),
+        # mock LPA boundary check
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "details": '{"actual": 2}',
+                }
+            ]
+        ),
+        # mock count value
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "details": '{"actual": 1}',
+                }
+            ]
+        ),
+    ]
+
+    # mock sqlite queries
+    mock_query_sqlite.side_effect = [
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "active_endpoint_count": 5,
+                }
+            ]
+        ),
+        pd.DataFrame(
+            [
+                {
+                    "organisation": "org1",
+                    "dataset": "dataset1",
+                    "problem_source": "issue",
+                    "problem_type": "missing-value",
+                    "count": 1,
+                }
+            ]
+        ),
+    ]
+
+    generate_provision_quality()
+
+    td = datetime.today().strftime("%Y-%m-%d")
+    output_file = Path(
+        f"/tmp/performance/provision-quality/entry-date={td}/provision-quality.parquet"
+    )
+    assert output_file.exists(), "Parquet file not found"
+
+    df = pd.read_parquet(output_file)
+    assert "organisation" in df.columns
+    assert "dataset" in df.columns
+    assert "quality" in df.columns
+
+    assert not df.empty, "Dataframe loaded from Parquet is empty"
+    assert len(df) == 1
+    assert df.iloc[0]["organisation"] == "org1"
diff --git a/tests/unit/test_functions_core_utils.py b/tests/unit/test_functions_core_utils.py
new file mode 100644
index 000000000..1df56b006
--- /dev/null
+++ b/tests/unit/test_functions_core_utils.py
@@ -0,0 +1,32 @@
+import pandas as pd
+from unittest.mock import patch, Mock
+from digital_land.utils.functions_core import datasette_query, query_sqlite
+
+
+@patch("digital_land.utils.functions_core.sqlite3.connect")
+def test_query_sqlite(mock_connect):
+    mock_data = Mock()
+    mock_data.description = [("organisation",), ("dataset",)]
+    mock_data.fetchall.return_value = [("org1", "dataset1"), ("org2", "dataset2")]
+
+    mock_con = Mock()
+    mock_con.execute.return_value = mock_data
+    mock_connect.return_value.__enter__.return_value = mock_con
+
+    df = query_sqlite("db_path", "SELECT * FROM table")
+
+    assert isinstance(df, pd.DataFrame)
+    assert list(df.columns) == ["organisation", "dataset"]
+    assert len(df) == 2
+    assert df.iloc[0]["organisation"] == "org1"
+
+
+@patch("digital_land.utils.functions_core.pd.read_csv")
+def test_datasette_query(mock_read_csv):
+    df_mock = pd.DataFrame({"organisation": ["org1", "org2"]})
+    mock_read_csv.return_value = df_mock
+
+    df = datasette_query("db", "SELECT organisation FROM table")
+    assert isinstance(df, pd.DataFrame)
+    assert "organisation" in df.columns
+    assert df.equals(df_mock)

From d03a6556571d349f2014c279e8dd342f0bccb13f Mon Sep 17 00:00:00 2001
From: kena vyas <kvyas@scottlogic.com>
Date: Fri, 16 May 2025 14:34:29 +0100
Subject: [PATCH 13/17] make specification parameter optional

---
 digital_land/api.py      | 4 +++-
 digital_land/commands.py | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/digital_land/api.py b/digital_land/api.py
index 03a480e01..e99ced5bc 100644
--- a/digital_land/api.py
+++ b/digital_land/api.py
@@ -14,7 +14,7 @@
 class API:
     def __init__(
         self,
-        specification: Specification,
+        specification: Specification = None,
         url: str = DEFAULT_URL,
         cache_dir: str = "var/cache",
     ):
@@ -66,6 +66,8 @@ def download_dataset(
                     raise ValueError("Builder name must be provided when builder=True")
                 url = f"{self.url}/{builder_name}-builder/dataset/{dataset}.sqlite3"
             else:
+                if self.specification is None:
+                    raise ValueError("Specification must be provided")
                 collection = self.specification.dataset[dataset]["collection"]
                 url = f"{self.url}/{collection}-collection/dataset/{dataset}.sqlite3"
             mode = "wb"
diff --git a/digital_land/commands.py b/digital_land/commands.py
index 9b5dd4986..6d5bbd272 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1731,8 +1731,7 @@ def generate_provision_quality():
     """Generates a provision quality dataset and saves it as a parquet file"""
     td = datetime.today().strftime("%Y-%m-%d")
 
-    specification = Specification("specification/")
-    api = API(specification)
+    api = API()
 
     # Download the performance db using api
     api.download_dataset(

From 7f236fa50f683384527c3df4d71b70d9f95a70fd Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Fri, 30 May 2025 11:22:38 +0100
Subject: [PATCH 14/17] Change datasette_query to duckdb.query

---
 digital_land/commands.py | 96 ++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 49 deletions(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 6d5bbd272..5afe18d62 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -15,6 +15,7 @@
 from requests import HTTPError
 import shapely
 import numpy as np
+import duckdb
 
 from digital_land.package.organisation import OrganisationPackage
 from digital_land.check import duplicate_reference_check
@@ -1744,22 +1745,26 @@ def generate_provision_quality():
     path_perf_db = Path(api.cache_dir) / "dataset" / "performance.sqlite3"
 
     # Issue quality criteria lookup
-    lookup_issue_qual = fc.datasette_query(
-        "digital-land",
-        """
+    specification_repo_url = (
+        "https://raw.githubusercontent.com/digital-land/specification/refs/heads/"
+    )
+    issue_type_url = f"{specification_repo_url}main/content/issue-type.csv"
+
+    lookup_issue_qual = duckdb.query(
+        f"""
         SELECT
             description,
-            issue_type,
+            "issue-type" AS issue_type,
             name,
             severity,
             responsibility,
-            quality_criteria_level || " - " || quality_criteria as quality_criteria,
+            quality_criteria_level || ' - ' || quality_criteria as quality_criteria,
             quality_criteria_level as quality_level
-        FROM issue_type
-        WHERE quality_criteria_level != ''
+        FROM read_csv('{issue_type_url}')
+        WHERE CAST(quality_criteria_level AS string) != ''
         AND quality_criteria != ''
-        """,
-    )
+    """
+    ).to_df()
 
     # Transform data
     provision = fc.query_sqlite(
@@ -1793,54 +1798,47 @@ def generate_provision_quality():
         left_on="problem_type",
         right_on="issue_type",
     )
-    qual_issue.drop("issue_type", axis=1, inplace=True)
+    qual_issue = qual_issue.drop(columns="issue_type")
 
     # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_bounds = fc.datasette_query(
-        "digital-land",
-        """
-        SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check no entities are outside of the local planning authority boundary'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
+    s3_uri = f"s3://development-collection-data/log/expectation/dataset=*/*.parquet"
 
-    qual_expectation_bounds["problem_source"] = "expectation"
-    qual_expectation_bounds["problem_type"] = (
-        "entity outside of the local planning authority boundary"
+    qual_expectation_bounds = duckdb.query(
+        f"""
+        SELECT organisation, dataset, details
+        FROM   read_parquet('{s3_uri}')
+        WHERE name = 'Check no entities are outside of the local planning authority boundary'
+          AND passed = 'False'
+          AND message not like '%error%'
+        """
+    ).to_df()
+    qual_expectation_bounds = qual_expectation_bounds.assign(
+        problem_source="expectation",
+        problem_type="entity outside of the local planning authority boundary",
+        count=[json.loads(v)["actual"] for v in qual_expectation_bounds["details"]],
+        quality_criteria="3 - entities within LPA boundary",
+        quality_level=3,
     )
-    qual_expectation_bounds["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_bounds["details"]
-    ]
-    qual_expectation_bounds["quality_criteria"] = "3 - entities within LPA boundary"
-    qual_expectation_bounds["quality_level"] = 3
-    qual_expectation_bounds.drop("details", axis=1, inplace=True)
+    qual_expectation_bounds = qual_expectation_bounds.drop(columns="details")
 
     # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    qual_expectation_count = fc.datasette_query(
-        "digital-land",
-        """
+    qual_expectation_count = duckdb.query(
+        f"""
         SELECT organisation, dataset, details
-        FROM expectation
-        WHERE 1=1
-            AND name = 'Check number of entities inside the local planning authority boundary matches the manual count'
-            AND passed = 'False'
-            AND message not like '%error%'
-        """,
-    )
-
-    qual_expectation_count["problem_source"] = "expectation"
-    qual_expectation_count["problem_type"] = "entity count doesn't match manual count"
-    qual_expectation_count["count"] = [
-        json.loads(v)["actual"] for v in qual_expectation_count["details"]
-    ]
-    qual_expectation_count["quality_criteria"] = (
-        "3 - conservation area entity count matches LPA"
+        FROM   read_parquet('{s3_uri}')
+        WHERE  name = 'Check number of entities inside the local planning authority boundary matches the manual count'
+          AND  passed = 'False'
+          AND  message not like '%error%'
+        """
+    ).to_df()
+
+    qual_expectation_count = qual_expectation_count.assign(
+        problem_source="expectation",
+        problem_type="entity count doesn't match manual count",
+        count=[json.loads(v)["actual"] for v in qual_expectation_count["details"]],
+        quality_criteria="3 - conservation area entity count matches LPA",
+        quality_level=3,
     )
-    qual_expectation_count["quality_level"] = 3
     qual_expectation_count.drop("details", axis=1, inplace=True)
 
     # Combine all problem source tables, and aggregate to criteria level

From 83210b73404581361a06bcf8ef76e5c1152c8fa2 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Fri, 30 May 2025 11:27:39 +0100
Subject: [PATCH 15/17] Change datasette_query to duckdb.query

---
 digital_land/commands.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/commands.py b/digital_land/commands.py
index 5afe18d62..2417e5f63 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -1801,7 +1801,7 @@ def generate_provision_quality():
     qual_issue = qual_issue.drop(columns="issue_type")
 
     # IDENTIFY PROBLEMS - expectations - entity beyond LPA bounds
-    s3_uri = f"s3://development-collection-data/log/expectation/dataset=*/*.parquet"
+    s3_uri = "s3://development-collection-data/log/expectation/dataset=*/*.parquet"
 
     qual_expectation_bounds = duckdb.query(
         f"""

From bf32e96f27a952a9bb613238fbb262a6dabe3d11 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Fri, 30 May 2025 12:08:11 +0100
Subject: [PATCH 16/17] Added test for duckdb instead of datasette

---
 .../test_generate_provision_quality.py        | 46 ++++++++++++-------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py
index 33dbf202b..897325286 100644
--- a/tests/integration/test_generate_provision_quality.py
+++ b/tests/integration/test_generate_provision_quality.py
@@ -1,19 +1,18 @@
 import pandas as pd
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 from pathlib import Path
 from datetime import datetime
 from digital_land.commands import generate_provision_quality
 
 
-@patch("digital_land.commands.fc.datasette_query")
+@patch("digital_land.commands.duckdb.query")
 @patch("digital_land.commands.fc.query_sqlite")
 def test_generate_provision_quality(
     mock_query_sqlite,
-    mock_datasette_query,
+    mock_duckdb_query,
 ):
     # mock issue_type
-    mock_datasette_query.side_effect = [
-        pd.DataFrame(
+    df1 = pd.DataFrame(
             [
                 {
                     "description": "desc",
@@ -25,9 +24,9 @@ def test_generate_provision_quality(
                     "quality_level": 3,
                 }
             ]
-        ),
-        # mock LPA boundary check
-        pd.DataFrame(
+        )
+    # mock LPA boundary check
+    df2 =pd.DataFrame(
             [
                 {
                     "organisation": "org1",
@@ -35,9 +34,9 @@ def test_generate_provision_quality(
                     "details": '{"actual": 2}',
                 }
             ]
-        ),
-        # mock count value
-        pd.DataFrame(
+        )
+    # mock count value
+    df3 = pd.DataFrame(
             [
                 {
                     "organisation": "org1",
@@ -45,8 +44,19 @@ def test_generate_provision_quality(
                     "details": '{"actual": 1}',
                 }
             ]
-        ),
-    ]
+        )
+
+    # Wrap each in a mock with .to_df()
+    rel1 = Mock()
+    rel1.to_df.return_value = df1
+
+    rel2 = Mock()
+    rel2.to_df.return_value = df2
+
+    rel3 = Mock()
+    rel3.to_df.return_value = df3
+
+    mock_duckdb_query.side_effect = [rel1, rel2, rel3]
 
     # mock sqlite queries
     mock_query_sqlite.side_effect = [
@@ -81,10 +91,12 @@ def test_generate_provision_quality(
     assert output_file.exists(), "Parquet file not found"
 
     df = pd.read_parquet(output_file)
-    assert "organisation" in df.columns
-    assert "dataset" in df.columns
-    assert "quality" in df.columns
-
     assert not df.empty, "Dataframe loaded from Parquet is empty"
+    assert set(["organisation", "dataset", "quality"]).issubset(df.columns)
     assert len(df) == 1
     assert df.iloc[0]["organisation"] == "org1"
+    assert df.iloc[0]["dataset"] == "dataset1"
+    assert df["quality"].iloc[0] in [
+        "3. data that is good for ODP",
+        "4. data that is trustworthy",
+    ]

From ba74c084135cc4b90ab5eefe5f1f5bf65b466b75 Mon Sep 17 00:00:00 2001
From: alexglasertpx <alex.glaser@tpximpact.com>
Date: Fri, 30 May 2025 12:14:44 +0100
Subject: [PATCH 17/17] Added test for duckdb instead of datasette

---
 .../test_generate_provision_quality.py        | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/tests/integration/test_generate_provision_quality.py b/tests/integration/test_generate_provision_quality.py
index 897325286..edb291f58 100644
--- a/tests/integration/test_generate_provision_quality.py
+++ b/tests/integration/test_generate_provision_quality.py
@@ -13,38 +13,38 @@ def test_generate_provision_quality(
 ):
     # mock issue_type
     df1 = pd.DataFrame(
-            [
-                {
-                    "description": "desc",
-                    "issue_type": "missing-value",
-                    "name": "Missing Value",
-                    "severity": "error",
-                    "responsibility": "external",
-                    "quality_criteria": "any other validity error",
-                    "quality_level": 3,
-                }
-            ]
-        )
+        [
+            {
+                "description": "desc",
+                "issue_type": "missing-value",
+                "name": "Missing Value",
+                "severity": "error",
+                "responsibility": "external",
+                "quality_criteria": "any other validity error",
+                "quality_level": 3,
+            }
+        ]
+    )
     # mock LPA boundary check
-    df2 =pd.DataFrame(
-            [
-                {
-                    "organisation": "org1",
-                    "dataset": "dataset1",
-                    "details": '{"actual": 2}',
-                }
-            ]
-        )
+    df2 = pd.DataFrame(
+        [
+            {
+                "organisation": "org1",
+                "dataset": "dataset1",
+                "details": '{"actual": 2}',
+            }
+        ]
+    )
     # mock count value
     df3 = pd.DataFrame(
-            [
-                {
-                    "organisation": "org1",
-                    "dataset": "dataset1",
-                    "details": '{"actual": 1}',
-                }
-            ]
-        )
+        [
+            {
+                "organisation": "org1",
+                "dataset": "dataset1",
+                "details": '{"actual": 1}',
+            }
+        ]
+    )
 
     # Wrap each in a mock with .to_df()
     rel1 = Mock()