diff --git a/transformation-scripts/dataset-integration-planning/dataset-assessment.ipynb b/transformation-scripts/dataset-integration-planning/dataset-assessment.ipynb new file mode 100644 index 00000000..4b1ad108 --- /dev/null +++ b/transformation-scripts/dataset-integration-planning/dataset-assessment.ipynb @@ -0,0 +1,615 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9510c939-e383-4d91-b6b2-03db06042d7f", + "metadata": {}, + "source": [ + "# High priority dataset assessment" + ] + }, + { + "cell_type": "markdown", + "id": "327f1469-5ea4-46ee-8ffb-290be27ba875", + "metadata": {}, + "source": [ + "![dataset-integration-paths.png](dataset-integration-paths.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7d8c747a-d389-41af-8145-a62834f55c22", + "metadata": {}, + "source": [ + "| | Integration path | Portal | Data services team work | ODD team work | UI work |\n", + "|---------------------|------------------|--------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "| NLDAS-3 | 2 | WaterInsight | Move NetCDFs to ODR Publish STAC collection + items | Demonstrate titiler-multidim tiles and statistics for this dataset. | Integrate titiler-multidim /tiles and /statistics for STAC items |\n", + "| MiCASA | 5 | GHG Center | Publish STAC collection | Make sure /timeseries/statistics works for this dataset | titiler-cmr timeseries integration is in-progress |\n", + "| BlueFlux | 2 | GHG Center | Move NetCDFs to VEDA bucket Publish STAC collection + items | Same as NLDAS-3 | Same as NLDAS-3 + pass sel + sel_method parameters |\n", + "| MUR SST (NetCDFs) | 5 | Coastal | Publish STAC collection | None | Same as MiCASA |\n", + "| MUR SST (virtual icechunk) | 4 | Coastal | Publish STAC collection(s) NOTE: Currently this is 2 icechunk stores. Publishing 2 collections is most straightforward option. | Enable titiler/titiler-multidim to read icechunk stores. NOTE: We may be blocked on this until PO.DAAC enables requester-pays. | Once titiler-multidim can read icechunk stores, tiles should work via zarr-timeseries. However the UI would need to be able to generate timeseries from titiler-multidim via a STAC collection. |\n", + "| MUR SST (native Zarr icechunk) | 4 | Coastal | Publish STAC collection(s) | Enable titiler/titiler-multidim to read icechunk stores and generate timeseries | Once titiler-multidim can read icechunk stores, tiles should work via zarr-timeseries. However the UI would need to be able to generate timeseries from titiler-multidim via a STAC collection. |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b131df9-fb66-4dba-b798-bbc38fae2557", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install geojson_pydantic" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "59493d22-d383-4839-a3c9-1185dbfa64b3", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import httpx\n", + "import json\n", + "from geojson_pydantic import Feature, Polygon\n", + "\n", + "multidim_base_url = \"https://staging.openveda.cloud/api/titiler-multidim\"\n", + "cmr_base_url = \"https://staging.openveda.cloud/api/titiler-cmr\"\n", + "\n", + "timeseries_headers = {\n", + " \"accept\": \"application/geo+json\",\n", + " \"Content-Type\": \"application/json\"\n", + "}\n", + "\n", + "tile_headers = {\n", + " \"accept\": \"image/png\"\n", + "}\n", + "\n", + "minx, miny, maxx, maxy = [-124, 45, -116, 48]\n", + "geojson_data = Feature(\n", + " type=\"Feature\",\n", + " geometry=Polygon.from_bounds(minx, miny, maxx, maxy),\n", + " properties={},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b277c0a3-dc9a-4823-bde1-5cfe8def1339", + "metadata": {}, + "source": [ + "## Portal: WaterInsight" + ] + }, + { + "cell_type": "markdown", + "id": "5c2e4b11-ec3a-4c57-8655-fc4590f4be68", + "metadata": {}, + "source": [ + "### NLDAS-3 (titiler-multidim via ODR)\n", + "\n", + "### Where is the data? \n", + "\n", + "* Planned to be NetCDFs on AWS open data registry\n", + "* Right now it is in the protected bucket s3://nasa-waterinsight, Sid provided me (Aimee) with credentials for access\n", + "\n", + "### Can we visualize this data using titiler-multidim?\n", + "\n", + "Yes:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "450844c8-12c6-4b24-9724-05b8b3fed15b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success! Received PNG tile image\n", + "Content-Type: image/png\n", + "Content-Length: 9047 bytes\n" + ] + } + ], + "source": [ + "tiles_params = {\n", + " \"scale\": \"1\",\n", + " \"format\": \"png\",\n", + " \"url\": \"s3://nasa-eodc-public/NLDAS3/forcing/monthly/2023/NLDAS_FOR0010_M.A202301.030.beta.nc\",\n", + " \"variable\": \"Tair\",\n", + " \"decode_times\": \"false\",\n", + " \"colormap_name\": \"balance\",\n", + " \"rescale\": \"230,303\" # Temperature range in Kelvin (230K = -43°C, 303K = 30°C)\n", + "}\n", + "\n", + "# Make the GET request for tile\n", + "tiles_response = httpx.get(\n", + " url=f\"{multidim_base_url}/tiles/WebMercatorQuad/0/0/0\",\n", + " params=tiles_params,\n", + " timeout=None\n", + ")\n", + "tiles_response.raise_for_status()\n", + "\n", + "if tiles_response.status_code == 200:\n", + " print(\"Success! Received PNG tile image\")\n", + " print(f\"Content-Type: {tiles_response.headers.get('content-type')}\")\n", + " print(f\"Content-Length: {len(tiles_response.content)} bytes\")\n", + " \n", + " # Optionally save the tile image\n", + " # with open(\"tile_0_0_0.png\", \"wb\") as f:\n", + " # f.write(tiles_response.content)\n", + " # print(\"Tile saved as tile_0_0_0.png\")\n", + "else:\n", + " print(f\"Error: {tiles_response.status_code}\")\n", + " print(tiles_response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "d242ee99-1ddf-4312-bb87-2a0fa223cbfd", + "metadata": {}, + "source": [ + "### Can we visualize it in VEDA UI? \n", + "\n", + "👷 Not yet: VEDA UI does not have an integration titiler-multidim item-based visualization, only for a single zarr endpoint. \n", + "\n", + "In theory if we had an icechunk store, and titiler/titiler-multidim supported reading icechunk, we could visualize it.\n" + ] + }, + { + "cell_type": "markdown", + "id": "d4a63dc8-0aed-45d4-ba9e-f2f1104664f7", + "metadata": {}, + "source": [ + "### Can we produce time series using titiler-multidim?\n", + "\n", + "👷 No: titiler-multidim produces statistics for a given URL. Integration into the UI means we need:\n", + "\n", + "1. To index all the items into STAC (data services team)\n", + "2. The UI to query STAC for items and then make request to all individual items." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6224cd71-12a5-4733-aecd-d973c88c7bf1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success!\n" + ] + } + ], + "source": [ + "# Query parameters\n", + "params = {\n", + " \"url\": \"s3://nasa-eodc-public/NLDAS3/forcing/monthly/2023/NLDAS_FOR0010_M.A202301.030.beta.nc\",\n", + " \"variable\": \"Tair\",\n", + " \"decode_times\": \"false\",\n", + " \"histogram_bins\": \"8\"\n", + "}\n", + "\n", + "# Make the POST request\n", + "response = httpx.post(\n", + " f\"{multidim_base_url}/statistics\",\n", + " params=params,\n", + " headers=timeseries_headers,\n", + " json=geojson_data.model_dump(exclude_none=True),\n", + " timeout=None\n", + ")\n", + "\n", + "response.raise_for_status()\n", + "# Check response\n", + "if response.status_code == 200:\n", + " result = response.json()\n", + " print(\"Success!\")\n", + " # print(json.dumps(result, indent=2))\n", + "else:\n", + " print(f\"Error: {response.status_code}\")\n", + " print(response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "e2338578-4c81-423c-9689-f83cdd29072f", + "metadata": {}, + "source": [ + "### Can we produce time series in VEDA UI?\n", + "No, VEDA UI does not have an integration with titiler-multidim for time series generation.\n" + ] + }, + { + "cell_type": "markdown", + "id": "9ef3b2a1-3567-4624-8a1a-b209140b6e9a", + "metadata": {}, + "source": [ + "## GHG Center" + ] + }, + { + "cell_type": "markdown", + "id": "0db3296e-934a-429f-abe8-20c10d7640c1", + "metadata": {}, + "source": [ + "### MiCASA (titiler-cmr via GES DISC)" + ] + }, + { + "cell_type": "markdown", + "id": "e391346b-703b-41c2-808e-172fce6e5952", + "metadata": {}, + "source": [ + "### Can we visualize the dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1229c7d6-a65e-4a59-a26d-9154a0a6e82c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success! Received PNG tile image\n", + "Content-Type: image/jpeg\n", + "Content-Length: 4124 bytes\n" + ] + } + ], + "source": [ + "# Query parameters\n", + "cmr_params = {\n", + " \"scale\": \"1\",\n", + " \"concept_id\": \"C3273639213-GES_DISC\",\n", + " \"datetime\": \"2018-02-12T09:00:00Z\",\n", + " \"variable\": \"NPP\",\n", + " \"backend\": \"xarray\",\n", + " \"colormap_name\": \"purd\",\n", + " \"rescale\": \"0,0.00000008\"\n", + "}\n", + "\n", + "# Make the GET request\n", + "response = httpx.get(\n", + " url=f\"{cmr_base_url}/tiles/WebMercatorQuad/0/0/0\",\n", + " params=cmr_params,\n", + " headers=tile_headers,\n", + " timeout=None\n", + ")\n", + "\n", + "response.raise_for_status()\n", + "\n", + "# Check response\n", + "if response.status_code == 200:\n", + " print(\"Success! Received PNG tile image\")\n", + " print(f\"Content-Type: {response.headers.get('content-type')}\")\n", + " print(f\"Content-Length: {len(response.content)} bytes\")\n", + "else:\n", + " print(f\"Error: {response.status_code}\")\n", + " print(response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "e58e1e01-96f3-4a01-a2d8-8b15335a2f96", + "metadata": {}, + "source": [ + "### Can we visualize it in the UI?\n", + "\n", + "Almost certain this will work given existing GPM IMERG implementation." + ] + }, + { + "cell_type": "markdown", + "id": "7e2c5cc2-0256-4d2f-a2e2-eec84eef91ea", + "metadata": {}, + "source": [ + "### Can we produce time series?\n", + "\n", + "🐛 There is a bug which is being worked on: https://github.com/developmentseed/titiler-cmr/pull/68" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8e82c25f-e306-40fc-bad7-e77e1b7ab8b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = httpx.post(\n", + " \"https://v4jec6i5c0.execute-api.us-west-2.amazonaws.com/timeseries/statistics\",\n", + " params={\n", + " \"concept_id\": \"C3273639213-GES_DISC\",\n", + " \"datetime\": \"2022-03-01T00:00:01Z/2022-03-10T23:59:59Z\",\n", + " \"step\": \"P1D\",\n", + " \"temporal_mode\": \"point\",\n", + " \"variable\": \"NPP\",\n", + " \"backend\": \"xarray\",\n", + " },\n", + " json=geojson_data.model_dump(exclude_none=True),\n", + " timeout=None,\n", + ")\n", + "\n", + "response.raise_for_status()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0648b760-10ef-4ee2-ba6a-fee8b2593fd1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Success!\n" + ] + } + ], + "source": [ + "# Check response\n", + "if request.status_code == 200:\n", + " print(\"Success!\")\n", + "else:\n", + " print(f\"Error: {response.status_code}\")\n", + " print(response.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "86787056-3f7c-40a7-a669-af8fa717bb2c", + "metadata": {}, + "outputs": [], + "source": [ + "#response.json()" + ] + }, + { + "cell_type": "markdown", + "id": "9e1f8691-f306-4217-bd2c-69338b56eb20", + "metadata": {}, + "source": [ + "### Can we produce time series in the UI?" + ] + }, + { + "cell_type": "markdown", + "id": "a0a95f1f-a3eb-439b-830b-d3b58a9ccd06", + "metadata": {}, + "source": [ + "Hanbyul is currently working on this, see the [veda-ui issue #1727](https://github.com/NASA-IMPACT/veda-ui/issues/1727) and [WIP PR #1747](https://github.com/NASA-IMPACT/veda-ui/pull/1747)." + ] + }, + { + "cell_type": "markdown", + "id": "784fd6b7-e2b0-495e-85eb-b919190bf414", + "metadata": {}, + "source": [ + "## BlueFlux (titiler-multidim via VEDA bucket)\n", + "\n", + "Since the data is maintained by ORNL DAAC, which we don't currently have access too, it was suggested to copy the data into the VEDA SMCE bucket and tile it from there using titiler-multidim.\n", + "\n", + "Looks like it is only 4 files. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93954293-254e-492c-8f62-842aad351f7b", + "metadata": {}, + "outputs": [], + "source": [ + "import earthaccess\n", + "\n", + "earthaccess.login()\n", + "\n", + "granule_results = earthaccess.search_data(\n", + " collection_concept_id=\"C3498325287-ORNL_CLOUD\"\n", + ")\n", + "print(f\"{len(granule_results)} granules found\")\n", + "\n", + "s3_link = granule_results[0].data_links(access=\"direct\")[0]\n", + "s3fs = earthaccess.get_s3_filesystem(daac='ORNLDAAC')\n", + "\n", + "# s3fs.download(s3_link, s3_link.split('/')[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ba0aaab-47a4-4a4c-8867-4c62c9d5a7e6", + "metadata": {}, + "outputs": [], + "source": [ + "#!aws s3 cp blueflux_fco2_micromol_500m_std_v1.nc s3://nasa-eodc-public/BlueFlux/blueflux_fco2_micromol_500m_std_v1.nc" + ] + }, + { + "cell_type": "markdown", + "id": "136f9c0c-e57b-4212-a8cd-e958fef152bb", + "metadata": {}, + "source": [ + "# Can we visualize it?\n", + "\n", + "👷 Probably, but we need to upgrade titiler-multidim to include the `sel` parameter since each file has many dates in it.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "480c4ccf-e19e-4af7-8dbc-c7930f8f84c5", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "xds = xr.open_dataset('blueflux_fco2_micromol_500m_std_v1.nc')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37be47c5-43ff-443c-b511-60d8c676bcdf", + "metadata": {}, + "outputs": [], + "source": [ + "xds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e840cee-dd29-4005-a67d-10022e58d34e", + "metadata": {}, + "outputs": [], + "source": [ + "xds.fco2_std[100].min().values, xds.fco2_std[100].max().values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f70d004-eb1e-4637-963d-d28a5c20e3e2", + "metadata": {}, + "outputs": [], + "source": [ + "import morecantile\n", + "\n", + "tms = morecantile.tms.get(\"WebMercatorQuad\")\n", + "\n", + "x, y, z = tms.tile(-81, 26, 7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97b5bdc6-ac32-4e0e-9af0-58c2b82af4a2", + "metadata": {}, + "outputs": [], + "source": [ + "x, y, z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff0b6d5e-68b7-49cf-aaac-d68bbc04e4f3", + "metadata": {}, + "outputs": [], + "source": [ + "tiles_params = {\n", + " \"scale\": \"1\",\n", + " \"format\": \"png\",\n", + " \"url\": \"s3://nasa-eodc-public/BlueFlux/blueflux_fco2_micromol_500m_std_v1.nc\",\n", + " \"variable\": \"fco2_std\",\n", + " \"sel\": \"time=2000-04-10\",\n", + " \"colormap_name\": \"pink\",\n", + " \"rescale\": \"0.04,1.55\"\n", + "}\n", + "\n", + "# Make the GET request for tile\n", + "tiles_response = requests.get(\n", + " url=f\"{multidim_base_url}/tiles/WebMercatorQuad/{z}/{x}/{y}\",\n", + " params=tiles_params\n", + ")\n", + "\n", + "if tiles_response.status_code == 200:\n", + " print(\"Success! Received PNG tile image\")\n", + " print(f\"Content-Type: {tiles_response.headers.get('content-type')}\")\n", + " print(f\"Content-Length: {len(tiles_response.content)} bytes\")\n", + " \n", + " #Optionally save the tile image\n", + " with open(\"tile.png\", \"wb\") as f:\n", + " f.write(tiles_response.content)\n", + " print(\"Tile saved as tile.png\")\n", + "else:\n", + " print(f\"Error: {tiles_response.status_code}\")\n", + " print(tiles_response.text)" + ] + }, + { + "cell_type": "markdown", + "id": "4c3bcbbe-6f89-4787-8264-d3e6765518a7", + "metadata": {}, + "source": [ + "## Can we produce timeseries?\n", + "\n", + "👷 Similarly, we need to include the `sel` parameter into the statistics endpoint of titiler-multidim." + ] + }, + { + "cell_type": "markdown", + "id": "76cc8dae-b135-4bc9-8f56-e238e68681d5", + "metadata": {}, + "source": [ + "## Coastal Portal" + ] + }, + { + "cell_type": "markdown", + "id": "13c13e1e-bd3f-4d66-b614-b70021b338a2", + "metadata": {}, + "source": [ + "### MUR SST\n", + "\n", + "We can integrate this dataset visually, it has been previously demonstrated.\n", + "\n", + "Time series should work once titiler-cmr timeseries integration is complete.\n", + "\n", + "👷 However, this will all be slow without the use of a virtual layer." + ] + }, + { + "cell_type": "markdown", + "id": "1c806131-172a-4b1b-a0ea-fa60839366fa", + "metadata": {}, + "source": [ + "## Global Mangrove Aboveground Biomass, Carbon Stocks and Canopy Height\n", + "\n", + "I think we decided this was a no-op since it is a GeoTIFF and must be convereted to COG to work with any tiler." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transformation-scripts/dataset-integration-planning/dataset-integration-paths.png b/transformation-scripts/dataset-integration-planning/dataset-integration-paths.png new file mode 100644 index 00000000..b309f37f Binary files /dev/null and b/transformation-scripts/dataset-integration-planning/dataset-integration-paths.png differ diff --git a/transformation-scripts/dataset-integration-planning/dataset-integration-paths.svg b/transformation-scripts/dataset-integration-planning/dataset-integration-paths.svg new file mode 100644 index 00000000..2a13b2b9 --- /dev/null +++ b/transformation-scripts/dataset-integration-planning/dataset-integration-paths.svg @@ -0,0 +1,4 @@ + + +VEDA-owned or publicDAAC (GES DISC, LPDAAC)COGNetCDFicechunkCOGNetCDFdata storemetadataSTAC collectionSTAC collection+ itemsservicestitiler-cmrtitiler-pgstactitiler-multidimVEDA UItiles (concept-id)timeseries(concept-id)tiles (URL)statistics (URL)tiles (search id)statistics (URL)client1. VEDA COG + titiler-pgstac2. VEDA-accessbile NetCDF + titiler-multidim3. Virtual icechunk for VEDA-accessible data + titiler-multidim4. Virtual icechunk for CMR data + titiler-multidim (titiler-cmr in future)5. DAAC COG or NetCDF-4 + titiler-cmredl or requester pays?in-progressneeds workworksDataset integration paths \ No newline at end of file