cal-itp · shweta487 · Dec 31, 2025 · Jan 5, 2026
diff --git a/transit_provider_dashboard/01_agency_grain_census.ipynb b/transit_provider_dashboard/01_agency_grain_census.ipynb
diff --git a/transit_provider_dashboard/01_prepare_acs_data.ipynb b/transit_provider_dashboard/01_prepare_acs_data.ipynb
diff --git a/transit_provider_dashboard/02_ntd_data_integration.ipynb b/transit_provider_dashboard/02_ntd_data_integration.ipynb
diff --git a/transit_provider_dashboard/02_prepare_orgs_ridership_data.ipynb b/transit_provider_dashboard/02_prepare_orgs_ridership_data.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "38be2998-be45-482f-895d-8b7b77c233b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: shared_utils in /opt/conda/lib/python3.11/site-packages (4.2)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install shared_utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "85934cb5-af8d-4a54-a7b7-b1800dd2b03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('../ahsc_grant')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "a4b318d7-230c-4a36-9406-87d3e541b6bc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/dask/dataframe/__init__.py:31: FutureWarning: \n",
+      "Dask dataframe query planning is disabled because dask-expr is not installed.\n",
+      "\n",
+      "You can install it with `pip install dask[dataframe]` or `conda install dask`.\n",
+      "This will raise in a future version.\n",
+      "\n",
+      "  warnings.warn(msg, FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd \n",
+    "import geopandas as gpd\n",
+    "import google.auth\n",
+    "import os\n",
+    "import gcsfs\n",
+    "import requests\n",
+    "from calitp_data_analysis.sql import get_engine\n",
+    "from shared_utils import schedule_rt_utils \n",
+    "from gtfs_key_ntd_crosswalk import filter_to_valid_dates\n",
+    "db_engine = get_engine()\n",
+    "credentials, project = google.auth.default()\n",
+    "fs = gcsfs.GCSFileSystem()\n",
+    "\n",
+    "pd.set_option('display.max_columns', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "14f225e9-2636-4fae-b568-4e0d554cebad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'\n",
+    "analysis_date = \"2025-10-16\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0937c3fe-837b-4ace-90b2-c68a07ccdb53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Querying dim organization\n",
+    "with db_engine.connect() as connection:\n",
+    "    query = \"\"\"\n",
+    "        SELECT\n",
+    "            key, name, source_record_id, organization_type, ntd_id, ntd_id_2022, ntd_agency_info_key, \n",
+    "            public_currently_operating, _is_current, _valid_from, _valid_to\n",
+    "        FROM \n",
+    "            cal-itp-data-infra.mart_transit_database.dim_organizations\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    #localize timestamps\n",
+    "    dim_orgs = (\n",
+    "        pd.read_sql(query, connection)\n",
+    "        .pipe(schedule_rt_utils.localize_timestamp_col, [\"_valid_from\", \"_valid_to\"])\n",
+    "    )\n",
+    "    \n",
+    "    \n",
+    "    dim_orgs = dim_orgs[\n",
+    "        (dim_orgs['public_currently_operating'] == True) & \n",
+    "        (dim_orgs['_is_current'] == True)\n",
+    "    ].reset_index(drop=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2f48a88b-fe63-4e66-bce1-19ce467be773",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Filtering the provider gtfs data to valid dates \n",
+    "valid_organization_full = filter_to_valid_dates(dim_orgs, [analysis_date])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ef62384e-285d-482c-b197-043523472fdf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 218 entries, 0 to 217\n",
+      "Data columns (total 13 columns):\n",
+      " #   Column                      Non-Null Count  Dtype              \n",
+      "---  ------                      --------------  -----              \n",
+      " 0   key                         218 non-null    object             \n",
+      " 1   name                        218 non-null    object             \n",
+      " 2   source_record_id            218 non-null    object             \n",
+      " 3   organization_type           218 non-null    object             \n",
+      " 4   ntd_id                      179 non-null    object             \n",
+      " 5   ntd_id_2022                 181 non-null    object             \n",
+      " 6   ntd_agency_info_key         159 non-null    object             \n",
+      " 7   public_currently_operating  218 non-null    object             \n",
+      " 8   _is_current                 218 non-null    bool               \n",
+      " 9   _valid_from                 218 non-null    datetime64[ns, UTC]\n",
+      " 10  _valid_to                   218 non-null    datetime64[ns, UTC]\n",
+      " 11  _valid_from_local           218 non-null    datetime64[ns]     \n",
+      " 12  _valid_to_local             218 non-null    datetime64[ns]     \n",
+      "dtypes: bool(1), datetime64[ns, UTC](2), datetime64[ns](2), object(8)\n",
+      "memory usage: 20.8+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "valid_organization_full.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "56dc63b7-cd87-4d59-98fb-b5dc601012a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Querying NTD Ridership data \n",
+    "with db_engine.connect() as connection:\n",
+    "    query = \"\"\"\n",
+    "        SELECT\n",
+    "            agency, ntd_id, reporter_type, report_year, primary_uza_name, unlinked_passenger_trips_upt, agency_voms\n",
+    "        FROM \n",
+    "            cal-itp-data-infra.mart_ntd.dim_annual_service_agencies\n",
+    "        WHERE \n",
+    "            state = 'CA' AND report_year = 2023\n",
+    "    \"\"\"\n",
+    "    ridership_data= pd.read_sql(query, connection)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "db017035-7b98-48a0-9a63-5fcf144e63d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Grouping ridership data for each agency/ntd_id\n",
+    "ridership_data_grouped = ridership_data.groupby(\n",
+    "        [\n",
+    "            \"agency\",\n",
+    "            \"ntd_id\",           \n",
+    "        ]\n",
+    "    ).agg({\n",
+    "        \"unlinked_passenger_trips_upt\":\"sum\",\n",
+    "        \"agency_voms\":\"sum\"\n",
+    "    }).sort_values(by=\"ntd_id\").reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7ca7eb20-8891-4423-93e9-c6eba447d2b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 209 entries, 0 to 208\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column                        Non-Null Count  Dtype  \n",
+      "---  ------                        --------------  -----  \n",
+      " 0   agency                        209 non-null    object \n",
+      " 1   ntd_id                        209 non-null    object \n",
+      " 2   unlinked_passenger_trips_upt  209 non-null    float64\n",
+      " 3   agency_voms                   209 non-null    float64\n",
+      "dtypes: float64(2), object(2)\n",
+      "memory usage: 6.7+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "ridership_data_grouped.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "05d58a95-5e4a-43e1-861d-5fbaa8abfa33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Store data in warehouse\n",
+    "with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet\", \"wb\") as f:\n",
+    "    ridership_data_grouped.to_parquet(f, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "38dd8fb5-5d8a-43e1-b21a-0f05ecfe70fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Store data in warehouse\n",
+    "with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet\", \"wb\") as f:\n",
+    "    valid_organization_full.to_parquet(f, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00e74d09-a07e-4f42-af18-1a51a9139398",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}