Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,762 changes: 0 additions & 2,762 deletions transit_provider_dashboard/01_agency_grain_census.ipynb

This file was deleted.

897 changes: 897 additions & 0 deletions transit_provider_dashboard/01_prepare_acs_data.ipynb

Large diffs are not rendered by default.

1,336 changes: 0 additions & 1,336 deletions transit_provider_dashboard/02_ntd_data_integration.ipynb

This file was deleted.

280 changes: 280 additions & 0 deletions transit_provider_dashboard/02_prepare_orgs_ridership_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "38be2998-be45-482f-895d-8b7b77c233b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: shared_utils in /opt/conda/lib/python3.11/site-packages (4.2)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install shared_utils"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "85934cb5-af8d-4a54-a7b7-b1800dd2b03e",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('../ahsc_grant')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a4b318d7-230c-4a36-9406-87d3e541b6bc",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.11/site-packages/dask/dataframe/__init__.py:31: FutureWarning: \n",
"Dask dataframe query planning is disabled because dask-expr is not installed.\n",
"\n",
"You can install it with `pip install dask[dataframe]` or `conda install dask`.\n",
"This will raise in a future version.\n",
"\n",
" warnings.warn(msg, FutureWarning)\n"
]
}
],
"source": [
"import pandas as pd \n",
"import geopandas as gpd\n",
"import google.auth\n",
"import os\n",
"import gcsfs\n",
"import requests\n",
"from calitp_data_analysis.sql import get_engine\n",
"from shared_utils import schedule_rt_utils \n",
"from gtfs_key_ntd_crosswalk import filter_to_valid_dates\n",
"db_engine = get_engine()\n",
"credentials, project = google.auth.default()\n",
"fs = gcsfs.GCSFileSystem()\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "14f225e9-2636-4fae-b568-4e0d554cebad",
"metadata": {},
"outputs": [],
"source": [
"GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses'\n",
"analysis_date = \"2025-10-16\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0937c3fe-837b-4ace-90b2-c68a07ccdb53",
"metadata": {},
"outputs": [],
"source": [
"# Querying dim organization\n",
"with db_engine.connect() as connection:\n",
" query = \"\"\"\n",
" SELECT\n",
" key, name, source_record_id, organization_type, ntd_id, ntd_id_2022, ntd_agency_info_key, \n",
" public_currently_operating, _is_current, _valid_from, _valid_to\n",
" FROM \n",
" cal-itp-data-infra.mart_transit_database.dim_organizations\n",
" \"\"\"\n",
" \n",
" #localize timestamps\n",
" dim_orgs = (\n",
" pd.read_sql(query, connection)\n",
" .pipe(schedule_rt_utils.localize_timestamp_col, [\"_valid_from\", \"_valid_to\"])\n",
" )\n",
" \n",
" \n",
" dim_orgs = dim_orgs[\n",
" (dim_orgs['public_currently_operating'] == True) & \n",
" (dim_orgs['_is_current'] == True)\n",
" ].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2f48a88b-fe63-4e66-bce1-19ce467be773",
"metadata": {},
"outputs": [],
"source": [
"# Filtering the provider gtfs data to valid dates \n",
"valid_organization_full = filter_to_valid_dates(dim_orgs, [analysis_date])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ef62384e-285d-482c-b197-043523472fdf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 218 entries, 0 to 217\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 key 218 non-null object \n",
" 1 name 218 non-null object \n",
" 2 source_record_id 218 non-null object \n",
" 3 organization_type 218 non-null object \n",
" 4 ntd_id 179 non-null object \n",
" 5 ntd_id_2022 181 non-null object \n",
" 6 ntd_agency_info_key 159 non-null object \n",
" 7 public_currently_operating 218 non-null object \n",
" 8 _is_current 218 non-null bool \n",
" 9 _valid_from 218 non-null datetime64[ns, UTC]\n",
" 10 _valid_to 218 non-null datetime64[ns, UTC]\n",
" 11 _valid_from_local 218 non-null datetime64[ns] \n",
" 12 _valid_to_local 218 non-null datetime64[ns] \n",
"dtypes: bool(1), datetime64[ns, UTC](2), datetime64[ns](2), object(8)\n",
"memory usage: 20.8+ KB\n"
]
}
],
"source": [
"valid_organization_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "56dc63b7-cd87-4d59-98fb-b5dc601012a6",
"metadata": {},
"outputs": [],
"source": [
"# Querying NTD Ridership data \n",
"with db_engine.connect() as connection:\n",
" query = \"\"\"\n",
" SELECT\n",
" agency, ntd_id, reporter_type, report_year, primary_uza_name, unlinked_passenger_trips_upt, agency_voms\n",
" FROM \n",
" cal-itp-data-infra.mart_ntd.dim_annual_service_agencies\n",
" WHERE \n",
" state = 'CA' AND report_year = 2023\n",
" \"\"\"\n",
" ridership_data= pd.read_sql(query, connection)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "db017035-7b98-48a0-9a63-5fcf144e63d0",
"metadata": {},
"outputs": [],
"source": [
"# Grouping ridership data for each agency/ntd_id\n",
"ridership_data_grouped = ridership_data.groupby(\n",
" [\n",
" \"agency\",\n",
" \"ntd_id\", \n",
" ]\n",
" ).agg({\n",
" \"unlinked_passenger_trips_upt\":\"sum\",\n",
" \"agency_voms\":\"sum\"\n",
" }).sort_values(by=\"ntd_id\").reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7ca7eb20-8891-4423-93e9-c6eba447d2b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 209 entries, 0 to 208\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 agency 209 non-null object \n",
" 1 ntd_id 209 non-null object \n",
" 2 unlinked_passenger_trips_upt 209 non-null float64\n",
" 3 agency_voms 209 non-null float64\n",
"dtypes: float64(2), object(2)\n",
"memory usage: 6.7+ KB\n"
]
}
],
"source": [
"ridership_data_grouped.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "05d58a95-5e4a-43e1-861d-5fbaa8abfa33",
"metadata": {},
"outputs": [],
"source": [
"#Store data in warehouse\n",
"with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet\", \"wb\") as f:\n",
" ridership_data_grouped.to_parquet(f, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "38dd8fb5-5d8a-43e1-b21a-0f05ecfe70fd",
"metadata": {},
"outputs": [],
"source": [
"#Store data in warehouse\n",
"with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet\", \"wb\") as f:\n",
" valid_organization_full.to_parquet(f, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00e74d09-a07e-4f42-af18-1a51a9139398",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading