Skip to content

Commit 73bc497

Browse files
committed
transit provider dashboard refactor code refactored to add more organizations
1 parent 6b50d44 commit 73bc497

File tree

8 files changed

+4356
-4098
lines changed

8 files changed

+4356
-4098
lines changed

transit_provider_dashboard/01_agency_grain_census.ipynb

Lines changed: 0 additions & 2762 deletions
This file was deleted.

transit_provider_dashboard/01_prepare_acs_data.ipynb

Lines changed: 897 additions & 0 deletions
Large diffs are not rendered by default.

transit_provider_dashboard/02_ntd_data_integration.ipynb

Lines changed: 0 additions & 1336 deletions
This file was deleted.
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "38be2998-be45-482f-895d-8b7b77c233b4",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stdout",
11+
"output_type": "stream",
12+
"text": [
13+
"Requirement already satisfied: shared_utils in /opt/conda/lib/python3.11/site-packages (4.2)\n",
14+
"Note: you may need to restart the kernel to use updated packages.\n"
15+
]
16+
}
17+
],
18+
"source": [
19+
"pip install shared_utils"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 2,
25+
"id": "85934cb5-af8d-4a54-a7b7-b1800dd2b03e",
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"import sys\n",
30+
"sys.path.append('../ahsc_grant')"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 3,
36+
"id": "a4b318d7-230c-4a36-9406-87d3e541b6bc",
37+
"metadata": {},
38+
"outputs": [
39+
{
40+
"name": "stderr",
41+
"output_type": "stream",
42+
"text": [
43+
"/opt/conda/lib/python3.11/site-packages/dask/dataframe/__init__.py:31: FutureWarning: \n",
44+
"Dask dataframe query planning is disabled because dask-expr is not installed.\n",
45+
"\n",
46+
"You can install it with `pip install dask[dataframe]` or `conda install dask`.\n",
47+
"This will raise in a future version.\n",
48+
"\n",
49+
" warnings.warn(msg, FutureWarning)\n"
50+
]
51+
}
52+
],
53+
"source": [
54+
"import pandas as pd \n",
55+
"import geopandas as gpd\n",
56+
"import google.auth\n",
57+
"import os\n",
58+
"import gcsfs\n",
59+
"import requests\n",
60+
"from calitp_data_analysis.sql import get_engine\n",
61+
"from shared_utils import schedule_rt_utils \n",
62+
"from gtfs_key_ntd_crosswalk import filter_to_valid_dates\n",
63+
"db_engine = get_engine()\n",
64+
"credentials, project = google.auth.default()\n",
65+
"fs = gcsfs.GCSFileSystem()\n",
66+
"\n",
67+
"pd.set_option('display.max_columns', None)"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 4,
73+
"id": "14f225e9-2636-4fae-b568-4e0d554cebad",
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses'\n",
78+
"analysis_date = \"2025-10-16\""
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": 5,
84+
"id": "0937c3fe-837b-4ace-90b2-c68a07ccdb53",
85+
"metadata": {},
86+
"outputs": [],
87+
"source": [
88+
"# Querying dim organization\n",
89+
"with db_engine.connect() as connection:\n",
90+
" query = \"\"\"\n",
91+
" SELECT\n",
92+
" key, name, source_record_id, organization_type, ntd_id, ntd_id_2022, ntd_agency_info_key, \n",
93+
" public_currently_operating, _is_current, _valid_from, _valid_to\n",
94+
" FROM \n",
95+
" cal-itp-data-infra.mart_transit_database.dim_organizations\n",
96+
" \"\"\"\n",
97+
" \n",
98+
" #localize timestamps\n",
99+
" dim_orgs = (\n",
100+
" pd.read_sql(query, connection)\n",
101+
" .pipe(schedule_rt_utils.localize_timestamp_col, [\"_valid_from\", \"_valid_to\"])\n",
102+
" )\n",
103+
" \n",
104+
" \n",
105+
" dim_orgs = dim_orgs[\n",
106+
" (dim_orgs['public_currently_operating'] == True) & \n",
107+
" (dim_orgs['_is_current'] == True)\n",
108+
" ].reset_index(drop=True)"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 6,
114+
"id": "2f48a88b-fe63-4e66-bce1-19ce467be773",
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"# Filtering the provider gtfs data to valid dates \n",
119+
"valid_organization_full = filter_to_valid_dates(dim_orgs, [analysis_date])"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 7,
125+
"id": "ef62384e-285d-482c-b197-043523472fdf",
126+
"metadata": {},
127+
"outputs": [
128+
{
129+
"name": "stdout",
130+
"output_type": "stream",
131+
"text": [
132+
"<class 'pandas.core.frame.DataFrame'>\n",
133+
"RangeIndex: 218 entries, 0 to 217\n",
134+
"Data columns (total 13 columns):\n",
135+
" # Column Non-Null Count Dtype \n",
136+
"--- ------ -------------- ----- \n",
137+
" 0 key 218 non-null object \n",
138+
" 1 name 218 non-null object \n",
139+
" 2 source_record_id 218 non-null object \n",
140+
" 3 organization_type 218 non-null object \n",
141+
" 4 ntd_id 179 non-null object \n",
142+
" 5 ntd_id_2022 181 non-null object \n",
143+
" 6 ntd_agency_info_key 159 non-null object \n",
144+
" 7 public_currently_operating 218 non-null object \n",
145+
" 8 _is_current 218 non-null bool \n",
146+
" 9 _valid_from 218 non-null datetime64[ns, UTC]\n",
147+
" 10 _valid_to 218 non-null datetime64[ns, UTC]\n",
148+
" 11 _valid_from_local 218 non-null datetime64[ns] \n",
149+
" 12 _valid_to_local 218 non-null datetime64[ns] \n",
150+
"dtypes: bool(1), datetime64[ns, UTC](2), datetime64[ns](2), object(8)\n",
151+
"memory usage: 20.8+ KB\n"
152+
]
153+
}
154+
],
155+
"source": [
156+
"valid_organization_full.info()"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": 9,
162+
"id": "56dc63b7-cd87-4d59-98fb-b5dc601012a6",
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"# Querying NTD Ridership data \n",
167+
"with db_engine.connect() as connection:\n",
168+
" query = \"\"\"\n",
169+
" SELECT\n",
170+
" agency, ntd_id, reporter_type, report_year, primary_uza_name, unlinked_passenger_trips_upt, agency_voms\n",
171+
" FROM \n",
172+
" cal-itp-data-infra.mart_ntd.dim_annual_service_agencies\n",
173+
" WHERE \n",
174+
" state = 'CA' AND report_year = 2023\n",
175+
" \"\"\"\n",
176+
" ridership_data= pd.read_sql(query, connection)"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": 10,
182+
"id": "db017035-7b98-48a0-9a63-5fcf144e63d0",
183+
"metadata": {},
184+
"outputs": [],
185+
"source": [
186+
"# Grouping ridership data for each agency/ntd_id\n",
187+
"ridership_data_grouped = ridership_data.groupby(\n",
188+
" [\n",
189+
" \"agency\",\n",
190+
" \"ntd_id\", \n",
191+
" ]\n",
192+
" ).agg({\n",
193+
" \"unlinked_passenger_trips_upt\":\"sum\",\n",
194+
" \"agency_voms\":\"sum\"\n",
195+
" }).sort_values(by=\"ntd_id\").reset_index()"
196+
]
197+
},
198+
{
199+
"cell_type": "code",
200+
"execution_count": 11,
201+
"id": "7ca7eb20-8891-4423-93e9-c6eba447d2b4",
202+
"metadata": {},
203+
"outputs": [
204+
{
205+
"name": "stdout",
206+
"output_type": "stream",
207+
"text": [
208+
"<class 'pandas.core.frame.DataFrame'>\n",
209+
"RangeIndex: 209 entries, 0 to 208\n",
210+
"Data columns (total 4 columns):\n",
211+
" # Column Non-Null Count Dtype \n",
212+
"--- ------ -------------- ----- \n",
213+
" 0 agency 209 non-null object \n",
214+
" 1 ntd_id 209 non-null object \n",
215+
" 2 unlinked_passenger_trips_upt 209 non-null float64\n",
216+
" 3 agency_voms 209 non-null float64\n",
217+
"dtypes: float64(2), object(2)\n",
218+
"memory usage: 6.7+ KB\n"
219+
]
220+
}
221+
],
222+
"source": [
223+
"ridership_data_grouped.info()"
224+
]
225+
},
226+
{
227+
"cell_type": "code",
228+
"execution_count": 12,
229+
"id": "05d58a95-5e4a-43e1-861d-5fbaa8abfa33",
230+
"metadata": {},
231+
"outputs": [],
232+
"source": [
233+
"#Store data in warehouse\n",
234+
"with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet\", \"wb\") as f:\n",
235+
" ridership_data_grouped.to_parquet(f, index=False)"
236+
]
237+
},
238+
{
239+
"cell_type": "code",
240+
"execution_count": 8,
241+
"id": "38dd8fb5-5d8a-43e1-b21a-0f05ecfe70fd",
242+
"metadata": {},
243+
"outputs": [],
244+
"source": [
245+
"#Store data in warehouse\n",
246+
"with fs.open(f\"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet\", \"wb\") as f:\n",
247+
" valid_organization_full.to_parquet(f, index=False)"
248+
]
249+
},
250+
{
251+
"cell_type": "code",
252+
"execution_count": null,
253+
"id": "00e74d09-a07e-4f42-af18-1a51a9139398",
254+
"metadata": {},
255+
"outputs": [],
256+
"source": []
257+
}
258+
],
259+
"metadata": {
260+
"kernelspec": {
261+
"display_name": "Python 3 (ipykernel)",
262+
"language": "python",
263+
"name": "python3"
264+
},
265+
"language_info": {
266+
"codemirror_mode": {
267+
"name": "ipython",
268+
"version": 3
269+
},
270+
"file_extension": ".py",
271+
"mimetype": "text/x-python",
272+
"name": "python",
273+
"nbconvert_exporter": "python",
274+
"pygments_lexer": "ipython3",
275+
"version": "3.11.10"
276+
}
277+
},
278+
"nbformat": 4,
279+
"nbformat_minor": 5
280+
}

0 commit comments

Comments
 (0)