From 5affcc6f7a18fbf7eb838c4bb2b42a61f49707e5 Mon Sep 17 00:00:00 2001 From: Lachy Groom Date: Sat, 14 Feb 2026 18:29:11 -0800 Subject: [PATCH] Fix non-deterministic empty flight fields via aria-label fallback The HTML parser relies on specific CSS class names (e.g. tPgKwe, mv1WYe, Ak5kof, BbR8Ec) to extract flight details. However, Google obfuscates these class names differently depending on the browser/TLS fingerprint. Since primp's chrome_126 impersonation silently falls back to a random fingerprint, ~25% of requests receive HTML with different class names, causing airline name, departure/arrival times, duration, and stops to all come back empty while price still works. This adds a fallback that parses the aria-label attribute on each flight item when any CSS selector returns empty data. The aria-label always contains structured text regardless of fingerprint, e.g.: "From 2359 US dollars. Nonstop flight with Alaska. Leaves San Jose Mineta International Airport at 2:25 PM on Sunday, February 15 ..." Also handles U+202F (narrow no-break space) that Google uses between time digits and AM/PM in aria-labels. Relates to #7 (same class of bug for price CSS selector) and #63 (duplicate flights from multiple container elements). Co-Authored-By: Claude Opus 4.6 --- fast_flights/core.py | 98 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/fast_flights/core.py b/fast_flights/core.py index 977b241e..2c10fcfa 100644 --- a/fast_flights/core.py +++ b/fast_flights/core.py @@ -15,6 +15,80 @@ DataSource = Literal['html', 'js'] + +def _parse_aria_label(label: str) -> dict: + """Parse flight details from aria-label as fallback when CSS selectors fail. + + Google Flights always includes a structured aria-label on each flight item, + e.g.: 'From 2359 US dollars. Nonstop flight with Alaska. Leaves San Jose + Mineta International Airport at 2:25 PM on Sunday, February 15 and arrives + at Kona International Airport at 6:13 PM on Sunday, February 15. Total + duration 5 hr 48 min.' + + This is used as a fallback when the CSS class names in the HTML differ from + what the parser expects (Google obfuscates class names differently depending + on the browser fingerprint / TLS fingerprint used by the HTTP client). + """ + result = {} + + # Airline name: "flight with . Leaves" + m = re.search(r'flight with (.+?)\.\s*Leaves', label) + result['name'] = m.group(1) if m else "" + + # Departure time: "Leaves at