From 5affcc6f7a18fbf7eb838c4bb2b42a61f49707e5 Mon Sep 17 00:00:00 2001
From: Lachy Groom <lachygroom@gmail.com>
Date: Sat, 14 Feb 2026 18:29:11 -0800
Subject: [PATCH] Fix non-deterministic empty flight fields via aria-label
 fallback

The HTML parser relies on specific CSS class names (e.g. tPgKwe,
mv1WYe, Ak5kof, BbR8Ec) to extract flight details. However, Google
obfuscates these class names differently depending on the browser/TLS
fingerprint. Since primp's chrome_126 impersonation silently falls back
to a random fingerprint, ~25% of requests receive HTML with different
class names, causing airline name, departure/arrival times, duration,
and stops to all come back empty while price still works.

This adds a fallback that parses the aria-label attribute on each
flight item when any CSS selector returns empty data. The aria-label
always contains structured text regardless of fingerprint, e.g.:

  "From 2359 US dollars. Nonstop flight with Alaska. Leaves San Jose
   Mineta International Airport at 2:25 PM on Sunday, February 15 ..."

Also handles U+202F (narrow no-break space) that Google uses between
time digits and AM/PM in aria-labels.

Relates to #7 (same class of bug for price CSS selector) and #63
(duplicate flights from multiple container elements).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 fast_flights/core.py | 98 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
diff --git a/fast_flights/core.py b/fast_flights/core.py
index 977b241e..2c10fcfa 100644
--- a/fast_flights/core.py
+++ b/fast_flights/core.py
@@ -15,6 +15,80 @@
 
 DataSource = Literal['html', 'js']
 
+
+def _parse_aria_label(label: str) -> dict:
+    """Parse flight details from aria-label as fallback when CSS selectors fail.
+
+    Google Flights always includes a structured aria-label on each flight item,
+    e.g.: 'From 2359 US dollars. Nonstop flight with Alaska. Leaves San Jose
+    Mineta International Airport at 2:25 PM on Sunday, February 15 and arrives
+    at Kona International Airport at 6:13 PM on Sunday, February 15. Total
+    duration 5 hr 48 min.'
+
+    This is used as a fallback when the CSS class names in the HTML differ from
+    what the parser expects (Google obfuscates class names differently depending
+    on the browser fingerprint / TLS fingerprint used by the HTTP client).
+    """
+    result = {}
+
+    # Airline name: "flight with <airline>. Leaves"
+    m = re.search(r'flight with (.+?)\.\s*Leaves', label)
+    result['name'] = m.group(1) if m else ""
+
+    # Departure time: "Leaves <airport> at <time> on <day> and arrives"
+    # Use greedy .+ to skip past airport names that may contain "at"
+    # Use \s* for the space before AM/PM to handle U+202F (narrow no-break space)
+    m = re.search(r'Leaves .+ at (\d+:\d+\s*(?:AM|PM)) on (.+?) and arrives', label)
+    if m:
+        time_str, date_str = m.group(1), m.group(2)
+        time_str = re.sub(r'\s+', ' ', time_str)
+        result['departure'] = f"{time_str} on {_shorten_date(date_str)}"
+    else:
+        result['departure'] = ""
+
+    # Arrival time: "arrives at <airport> at <time> on <day>. Total"
+    m = re.search(r'arrives .+ at (\d+:\d+\s*(?:AM|PM)) on (.+?)\.\s*Total', label)
+    if m:
+        time_str, date_str = m.group(1), m.group(2)
+        time_str = re.sub(r'\s+', ' ', time_str)
+        result['arrival'] = f"{time_str} on {_shorten_date(date_str)}"
+    else:
+        result['arrival'] = ""
+
+    # Duration: "Total duration <dur>."
+    m = re.search(r'Total duration (.+?)\.', label)
+    result['duration'] = m.group(1) if m else ""
+
+    # Stops: "Nonstop" or "1 stop" or "2 stops"
+    m = re.search(r'(Nonstop|\d+ stops?)\s+flight', label)
+    if m:
+        stops_text = m.group(1)
+        result['stops'] = 0 if stops_text == "Nonstop" else int(stops_text.split()[0])
+    else:
+        result['stops'] = "Unknown"
+
+    return result
+
+
+_DAY_ABBREVS = {
+    'Monday': 'Mon', 'Tuesday': 'Tue', 'Wednesday': 'Wed',
+    'Thursday': 'Thu', 'Friday': 'Fri', 'Saturday': 'Sat', 'Sunday': 'Sun',
+}
+_MONTH_ABBREVS = {
+    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
+    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
+    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec',
+}
+
+
+def _shorten_date(date_str: str) -> str:
+    """Convert 'Sunday, February 15' to 'Sun, Feb 15'."""
+    for full, abbr in _DAY_ABBREVS.items():
+        date_str = date_str.replace(full, abbr)
+    for full, abbr in _MONTH_ABBREVS.items():
+        date_str = date_str.replace(full, abbr)
+    return date_str
+
 # Default cookies embedded into the app to help bypass common consent gating.
 # These are used only if the caller does not supply cookies (binary) and
 # does not provide cookies via request_kwargs.
@@ -275,6 +349,30 @@ def safe(n: Optional[LexborNode]):
             except ValueError:
                 stops_fmt = "Unknown"
 
+            # Fallback: if CSS selectors missed any key fields, parse from
+            # aria-label. Google serves different obfuscated class names
+            # depending on the browser/TLS fingerprint, but aria-label always
+            # contains structured flight data regardless.
+            if not name or not departure_time or not arrival_time or not duration or stops_fmt == "Unknown":
+                # Check the item element itself first, then descendants
+                aria = item.attributes.get("aria-label", "") or ""
+                if not aria or "flight" not in aria:
+                    aria_el = item.css_first("[aria-label*='flight']")
+                    if aria_el:
+                        aria = aria_el.attributes.get("aria-label", "") or ""
+                if aria and "flight" in aria:
+                    parsed = _parse_aria_label(aria)
+                    if not name:
+                        name = parsed.get('name', '')
+                    if not departure_time:
+                        departure_time = parsed.get('departure', '')
+                    if not arrival_time:
+                        arrival_time = parsed.get('arrival', '')
+                    if not duration:
+                        duration = parsed.get('duration', '')
+                    if stops_fmt == "Unknown":
+                        stops_fmt = parsed.get('stops', 'Unknown')
+
             flights.append(
                 {
                     "is_best": is_best_flight,