diff --git a/aw_transform/split_url_events.py b/aw_transform/split_url_events.py index 78ec93f..8a4466f 100644 --- a/aw_transform/split_url_events.py +++ b/aw_transform/split_url_events.py @@ -14,11 +14,16 @@ def split_url_events(events: List[Event]) -> List[Event]: url = event.data["url"] parsed_url = urlparse(url) event.data["$protocol"] = parsed_url.scheme - event.data["$domain"] = ( - parsed_url.netloc[4:] - if parsed_url.netloc[:4] == "www." - else parsed_url.netloc - ) + netloc = parsed_url.netloc + if netloc: + domain = netloc[4:] if netloc[:4] == "www." else netloc + elif parsed_url.scheme: + # For URLs without a domain (e.g. file://, about:), + # use the scheme as domain so they don't all cluster as empty. + domain = parsed_url.scheme + else: + domain = "" + event.data["$domain"] = domain event.data["$path"] = parsed_url.path event.data["$params"] = parsed_url.params event.data["$options"] = parsed_url.query diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 1d06e39..3aa119c 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -345,12 +345,22 @@ def test_url_parse_event(): result = split_url_events([e3]) print(result) assert result[0].data["$protocol"] == "file" - assert result[0].data["$domain"] == "" + assert result[0].data["$domain"] == "file" assert result[0].data["$path"] == "/home/johan/myfile.txt" assert result[0].data["$params"] == "" assert result[0].data["$options"] == "" assert result[0].data["$identifier"] == "" + # Test about: URLs + e4 = Event( + data={"url": "about:blank"}, + timestamp=now, + duration=timedelta(seconds=1), + ) + result = split_url_events([e4]) + assert result[0].data["$protocol"] == "about" + assert result[0].data["$domain"] == "about" + def test_union(): now = datetime.now(timezone.utc)