From f614994dcb3117faa79603153ab753f4dd37f436 Mon Sep 17 00:00:00 2001 From: Andrew Luhring Date: Thu, 4 Jun 2026 23:50:26 -0500 Subject: [PATCH 1/5] feature: support rfc-compliant bracketed ipv6 addresses in URLs. Updates URL parsing to support RFC 3986 and RFC 6874 bracketed IPv6 addresses within a valid URL scheme. - replaces the for loop in find_authority_end with a while let loop to handle ipv6 extraction which apparently allows rust to use a jump table - when an open bracket is encountered, standard domain validation is suspended. the parser validates ipv6 valid characters-- including zone identifiers aka percentage signs until the closing bracket is found - maintains near zero performance regression on standard plaintext by ensuring normal characters bypass the ipv6 branch stuff - adds dedicated ipv6 benchmark cases and a substantial number ipv ipv6 unit tests - centralizes some testing utilities into tests/common/mod.rs to prevent duplicating test logic/functionality. - clippy remains undefeated NOTE: this adds ipv6 addresses within valid schemes-- schemeless bracketed IPs are intentionally excluded to prevent performance regressions on standard text. --- benches/bench.rs | 25 ++++- src/domains.rs | 67 ++++++++++-- tests/common/mod.rs | 20 +++- tests/ipv6.rs | 251 ++++++++++++++++++++++++++++++++++++++++++++ tests/url.rs | 23 +--- 5 files changed, 357 insertions(+), 29 deletions(-) create mode 100644 tests/ipv6.rs diff --git a/benches/bench.rs b/benches/bench.rs index ffed341..bccc213 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -60,11 +60,34 @@ fn some_links_without_scheme(c: &mut Criterion) { }); } +fn ipv6_links(c: &mut Criterion) { + let finder = LinkFinder::new(); + + // performance of a standard, valid ipv6 link + c.bench_function("ipv6_standard", |b| { + b.iter(|| { + let links = finder.links("http://[2001:db8::1]:8080/path"); + assert_eq!(links.count(), 1); + }) + }); + + c.bench_function("ipv6_unclosed_bracket_fail_fast", |b| { + b.iter(|| { + let links = finder.links( + "http://[2001:db8::1_malformed_text_continues_on_and_on\ + notice how it just keeps going on but doesnt end because there hasnt been a closing bracket", + ); + assert_eq!(links.count(), 0); + }) + }); +} + criterion_group!( benches, no_links, some_links, heaps_of_links, - some_links_without_scheme + some_links_without_scheme, + ipv6_links ); criterion_main!(benches); diff --git a/src/domains.rs b/src/domains.rs index 4585284..99c8c92 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -34,8 +34,6 @@ pub(crate) fn find_authority_end( port_allowed: bool, iri_parsing_enabled: bool, ) -> (Option, Option) { - let mut end = Some(0); - let mut maybe_last_dot = None; let mut last_dot = None; let mut number_dots = 0; @@ -44,8 +42,10 @@ pub(crate) fn find_authority_end( let mut all_numeric = true; let mut maybe_host = true; let mut host_ended = false; + let mut end = Some(0); + let mut chars = s.char_indices(); - for (i, c) in s.char_indices() { + while let Some((i, c)) = chars.next() { let can_be_last = match c { // ALPHA 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { @@ -161,12 +161,63 @@ pub(crate) fn find_authority_end( } break; } - _ => { - // Anything else, this might be the end of the authority (can be empty). - // Now let the rest of the code handle checking whether the end of the URL is - // valid. - break; + '[' => { + if maybe_host && !host_ended { + let mut closed = false; + let mut has_chars = false; + let mut bracket_end_idx = i; + let mut in_zone_id = false; + + // look for closing bracket and ipv6 characters in between. + for (inner_i, inner_c) in chars.by_ref() { + match inner_c { + ']' => { + if has_chars { + closed = true; + bracket_end_idx = inner_i + inner_c.len_utf8(); + } + break; + } + '%' => { + if in_zone_id { + break; + } + in_zone_id = true; + has_chars = true; + } + // Allow valid IPv6 characters + Zone Index alphanumeric + '0'..='9' | 'a'..='f' | 'A'..='F' | ':' | '.' => { + has_chars = true; + } + 'g'..='z' | 'G'..='Z' => { + if in_zone_id { + has_chars = true; + } else { + // non hex char found in main ip section + break; + } + } + _ => break, // Invalid character, abort + } + } + + if closed { + // 4. Update state to reflect a successful host block + all_numeric = false; + maybe_last_dot = None; + end = Some(bracket_end_idx); + + // The iterator is now sitting exactly on the character AFTER ']'. + // We `continue` to let the outer loop process the next char (like ':' or '/') + continue; + } else { + break; // Unclosed or malformed, terminate authority scanning + } + } else { + break; + } } + _ => break, }; if can_be_last { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 524880a..813fdc4 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,4 +1,4 @@ -use linkify::LinkFinder; +use linkify::{LinkFinder, LinkKind}; pub fn assert_linked_with(finder: &LinkFinder, input: &str, expected: &str) { let actual = show_links(input, finder); @@ -19,3 +19,21 @@ pub fn show_links(input: &str, finder: &LinkFinder) -> String { } result } + +/// Assert link without protocol +pub fn assert_urls_without_protocol(input: &str, expected: &str) { + let mut finder = LinkFinder::new(); + finder.url_must_have_scheme(false); + finder.kinds(&[LinkKind::Url]); + assert_linked_with(&finder, input, expected); +} + +/// Assert link with protocol +pub fn assert_linked(input: &str, expected: &str) { + let finder = LinkFinder::new(); + assert_linked_with(&finder, input, expected); +} + +pub fn assert_not_linked(s: &str) { + assert_linked(s, s); +} diff --git a/tests/ipv6.rs b/tests/ipv6.rs new file mode 100644 index 0000000..ed5fb0d --- /dev/null +++ b/tests/ipv6.rs @@ -0,0 +1,251 @@ +mod common; + +use crate::common::{assert_linked, assert_not_linked}; + +#[test] +fn ipv6_from_issue() { + // test the literal links from the pr. + assert_linked( + "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html", + "|http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html|", + ); + + assert_linked( + "http://[1080:0:0:0:8:800:200C:417A]/index.html", + "|http://[1080:0:0:0:8:800:200C:417A]/index.html|", + ); + + assert_linked( + "http://[3ffe:2a00:100:7031::1]", + "|http://[3ffe:2a00:100:7031::1]|", + ); + assert_linked( + "http://[1080::8:800:200C:417A]/foo", + "|http://[1080::8:800:200C:417A]/foo|", + ); + + assert_linked("http://[::192.9.5.5]/ipng", "|http://[::192.9.5.5]/ipng|"); + + assert_linked( + "http://[::FFFF:129.144.52.38]:80/index.html", + "|http://[::FFFF:129.144.52.38]:80/index.html|", + ); + + assert_linked( + "http://[2010:836B:4179::836B:4179]", + "|http://[2010:836B:4179::836B:4179]|", + ); +} + +#[test] +fn ipv6_full_uncompressed() { + assert_linked( + "http://[2001:0db8:0000:0000:0000:ff00:0042:8329]", + "|http://[2001:0db8:0000:0000:0000:ff00:0042:8329]|", + ); +} + +#[test] +fn ipv6_leading_zeros_omitted() { + assert_linked( + "https://[2001:db8:0:0:0:ff00:42:8329]", + "|https://[2001:db8:0:0:0:ff00:42:8329]|", + ); +} + +#[test] +fn ipv6_zero_comp_middle() { + assert_linked( + "https://[2001:db8::ff00:42:8329]", + "|https://[2001:db8::ff00:42:8329]|", + ); +} + +#[test] +fn ipv6_zero_comp_trailing() { + assert_linked("http://[2001:db8:1234::]", "|http://[2001:db8:1234::]|"); +} + +#[test] +fn ipv6_zero_comp_leading() { + assert_linked("http://[::1234:5678]", "|http://[::1234:5678]|"); +} + +#[test] +fn ipv6_zero_comp_loopback_comp() { + assert_linked("https://[::1]", "|https://[::1]|"); +} + +#[test] +fn ipv6_unspecified_address() { + assert_linked("http://[::]", "|http://[::]|"); +} + +// RFC 3986: + +#[test] +fn ipv6_bracket_no_port() { + assert_linked("http://[2001:db8::1]", "|http://[2001:db8::1]|"); +} + +#[test] +fn ipv6_bracket_with_port() { + assert_linked("http://[2001:db8::1]:8080", "|http://[2001:db8::1]:8080|"); +} + +#[test] +fn ipv6_bracket() { + assert_linked("http://[2001:db8::1]", "|http://[2001:db8::1]|"); +} + +#[test] +fn ipv6_bracket_slash() { + assert_linked("https://[2001:db8::1]/", "|https://[2001:db8::1]/|"); +} + +#[test] +fn ipv6_bracket_path() { + assert_linked( + "http://[2001:db8::1]/index.html", + "|http://[2001:db8::1]/index.html|", + ); +} + +#[test] +fn ipv6_mixed_case() { + assert_linked("http://[2001:DB8::A:b:C]", "|http://[2001:DB8::A:b:C]|"); +} + +#[test] +fn ipv6_max_hex() { + assert_linked( + "http://[ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]", + "|http://[ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]|", + ); +} +// rfc 4007 +#[test] +fn ipv6_zone_indices() { + assert_linked( + "http://[fe80::1ff:fe23:4567:890a%eth0]", + "|http://[fe80::1ff:fe23:4567:890a%eth0]|", + ); + assert_linked("http://[fe80::1%25]", "|http://[fe80::1%25]|"); +} +// RFC 6874: IPv6 Zone Identifiers in URIs + +#[test] +fn ipv6_zone_index_unencoded() { + // Strictly speaking, unencoded '%' is not RFC compliant for URIs, + // but it is extremely common in plain text and we want to capture it. + assert_linked("http://[fe80::1%eth0]", "|http://[fe80::1%eth0]|"); +} + +#[test] +fn ipv6_zone_index_encoded() { + // This is the strict RFC 6874 compliant format (%25 is the URL encoding for %) + assert_linked("http://[fe80::1%25eth0]", "|http://[fe80::1%25eth0]|"); +} + +#[test] +fn ipv6_zone_index_with_port() { + // Crucial: verify that the parser transitions correctly from the zone index + // to the port after the closing bracket. + assert_linked( + "https://[fe80::1%eth0]:8080", + "|https://[fe80::1%eth0]:8080|", + ); + assert_linked( + "https://[fe80::1%25eth0]:8080", + "|https://[fe80::1%25eth0]:8080|", + ); +} + +#[test] +fn ipv6_zone_index_with_path() { + // Verify that the parser transitions correctly from the zone index to the path. + assert_linked( + "http://[fe80::1%eth0]/api/data", + "|http://[fe80::1%eth0]/api/data|", + ); + assert_linked( + "http://[fe80::1%25eth0]/api/data", + "|http://[fe80::1%25eth0]/api/data|", + ); +} + +#[test] +fn ipv6_zone_index_with_port_and_path() { + assert_linked( + "http://[fe80::1%eth0]:443/api/data?query=1", + "|http://[fe80::1%eth0]:443/api/data?query=1|", + ); +} + +#[test] +fn ipv6_zone_index_ipv4_mapped() { + // A rare but syntactically possible edge case + assert_linked( + "http://[::ffff:192.0.2.128%eth0]", + "|http://[::ffff:192.0.2.128%eth0]|", + ); +} + +#[test] +fn ipv6_ipv4_mapped() { + assert_linked( + "http://[::ffff:192.0.2.128]", + "|http://[::ffff:192.0.2.128]|", + ); + assert_linked( + "http://[0:0:0:0:0:ffff:192.0.2.128]", + "|http://[0:0:0:0:0:ffff:192.0.2.128]|", + ); +} + +#[test] +fn ipv6_empty_brackets() { + assert_not_linked("http://[]/"); +} + +#[test] +fn ipv6_unclosed_brackets() { + assert_not_linked("http://[2001:db8::1/index.html"); +} + +#[test] +fn ipv6_non_hex() { + assert_not_linked("http://[this-is-not-hex]/"); +} +#[test] +fn ipv6_non_hex_but_looks_ip6y() { + assert_not_linked("http://[this:is:not:hex]/"); +} + +#[test] +fn ipv6_nested_brackets() { + assert_not_linked("http://[[::1]]/"); +} +#[test] +fn ipv6_unclosed_bracket_at_eof() { + assert_not_linked("http://[2001:"); +} + +#[test] +fn ipv6_unclosed_bracket_with_path() { + assert_not_linked("http://[2001:/path"); +} + +#[test] +fn ipv6_with_complicated_surrounding_input() { + assert_linked("So it's 8:10pm on or maybe its 1:21. time is an illusion. : anyway, [check out this link](http://[::])", + "So it's 8:10pm on or maybe its 1:21. time is an illusion. : anyway, [check out this link](|http://[::]|)") +} + +#[test] +fn ipv6_with_too_many_colons() { + // linkify does not do deep validation whether an ipv4 address is VALID, it extracts structurally valid boundaries. + // so i'll do the same- otherwise we'd have to start counting colons and doing more complicated validations which... + // would impact performance. + assert_linked("http://[:::]", "|http://[:::]|"); +} diff --git a/tests/url.rs b/tests/url.rs index 41b1650..77fe4d5 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -1,6 +1,9 @@ mod common; -use crate::common::assert_linked_with; +use crate::common::{ + assert_linked, assert_linked_with, assert_not_linked, assert_urls_without_protocol, +}; + use linkify::{LinkFinder, LinkKind}; #[test] @@ -558,24 +561,6 @@ fn fuzz() { assert_not_linked("ab:/ϸ"); } -fn assert_not_linked(s: &str) { - assert_linked(s, s); -} - -/// Assert link with protocol -fn assert_linked(input: &str, expected: &str) { - let finder = LinkFinder::new(); - assert_linked_with(&finder, input, expected); -} - fn assert_not_linked_without_protocol(s: &str) { assert_urls_without_protocol(s, s); } - -/// Assert link without protocol -fn assert_urls_without_protocol(input: &str, expected: &str) { - let mut finder = LinkFinder::new(); - finder.url_must_have_scheme(false); - finder.kinds(&[LinkKind::Url]); - assert_linked_with(&finder, input, expected); -} From af190c76f5ebe453f187c784138ca5fa2507c466 Mon Sep 17 00:00:00 2001 From: Andrew Luhring Date: Fri, 5 Jun 2026 22:24:58 -0500 Subject: [PATCH 2/5] moves / refactors most of my contribution in domains.rs to a function that describes what is happening. also move the ipv6 character matchings and zone_id matchings to their own helper functions for ledgibility --- src/domains.rs | 111 ++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 99c8c92..f884005 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -27,6 +27,58 @@ use std::char; +/// ipv6 characters are hex characters, :'s and .'s (you can have ipv4 addresses inside ipv6 addresses). +#[inline(always)] +fn is_ipv6_char(c: char) -> bool { + matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' | ':' | '.') +} + +/// zone_id characters +#[inline(always)] +fn is_zone_id_char(c: char) -> bool { + matches!(c, 'g'..='z' | 'G'..='Z') +} + +fn find_ipv6_end(chars: &mut I) -> Option +where + I: Iterator, +{ + let mut has_chars = false; + let mut in_zone_id = false; + + for (inner_i, inner_c) in chars { + match inner_c { + ']' => { + return if has_chars { + Some(inner_i + inner_c.len_utf8()) + } else { + None + }; + } + '%' if !in_zone_id => { + in_zone_id = true; + has_chars = true; + } + '%' => return None, // Reject multiple '%' signs + + c if is_ipv6_char(c) => { + has_chars = true; + } + + c if is_zone_id_char(c) => { + if !in_zone_id { + return None; // Non-hex char found outside of zone id + } + has_chars = true; + } + + _ => return None, // Invalid character, abort + } + } + + None +} + pub(crate) fn find_authority_end( s: &str, mut userinfo_allowed: bool, @@ -162,57 +214,14 @@ pub(crate) fn find_authority_end( break; } '[' => { - if maybe_host && !host_ended { - let mut closed = false; - let mut has_chars = false; - let mut bracket_end_idx = i; - let mut in_zone_id = false; - - // look for closing bracket and ipv6 characters in between. - for (inner_i, inner_c) in chars.by_ref() { - match inner_c { - ']' => { - if has_chars { - closed = true; - bracket_end_idx = inner_i + inner_c.len_utf8(); - } - break; - } - '%' => { - if in_zone_id { - break; - } - in_zone_id = true; - has_chars = true; - } - // Allow valid IPv6 characters + Zone Index alphanumeric - '0'..='9' | 'a'..='f' | 'A'..='F' | ':' | '.' => { - has_chars = true; - } - 'g'..='z' | 'G'..='Z' => { - if in_zone_id { - has_chars = true; - } else { - // non hex char found in main ip section - break; - } - } - _ => break, // Invalid character, abort - } - } - - if closed { - // 4. Update state to reflect a successful host block - all_numeric = false; - maybe_last_dot = None; - end = Some(bracket_end_idx); - - // The iterator is now sitting exactly on the character AFTER ']'. - // We `continue` to let the outer loop process the next char (like ':' or '/') - continue; - } else { - break; // Unclosed or malformed, terminate authority scanning - } + if !maybe_host && host_ended { + break; + } + if let Some(bracket_end_index) = find_ipv6_end(&mut chars) { + all_numeric = false; + maybe_last_dot = None; + end = Some(bracket_end_index); + continue; } else { break; } From d70596d1c19e933ab46b0df211d25c154914f700 Mon Sep 17 00:00:00 2001 From: Andrew Luhring Date: Fri, 5 Jun 2026 23:53:35 -0500 Subject: [PATCH 3/5] fixes incorrect inverted logic. --- src/domains.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/domains.rs b/src/domains.rs index f884005..8c9f169 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -214,7 +214,7 @@ pub(crate) fn find_authority_end( break; } '[' => { - if !maybe_host && host_ended { + if !maybe_host || host_ended { break; } if let Some(bracket_end_index) = find_ipv6_end(&mut chars) { From 7c5bd3586a2708e8cdc98c3d697e520871b640a7 Mon Sep 17 00:00:00 2001 From: Andrew Luhring Date: Sat, 6 Jun 2026 00:10:43 -0500 Subject: [PATCH 4/5] adds a test case and comments that explain the intent around not counting colons --- tests/ipv6.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/ipv6.rs b/tests/ipv6.rs index ed5fb0d..fc2e311 100644 --- a/tests/ipv6.rs +++ b/tests/ipv6.rs @@ -248,4 +248,9 @@ fn ipv6_with_too_many_colons() { // so i'll do the same- otherwise we'd have to start counting colons and doing more complicated validations which... // would impact performance. assert_linked("http://[:::]", "|http://[:::]|"); + // this isn't a bug, its a feature. we are not validating the number of colons, this is heuristic link extractor, + // not a strict URI validator. You can use this library to extract things that look like links and then use + // rusts' standard url crate to validate the links and throw whatever errors you choose to. + // counting the number of colons and doing more lookarounds than we're already doing would very likely impact performance. + assert_linked("http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]", "|http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]|") } From 021247a82cac6d908d9589ebd0602cde73440816 Mon Sep 17 00:00:00 2001 From: Andrew Luhring Date: Sun, 7 Jun 2026 00:44:31 -0500 Subject: [PATCH 5/5] fixes issues mentioned by maintainer in the pr. - readds the comment i accidentally deleted - adds a comment for the continue statment as requested by maintainer - reruns cargo fmt check and applies changes. --- src/domains.rs | 7 +++++++ tests/ipv6.rs | 13 ++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/domains.rs b/src/domains.rs index 8c9f169..92c2eb3 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -221,11 +221,18 @@ pub(crate) fn find_authority_end( all_numeric = false; maybe_last_dot = None; end = Some(bracket_end_index); + // The IPv6 helper finds the exact end index and consumes the closing ] + // We use continue below to satisfy the match statement's type requirements, + // and bypass the default `can_be_last` calculation for the opening [ + // and immediately process whatever follows the address. continue; } else { break; } } + // Anything else, this might be the end of the authority (can be empty). + // Now let the rest of the code handle checking whether the end of the URL is + // valid. _ => break, }; diff --git a/tests/ipv6.rs b/tests/ipv6.rs index fc2e311..ab0f62e 100644 --- a/tests/ipv6.rs +++ b/tests/ipv6.rs @@ -248,9 +248,12 @@ fn ipv6_with_too_many_colons() { // so i'll do the same- otherwise we'd have to start counting colons and doing more complicated validations which... // would impact performance. assert_linked("http://[:::]", "|http://[:::]|"); - // this isn't a bug, its a feature. we are not validating the number of colons, this is heuristic link extractor, - // not a strict URI validator. You can use this library to extract things that look like links and then use - // rusts' standard url crate to validate the links and throw whatever errors you choose to. - // counting the number of colons and doing more lookarounds than we're already doing would very likely impact performance. - assert_linked("http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]", "|http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]|") + // this isn't a bug, its a feature. we are not validating the number of colons, this is heuristic link extractor, + // not a strict URI validator. You can use this library to extract things that look like links and then use + // rusts' standard url crate to validate the links and throw whatever errors you choose to. + // counting the number of colons and doing more lookarounds than we're already doing would very likely impact performance. + assert_linked( + "http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]", + "|http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]|", + ) }