diff --git a/benches/bench.rs b/benches/bench.rs index ffed341..bccc213 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -60,11 +60,34 @@ fn some_links_without_scheme(c: &mut Criterion) { }); } +fn ipv6_links(c: &mut Criterion) { + let finder = LinkFinder::new(); + + // performance of a standard, valid ipv6 link + c.bench_function("ipv6_standard", |b| { + b.iter(|| { + let links = finder.links("http://[2001:db8::1]:8080/path"); + assert_eq!(links.count(), 1); + }) + }); + + c.bench_function("ipv6_unclosed_bracket_fail_fast", |b| { + b.iter(|| { + let links = finder.links( + "http://[2001:db8::1_malformed_text_continues_on_and_on\ + notice how it just keeps going on but doesnt end because there hasnt been a closing bracket", + ); + assert_eq!(links.count(), 0); + }) + }); +} + criterion_group!( benches, no_links, some_links, heaps_of_links, - some_links_without_scheme + some_links_without_scheme, + ipv6_links ); criterion_main!(benches); diff --git a/src/domains.rs b/src/domains.rs index 4585284..92c2eb3 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -27,6 +27,58 @@ use std::char; +/// ipv6 characters are hex characters, :'s and .'s (you can have ipv4 addresses inside ipv6 addresses). +#[inline(always)] +fn is_ipv6_char(c: char) -> bool { + matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F' | ':' | '.') +} + +/// zone_id characters +#[inline(always)] +fn is_zone_id_char(c: char) -> bool { + matches!(c, 'g'..='z' | 'G'..='Z') +} + +fn find_ipv6_end(chars: &mut I) -> Option +where + I: Iterator, +{ + let mut has_chars = false; + let mut in_zone_id = false; + + for (inner_i, inner_c) in chars { + match inner_c { + ']' => { + return if has_chars { + Some(inner_i + inner_c.len_utf8()) + } else { + None + }; + } + '%' if !in_zone_id => { + in_zone_id = true; + has_chars = true; + } + '%' => return None, // Reject multiple '%' signs + + c if is_ipv6_char(c) => { + has_chars = true; + } + + c if is_zone_id_char(c) => { + if !in_zone_id { + return None; // Non-hex char found outside of zone id + } + has_chars = true; + } + + _ => return None, // Invalid character, abort + } + } + + None +} + pub(crate) fn find_authority_end( s: &str, mut userinfo_allowed: bool, @@ -34,8 +86,6 @@ pub(crate) fn find_authority_end( port_allowed: bool, iri_parsing_enabled: bool, ) -> (Option, Option) { - let mut end = Some(0); - let mut maybe_last_dot = None; let mut last_dot = None; let mut number_dots = 0; @@ -44,8 +94,10 @@ pub(crate) fn find_authority_end( let mut all_numeric = true; let mut maybe_host = true; let mut host_ended = false; + let mut end = Some(0); + let mut chars = s.char_indices(); - for (i, c) in s.char_indices() { + while let Some((i, c)) = chars.next() { let can_be_last = match c { // ALPHA 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { @@ -161,12 +213,27 @@ pub(crate) fn find_authority_end( } break; } - _ => { - // Anything else, this might be the end of the authority (can be empty). - // Now let the rest of the code handle checking whether the end of the URL is - // valid. - break; + '[' => { + if !maybe_host || host_ended { + break; + } + if let Some(bracket_end_index) = find_ipv6_end(&mut chars) { + all_numeric = false; + maybe_last_dot = None; + end = Some(bracket_end_index); + // The IPv6 helper finds the exact end index and consumes the closing ] + // We use continue below to satisfy the match statement's type requirements, + // and bypass the default `can_be_last` calculation for the opening [ + // and immediately process whatever follows the address. + continue; + } else { + break; + } } + // Anything else, this might be the end of the authority (can be empty). + // Now let the rest of the code handle checking whether the end of the URL is + // valid. + _ => break, }; if can_be_last { diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 524880a..813fdc4 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1,4 +1,4 @@ -use linkify::LinkFinder; +use linkify::{LinkFinder, LinkKind}; pub fn assert_linked_with(finder: &LinkFinder, input: &str, expected: &str) { let actual = show_links(input, finder); @@ -19,3 +19,21 @@ pub fn show_links(input: &str, finder: &LinkFinder) -> String { } result } + +/// Assert link without protocol +pub fn assert_urls_without_protocol(input: &str, expected: &str) { + let mut finder = LinkFinder::new(); + finder.url_must_have_scheme(false); + finder.kinds(&[LinkKind::Url]); + assert_linked_with(&finder, input, expected); +} + +/// Assert link with protocol +pub fn assert_linked(input: &str, expected: &str) { + let finder = LinkFinder::new(); + assert_linked_with(&finder, input, expected); +} + +pub fn assert_not_linked(s: &str) { + assert_linked(s, s); +} diff --git a/tests/ipv6.rs b/tests/ipv6.rs new file mode 100644 index 0000000..ab0f62e --- /dev/null +++ b/tests/ipv6.rs @@ -0,0 +1,259 @@ +mod common; + +use crate::common::{assert_linked, assert_not_linked}; + +#[test] +fn ipv6_from_issue() { + // test the literal links from the pr. + assert_linked( + "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html", + "|http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html|", + ); + + assert_linked( + "http://[1080:0:0:0:8:800:200C:417A]/index.html", + "|http://[1080:0:0:0:8:800:200C:417A]/index.html|", + ); + + assert_linked( + "http://[3ffe:2a00:100:7031::1]", + "|http://[3ffe:2a00:100:7031::1]|", + ); + assert_linked( + "http://[1080::8:800:200C:417A]/foo", + "|http://[1080::8:800:200C:417A]/foo|", + ); + + assert_linked("http://[::192.9.5.5]/ipng", "|http://[::192.9.5.5]/ipng|"); + + assert_linked( + "http://[::FFFF:129.144.52.38]:80/index.html", + "|http://[::FFFF:129.144.52.38]:80/index.html|", + ); + + assert_linked( + "http://[2010:836B:4179::836B:4179]", + "|http://[2010:836B:4179::836B:4179]|", + ); +} + +#[test] +fn ipv6_full_uncompressed() { + assert_linked( + "http://[2001:0db8:0000:0000:0000:ff00:0042:8329]", + "|http://[2001:0db8:0000:0000:0000:ff00:0042:8329]|", + ); +} + +#[test] +fn ipv6_leading_zeros_omitted() { + assert_linked( + "https://[2001:db8:0:0:0:ff00:42:8329]", + "|https://[2001:db8:0:0:0:ff00:42:8329]|", + ); +} + +#[test] +fn ipv6_zero_comp_middle() { + assert_linked( + "https://[2001:db8::ff00:42:8329]", + "|https://[2001:db8::ff00:42:8329]|", + ); +} + +#[test] +fn ipv6_zero_comp_trailing() { + assert_linked("http://[2001:db8:1234::]", "|http://[2001:db8:1234::]|"); +} + +#[test] +fn ipv6_zero_comp_leading() { + assert_linked("http://[::1234:5678]", "|http://[::1234:5678]|"); +} + +#[test] +fn ipv6_zero_comp_loopback_comp() { + assert_linked("https://[::1]", "|https://[::1]|"); +} + +#[test] +fn ipv6_unspecified_address() { + assert_linked("http://[::]", "|http://[::]|"); +} + +// RFC 3986: + +#[test] +fn ipv6_bracket_no_port() { + assert_linked("http://[2001:db8::1]", "|http://[2001:db8::1]|"); +} + +#[test] +fn ipv6_bracket_with_port() { + assert_linked("http://[2001:db8::1]:8080", "|http://[2001:db8::1]:8080|"); +} + +#[test] +fn ipv6_bracket() { + assert_linked("http://[2001:db8::1]", "|http://[2001:db8::1]|"); +} + +#[test] +fn ipv6_bracket_slash() { + assert_linked("https://[2001:db8::1]/", "|https://[2001:db8::1]/|"); +} + +#[test] +fn ipv6_bracket_path() { + assert_linked( + "http://[2001:db8::1]/index.html", + "|http://[2001:db8::1]/index.html|", + ); +} + +#[test] +fn ipv6_mixed_case() { + assert_linked("http://[2001:DB8::A:b:C]", "|http://[2001:DB8::A:b:C]|"); +} + +#[test] +fn ipv6_max_hex() { + assert_linked( + "http://[ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]", + "|http://[ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff]|", + ); +} +// rfc 4007 +#[test] +fn ipv6_zone_indices() { + assert_linked( + "http://[fe80::1ff:fe23:4567:890a%eth0]", + "|http://[fe80::1ff:fe23:4567:890a%eth0]|", + ); + assert_linked("http://[fe80::1%25]", "|http://[fe80::1%25]|"); +} +// RFC 6874: IPv6 Zone Identifiers in URIs + +#[test] +fn ipv6_zone_index_unencoded() { + // Strictly speaking, unencoded '%' is not RFC compliant for URIs, + // but it is extremely common in plain text and we want to capture it. + assert_linked("http://[fe80::1%eth0]", "|http://[fe80::1%eth0]|"); +} + +#[test] +fn ipv6_zone_index_encoded() { + // This is the strict RFC 6874 compliant format (%25 is the URL encoding for %) + assert_linked("http://[fe80::1%25eth0]", "|http://[fe80::1%25eth0]|"); +} + +#[test] +fn ipv6_zone_index_with_port() { + // Crucial: verify that the parser transitions correctly from the zone index + // to the port after the closing bracket. + assert_linked( + "https://[fe80::1%eth0]:8080", + "|https://[fe80::1%eth0]:8080|", + ); + assert_linked( + "https://[fe80::1%25eth0]:8080", + "|https://[fe80::1%25eth0]:8080|", + ); +} + +#[test] +fn ipv6_zone_index_with_path() { + // Verify that the parser transitions correctly from the zone index to the path. + assert_linked( + "http://[fe80::1%eth0]/api/data", + "|http://[fe80::1%eth0]/api/data|", + ); + assert_linked( + "http://[fe80::1%25eth0]/api/data", + "|http://[fe80::1%25eth0]/api/data|", + ); +} + +#[test] +fn ipv6_zone_index_with_port_and_path() { + assert_linked( + "http://[fe80::1%eth0]:443/api/data?query=1", + "|http://[fe80::1%eth0]:443/api/data?query=1|", + ); +} + +#[test] +fn ipv6_zone_index_ipv4_mapped() { + // A rare but syntactically possible edge case + assert_linked( + "http://[::ffff:192.0.2.128%eth0]", + "|http://[::ffff:192.0.2.128%eth0]|", + ); +} + +#[test] +fn ipv6_ipv4_mapped() { + assert_linked( + "http://[::ffff:192.0.2.128]", + "|http://[::ffff:192.0.2.128]|", + ); + assert_linked( + "http://[0:0:0:0:0:ffff:192.0.2.128]", + "|http://[0:0:0:0:0:ffff:192.0.2.128]|", + ); +} + +#[test] +fn ipv6_empty_brackets() { + assert_not_linked("http://[]/"); +} + +#[test] +fn ipv6_unclosed_brackets() { + assert_not_linked("http://[2001:db8::1/index.html"); +} + +#[test] +fn ipv6_non_hex() { + assert_not_linked("http://[this-is-not-hex]/"); +} +#[test] +fn ipv6_non_hex_but_looks_ip6y() { + assert_not_linked("http://[this:is:not:hex]/"); +} + +#[test] +fn ipv6_nested_brackets() { + assert_not_linked("http://[[::1]]/"); +} +#[test] +fn ipv6_unclosed_bracket_at_eof() { + assert_not_linked("http://[2001:"); +} + +#[test] +fn ipv6_unclosed_bracket_with_path() { + assert_not_linked("http://[2001:/path"); +} + +#[test] +fn ipv6_with_complicated_surrounding_input() { + assert_linked("So it's 8:10pm on or maybe its 1:21. time is an illusion. : anyway, [check out this link](http://[::])", + "So it's 8:10pm on or maybe its 1:21. time is an illusion. : anyway, [check out this link](|http://[::]|)") +} + +#[test] +fn ipv6_with_too_many_colons() { + // linkify does not do deep validation whether an ipv4 address is VALID, it extracts structurally valid boundaries. + // so i'll do the same- otherwise we'd have to start counting colons and doing more complicated validations which... + // would impact performance. + assert_linked("http://[:::]", "|http://[:::]|"); + // this isn't a bug, its a feature. we are not validating the number of colons, this is heuristic link extractor, + // not a strict URI validator. You can use this library to extract things that look like links and then use + // rusts' standard url crate to validate the links and throw whatever errors you choose to. + // counting the number of colons and doing more lookarounds than we're already doing would very likely impact performance. + assert_linked( + "http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]", + "|http://[::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::]|", + ) +} diff --git a/tests/url.rs b/tests/url.rs index 41b1650..77fe4d5 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -1,6 +1,9 @@ mod common; -use crate::common::assert_linked_with; +use crate::common::{ + assert_linked, assert_linked_with, assert_not_linked, assert_urls_without_protocol, +}; + use linkify::{LinkFinder, LinkKind}; #[test] @@ -558,24 +561,6 @@ fn fuzz() { assert_not_linked("ab:/ϸ"); } -fn assert_not_linked(s: &str) { - assert_linked(s, s); -} - -/// Assert link with protocol -fn assert_linked(input: &str, expected: &str) { - let finder = LinkFinder::new(); - assert_linked_with(&finder, input, expected); -} - fn assert_not_linked_without_protocol(s: &str) { assert_urls_without_protocol(s, s); } - -/// Assert link without protocol -fn assert_urls_without_protocol(input: &str, expected: &str) { - let mut finder = LinkFinder::new(); - finder.url_must_have_scheme(false); - finder.kinds(&[LinkKind::Url]); - assert_linked_with(&finder, input, expected); -}