From d77b8594dceae702cb3c2edf9fcbe54e8b2835e2 Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 13 May 2016 17:12:45 +0100 Subject: [PATCH 1/7] Allow non-ASCII characters in email addresses. Keep the default behaviour of previous versions, add a UNICODE variable to switch it on. This is from RFC 5335 / 6532, "Internationalized Email Headers". Fixes #12. --- lib/Email/Address.pm | 14 +++++--- t/ascii.t | 83 +++++++++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 40 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index e92dd59..47f0f00 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -6,6 +6,7 @@ package Email::Address; our $COMMENT_NEST_LEVEL ||= 1; our $STRINGIFY ||= 'format'; our $COLLAPSE_SPACES = 1 unless defined $COLLAPSE_SPACES; # I miss //= +our $UNICODE ||= 0; =head1 SYNOPSIS @@ -172,9 +173,10 @@ collapse multiple spaces into a single space, which avoids this problem. To prevent this behavior, set C<$Email::Address::COLLAPSE_SPACES> to zero. This variable will go away when the bug is resolved properly. -In accordance with RFC 822 and its descendants, this module demands that email -addresses be ASCII only. Any non-ASCII content in the parsed addresses will -cause the parser to return no results. +By default, this module mandates that email addresses be ASCII only, and any +non-ASCII content will cause a blank result. This matches RFCs 822, 2822, and +5322. If you wish to allow UTF-8 characters in email, as per RFCs 5335 and +6532, set C<$Email:Address::UNICODE> to 1. =cut @@ -223,8 +225,10 @@ sub parse { ($user, $host) = ($1, $2); } - next if $user =~ /\P{ASCII}/; - next if $host =~ /\P{ASCII}/; + unless ($UNICODE) { + next if $user =~ /\P{ASCII}/; + next if $host =~ /\P{ASCII}/; + } my ($phrase) = /($display_name)/o; diff --git a/t/ascii.t b/t/ascii.t index 6219f62..a143641 100644 --- a/t/ascii.t +++ b/t/ascii.t @@ -10,50 +10,63 @@ my $ascii = q{admin@mozilla.org}; my $utf_8 = q{аdmin@mozilla.org}; my $text = decode('utf-8', $utf_8, Encode::LEAVE_SRC); -my $ok_mixed = qq{"$text" <$ascii>}; -my $bad_mixed = qq{"$text" <$text>}; +my $ascii_mixed = qq{"$text" <$ascii>}; +my $utf8_mixed = qq{"$text" <$text>}; -{ - my (@addr) = Email::Address->parse($ascii); - is(@addr, 1, "an ascii address is a-ok"); +for (0..1) { + local $Email::Address::UNICODE = $_; - # ok( $ascii =~ $Email::Address::addr_spec, "...it =~ addr_spec"); -} + { + my (@addr) = Email::Address->parse($ascii); + is(@addr, 1, "an ascii address is a-ok"); -{ - my (@addr) = Email::Address->parse($ok_mixed); - is(@addr, 1, "a quoted non-ascii phrase is a-ok with ascii email"); -} + # ok( $ascii =~ $Email::Address::addr_spec, "...it =~ addr_spec"); + } -{ - my (@addr) = Email::Address->parse($bad_mixed); - is(@addr, 0, "a quoted non-ascii phrase is not okay with non-ascii email"); -} + { + my (@addr) = Email::Address->parse($ascii_mixed); + is(@addr, 1, "a quoted non-ascii phrase is a-ok with ascii email"); + } -{ - my (@addr) = Email::Address->parse($utf_8); - is(@addr, 0, "utf-8 octet address: not ok"); + { + my (@addr) = Email::Address->parse($utf8_mixed); + is(@addr, $Email::Address::UNICODE, "a quoted non-ascii phrase with non-ascii email"); + } - # ok( $utf_8 !~ $Email::Address::addr_spec, "...it !~ addr_spec"); -} + { + my (@addr) = Email::Address->parse($utf_8); + is(@addr, $Email::Address::UNICODE, "utf-8 octet address"); -{ - my (@addr) = Email::Address->parse($text); - is(@addr, 0, "unicode (decoded) address: not ok"); + # ok( $utf_8 !~ $Email::Address::addr_spec, "...it !~ addr_spec"); + } - # ok( $text =~ $Email::Address::addr_spec, "...it !~ addr_spec"); -} + { + my (@addr) = Email::Address->parse($text); + is(@addr, $Email::Address::UNICODE, "unicode (decoded) address"); + + # ok( $text =~ $Email::Address::addr_spec, "...it !~ addr_spec"); + } + + { + my @addr = Email::Address->parse(qq{ + "Not ascii phras\x{e9}" , + b\x{e3}d\@user, + bad\@d\x{f6}main, + not.bad\@again + }); + is scalar @addr, $Email::Address::UNICODE ? 4 : 2, "correct number of good emails"; + is "$addr[0]", qq{"Not ascii phras\x{e9}" }, "expected email"; + if ($Email::Address::UNICODE) { + is "$addr[1]", qq{b\x{e3}d\@user}, "expected email"; + is "$addr[2]", qq{bad\@d\x{f6}main}, "expected email"; + is "$addr[3]", qq{not.bad\@again}, "expected email"; + } else { + is "$addr[1]", qq{not.bad\@again}, "expected email"; + } + } + + Email::Address->purge_cache; -{ - my @addr = Email::Address->parse(qq{ - "Not ascii phras\x{e9}" , - b\x{e3}d\@user, - bad\@d\x{f6}main, - not.bad\@again - }); - is scalar @addr, 2, "correct number of good emails"; - is "$addr[0]", qq{"Not ascii phras\x{e9}" }, "expected email"; - is "$addr[1]", qq{not.bad\@again}, "expected email"; } done_testing; From fa81a51ac932adf182629c44a613d722dc680f58 Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 13 May 2016 13:51:37 +0100 Subject: [PATCH 2/7] Bring regexes more in line with RFC5322. FWS is still brought in as \s+ for simplicity. The obs_phrase test in comment passes fine, so remove that code; space backtracking problems will be fixed in the next commit. This commit actually fixes the original example in #10 as $comment is much simplified. Update test to pass; test is actually incorrect in that the "comment" should not be removed (RT#80665). --- lib/Email/Address.pm | 33 ++++++++------------------------- t/tests.t | 2 +- 2 files changed, 9 insertions(+), 26 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index 47f0f00..4b1f54c 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -29,15 +29,13 @@ to be correct, and very very fast. my $CTL = q{\x00-\x1F\x7F}; my $special = q{()<>\\[\\]:;@\\\\,."}; -my $text = qr/[^\x0A\x0D]/; +my $quoted_pair = qr/\\[[:graph:] \t]/; -my $quoted_pair = qr/\\$text/; - -my $ctext = qr/(?>[^()\\]+)/; +my $ctext = qr/[^$CTL()\\]/; my ($ccontent, $comment) = (q{})x2; for (1 .. $COMMENT_NEST_LEVEL) { $ccontent = qr/$ctext|$quoted_pair|$comment/; - $comment = qr/\s*\((?:\s*$ccontent)*\s*\)\s*/; + $comment = qr/\($ccontent*\)/; } my $cfws = qr/$comment|\s+/; @@ -46,33 +44,18 @@ my $atom = qr/$cfws*$atext+$cfws*/; my $dot_atom_text = qr/$atext+(?:\.$atext+)*/; my $dot_atom = qr/$cfws*$dot_atom_text$cfws*/; -my $qtext = qr/[^\\"]/; +my $qtext = qr/[^$CTL\\"]/; my $qcontent = qr/$qtext|$quoted_pair/; my $quoted_string = qr/$cfws*"$qcontent*"$cfws*/; my $word = qr/$atom|$quoted_string/; -# XXX: This ($phrase) used to just be: my $phrase = qr/$word+/; It was changed -# to resolve bug 22991, creating a significant slowdown. Given current speed -# problems. Once 16320 is resolved, this section should be dealt with. -# -- rjbs, 2006-11-11 -#my $obs_phrase = qr/$word(?:$word|\.|$cfws)*/; - -# XXX: ...and the above solution caused endless problems (never returned) when -# examining this address, now in a test: -# admin+=E6=96=B0=E5=8A=A0=E5=9D=A1_Weblog-- ATAT --test.socialtext.com -# So we disallow the hateful CFWS in this context for now. Of modern mail -# agents, only Apple Web Mail 2.0 is known to produce obs-phrase. -# -- rjbs, 2006-11-19 -my $simple_word = qr/$atom|\.|\s*"$qcontent+"\s*/; -my $obs_phrase = qr/$simple_word+/; - +my $obs_phrase = qr/$word(?:$word|\.|$cfws)*/; my $phrase = qr/$obs_phrase|(?:$word+)/; my $local_part = qr/$dot_atom|$quoted_string/; -my $dtext = qr/[^\[\]\\]/; -my $dcontent = qr/$dtext|$quoted_pair/; -my $domain_literal = qr/$cfws*\[(?:\s*$dcontent)*\s*\]$cfws*/; +my $dtext = qr/[^$CTL\[\]\\]/; +my $domain_literal = qr/$cfws*\[$dtext*\]$cfws*/; my $domain = qr/$dot_atom|$domain_literal/; my $display_name = $phrase; @@ -127,7 +110,7 @@ following comment. our $addr_spec = qr/$local_part\@$domain/; our $angle_addr = qr/$cfws*<$addr_spec>$cfws*/; our $name_addr = qr/(?>$display_name?)$angle_addr/; -our $mailbox = qr/(?:$name_addr|$addr_spec)$comment*/; +our $mailbox = qr/$name_addr|$addr_spec/; sub _PHRASE () { 0 } sub _ADDRESS () { 1 } diff --git a/t/tests.t b/t/tests.t index b590337..268a1de 100644 --- a/t/tests.t +++ b/t/tests.t @@ -1488,7 +1488,7 @@ my @list = ( '"Greg Norris (humble visionary genius)" , ', [ [ - 'Greg Norris', + 'Greg Norris ', 'nextrightmove-- ATAT --bang.example.net', '(humble visionary genius)' ], From b142fb678bb26be312a8a6eedbb7e5db7e8f971b Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 13 May 2016 15:28:51 +0100 Subject: [PATCH 3/7] Try and prevent backtracking regex explosions. COLLAPSE_SPACES is no longer necessary. Fixes the second example given in #10. --- lib/Email/Address.pm | 25 ++++++++----------------- t/speed.t | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 17 deletions(-) create mode 100644 t/speed.t diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index 4b1f54c..72d23b7 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -5,7 +5,6 @@ package Email::Address; our $COMMENT_NEST_LEVEL ||= 1; our $STRINGIFY ||= 'format'; -our $COLLAPSE_SPACES = 1 unless defined $COLLAPSE_SPACES; # I miss //= our $UNICODE ||= 0; =head1 SYNOPSIS @@ -37,25 +36,25 @@ for (1 .. $COMMENT_NEST_LEVEL) { $ccontent = qr/$ctext|$quoted_pair|$comment/; $comment = qr/\($ccontent*\)/; } -my $cfws = qr/$comment|\s+/; +my $cfws = qr/(?>$comment|\s+)/; my $atext = qq/[^$CTL$special\\s]/; -my $atom = qr/$cfws*$atext+$cfws*/; +my $atom = qr/$cfws*(?>$atext+)$cfws*/; my $dot_atom_text = qr/$atext+(?:\.$atext+)*/; -my $dot_atom = qr/$cfws*$dot_atom_text$cfws*/; +my $dot_atom = qr/$cfws*(?>$dot_atom_text)$cfws*/; my $qtext = qr/[^$CTL\\"]/; my $qcontent = qr/$qtext|$quoted_pair/; -my $quoted_string = qr/$cfws*"$qcontent*"$cfws*/; +my $quoted_string = qr/$cfws*"(?>$qcontent*)"$cfws*/; my $word = qr/$atom|$quoted_string/; -my $obs_phrase = qr/$word(?:$word|\.|$cfws)*/; -my $phrase = qr/$obs_phrase|(?:$word+)/; +my $obs_phrase = qr/$word(?>(?:$word|\.|$cfws)*)/; +my $phrase = qr/$obs_phrase|(?>$word+)/; my $local_part = qr/$dot_atom|$quoted_string/; my $dtext = qr/[^$CTL\[\]\\]/; -my $domain_literal = qr/$cfws*\[$dtext*\]$cfws*/; +my $domain_literal = qr/$cfws*\[(?>$dtext*)\]$cfws*/; my $domain = qr/$dot_atom|$domain_literal/; my $display_name = $phrase; @@ -109,7 +108,7 @@ following comment. our $addr_spec = qr/$local_part\@$domain/; our $angle_addr = qr/$cfws*<$addr_spec>$cfws*/; -our $name_addr = qr/(?>$display_name?)$angle_addr/; +our $name_addr = qr/$display_name?$angle_addr/; our $mailbox = qr/$name_addr|$addr_spec/; sub _PHRASE () { 0 } @@ -150,12 +149,6 @@ C<$Email::Address::COMMENT_NEST_LEVEL> package variable to allow more. The reason for this hardly-limiting limitation is simple: efficiency. -Long strings of whitespace can be problematic for this module to parse, a bug -which has not yet been adequately addressed. The default behavior is now to -collapse multiple spaces into a single space, which avoids this problem. To -prevent this behavior, set C<$Email::Address::COLLAPSE_SPACES> to zero. This -variable will go away when the bug is resolved properly. - By default, this module mandates that email addresses be ASCII only, and any non-ASCII content will cause a blank result. This matches RFCs 822, 2822, and 5322. If you wish to allow UTF-8 characters in email, as per RFCs 5335 and @@ -187,8 +180,6 @@ sub parse { my ($class, $line) = @_; return unless $line; - $line =~ s/[ \t]+/ /g if $COLLAPSE_SPACES; - if (my @cached = $class->__get_cached_parse($line)) { return @cached; } diff --git a/t/speed.t b/t/speed.t new file mode 100644 index 0000000..1092f49 --- /dev/null +++ b/t/speed.t @@ -0,0 +1,15 @@ +#!perl +use strict; + +use Email::Address; +use Test::More tests => 1; + +my $email = "\"Hello\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\" <\@m>"; +my ($ea) = Email::Address->parse($email); + +is( + $ea, + undef, + 'Bad address does not parse, but is not really slow' +); + From 9066167ba421d59786b16308f3b43e58cce0d488 Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Fri, 13 May 2016 16:23:58 +0100 Subject: [PATCH 4/7] Use regex recusion for comment nesting. Due to use of recursive regex and named backpatterns, this can not support perl 5.8. Fixes #11. --- lib/Email/Address.pm | 32 +++++++++----------------------- t/speed.t | 8 +++++++- t/tests.t | 10 ++++++++++ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index 72d23b7..7255a4d 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -3,9 +3,8 @@ use warnings; package Email::Address; # ABSTRACT: RFC 2822 Address Parsing and Creation -our $COMMENT_NEST_LEVEL ||= 1; -our $STRINGIFY ||= 'format'; -our $UNICODE ||= 0; +our $STRINGIFY ||= 'format'; +our $UNICODE ||= 0; =head1 SYNOPSIS @@ -31,11 +30,7 @@ my $special = q{()<>\\[\\]:;@\\\\,."}; my $quoted_pair = qr/\\[[:graph:] \t]/; my $ctext = qr/[^$CTL()\\]/; -my ($ccontent, $comment) = (q{})x2; -for (1 .. $COMMENT_NEST_LEVEL) { - $ccontent = qr/$ctext|$quoted_pair|$comment/; - $comment = qr/\($ccontent*\)/; -} +my $comment = qr/(?\((?:$ctext|$quoted_pair|(?&comment))*\))/; my $cfws = qr/(?>$comment|\s+)/; my $atext = qq/[^$CTL$special\\s]/; @@ -140,15 +135,6 @@ This method returns a list of C objects it finds in the input string. B that it returns a list, and expects that it may find multiple addresses. The behavior in scalar context is undefined. -The specification for an email address allows for infinitely nestable comments. -That's nice in theory, but a little over done. By default this module allows -for one (C<1>) level of nested comments. If you think you need more, modify the -C<$Email::Address::COMMENT_NEST_LEVEL> package variable to allow more. - - $Email::Address::COMMENT_NEST_LEVEL = 10; # I'm deep - -The reason for this hardly-limiting limitation is simple: efficiency. - By default, this module mandates that email addresses be ASCII only, and any non-ASCII content will cause a blank result. This matches RFCs 822, 2822, and 5322. If you wish to allow UTF-8 characters in email, as per RFCs 5335 and @@ -184,19 +170,19 @@ sub parse { return @cached; } - my (@mailboxes) = ($line =~ /$mailbox/go); my @addrs; - foreach (@mailboxes) { + while ($line =~ /(?$mailbox)/go) { + local $_ = $+{mailbox}; my $original = $_; - my @comments = /($comment)/go; + my @comments = /$comment/go; s/$comment//go if @comments; my ($user, $host, $com); - ($user, $host) = ($1, $2) if s/<($local_part)\@($domain)>\s*\z//o; + ($user, $host) = ($+{user}, $+{host}) if s/<(?$local_part)\@(?$domain)>\s*\z//o; if (! defined($user) || ! defined($host)) { - s/($local_part)\@($domain)//o; - ($user, $host) = ($1, $2); + s/(?$local_part)\@(?$domain)//o; + ($user, $host) = ($+{user}, $+{host}); } unless ($UNICODE) { diff --git a/t/speed.t b/t/speed.t index 1092f49..c8b15f3 100644 --- a/t/speed.t +++ b/t/speed.t @@ -2,7 +2,7 @@ use strict; use Email::Address; -use Test::More tests => 1; +use Test::More tests => 2; my $email = "\"Hello\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\" <\@m>"; my ($ea) = Email::Address->parse($email); @@ -13,3 +13,9 @@ is( 'Bad address does not parse, but is not really slow' ); +my $email = '\(¯¯`·.¥«P®ÎÑç€ØfTh€ÐÅ®K»¥.·`¯¯\) , "(> \" \" <) ( =\'o\'= ) (\")___(\") sWeEtAnGeLtHePrInCeSsOfThEsKy" , "(i)cRiStIaN(i)" , "(S)MaNu_vuOLeAmMazZaReNimOe(*)MiAo(@)" '; +my $email2 = ", $email"; +$email = $email . ($email2 x 10); +my @emails = Email::Address->parse($email); + +is(@emails, 0, 'Bad addresses do not parse, but do not take for ever'); diff --git a/t/tests.t b/t/tests.t index 268a1de..6f3adf9 100644 --- a/t/tests.t +++ b/t/tests.t @@ -1629,6 +1629,16 @@ my @list = ( ], ], ], + [ + q{"Matthew" (Matthew (GSC))}, + [ + [ + 'Matthew', + 'matthew-- ATAT --example.org', + 'Matthew (GSC)', + ], + ], + ], ); my $tests = 1; From 7205fc9e972eeaf13ba975fdfdf28904ccb3c72b Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Sat, 14 May 2016 10:40:46 +0100 Subject: [PATCH 5/7] Don't extract "comments" from in quoted strings. Fixes https://rt.cpan.org/Public/Bug/Display.html?id=80665 --- lib/Email/Address.pm | 16 ++++++++++++++-- t/tests.t | 14 ++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index 7255a4d..c9824bc 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -54,6 +54,10 @@ my $domain = qr/$dot_atom|$domain_literal/; my $display_name = $phrase; +# This is for extracting comments, but not from inside quoted strings or domain +# literals. +my $parts = qr/("(?>$qcontent*)")|(\[(?>$dtext*)\])|$comment|([^\["(]+)/; + =head2 Package Variables B Email isn't easy (if even possible) to parse with a regex, I$local_part)\@(?$domain)>\s*\z//o; diff --git a/t/tests.t b/t/tests.t index 6f3adf9..dda6414 100644 --- a/t/tests.t +++ b/t/tests.t @@ -1488,9 +1488,9 @@ my @list = ( '"Greg Norris (humble visionary genius)" , ', [ [ - 'Greg Norris ', + 'Greg Norris (humble visionary genius)', 'nextrightmove-- ATAT --bang.example.net', - '(humble visionary genius)' + undef, ], [ undef, @@ -1639,6 +1639,16 @@ my @list = ( ], ], ], + [ + q{"John (imperator) Doe" (Comment with "quotes"!)}, + [ + [ + 'John (imperator) Doe', + 'john.doe-- ATAT --example.com', + 'Comment with "quotes"!', + ], + ] + ], ); my $tests = 1; From c61bb289fdbbc645775c41b185c44bc8e5e9456d Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Sat, 14 May 2016 11:35:38 +0100 Subject: [PATCH 6/7] Fetch phrase/user/host from main regex. --- lib/Email/Address.pm | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index c9824bc..dd85f16 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -105,9 +105,9 @@ following comment. =cut -our $addr_spec = qr/$local_part\@$domain/; +our $addr_spec = qr/(?$local_part)\@(?$domain)/; our $angle_addr = qr/$cfws*<$addr_spec>$cfws*/; -our $name_addr = qr/$display_name?$angle_addr/; +our $name_addr = qr/(?$display_name?)$angle_addr/; our $mailbox = qr/$name_addr|$addr_spec/; sub _PHRASE () { 0 } @@ -178,6 +178,14 @@ sub parse { while ($line =~ /(?$mailbox)/go) { local $_ = $+{mailbox}; my $original = $_; + my $phrase = $+{display_name}; + my $user = $+{local_part}; + my $host = $+{domain}; + + unless ($UNICODE) { + next if $user =~ /\P{ASCII}/; + next if $host =~ /\P{ASCII}/; + } my @comments; my $new = ''; @@ -190,20 +198,6 @@ sub parse { } $_ = $new; - my ($user, $host, $com); - ($user, $host) = ($+{user}, $+{host}) if s/<(?$local_part)\@(?$domain)>\s*\z//o; - if (! defined($user) || ! defined($host)) { - s/(?$local_part)\@(?$domain)//o; - ($user, $host) = ($+{user}, $+{host}); - } - - unless ($UNICODE) { - next if $user =~ /\P{ASCII}/; - next if $host =~ /\P{ASCII}/; - } - - my ($phrase) = /($display_name)/o; - for ( $phrase, $host, $user, @comments ) { next unless defined $_; s/^\s+//; From 5182577ff2f3282f33e460b145e396316d9d7354 Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Sat, 14 May 2016 12:03:29 +0100 Subject: [PATCH 7/7] Parse display name for multiple quoted strings. Tidy up handling of phrase (display name) to be consistent throughout; a phrase will be treated as is unless it starts and ends with double quote marks, in which case it will be treated as a quoted string, unquoted and unescaped. Fixes comment in #13. --- lib/Email/Address.pm | 33 ++++++++++++++++++++++++++------- t/tests.t | 20 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm index dd85f16..da64411 100644 --- a/lib/Email/Address.pm +++ b/lib/Email/Address.pm @@ -55,7 +55,7 @@ my $domain = qr/$dot_atom|$domain_literal/; my $display_name = $phrase; # This is for extracting comments, but not from inside quoted strings or domain -# literals. +# literals; or quoted strings from in phrases. my $parts = qr/("(?>$qcontent*)")|(\[(?>$dtext*)\])|$comment|([^\["(]+)/; =head2 Package Variables @@ -205,7 +205,17 @@ sub parse { $_ = undef unless length $_; } - $phrase =~ s/\\(.)/$1/g if $phrase; + $new = ''; + while ($phrase && $phrase =~ /$parts/go) { + my ($q, $d, $c, $o) = ($1, $2, $3, $4); + $new .= $d, next if $d; # Shouldn't be any + $new .= $c, next if $c; # Shouldn't be any + $new .= $o, next if $o; + $q =~ s/\A"(.+)"\z/$1/; + $q =~ s/\\(.)/$1/g; + $new .= $q; + } + $phrase = $new if $new; my $new_comment = join q{ }, @comments; push @addrs, @@ -226,17 +236,28 @@ sub parse { Constructs and returns a new C object. Takes four positional arguments: phrase, email, and comment, and original string. +If phrase starts and ends with quotes, the phrase will be assumed to be a +quoted string. Otherwise it will be treated as is. + The original string should only really be set using C. =cut sub new { my ($class, $phrase, $email, $comment, $orig) = @_; - $phrase =~ s/\A"(.+)"\z/$1/ if $phrase; + $phrase = _dephrase($phrase) if $phrase; bless [ $phrase, $email, $comment, $orig ] => $class; } +sub _dephrase { + my $phrase = shift; + return $phrase unless $phrase =~ /\A"(.+)"\z/; + $phrase =~ s/\A"(.+)"\z/$1/; + $phrase =~ s/($quoted_pair)/substr $1, -1/goe; + return $phrase; +} + =item purge_cache Email::Address->purge_cache; @@ -408,7 +429,7 @@ sub _enquoted_phrase { # if it's encoded -- rjbs, 2007-02-28 return $phrase if $phrase =~ /\A=\?.+\?=\z/; - $phrase =~ s/\A"(.+)"\z/$1/; + $phrase = _dephrase($phrase); $phrase =~ s/([\\"])/\\$1/g; return qq{"$phrase"}; @@ -436,9 +457,7 @@ sub name { my ($self) = @_; my $name = q{}; if ( $name = $self->[_PHRASE] ) { - $name =~ s/^"//; - $name =~ s/"$//; - $name =~ s/($quoted_pair)/substr $1, -1/goe; + $name = _dephrase($name); } elsif ( $name = $self->[_COMMENT] ) { $name =~ s/^\(//; $name =~ s/\)$//; diff --git a/t/tests.t b/t/tests.t index dda6414..0038b70 100644 --- a/t/tests.t +++ b/t/tests.t @@ -1649,6 +1649,26 @@ my @list = ( ], ] ], + [ + q{Matthew "the" Example }, + [ + [ + 'Matthew the Example', + 'matthew-- ATAT --example.org', + undef, + ], + ], + ], + [ + q{"Matthew" the "Example" }, + [ + [ + 'Matthew the Example', + 'matthew-- ATAT --example.org', + undef, + ], + ], + ], ); my $tests = 1;