Perl-Email-Project · dracos · May 13, 2016 · May 13, 2016 · May 13, 2016 · May 13, 2016
diff --git a/lib/Email/Address.pm b/lib/Email/Address.pm
@@ -3,9 +3,8 @@ use warnings;
 package Email::Address;
 # ABSTRACT: RFC 2822 Address Parsing and Creation
 
-our $COMMENT_NEST_LEVEL ||= 1;
-our $STRINGIFY          ||= 'format';
-our $COLLAPSE_SPACES      = 1 unless defined $COLLAPSE_SPACES; # I miss //=
+our $STRINGIFY ||= 'format';
+our $UNICODE   ||= 0;
 
 =head1 SYNOPSIS
 
@@ -28,54 +27,37 @@ to be correct, and very very fast.
 my $CTL            = q{\x00-\x1F\x7F};
 my $special        = q{()<>\\[\\]:;@\\\\,."};
 
-my $text           = qr/[^\x0A\x0D]/;
+my $quoted_pair    = qr/\\[[:graph:] \t]/;
 
-my $quoted_pair    = qr/\\$text/;
-
-my $ctext          = qr/(?>[^()\\]+)/;
-my ($ccontent, $comment) = (q{})x2;
-for (1 .. $COMMENT_NEST_LEVEL) {
-  $ccontent = qr/$ctext|$quoted_pair|$comment/;
-  $comment  = qr/\s*\((?:\s*$ccontent)*\s*\)\s*/;
-}
-my $cfws           = qr/$comment|\s+/;
+my $ctext          = qr/[^$CTL()\\]/;
+my $comment        = qr/(?<comment>\((?:$ctext|$quoted_pair|(?&comment))*\))/;
+my $cfws           = qr/(?>$comment|\s+)/;
 
 my $atext          = qq/[^$CTL$special\\s]/;
-my $atom           = qr/$cfws*$atext+$cfws*/;
+my $atom           = qr/$cfws*(?>$atext+)$cfws*/;
 my $dot_atom_text  = qr/$atext+(?:\.$atext+)*/;
-my $dot_atom       = qr/$cfws*$dot_atom_text$cfws*/;
+my $dot_atom       = qr/$cfws*(?>$dot_atom_text)$cfws*/;
 
-my $qtext          = qr/[^\\"]/;
+my $qtext          = qr/[^$CTL\\"]/;
 my $qcontent       = qr/$qtext|$quoted_pair/;
-my $quoted_string  = qr/$cfws*"$qcontent*"$cfws*/;
+my $quoted_string  = qr/$cfws*"(?>$qcontent*)"$cfws*/;
 
 my $word           = qr/$atom|$quoted_string/;
 
-# XXX: This ($phrase) used to just be: my $phrase = qr/$word+/; It was changed
-# to resolve bug 22991, creating a significant slowdown.  Given current speed
-# problems.  Once 16320 is resolved, this section should be dealt with.
-# -- rjbs, 2006-11-11
-#my $obs_phrase     = qr/$word(?:$word|\.|$cfws)*/;
-
-# XXX: ...and the above solution caused endless problems (never returned) when
-# examining this address, now in a test:
-#   admin+=E6=96=B0=E5=8A=A0=E5=9D=A1_Weblog-- ATAT --test.socialtext.com
-# So we disallow the hateful CFWS in this context for now.  Of modern mail
-# agents, only Apple Web Mail 2.0 is known to produce obs-phrase.
-# -- rjbs, 2006-11-19
-my $simple_word    = qr/$atom|\.|\s*"$qcontent+"\s*/;
-my $obs_phrase     = qr/$simple_word+/;
-
-my $phrase         = qr/$obs_phrase|(?:$word+)/;
+my $obs_phrase     = qr/$word(?>(?:$word|\.|$cfws)*)/;
+my $phrase         = qr/$obs_phrase|(?>$word+)/;
 
 my $local_part     = qr/$dot_atom|$quoted_string/;
-my $dtext          = qr/[^\[\]\\]/;
-my $dcontent       = qr/$dtext|$quoted_pair/;
-my $domain_literal = qr/$cfws*\[(?:\s*$dcontent)*\s*\]$cfws*/;
+my $dtext          = qr/[^$CTL\[\]\\]/;
+my $domain_literal = qr/$cfws*\[(?>$dtext*)\]$cfws*/;
 my $domain         = qr/$dot_atom|$domain_literal/;
 
 my $display_name   = $phrase;
 
+# This is for extracting comments, but not from inside quoted strings or domain
+# literals; or quoted strings from in phrases.
+my $parts = qr/("(?>$qcontent*)")|(\[(?>$dtext*)\])|$comment|([^\["(]+)/;
+
 =head2 Package Variables
 
 B<ACHTUNG!>  Email isn't easy (if even possible) to parse with a regex, I<at
@@ -123,10 +105,10 @@ following comment.
 
 =cut
 
-our $addr_spec  = qr/$local_part\@$domain/;
+our $addr_spec  = qr/(?<local_part>$local_part)\@(?<domain>$domain)/;
 our $angle_addr = qr/$cfws*<$addr_spec>$cfws*/;
-our $name_addr  = qr/(?>$display_name?)$angle_addr/;
-our $mailbox    = qr/(?:$name_addr|$addr_spec)$comment*/;
+our $name_addr  = qr/(?<display_name>$display_name?)$angle_addr/;
+our $mailbox    = qr/$name_addr|$addr_spec/;
 
 sub _PHRASE   () { 0 }
 sub _ADDRESS  () { 1 }
@@ -157,24 +139,10 @@ This method returns a list of C<Email::Address> objects it finds in the input
 string.  B<Please note> that it returns a list, and expects that it may find
 multiple addresses.  The behavior in scalar context is undefined.
 
-The specification for an email address allows for infinitely nestable comments.
-That's nice in theory, but a little over done.  By default this module allows
-for one (C<1>) level of nested comments. If you think you need more, modify the
-C<$Email::Address::COMMENT_NEST_LEVEL> package variable to allow more.
-
-  $Email::Address::COMMENT_NEST_LEVEL = 10; # I'm deep
-
-The reason for this hardly-limiting limitation is simple: efficiency.
-
-Long strings of whitespace can be problematic for this module to parse, a bug
-which has not yet been adequately addressed.  The default behavior is now to
-collapse multiple spaces into a single space, which avoids this problem.  To
-prevent this behavior, set C<$Email::Address::COLLAPSE_SPACES> to zero.  This
-variable will go away when the bug is resolved properly.
-
-In accordance with RFC 822 and its descendants, this module demands that email
-addresses be ASCII only.  Any non-ASCII content in the parsed addresses will
-cause the parser to return no results.
+By default, this module mandates that email addresses be ASCII only, and any
+non-ASCII content will cause a blank result. This matches RFCs 822, 2822, and
+5322. If you wish to allow UTF-8 characters in email, as per RFCs 5335 and
+6532, set C<$Email:Address::UNICODE> to 1.
 
 =cut
 
@@ -202,31 +170,33 @@ sub parse {
     my ($class, $line) = @_;
     return unless $line;
 
-    $line =~ s/[ \t]+/ /g if $COLLAPSE_SPACES;
-
     if (my @cached = $class->__get_cached_parse($line)) {
         return @cached;
     }
 
-    my (@mailboxes) = ($line =~ /$mailbox/go);
     my @addrs;
-    foreach (@mailboxes) {
+    while ($line =~ /(?<mailbox>$mailbox)/go) {
+      local $_ = $+{mailbox};
       my $original = $_;
+      my $phrase = $+{display_name};
+      my $user = $+{local_part};
+      my $host = $+{domain};
 
-      my @comments = /($comment)/go;
-      s/$comment//go if @comments;
-
-      my ($user, $host, $com);
-      ($user, $host) = ($1, $2) if s/<($local_part)\@($domain)>\s*\z//o;
-      if (! defined($user) || ! defined($host)) {
-          s/($local_part)\@($domain)//o;
-          ($user, $host) = ($1, $2);
+      unless ($UNICODE) {
+          next if $user =~ /\P{ASCII}/;
+          next if $host =~ /\P{ASCII}/;
       }
 
-      next if $user =~ /\P{ASCII}/;
-      next if $host =~ /\P{ASCII}/;
-
-      my ($phrase)       = /($display_name)/o;
+      my @comments;
+      my $new = '';
+      while (/$parts/go) {
+          my ($q, $d, $c, $o) = ($1, $2, $3, $4);
+          $new .= $q, next if $q;
+          $new .= $d, next if $d;
+          $new .= $o, next if $o;
+          push @comments, $c;
+      }
+      $_ = $new;
 
       for ( $phrase, $host, $user, @comments ) {
         next unless defined $_;
@@ -235,7 +205,17 @@ sub parse {
         $_ = undef unless length $_;
       }
 
-      $phrase =~ s/\\(.)/$1/g if $phrase;
+      $new = '';
+      while ($phrase && $phrase =~ /$parts/go) {
+        my ($q, $d, $c, $o) = ($1, $2, $3, $4);
+        $new .= $d, next if $d; # Shouldn't be any
+        $new .= $c, next if $c; # Shouldn't be any
+        $new .= $o, next if $o;
+        $q =~ s/\A"(.+)"\z/$1/;
+        $q =~ s/\\(.)/$1/g;
+        $new .= $q;
+      }
+      $phrase = $new if $new;
 
       my $new_comment = join q{ }, @comments;
       push @addrs,
@@ -256,17 +236,28 @@ sub parse {
 Constructs and returns a new C<Email::Address> object. Takes four
 positional arguments: phrase, email, and comment, and original string.
 
+If phrase starts and ends with quotes, the phrase will be assumed to be a
+quoted string. Otherwise it will be treated as is.
+
 The original string should only really be set using C<parse>.
 
 =cut
 
 sub new {
   my ($class, $phrase, $email, $comment, $orig) = @_;
-  $phrase =~ s/\A"(.+)"\z/$1/ if $phrase;
+  $phrase = _dephrase($phrase) if $phrase;
 
   bless [ $phrase, $email, $comment, $orig ] => $class;
 }
 
+sub _dephrase {
+    my $phrase = shift;
+    return $phrase unless $phrase =~ /\A"(.+)"\z/;
+    $phrase =~ s/\A"(.+)"\z/$1/;
+    $phrase =~ s/($quoted_pair)/substr $1, -1/goe;
+    return $phrase;
+}
+
 =item purge_cache
 
   Email::Address->purge_cache;
@@ -438,7 +429,7 @@ sub _enquoted_phrase {
   # if it's encoded -- rjbs, 2007-02-28
   return $phrase if $phrase =~ /\A=\?.+\?=\z/;
 
-  $phrase =~ s/\A"(.+)"\z/$1/;
+  $phrase = _dephrase($phrase);
   $phrase =~ s/([\\"])/\\$1/g;
 
   return qq{"$phrase"};
@@ -466,9 +457,7 @@ sub name {
     my ($self) = @_;
     my $name = q{};
     if ( $name = $self->[_PHRASE] ) {
-        $name =~ s/^"//;
-        $name =~ s/"$//;
-        $name =~ s/($quoted_pair)/substr $1, -1/goe;
+        $name = _dephrase($name);
     } elsif ( $name = $self->[_COMMENT] ) {
         $name =~ s/^\(//;
         $name =~ s/\)$//;

diff --git a/t/ascii.t b/t/ascii.t
@@ -10,50 +10,63 @@ my $ascii = q{admin@mozilla.org};
 my $utf_8 = q{аdmin@mozilla.org};
 my $text  = decode('utf-8', $utf_8, Encode::LEAVE_SRC);
 
-my $ok_mixed  = qq{"$text" <$ascii>};
-my $bad_mixed = qq{"$text" <$text>};
+my $ascii_mixed = qq{"$text" <$ascii>};
+my $utf8_mixed = qq{"$text" <$text>};
 
-{
-  my (@addr) = Email::Address->parse($ascii);
-  is(@addr, 1, "an ascii address is a-ok");
+for (0..1) {
+  local $Email::Address::UNICODE = $_;
 
-  # ok( $ascii =~ $Email::Address::addr_spec, "...it =~ addr_spec");
-}
+  {
+    my (@addr) = Email::Address->parse($ascii);
+    is(@addr, 1, "an ascii address is a-ok");
 
-{
-  my (@addr) = Email::Address->parse($ok_mixed);
-  is(@addr, 1, "a quoted non-ascii phrase is a-ok with ascii email");
-}
+    # ok( $ascii =~ $Email::Address::addr_spec, "...it =~ addr_spec");
+  }
 
-{
-  my (@addr) = Email::Address->parse($bad_mixed);
-  is(@addr, 0, "a quoted non-ascii phrase is not okay with non-ascii email");
-}
+  {
+    my (@addr) = Email::Address->parse($ascii_mixed);
+    is(@addr, 1, "a quoted non-ascii phrase is a-ok with ascii email");
+  }
 
-{
-  my (@addr) = Email::Address->parse($utf_8);
-  is(@addr, 0, "utf-8 octet address: not ok");
+  {
+    my (@addr) = Email::Address->parse($utf8_mixed);
+    is(@addr, $Email::Address::UNICODE, "a quoted non-ascii phrase with non-ascii email");
+  }
 
-  # ok( $utf_8 !~ $Email::Address::addr_spec, "...it !~ addr_spec");
-}
+  {
+    my (@addr) = Email::Address->parse($utf_8);
+    is(@addr, $Email::Address::UNICODE, "utf-8 octet address");
 
-{
-  my (@addr) = Email::Address->parse($text);
-  is(@addr, 0, "unicode (decoded) address: not ok");
+    # ok( $utf_8 !~ $Email::Address::addr_spec, "...it !~ addr_spec");
+  }
 
-  # ok( $text =~ $Email::Address::addr_spec, "...it !~ addr_spec");
-}
+  {
+    my (@addr) = Email::Address->parse($text);
+    is(@addr, $Email::Address::UNICODE, "unicode (decoded) address");
+
+    # ok( $text =~ $Email::Address::addr_spec, "...it !~ addr_spec");
+  }
+
+  {
+    my @addr = Email::Address->parse(qq{
+      "Not ascii phras\x{e9}" <good\@email>,
+      b\x{e3}d\@user,
+      bad\@d\x{f6}main,
+      not.bad\@again
+    });
+    is scalar @addr, $Email::Address::UNICODE ? 4 : 2, "correct number of good emails";
+    is "$addr[0]", qq{"Not ascii phras\x{e9}" <good\@email>}, "expected email";
+    if ($Email::Address::UNICODE) {
+      is "$addr[1]", qq{b\x{e3}d\@user}, "expected email";
+      is "$addr[2]", qq{bad\@d\x{f6}main}, "expected email";
+      is "$addr[3]", qq{not.bad\@again}, "expected email";
+    } else {
+      is "$addr[1]", qq{not.bad\@again}, "expected email";
+    }
+  }
+
+  Email::Address->purge_cache;
 
-{
-  my @addr = Email::Address->parse(qq{
-    "Not ascii phras\x{e9}" <good\@email>,
-    b\x{e3}d\@user,
-    bad\@d\x{f6}main,
-    not.bad\@again
-  });
-  is scalar @addr, 2, "correct number of good emails";
-  is "$addr[0]", qq{"Not ascii phras\x{e9}" <good\@email>}, "expected email";
-  is "$addr[1]", qq{not.bad\@again}, "expected email";
 }
 
 done_testing;
diff --git a/t/speed.t b/t/speed.t
@@ -0,0 +1,21 @@
+#!perl
+use strict;
+
+use Email::Address;
+use Test::More tests => 2;
+
+my $email = "\"Hello\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\" <\@m>";
+my ($ea) = Email::Address->parse($email);
+
+is(
+  $ea,
+  undef,
+  'Bad address does not parse, but is not really slow'
+);
+
+my $email = '\(¯¯`·.¥«P®ÎÑç€ØfTh€ÐÅ®K»¥.·`¯¯\) <email () example com>, "(> \" \" <)                              ( =\'o\'= )                              (\")___(\")  sWeEtAnGeLtHePrInCeSsOfThEsKy" <email2 () example com>, "(i)cRiStIaN(i)" <email3 () example com>, "(S)MaNu_vuOLeAmMazZaReNimOe(*)MiAo(@)" <email4 () example com>';
+my $email2 = ", $email";
+$email = $email . ($email2 x 10);
+my @emails = Email::Address->parse($email);
+
+is(@emails, 0, 'Bad addresses do not parse, but do not take for ever');