1717
1818class EmailIdentity (object ):
1919 def __init__ (self , manager , name , email ):
20+ """
21+ Return a Person entity that encodes the name and e-mail of
22+ an entity found in an e-mail header.
23+
24+ We want to create a Person entity even if we only have
25+ a valid name, or a valid e-mail.
26+ """
2027 self .email = ascii_text (stringify (email ))
2128 self .name = stringify (name )
2229 if not self .name :
2330 self .name = None
2431 if not registry .email .validate (self .email ):
2532 self .email = None
33+ # If the value stored in name is a valid e-mail
34+ # store it in self.email and set self.name to None
2635 if self .name and registry .email .validate (self .name ):
2736 self .email = self .email or ascii_text (self .name )
2837 self .name = None
@@ -38,9 +47,11 @@ def __init__(self, manager, name, email):
3847 self .label = self .name
3948
4049 self .entity = None
41- key = registry .email .node_id_safe (self .email )
42- if self .name is not None and len (self .name ) > 10 :
43- key = key or registry .name .node_id_safe (self .name )
50+
51+ if not self .email :
52+ return
53+
54+ key = self .email .strip ().lower ()
4455 if key is not None :
4556 fragment = safe_fragment (self .label )
4657 self .entity = manager .make_entity ("Person" )
@@ -81,14 +92,24 @@ def ingest_attachment(self, entity, name, mime_type, body):
8192 self .manager .queue_entity (child )
8293
8394 def get_header (self , msg , * headers ):
84- values = []
95+ """
96+ As seen in real world, we can't rely on the correct parsing
97+ of header values by the python built-in email module.
98+ Therefore we additionally check for the raw header values
99+ if the values contain "; " as a splitter.
100+ """
101+ raw_headers = dict (msg ._headers )
102+ values = set ()
85103 for header in headers :
86104 try :
87105 for value in ensure_list (msg .get_all (header )):
88- values .append (value )
106+ values .add (value )
107+ for value in ensure_list (raw_headers .get (header )):
108+ values .update (value .split (";" ))
89109 except (TypeError , IndexError , AttributeError , ValueError ) as exc :
90110 log .warning ("Failed to parse [%s]: %s" , header , exc )
91- return values
111+ values = [x .strip () for x in values ]
112+ return list (values )
92113
93114 def get_dates (self , msg , * headers ):
94115 dates = []
@@ -118,6 +139,12 @@ def apply_identities(self, entity, identities, eprop=None, lprop=None):
118139 entity .add ("namesMentioned" , identity .name )
119140 entity .add ("emailMentioned" , identity .email )
120141
142+ def apply_raw (self , msg , entity , lprop , * headers ):
143+ raw_header_values = self .get_header (msg , * headers )
144+ for raw_value in raw_header_values :
145+ raw_value = raw_value .replace ('"' , "" )
146+ entity .add (lprop , raw_value )
147+
121148 def parse_message_ids (self , values ):
122149 message_ids = []
123150 for value in ensure_list (values ):
@@ -196,15 +223,20 @@ def extract_msg_headers(self, entity, msg):
196223
197224 sender = self .get_header_identities (msg , "Sender" , "X-Sender" )
198225 self .apply_identities (entity , sender , "emitters" , "sender" )
226+ self .apply_raw (msg , entity , "sender" , "Sender" , "X-Sender" )
199227
200228 froms = self .get_header_identities (msg , "From" , "X-From" ) # codespell:ignore
201229 self .apply_identities (entity , froms , "emitters" , "from" ) # codespell:ignore
230+ self .apply_raw (msg , entity , "from" , "From" , "X-From" )
202231
203232 tos = self .get_header_identities (msg , "To" , "Resent-To" )
204233 self .apply_identities (entity , tos , "recipients" , "to" )
234+ self .apply_raw (msg , entity , "to" , "To" , "Resent-To" )
205235
206236 ccs = self .get_header_identities (msg , "CC" , "Cc" , "Resent-Cc" )
207237 self .apply_identities (entity , ccs , "recipients" , "cc" )
238+ self .apply_raw (msg , entity , "cc" , "CC" , "Cc" , "Resent-Cc" )
208239
209240 bccs = self .get_header_identities (msg , "Bcc" , "BCC" , "Resent-Bcc" )
210241 self .apply_identities (entity , bccs , "recipients" , "bcc" )
242+ self .apply_raw (msg , entity , "bcc" , "Bcc" , "BCC" , "Resent-Bcc" )
0 commit comments