Skip to content

Commit ccd28c3

Browse files
committed
Reveal raw data from email headers; Don't create Person entities without an email address
1 parent 511f516 commit ccd28c3

File tree

1 file changed

+38
-6
lines changed

1 file changed

+38
-6
lines changed

ingestors/support/email.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,21 @@
1717

1818
class EmailIdentity(object):
1919
def __init__(self, manager, name, email):
20+
"""
21+
Return a Person entity that encodes the name and e-mail of
22+
an entity found in an e-mail header.
23+
24+
We want to create a Person entity even if we only have
25+
a valid name, or a valid e-mail.
26+
"""
2027
self.email = ascii_text(stringify(email))
2128
self.name = stringify(name)
2229
if not self.name:
2330
self.name = None
2431
if not registry.email.validate(self.email):
2532
self.email = None
33+
# If the value stored in name is a valid e-mail
34+
# store it in self.email and set self.name to None
2635
if self.name and registry.email.validate(self.name):
2736
self.email = self.email or ascii_text(self.name)
2837
self.name = None
@@ -38,9 +47,11 @@ def __init__(self, manager, name, email):
3847
self.label = self.name
3948

4049
self.entity = None
41-
key = registry.email.node_id_safe(self.email)
42-
if self.name is not None and len(self.name) > 10:
43-
key = key or registry.name.node_id_safe(self.name)
50+
51+
if not self.email:
52+
return
53+
54+
key = self.email.strip().lower()
4455
if key is not None:
4556
fragment = safe_fragment(self.label)
4657
self.entity = manager.make_entity("Person")
@@ -81,14 +92,24 @@ def ingest_attachment(self, entity, name, mime_type, body):
8192
self.manager.queue_entity(child)
8293

8394
def get_header(self, msg, *headers):
84-
values = []
95+
"""
96+
As seen in real world, we can't rely on the correct parsing
97+
of header values by the python built-in email module.
98+
Therefore we additionally check for the raw header values
99+
if the values contain "; " as a splitter.
100+
"""
101+
raw_headers = dict(msg._headers)
102+
values = set()
85103
for header in headers:
86104
try:
87105
for value in ensure_list(msg.get_all(header)):
88-
values.append(value)
106+
values.add(value)
107+
for value in ensure_list(raw_headers.get(header)):
108+
values.update(value.split(";"))
89109
except (TypeError, IndexError, AttributeError, ValueError) as exc:
90110
log.warning("Failed to parse [%s]: %s", header, exc)
91-
return values
111+
values = [x.strip() for x in values]
112+
return list(values)
92113

93114
def get_dates(self, msg, *headers):
94115
dates = []
@@ -118,6 +139,12 @@ def apply_identities(self, entity, identities, eprop=None, lprop=None):
118139
entity.add("namesMentioned", identity.name)
119140
entity.add("emailMentioned", identity.email)
120141

142+
def apply_raw(self, msg, entity, lprop, *headers):
143+
raw_header_values = self.get_header(msg, *headers)
144+
for raw_value in raw_header_values:
145+
raw_value = raw_value.replace('"', "")
146+
entity.add(lprop, raw_value)
147+
121148
def parse_message_ids(self, values):
122149
message_ids = []
123150
for value in ensure_list(values):
@@ -196,15 +223,20 @@ def extract_msg_headers(self, entity, msg):
196223

197224
sender = self.get_header_identities(msg, "Sender", "X-Sender")
198225
self.apply_identities(entity, sender, "emitters", "sender")
226+
self.apply_raw(msg, entity, "sender", "Sender", "X-Sender")
199227

200228
froms = self.get_header_identities(msg, "From", "X-From") # codespell:ignore
201229
self.apply_identities(entity, froms, "emitters", "from") # codespell:ignore
230+
self.apply_raw(msg, entity, "from", "From", "X-From")
202231

203232
tos = self.get_header_identities(msg, "To", "Resent-To")
204233
self.apply_identities(entity, tos, "recipients", "to")
234+
self.apply_raw(msg, entity, "to", "To", "Resent-To")
205235

206236
ccs = self.get_header_identities(msg, "CC", "Cc", "Resent-Cc")
207237
self.apply_identities(entity, ccs, "recipients", "cc")
238+
self.apply_raw(msg, entity, "cc", "CC", "Cc", "Resent-Cc")
208239

209240
bccs = self.get_header_identities(msg, "Bcc", "BCC", "Resent-Bcc")
210241
self.apply_identities(entity, bccs, "recipients", "bcc")
242+
self.apply_raw(msg, entity, "bcc", "Bcc", "BCC", "Resent-Bcc")

0 commit comments

Comments
 (0)