From a6a074f2e9a98078c79cd306b4391762e7c9620e Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 6 Apr 2025 09:27:17 -0700 Subject: [PATCH 1/8] In progress test of new RTF deencapsulation adjustments --- CHANGELOG.md | 3 +++ README.rst | 4 +-- extract_msg/__init__.py | 4 +-- extract_msg/constants/re.py | 8 ++++++ extract_msg/msg_classes/message_base.py | 8 +++++- extract_msg/utils.py | 35 +++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21463d41..0d8e34cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +**v0.54.1** +* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to RTFDE. Some times have been detected to be improved by 75% or more. + **v0.54.0** * [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large. diff --git a/README.rst b/README.rst index c666d985..2fe2af9e 100644 --- a/README.rst +++ b/README.rst @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.54.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.1-blue.svg + :target: https://pypi.org/project/extract-msg/0.54.1/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3810/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 4f4e5a6c..c8f52fd1 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2025-03-23' -__version__ = '0.54.0' +__date__ = '2025-04-06' +__version__ = '0.54.1' __all__ = [ # Modules: diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py index d4cb88d0..e69e1189 100644 --- a/extract_msg/constants/re.py +++ b/extract_msg/constants/re.py @@ -40,3 +40,11 @@ # invalid. INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]') +# Used as the initial step in stripping RTF files for deencapsulation. Finds +# ignored sections that do not contrain groups *and* finds HTML tag sections +# that are entirely empty. It also then finds sections of data that can be +# merged together without affecting the results +RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)') + +# Finds sections of ignored data that contains groups that are entirely self contained. If any of the { or } characters have a \ in front of them, the processing stops and treats it as not self contained to be safe. + diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 5bb42c7f..74f94df8 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -48,7 +48,8 @@ from ..utils import ( addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk, htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf, - prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml + prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, stripRtf, + validateHtml ) @@ -1012,6 +1013,11 @@ def deencapsulatedRtf(self) -> Optional[RTFDE.DeEncapsulator]: while body and body[-1] != 125: body = body[:-1] + # Some files take a long time due to how they are structured and + # how RTFDE works. The longer a file would normally take, the + # better this fix works: + body = stripRtf(body) + try: deencapsultor = RTFDE.DeEncapsulator(body) deencapsultor.deencapsulate() diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 887d7eee..d5a16ee9 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -37,6 +37,7 @@ 'rtfSanitizeHtml', 'rtfSanitizePlain', 'setupLogging', + 'stripRtf', 'tryGetMimetype', 'unsignedToSignedInt', 'unwrapMsg', @@ -61,6 +62,7 @@ import logging.config import os import pathlib +import re import shutil import struct import sys @@ -1012,6 +1014,39 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None return True +def stripRtf(rtfBody: bytes) -> bytes: + """ + Cleans up RTF before sending it to RTFDE. + + Attempts to find common sections of RTF data that will + """ + # First do an initial strip to simplify our data stream. + rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) + # Now, let's find any self-contained ignorable groups. + + return rtfBody + + +def _stripRtfHelper(match: re.Match[bytes]) -> bytes: + res = match.string + + # If these don't match, don't even try. + if res.count(b'{') != res.count(b'}') or res.count(b'{') == 0: + return res + + # If any group markers are prefixed by a backslash, give up. + if res.find(b'\\{') != -1 or res.find(b'\\}') != -1: + return res + + # Last little bit of processing to validate everything. We know the {} + # match, but let's be *absolutely* sure. + # TODO + + return res + + + + def tryGetMimetype(att: AttachmentBase, mimetype: Union[str, None]) -> Union[str, None]: """ Uses an optional dependency to try and get the mimetype of an attachment. From 59e4365f1c8c28b8ec672a19a31a3d87c0dd49d5 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 6 Apr 2025 09:28:00 -0700 Subject: [PATCH 2/8] Add todo markers --- extract_msg/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index d5a16ee9..b90a8dbc 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1023,7 +1023,7 @@ def stripRtf(rtfBody: bytes) -> bytes: # First do an initial strip to simplify our data stream. rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) # Now, let's find any self-contained ignorable groups. - + # TODO return rtfBody From fe371759f2c979a8c98dbea26ce8e1f124831f64 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 6 Apr 2025 11:18:10 -0700 Subject: [PATCH 3/8] Further refine the RTF stripping --- extract_msg/constants/re.py | 6 +++++- extract_msg/utils.py | 11 ++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py index e69e1189..e70ccd1c 100644 --- a/extract_msg/constants/re.py +++ b/extract_msg/constants/re.py @@ -44,7 +44,11 @@ # ignored sections that do not contrain groups *and* finds HTML tag sections # that are entirely empty. It also then finds sections of data that can be # merged together without affecting the results -RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)') +RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'a0\\htmlrtf0 ?)') + +# +#RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)') +RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?') # Finds sections of ignored data that contains groups that are entirely self contained. If any of the { or } characters have a \ in front of them, the processing stops and treats it as not self contained to be safe. diff --git a/extract_msg/utils.py b/extract_msg/utils.py index b90a8dbc..5e9c10b5 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1020,7 +1020,16 @@ def stripRtf(rtfBody: bytes) -> bytes: Attempts to find common sections of RTF data that will """ - # First do an initial strip to simplify our data stream. + # First, do a pre-strip to try and simplify ignored sections as much as possible. + def sub(k1): + def jjj(k: re.Match): + print(k.expand('\\g<0>')) + return k1 + return jjj + rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(rb'\\htmlrtf{\\htmlrtf0 ', rtfBody) + print('AAAAAAAAAAAAAAAAAAAAAAAAAA') + #rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(sub(b'\\htmlrtf}\\htmlrtf0 '), rtfBody) + # Second do an initial strip to simplify our data stream. rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) # Now, let's find any self-contained ignorable groups. # TODO From 007b15dab40d0aaba45f0ff3bc1ccc9e34adda30 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 6 Apr 2025 13:52:47 -0700 Subject: [PATCH 4/8] Further tuning --- extract_msg/constants/re.py | 9 +++------ extract_msg/utils.py | 13 ++++--------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py index e70ccd1c..40a1afdc 100644 --- a/extract_msg/constants/re.py +++ b/extract_msg/constants/re.py @@ -44,11 +44,8 @@ # ignored sections that do not contrain groups *and* finds HTML tag sections # that are entirely empty. It also then finds sections of data that can be # merged together without affecting the results -RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'a0\\htmlrtf0 ?)') +RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)') -# -#RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)') +# Preprocessing steps to simplify the RTF. +RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)') RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?') - -# Finds sections of ignored data that contains groups that are entirely self contained. If any of the { or } characters have a \ in front of them, the processing stops and treats it as not self contained to be safe. - diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 5e9c10b5..4003a47d 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1021,18 +1021,13 @@ def stripRtf(rtfBody: bytes) -> bytes: Attempts to find common sections of RTF data that will """ # First, do a pre-strip to try and simplify ignored sections as much as possible. - def sub(k1): - def jjj(k: re.Match): - print(k.expand('\\g<0>')) - return k1 - return jjj rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(rb'\\htmlrtf{\\htmlrtf0 ', rtfBody) - print('AAAAAAAAAAAAAAAAAAAAAAAAAA') - #rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(sub(b'\\htmlrtf}\\htmlrtf0 '), rtfBody) + rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(rb'\\htmlrtf}\\htmlrtf0 ', rtfBody) # Second do an initial strip to simplify our data stream. rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) - # Now, let's find any self-contained ignorable groups. - # TODO + + # TODO: Further processing... + return rtfBody From 5a1230b59302b4dbe2cf6b14f8ff513fecc55bda Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Sun, 6 Apr 2025 14:51:17 -0700 Subject: [PATCH 5/8] Tuning --- extract_msg/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 4003a47d..8de239d7 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1022,7 +1022,7 @@ def stripRtf(rtfBody: bytes) -> bytes: """ # First, do a pre-strip to try and simplify ignored sections as much as possible. rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(rb'\\htmlrtf{\\htmlrtf0 ', rtfBody) - rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(rb'\\htmlrtf}\\htmlrtf0 ', rtfBody) + rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody) # Second do an initial strip to simplify our data stream. rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) @@ -1030,6 +1030,12 @@ def stripRtf(rtfBody: bytes) -> bytes: return rtfBody +def _stripRtfCloseHelper(match: re.Match[bytes]) -> bytes: + if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1: + return ret + + return b'\\htmlrtf}\\htmlrtf0 ' + def _stripRtfHelper(match: re.Match[bytes]) -> bytes: res = match.string From aa5f890f345208eafa391c9f88d122c809f27077 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Mon, 7 Apr 2025 07:13:17 -0700 Subject: [PATCH 6/8] Add a bit more tuning to rtf stripping --- extract_msg/utils.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 8de239d7..28f51939 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1021,10 +1021,14 @@ def stripRtf(rtfBody: bytes) -> bytes: Attempts to find common sections of RTF data that will """ # First, do a pre-strip to try and simplify ignored sections as much as possible. - rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(rb'\\htmlrtf{\\htmlrtf0 ', rtfBody) + rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(_stripRtfOpenHelper, rtfBody) rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody) # Second do an initial strip to simplify our data stream. rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) + # Do it one more time to help with some things that might not have gotten + # caught the first time, perhaps because something now exists after + # stripping. + rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) # TODO: Further processing... @@ -1034,9 +1038,19 @@ def _stripRtfCloseHelper(match: re.Match[bytes]) -> bytes: if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1: return ret + if b'\\f' in ret: + return ret + return b'\\htmlrtf}\\htmlrtf0 ' +def _stripRtfOpenHelper(match: re.Match[bytes]) -> bytes: + if b'\\f' in (ret := match.expand(b'\\g<0>')): + return ret + + return b'\\htmlrtf{\\htmlrtf0 ' + + def _stripRtfHelper(match: re.Match[bytes]) -> bytes: res = match.string From 8717949a70cd4366ed8232600f568d090cf8519c Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Thu, 10 Apr 2025 13:49:57 -0700 Subject: [PATCH 7/8] Fix for issue discovered in #462 --- CHANGELOG.md | 3 ++- extract_msg/__init__.py | 2 +- extract_msg/msg_classes/msg.py | 22 +++++++++++++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d8e34cb..1f91c7e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ **v0.54.1** -* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to RTFDE. Some times have been detected to be improved by 75% or more. +* [[TeamMsgExtractor #462](https://github.com/TeamMsgExtractor/msg-extractor/issues/462)] Fix potential issue where child MSG might have incompatible encoding to parent MSG when trying to grab a stream from the parent. +* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to `RTFDE`. This shows improvements on all files that take more than one second. Currently, this actually fixes some files previously outputting wrong from `RTFDE` when deencapsulating the HTML body, specifically around non breaking spaces sometimes not transferring over. **v0.54.0** * [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large. diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index c8f52fd1..030e9316 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,7 +27,7 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2025-04-06' +__date__ = '2025-04-10' __version__ = '0.54.1' __all__ = [ diff --git a/extract_msg/msg_classes/msg.py b/extract_msg/msg_classes/msg.py index ea73756b..9f9af412 100644 --- a/extract_msg/msg_classes/msg.py +++ b/extract_msg/msg_classes/msg.py @@ -203,7 +203,23 @@ def __init__(self, path, **kwargs): self.__overrideEncoding = overrideEncoding if prefix and not filename: - filename = self.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False) + # We actually need to get this from the parent. + msg = None + parentNeedsClose = False + if self.__parentMsg: + msg = self.__parentMsg() + if msg is None: + # We *NEED* the parent here, so we're going to do something + # dumb and just generate it *manually*, grab what we need, # and them immediately close it. + # + # We don't need anything more advanced than MSGFile. + msg = MSGFile(path, prefix = prefixl[:-2], delayAttachments = True) + parentNeedsClose = True + # Now that we know we have the parent, grab the stream. + filename = msg.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False) + # Now if we opened the parent, close it. + if parentNeedsClose: + msg.close() if filename: self.filename = filename elif hasattr(path, '__len__'): @@ -492,7 +508,7 @@ def export(self, path, allowBadEmbed: bool = False) -> None: :param path: A path-like object (including strings and ``pathlib.Path`` objects) or an IO device with a write method which accepts bytes. - :param allowBadEmbed: If True, attempts to skip steps that will fail if + :param allowBadEmbed: If True, attempts to skip steps that will fail if the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook. """ from ..ole_writer import OleWriter @@ -507,7 +523,7 @@ def exportBytes(self, allowBadEmbed: bool = False) -> bytes: """ Saves a new copy of the MSG file, returning the bytes. - :param allowBadEmbed: If True, attempts to skip steps that will fail if + :param allowBadEmbed: If True, attempts to skip steps that will fail if the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook. """ out = io.BytesIO() From 5931d1d9056ecffb0acb1fbe9dc95120b55778b8 Mon Sep 17 00:00:00 2001 From: TheElementalOfDestruction Date: Thu, 10 Apr 2025 13:53:09 -0700 Subject: [PATCH 8/8] Fix tests and 3.8 --- extract_msg/constants/re.py | 3 +++ extract_msg/utils.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py index 40a1afdc..da010212 100644 --- a/extract_msg/constants/re.py +++ b/extract_msg/constants/re.py @@ -8,6 +8,9 @@ 'HTML_SAN_SPACE', 'INVALID_FILENAME_CHARS', 'INVALID_OLE_PATH', + 'RTF_BODY_STRIP_INIT', + 'RTF_BODY_STRIP_PRE_CLOSE', + 'RTF_BODY_STRIP_PRE_OPEN', 'RTF_ENC_BODY_START', ] diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 28f51939..ab34c99d 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -1034,7 +1034,7 @@ def stripRtf(rtfBody: bytes) -> bytes: return rtfBody -def _stripRtfCloseHelper(match: re.Match[bytes]) -> bytes: +def _stripRtfCloseHelper(match: re.Match) -> bytes: if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1: return ret @@ -1044,14 +1044,14 @@ def _stripRtfCloseHelper(match: re.Match[bytes]) -> bytes: return b'\\htmlrtf}\\htmlrtf0 ' -def _stripRtfOpenHelper(match: re.Match[bytes]) -> bytes: +def _stripRtfOpenHelper(match: re.Match) -> bytes: if b'\\f' in (ret := match.expand(b'\\g<0>')): return ret return b'\\htmlrtf{\\htmlrtf0 ' -def _stripRtfHelper(match: re.Match[bytes]) -> bytes: +def _stripRtfHelper(match: re.Match) -> bytes: res = match.string # If these don't match, don't even try.