diff --git a/CHANGELOG.md b/CHANGELOG.md index 21463d41..1f91c7e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +**v0.54.1** +* [[TeamMsgExtractor #462](https://github.com/TeamMsgExtractor/msg-extractor/issues/462)] Fix potential issue where child MSG might have incompatible encoding to parent MSG when trying to grab a stream from the parent. +* Added code to attempt to significantly improve RTF deencapsulation times. This tries to strip away unneeded data before passing it to `RTFDE`. This shows improvements on all files that take more than one second. Currently, this actually fixes some files previously outputting wrong from `RTFDE` when deencapsulating the HTML body, specifically around non breaking spaces sometimes not transferring over. + **v0.54.0** * [[TeamMsgExtractor #456](https://github.com/TeamMsgExtractor/msg-extractor/issues/456)] Changed the prepared html output to use plainly encoded HTML instead of prettified, since current prettification options used mangles the output and causes the output to sometimes be very large. diff --git a/README.rst b/README.rst index c666d985..2fe2af9e 100644 --- a/README.rst +++ b/README.rst @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.54.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.54.1-blue.svg + :target: https://pypi.org/project/extract-msg/0.54.1/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3810/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index 4f4e5a6c..030e9316 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2025-03-23' -__version__ = '0.54.0' +__date__ = '2025-04-10' +__version__ = '0.54.1' __all__ = [ # Modules: diff --git a/extract_msg/constants/re.py b/extract_msg/constants/re.py index d4cb88d0..da010212 100644 --- a/extract_msg/constants/re.py +++ b/extract_msg/constants/re.py @@ -8,6 +8,9 @@ 'HTML_SAN_SPACE', 'INVALID_FILENAME_CHARS', 'INVALID_OLE_PATH', + 'RTF_BODY_STRIP_INIT', + 'RTF_BODY_STRIP_PRE_CLOSE', + 'RTF_BODY_STRIP_PRE_OPEN', 'RTF_ENC_BODY_START', ] @@ -40,3 +43,12 @@ # invalid. INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]') +# Used as the initial step in stripping RTF files for deencapsulation. Finds +# ignored sections that do not contrain groups *and* finds HTML tag sections +# that are entirely empty. It also then finds sections of data that can be +# merged together without affecting the results +RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)') + +# Preprocessing steps to simplify the RTF. +RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)') +RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?') diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 5bb42c7f..74f94df8 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -48,7 +48,8 @@ from ..utils import ( addNumToDir, addNumToZipDir, createZipOpen, decodeRfc2047, findWk, htmlSanitize, inputToBytes, inputToString, isEncapsulatedRtf, - prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, validateHtml + prepareFilename, rtfSanitizeHtml, rtfSanitizePlain, stripRtf, + validateHtml ) @@ -1012,6 +1013,11 @@ def deencapsulatedRtf(self) -> Optional[RTFDE.DeEncapsulator]: while body and body[-1] != 125: body = body[:-1] + # Some files take a long time due to how they are structured and + # how RTFDE works. The longer a file would normally take, the + # better this fix works: + body = stripRtf(body) + try: deencapsultor = RTFDE.DeEncapsulator(body) deencapsultor.deencapsulate() diff --git a/extract_msg/msg_classes/msg.py b/extract_msg/msg_classes/msg.py index ea73756b..9f9af412 100644 --- a/extract_msg/msg_classes/msg.py +++ b/extract_msg/msg_classes/msg.py @@ -203,7 +203,23 @@ def __init__(self, path, **kwargs): self.__overrideEncoding = overrideEncoding if prefix and not filename: - filename = self.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False) + # We actually need to get this from the parent. + msg = None + parentNeedsClose = False + if self.__parentMsg: + msg = self.__parentMsg() + if msg is None: + # We *NEED* the parent here, so we're going to do something + # dumb and just generate it *manually*, grab what we need, # and them immediately close it. + # + # We don't need anything more advanced than MSGFile. + msg = MSGFile(path, prefix = prefixl[:-2], delayAttachments = True) + parentNeedsClose = True + # Now that we know we have the parent, grab the stream. + filename = msg.getStringStream(prefixl[:-1] + ['__substg1.0_3001'], prefix = False) + # Now if we opened the parent, close it. + if parentNeedsClose: + msg.close() if filename: self.filename = filename elif hasattr(path, '__len__'): @@ -492,7 +508,7 @@ def export(self, path, allowBadEmbed: bool = False) -> None: :param path: A path-like object (including strings and ``pathlib.Path`` objects) or an IO device with a write method which accepts bytes. - :param allowBadEmbed: If True, attempts to skip steps that will fail if + :param allowBadEmbed: If True, attempts to skip steps that will fail if the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook. """ from ..ole_writer import OleWriter @@ -507,7 +523,7 @@ def exportBytes(self, allowBadEmbed: bool = False) -> bytes: """ Saves a new copy of the MSG file, returning the bytes. - :param allowBadEmbed: If True, attempts to skip steps that will fail if + :param allowBadEmbed: If True, attempts to skip steps that will fail if the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook. """ out = io.BytesIO() diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 887d7eee..ab34c99d 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -37,6 +37,7 @@ 'rtfSanitizeHtml', 'rtfSanitizePlain', 'setupLogging', + 'stripRtf', 'tryGetMimetype', 'unsignedToSignedInt', 'unwrapMsg', @@ -61,6 +62,7 @@ import logging.config import os import pathlib +import re import shutil import struct import sys @@ -1012,6 +1014,63 @@ def setupLogging(defaultPath = None, defaultLevel = logging.WARN, logfile = None return True +def stripRtf(rtfBody: bytes) -> bytes: + """ + Cleans up RTF before sending it to RTFDE. + + Attempts to find common sections of RTF data that will + """ + # First, do a pre-strip to try and simplify ignored sections as much as possible. + rtfBody = constants.re.RTF_BODY_STRIP_PRE_OPEN.sub(_stripRtfOpenHelper, rtfBody) + rtfBody = constants.re.RTF_BODY_STRIP_PRE_CLOSE.sub(_stripRtfCloseHelper, rtfBody) + # Second do an initial strip to simplify our data stream. + rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) + # Do it one more time to help with some things that might not have gotten + # caught the first time, perhaps because something now exists after + # stripping. + rtfBody = constants.re.RTF_BODY_STRIP_INIT.sub(b'', rtfBody) + + # TODO: Further processing... + + return rtfBody + +def _stripRtfCloseHelper(match: re.Match) -> bytes: + if (ret := match.expand(b'\\g<0>')).count(b'\\htmlrtf0') > 1: + return ret + + if b'\\f' in ret: + return ret + + return b'\\htmlrtf}\\htmlrtf0 ' + + +def _stripRtfOpenHelper(match: re.Match) -> bytes: + if b'\\f' in (ret := match.expand(b'\\g<0>')): + return ret + + return b'\\htmlrtf{\\htmlrtf0 ' + + +def _stripRtfHelper(match: re.Match) -> bytes: + res = match.string + + # If these don't match, don't even try. + if res.count(b'{') != res.count(b'}') or res.count(b'{') == 0: + return res + + # If any group markers are prefixed by a backslash, give up. + if res.find(b'\\{') != -1 or res.find(b'\\}') != -1: + return res + + # Last little bit of processing to validate everything. We know the {} + # match, but let's be *absolutely* sure. + # TODO + + return res + + + + def tryGetMimetype(att: AttachmentBase, mimetype: Union[str, None]) -> Union[str, None]: """ Uses an optional dependency to try and get the mimetype of an attachment.