diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 435247c9..00235552 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index d78b4056..18fcbdd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,17 @@ +**v0.53.1** +* Expanded allowable range for `red-black-tree-mod`. +* Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached. +* Expand allowable versions of `BeautifulSoup4`. + +**v0.53.0** +* Added tests for many functions in `extract_msg.utils`. +* Fix an issue in `extract_msg.utils.msgPathToString()` that prevented backslashes from being replaced with forward slashes. +* Change the behavior of `extract_msg.utils.minutesToDurationStr()` to properly use plurals. +* Fixed issue in `extract_msg.utils.unwrapMsg()` that would prevent it from working on signed messages due to an API change. +* Added new exception `MimetypeFailureError`. +* Modified the logic of `MessageBase.asEmailMessage()` to use `AttachmentBase/SignedAttachment.name` instead of `getFilename()` which only exists on AttachmentBase. +* Modified the logic of `MessageBase.htmlBodyPrepared()` to properly put the mimetype in image tags to ensure rendering. Logic was also modified to use `encode` instead of `prettify` to reduce computation and output size. + **v0.52.0** * [[TeamMsgExtractor #444](https://github.com/TeamMsgExtractor/msg-extractor/issues/444)] Fix typo in string that prevented HTML body from generating from the plain text body properly. * Adjusted the behavior of `MSGFile.areStringsUnicode` to prioritize the property specified by the parent MSG files for MSG files that are embedded. Additionally, added a fallback to rely on whether or not there is a stream using the `001F` type to determine the property value if it is entirely missing. diff --git a/README.rst b/README.rst index f03156c3..2ff4b8a2 100644 --- a/README.rst +++ b/README.rst @@ -260,8 +260,8 @@ your access to the newest major version of extract-msg. .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg :target: LICENSE.txt -.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.52.0-blue.svg - :target: https://pypi.org/project/extract-msg/0.52.0/ +.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg + :target: https://pypi.org/project/extract-msg/0.53.1/ .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg :target: https://www.python.org/downloads/release/python-3810/ diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py index df0b14ad..3367f63c 100644 --- a/extract_msg/__init__.py +++ b/extract_msg/__init__.py @@ -27,8 +27,8 @@ # along with this program. If not, see . __author__ = 'Destiny Peterson & Matthew Walker' -__date__ = '2024-10-22' -__version__ = '0.52.0' +__date__ = '2025-02-05' +__version__ = '0.53.1' __all__ = [ # Modules: diff --git a/extract_msg/exceptions.py b/extract_msg/exceptions.py index b186c0c2..157c481f 100644 --- a/extract_msg/exceptions.py +++ b/extract_msg/exceptions.py @@ -92,6 +92,11 @@ class InvalidPropertyIdError(ExMsgBaseException): The provided property ID was invalid. """ +class MimetypeFailureError(ExMsgBaseException): + """ + The mimetype was unable to be properly determined when it was mandatory. + """ + class NotWritableError(ExMsgBaseException): """ Modification was attempted on an instance that is not writable. diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py index 40ca0fa7..62a39271 100644 --- a/extract_msg/msg_classes/message_base.py +++ b/extract_msg/msg_classes/message_base.py @@ -39,7 +39,8 @@ ) from ..exceptions import ( ConversionError, DataNotFoundError, DeencapMalformedData, - DeencapNotEncapsulated, IncompatibleOptionsError, WKError + DeencapNotEncapsulated, IncompatibleOptionsError, MimetypeFailureError, + WKError ) from .msg import MSGFile from ..structures.report_tag import ReportTag @@ -178,13 +179,10 @@ def asEmailMessage(self) -> EmailMessage: if att.dataType: if hasattr(att.dataType, 'asEmailMessage'): # Replace the extension with '.eml'. - filename = att.getFilename() + filename = att.name or '' if filename.lower().endswith('.msg'): filename = filename[:-4] + '.eml' - msgMain.add_attachment( - att.data.asEmailMessage(), - filename = filename, - cid = att.contentId) + msgMain.attach(att.data.asEmailMessage()) else: if issubclass(att.dataType, bytes): data = att.data @@ -1198,12 +1196,35 @@ def htmlBodyPrepared(self) -> Optional[bytes]: for tag in tags: # Iterate through the attachments until we get the right one. cid = tag['src'][4:] - data = next((attachment.data for attachment in self.attachments if attachment.cid == cid), None) + att = next((attachment for attachment in self.attachments if hasattr(attachment, 'cid') and attachment.cid == cid), None) # If we found anything, inject it. - if data: - tag['src'] = (b'data:image;base64,' + base64.b64encode(data)).decode('utf-8') + if att and isinstance(att.data, bytes): + # Try to get the mimetype. If we can't, see if the item has an + # extension and guess the mimtype for a few known ones. + mime = att.mimetype + if not mime: + ext = (att.name or '').split('.')[-1].lower() + if ext == 'png': + mime = 'image/png' + elif ext == 'jpg' or ext == 'jpeg': + mime = 'image/jpeg' + elif ext == 'gif': + mime = 'image/gif' + elif ext == 'tiff' or ext == 'tif': + mime = 'image/tif' + elif ext == 'bmp': + mime = 'image/bmp' + elif ext == 'svg': + mime = 'image/svg+xml' + # Final check. + if mime: + tag['src'] = (b'data:' + mime.encode() + b';base64,' + base64.b64encode(att.data)).decode('utf-8') + else: + # We don't know what to actually put for this item, and we + # really should never end up here, so throw an error. + raise MimetypeFailureError('Could not get the mimetype to use for htmlBodyPrepared.') - return soup.prettify('utf-8') + return soup.encode('utf-8') @functools.cached_property def htmlInjectableHeader(self) -> str: diff --git a/extract_msg/utils.py b/extract_msg/utils.py index 1639191c..6741737f 100644 --- a/extract_msg/utils.py +++ b/extract_msg/utils.py @@ -696,8 +696,17 @@ def minutesToDurationStr(minutes: int) -> str: return '1 minute' elif minutes < 60: return f'{minutes} minutes' + elif minutes == 60: + return '1 hour' elif minutes % 60 == 0: return f'{minutes // 60} hours' + elif minutes < 120: + if minutes == 61: + return f'1 hour 1 minute' + else: + return f'1 hour {minutes - 60} minutes' + elif minutes % 60 == 1: + return f'{minutes // 60} hours 1 minute' else: return f'{minutes // 60} hours {minutes % 60} minutes' @@ -709,8 +718,7 @@ def msgPathToString(inp: Union[str, Iterable[str]]) -> str: """ if not isinstance(inp, str): inp = '/'.join(inp) - inp.replace('\\', '/') - return inp + return inp.replace('\\', '/') def parseType(_type: int, stream: Union[int, bytes], encoding: str, extras: Sequence[bytes]): @@ -1094,7 +1102,7 @@ def unwrapMsg(msg: MSGFile) -> Dict[str, List]: msgFiles.append(att.data) toProcess.append(att.data) if isinstance(currentItem, MessageSignedBase): - raw += currentItem._rawAttachments + raw += currentItem.rawAttachments return { 'attachments': attachments, diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py index a31b4cc0..3f2ba1ee 100644 --- a/extract_msg_tests/__init__.py +++ b/extract_msg_tests/__init__.py @@ -4,6 +4,7 @@ 'OleWriterEditingTests', 'OleWriterExportTests', 'PropTests', + 'UtilTests', 'ValidationTests', ] @@ -11,4 +12,5 @@ from .cmd_line_tests import CommandLineTests from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests from .prop_tests import PropTests +from .util_tests import UtilTests from .validation_tests import ValidationTests diff --git a/extract_msg_tests/util_tests.py b/extract_msg_tests/util_tests.py new file mode 100644 index 00000000..4ec20755 --- /dev/null +++ b/extract_msg_tests/util_tests.py @@ -0,0 +1,61 @@ +__all__ = [ + 'UtilTests', +] + + +import unittest + +from extract_msg import utils + + +class UtilTests(unittest.TestCase): + def test_dictGetCasedKey(self): + caseDict = {'hello': 1, 'HeUtQjWkW': 2} + + self.assertEqual(utils.dictGetCasedKey(caseDict, 'Hello'), 'hello') + self.assertEqual(utils.dictGetCasedKey(caseDict, 'heutqjwkw'), 'HeUtQjWkW') + with self.assertRaises(KeyError): + utils.dictGetCasedKey(caseDict, 'jjjjj') + + def test_divide(self): + inputString = '12345678901234567890' + expectedOutputs = { + 1: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'], + 2: ['12', '34', '56', '78', '90', '12', '34', '56', '78', '90'], + 3: ['123', '456', '789', '012', '345', '678', '90'], + 4: ['1234', '5678', '9012', '3456', '7890'], + 5: ['12345', '67890', '12345', '67890'], + 6: ['123456', '789012', '345678', '90'], + 7: ['1234567', '8901234', '567890'], + 8: ['12345678', '90123456', '7890'], + 9: ['123456789', '012345678', '90'], + 10: ['1234567890', '1234567890'], + 11: ['12345678901', '234567890'], + } + + for divideBy, expectedResult in expectedOutputs.items(): + self.assertListEqual(utils.divide(inputString, divideBy), expectedResult) + + def test_makeWeakRef(self): + self.assertIsNone(utils.makeWeakRef(None)) + class TestClass: + pass + self.assertIsNotNone(utils.makeWeakRef(TestClass())) + + def test_minutesToDurationStr(self): + self.assertEqual(utils.minutesToDurationStr(0), '0 hours') + self.assertEqual(utils.minutesToDurationStr(1), '1 minute') + self.assertEqual(utils.minutesToDurationStr(2), '2 minutes') + self.assertEqual(utils.minutesToDurationStr(59), '59 minutes') + self.assertEqual(utils.minutesToDurationStr(60), '1 hour') + self.assertEqual(utils.minutesToDurationStr(61), '1 hour 1 minute') + self.assertEqual(utils.minutesToDurationStr(62), '1 hour 2 minutes') + self.assertEqual(utils.minutesToDurationStr(120), '2 hours') + self.assertEqual(utils.minutesToDurationStr(121), '2 hours 1 minute') + self.assertEqual(utils.minutesToDurationStr(122), '2 hours 2 minutes') + + def test_msgPathToStr(self): + self.assertEqual(utils.msgPathToString('hello/world/one'), 'hello/world/one') + self.assertEqual(utils.msgPathToString('hello/world\\one'), 'hello/world/one') + self.assertEqual(utils.msgPathToString(['hello', 'world', 'one']), 'hello/world/one') + self.assertEqual(utils.msgPathToString(['hello\\world', 'one']), 'hello/world/one') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4a702882..4261c061 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,6 @@ olefile==0.47 tzlocal>=4.2,<6 compressed-rtf>=1.0.6,<2 ebcdic>=1.1.1,<2 -beautifulsoup4>=4.11.1,<4.13 +beautifulsoup4>=4.11.1,<4.14 RTFDE>=0.1.1,<0.2 -red-black-tree-mod==1.20 +red-black-tree-mod>=1.20, <=1.23