From 25b4d575dd20764be5000d42bd552b57016f3065 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Apr 2026 13:25:16 -0700 Subject: [PATCH 1/7] Improve issue 21 fix --- adsrefpipe/refparsers/unicode.py | 6 ++++-- adsrefpipe/tests/unittests/test_ref_parsers.py | 17 +++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index 39453d2..2058f83 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -244,8 +244,10 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str: elif entno < 255: return self.u2asc(chr(entno)) except IndexError: - logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0))) - return "" + try: + return unicodedata.normalize('NFKD', chr(entno)) + except (OverflowError, ValueError): + raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0)) def __sub_hexnum_toent(self, match: re.Match) -> str: """ diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py index d0a3434..cddbf0e 100755 --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -304,11 +304,20 @@ def test_sub_hexnumasc_entity(self): handler.unicode = MagicMock() handler.unicode.__getitem__.side_effect = IndexError - # large invalid hex value to trigger returning and empty string "" - match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") + # supplementary-plane hex value should normalize instead of being dropped + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "𝑣") if match: - result = handler._UnicodeHandler__sub_hexnumasc_entity(match) - self.assertEqual(result, "") + with patch("unicodedata.normalize", return_value="v") as mock_normalize: + result = handler._UnicodeHandler__sub_hexnumasc_entity(match) + self.assertEqual(result, "v") + mock_normalize.assert_called_once_with("NFKD", "𝑣") + + # oversized hex value should still raise UnicodeHandlerError + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "�") + if match: + with self.assertRaises(UnicodeHandlerError) as context: + handler._UnicodeHandler__sub_hexnumasc_entity(match) + self.assertEqual(str(context.exception), "Unknown hexadecimal entity: �") def test_sub_hexnum_toent(self): """ test __sub_hexnum_toent method """ From b3f9800d0da39ed9427913473e9e247df48a80e5 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Apr 2026 14:26:57 -0700 Subject: [PATCH 2/7] Upgrade alembic - OUPFT, JATS, EGU --- adsrefpipe/refparsers/handler.py | 4 ++++ adsrefpipe/tests/unittests/test_app.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/adsrefpipe/refparsers/handler.py b/adsrefpipe/refparsers/handler.py index 25075b5..28fa716 100644 --- a/adsrefpipe/refparsers/handler.py +++ b/adsrefpipe/refparsers/handler.py @@ -22,6 +22,7 @@ from adsrefpipe.refparsers.NatureXML import NATUREtoREFs from adsrefpipe.refparsers.NLM3xml import NLMtoREFs from adsrefpipe.refparsers.ONCPxml import ONCPtoREFs +from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs from adsrefpipe.refparsers.OUPxml import OUPtoREFs from adsrefpipe.refparsers.PASAxml import PASAtoREFs from adsrefpipe.refparsers.RSCxml import RSCtoREFs @@ -77,6 +78,7 @@ 'CUP': CUPtoREFs, 'EDP': EDPtoREFs, 'EGU': EGUtoREFs, + 'EGUE2': EGUtoREFs, 'ELSEVIER': ELSEVIERtoREFs, 'ELSEVIERE2': ELSEVIERtoREFs, # with multiple extensions 'ICARUS': ICARUStoREFs, @@ -86,6 +88,7 @@ 'IOPFT': IOPFTtoREFs, 'IPAP': IPAPtoREFs, 'JATS': JATStoREFs, + 'JATSE2': JATStoREFs, 'JLVEnHTML': JLVEnHTMLtoREFs, 'JSTAGE': JSTAGEtoREFs, 'LivingReviews': LivingReviewsToREFs, @@ -97,6 +100,7 @@ 'ObsOCR': ObsOCRtoREFs, 'ONCP': ONCPtoREFs, 'OUP': OUPtoREFs, + 'OUPFT': OUPFTtoREFs, 'PairsTXT': PairsTXTtoREFs, 'PairsTXTE2': PairsTXTtoREFs, 'PairsTXTE3': PairsTXTtoREFs, diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index 5cc678e..e3d72bf 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -26,6 +26,8 @@ from adsrefpipe.refparsers.WileyXML import WILEYtoREFs from adsrefpipe.refparsers.NLM3xml import NLMtoREFs from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference +from adsrefpipe.refparsers.EGUxml import EGUtoREFs +from adsrefpipe.refparsers.OUPFTxml import OUPFTtoREFs from adsrefpipe.refparsers.arXivTXT import ARXIVtoREFs from adsrefpipe.refparsers.handler import verify from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records @@ -390,6 +392,7 @@ def test_parser_name(self): 'CrossRef': ['/PLoSO/0007/10.1371_journal.pone.0048146.xref.xml', CrossRefToREFs], 'ELSEVIER': ['/AtmEn/0230/iss.elsevier.xml', ELSEVIERtoREFs], 'JATS': ['/NatSR/0009/iss36.jats.xml', JATStoREFs], + 'JATSE2': ['/IEEE/0001/iss1.ieee.xml', JATStoREFs], 'IOP': ['/JPhCS/1085/iss4.iop.xml', IOPtoREFs], 'SPRINGER': ['/JHEP/2019/iss06.springer.xml', SPRINGERtoREFs], 'APS': ['/PhRvB/0081/2010PhRvB..81r4520P.ref.xml', APStoREFs], @@ -398,6 +401,8 @@ def test_parser_name(self): 'WILEY': ['/JGR/0101/issD14.wiley2.xml', WILEYtoREFs], 'NLM': ['/PNAS/0109/iss17.nlm3.xml', NLMtoREFs], 'AGU': ['/JGR/0101/issD14.agu.xml', AGUtoREFs], + 'EGUE2': ['/EGUSp/0001/iss1.copernicus.xml', EGUtoREFs], + 'OUPFT': ['/MNRAS/0001/iss1.oupft.xml', OUPFTtoREFs], 'arXiv': ['/arXiv/2011/00324.raw', ARXIVtoREFs], } From cf07fea3069ef382a7216dcb88422bfa8a3b4207 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Apr 2026 16:46:02 -0700 Subject: [PATCH 3/7] Log unicode error --- adsrefpipe/refparsers/unicode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index 2058f83..4981240 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -228,7 +228,8 @@ def __sub_numasc_entity(self, match: re.Match) -> str: try: return unicodedata.normalize('NFKD', chr(entno)) except OverflowError: - raise UnicodeHandlerError('Unknown numeric entity: %s' % match.group(0)) + logger.error(UnicodeHandlerError('Unknown numeric entity: %s, replacing by WHITE SQUARE' % match.group(0))) + return self.unicode[9633].ascii def __sub_hexnumasc_entity(self, match: re.Match) -> str: """ @@ -247,7 +248,7 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str: try: return unicodedata.normalize('NFKD', chr(entno)) except (OverflowError, ValueError): - raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0)) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by WHITE SQUARE' % match.group(0))) def __sub_hexnum_toent(self, match: re.Match) -> str: """ @@ -264,7 +265,7 @@ def __sub_hexnum_toent(self, match: re.Match) -> str: if self.unicode[entno]: return '&%s;' % self.unicode[entno].entity else: - raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % entno) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by WHITE SQUARE' % entno)) def __sub_asc_entity(self, match: re.Match) -> str: """ From 5bc087536ff551c27a375ce9c55367ff878d5ec6 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Apr 2026 16:49:36 -0700 Subject: [PATCH 4/7] Log unicode error - replace with '' --- adsrefpipe/refparsers/unicode.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index 4981240..89b7ece 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -228,8 +228,8 @@ def __sub_numasc_entity(self, match: re.Match) -> str: try: return unicodedata.normalize('NFKD', chr(entno)) except OverflowError: - logger.error(UnicodeHandlerError('Unknown numeric entity: %s, replacing by WHITE SQUARE' % match.group(0))) - return self.unicode[9633].ascii + logger.error(UnicodeHandlerError('Unknown numeric entity: %s, replacing by ""' % match.group(0))) + return "" def __sub_hexnumasc_entity(self, match: re.Match) -> str: """ @@ -248,7 +248,8 @@ def __sub_hexnumasc_entity(self, match: re.Match) -> str: try: return unicodedata.normalize('NFKD', chr(entno)) except (OverflowError, ValueError): - logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by WHITE SQUARE' % match.group(0))) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % match.group(0))) + return "" def __sub_hexnum_toent(self, match: re.Match) -> str: """ @@ -265,7 +266,8 @@ def __sub_hexnum_toent(self, match: re.Match) -> str: if self.unicode[entno]: return '&%s;' % self.unicode[entno].entity else: - logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by WHITE SQUARE' % entno)) + logger.error(UnicodeHandlerError('Unknown hexadecimal entity: %s, replacing by ""' % entno)) + return "" def __sub_asc_entity(self, match: re.Match) -> str: """ From b6ed8fdd3e3d7f3e6cbabf46ce72b9b6ce28a105 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Apr 2026 17:34:09 -0700 Subject: [PATCH 5/7] update ref parsers unit tests --- adsrefpipe/tests/unittests/test_ref_parsers.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py index cddbf0e..3e0fb2f 100755 --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -273,13 +273,11 @@ def test_sub_numasc_entity_exception(self): with patch("unicodedata.normalize", return_value="normalized_value"): self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "normalized_value") - # test OverflowError handling (raises UnicodeHandlerError) + # test OverflowError handling (logs and replaces with empty string) match = re.match(r'&#(?P\d+);', "�") if match: with patch("unicodedata.normalize", side_effect=OverflowError): - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_numasc_entity(match) - self.assertEqual(str(context.exception), "Unknown numeric entity: �") + self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "") def test_sub_hexnumasc_entity(self): """ test __sub_hexnumasc_entity method """ @@ -312,12 +310,10 @@ def test_sub_hexnumasc_entity(self): self.assertEqual(result, "v") mock_normalize.assert_called_once_with("NFKD", "𝑣") - # oversized hex value should still raise UnicodeHandlerError + # oversized hex value should log and replace with empty string match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "�") if match: - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_hexnumasc_entity(match) - self.assertEqual(str(context.exception), "Unknown hexadecimal entity: �") + self.assertEqual(handler._UnicodeHandler__sub_hexnumasc_entity(match), "") def test_sub_hexnum_toent(self): """ test __sub_hexnum_toent method """ From 57cb453e03157dd335d3f85d11064ea94fefcd43 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Wed, 22 Apr 2026 09:46:33 -0700 Subject: [PATCH 6/7] Update unit test --- adsrefpipe/tests/unittests/test_ref_parsers.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py index 3e0fb2f..f98729f 100755 --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -335,15 +335,12 @@ def test_sub_hexnum_toent(self): if match: self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "£") - # test UnicodeHandlerError for unknown entity by ensuring index is in range but has no entity + # test unknown entity handling by ensuring index is in range but has no entity handler.unicode = MagicMock() handler.unicode.__getitem__.return_value = None match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") if match: - with self.assertRaises(UnicodeHandlerError) as context: - handler._UnicodeHandler__sub_hexnum_toent(match) - # ensure the exception message is correct - self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 629145") + self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "") def test_toentity(self): """ test __toentity method """ From 714d45f59647cb5f46c95c9ef3a1ea992b42eede Mon Sep 17 00:00:00 2001 From: thomasallen Date: Wed, 22 Apr 2026 11:17:04 -0700 Subject: [PATCH 7/7] Replace square placeholders --- adsrefpipe/refparsers/unicode.py | 8 ++++---- .../tests/unittests/test_ref_parsers_xml.py | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py index 89b7ece..c8169ac 100755 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -281,8 +281,8 @@ def __sub_asc_entity(self, match: re.Match) -> str: ret = self[ent].ascii return ret else: - logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by WHITE SQUARE' % match.group(0))) - return self.unicode[9633].ascii + logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by ""' % match.group(0))) + return "" def __toascii(self, char: str) -> str: """ @@ -299,8 +299,8 @@ def __toascii(self, char: str) -> str: if self.unicode[ascii_value]: return self.unicode[ascii_value].ascii else: - logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by WHITE SQUARE' % ascii_value)) - return self.unicode[9633].ascii + logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by ""' % ascii_value)) + return "" def __toentity(self, char: str) -> str: """ diff --git a/adsrefpipe/tests/unittests/test_ref_parsers_xml.py b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py index ab4c4b2..3346381 100644 --- a/adsrefpipe/tests/unittests/test_ref_parsers_xml.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py @@ -3,6 +3,7 @@ if project_home not in sys.path: sys.path.insert(0, project_home) +import copy import unittest from unittest.mock import patch, MagicMock, mock_open import xml.dom.minidom as dom @@ -1919,9 +1920,19 @@ class TestWileytoREFs(unittest.TestCase): def test_init(self): """ test init """ + def _normalize_unicode_placeholders(value): + if isinstance(value, str): + return value.replace('□□', '').replace('□', '') + if isinstance(value, list): + return [_normalize_unicode_placeholders(item) for item in value] + if isinstance(value, dict): + return {key: _normalize_unicode_placeholders(item) for key, item in value.items()} + return value + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.wiley2.xml') references = WILEYtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_wiley) + expected = _normalize_unicode_placeholders(copy.deepcopy(parsed_references.parsed_wiley)) + self.assertEqual(references, expected) def test_process_and_dispatch_exception(self): """ test exception in process_and_dispatch """ @@ -1944,4 +1955,4 @@ def test_process_and_dispatch_exception(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main()