From 68d864f2b4db86826ea840bda9780a086e5f8fe9 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Fri, 19 Dec 2025 04:03:24 +0000 Subject: [PATCH] Optimize is_text_element MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization achieves a **189% speedup** by eliminating expensive repeated operations and leveraging Python's built-in performance characteristics. **Key optimizations:** 1. **Module-level constant definition**: Moved `text_classes` and `text_categories` to module scope as tuples instead of recreating them on every function call. This eliminates ~70% of the original runtime spent reconstructing these collections (8.7ms → 0ms in line profiler). 2. **Tuple vs List optimization**: Changed from lists to tuples, which are more memory-efficient and faster for `isinstance()` checks since Python can optimize tuple-based type checking. 3. **Eliminated generator expressions**: Replaced `any(isinstance(...) for ...)` with direct `isinstance(ontology_element, text_classes)`, which is significantly faster as it avoids generator overhead and uses Python's optimized C implementation for multiple type checking. 4. **Direct indexing**: Replaced `any(... for category in text_categories)` with direct comparison `ontology_element.elementType == text_categories[0]` since there's only one category, eliminating loop overhead. **Performance impact**: The line profiler shows the critical path went from 52.3% + 15.9% = 68.2% of runtime in generator expressions to 67.5% + 32.5% = 100% in just two optimized operations, but with 7.8x less total time. **Hot path relevance**: Based on `function_references`, this function is called from `can_unstructured_elements_be_merged()` within a loop processing multiple HTML elements. The optimization will significantly speed up HTML parsing workflows where element classification happens frequently. **Test case performance**: All test cases show consistent 140-213% speedups, with the optimization being particularly effective for large-scale processing (800+ elements) showing ~190% improvements, making it ideal for batch document processing scenarios. --- .../partition/html/transformations.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 6006cf9905..60437e3ee4 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -15,6 +15,19 @@ ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE, ) +text_classes = ( + ontology.NarrativeText, + ontology.Quote, + ontology.Paragraph, + ontology.Footnote, + ontology.FootnoteReference, + ontology.Citation, + ontology.Bibliography, + ontology.Glossary, +) + +text_categories = (ontology.ElementTypeEnum.metadata,) + RECURSION_LIMIT = 50 @@ -186,22 +199,10 @@ def can_unstructured_elements_be_merged( def is_text_element(ontology_element: ontology.OntologyElement) -> bool: """Categories or classes that we want to combine with inline text""" - text_classes = [ - ontology.NarrativeText, - ontology.Quote, - ontology.Paragraph, - ontology.Footnote, - ontology.FootnoteReference, - ontology.Citation, - ontology.Bibliography, - ontology.Glossary, - ] - text_categories = [ontology.ElementTypeEnum.metadata] - - if any(isinstance(ontology_element, class_) for class_ in text_classes): + if isinstance(ontology_element, text_classes): return True - return any(ontology_element.elementType == category for category in text_categories) + return ontology_element.elementType == text_categories[0] def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: