From 862da610e613cb3d7bd70273bcd711f840a2031a Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Fri, 19 Dec 2025 04:08:05 +0000 Subject: [PATCH] Optimize is_inline_element The optimization achieves a **302% speedup** by moving constant list/tuple creation out of the function and using more efficient Python operations. **Key optimizations applied:** 1. **Moved constants to module level**: The `inline_classes` and `inline_categories` collections are now defined once at module load time as tuples, eliminating the overhead of recreating these collections on every function call. The line profiler shows this eliminated ~1.1ms of overhead (31.9% + 39.7% of original runtime). 2. **Replaced `any()` with direct `isinstance()`**: Instead of using a generator expression with `any()` to check class membership, the code now uses `isinstance(ontology_element, inline_classes)` directly with a tuple of classes. This is more efficient because `isinstance()` can natively handle tuple arguments. 3. **Replaced `any()` with `in` operator**: The second check now uses `ontology_element.elementType in inline_categories` instead of a generator expression with `any()`. The `in` operator on tuples is optimized at the C level and significantly faster than generator-based iteration. 4. **Used tuples instead of lists**: Tuples are slightly more memory-efficient and faster for membership testing than lists, especially for small collections. **Performance impact in context:** Based on the function reference, `is_inline_element()` is called within a loop in `can_unstructured_elements_be_merged()` when processing HTML elements. Since HTML parsing often involves checking many elements, this optimization provides substantial benefits in document processing pipelines where this function may be called hundreds or thousands of times. **Test case insights:** The optimization shows consistent 150-340% speedups across all test scenarios, with particularly strong performance on large-scale tests (295-338% faster) where the constant overhead elimination compounds. Both basic element type checking and class inheritance checking benefit significantly, making this optimization valuable for diverse HTML parsing workloads. --- unstructured/partition/html/transformations.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 6006cf9905..dd92c758e7 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -15,6 +15,13 @@ ONTOLOGY_CLASS_TO_UNSTRUCTURED_ELEMENT_TYPE, ) +inline_classes = (ontology.Hyperlink,) + +inline_categories = ( + ontology.ElementTypeEnum.specialized_text, + ontology.ElementTypeEnum.annotation, +) + RECURSION_LIMIT = 50 @@ -207,16 +214,10 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool: def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: """Categories or classes that we want to combine with text elements""" - inline_classes = [ontology.Hyperlink] - inline_categories = [ - ontology.ElementTypeEnum.specialized_text, - ontology.ElementTypeEnum.annotation, - ] - - if any(isinstance(ontology_element, class_) for class_ in inline_classes): + if isinstance(ontology_element, inline_classes): return True - return any(ontology_element.elementType == category for category in inline_categories) + return ontology_element.elementType in inline_categories def unstructured_elements_to_ontology(