diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py
index 6006cf9905..57e0d9f965 100644
--- a/unstructured/partition/html/transformations.py
+++ b/unstructured/partition/html/transformations.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+import functools
import html
from collections import OrderedDict
from itertools import chain
@@ -161,12 +162,8 @@ def can_unstructured_elements_be_merged(
if current_element.metadata.category_depth != next_element.metadata.category_depth:
return False
- current_html_tags = BeautifulSoup(
- current_element.metadata.text_as_html, "html.parser"
- ).find_all(recursive=False)
- next_html_tags = BeautifulSoup(next_element.metadata.text_as_html, "html.parser").find_all(
- recursive=False
- )
+ current_html_tags = _cached_html_tags(current_element.metadata.text_as_html)
+ next_html_tags = _cached_html_tags(next_element.metadata.text_as_html)
ontology_elements = [
parse_html_to_ontology_element(html_tag)
@@ -478,3 +475,8 @@ def get_escaped_attributes(soup: Tag) -> dict[str, str | list[str]]:
escaped_value = html.escape(value)
escaped_attrs[escaped_key] = escaped_value
return escaped_attrs
+
+
+@functools.lru_cache(maxsize=4096)
+def _cached_html_tags(text_as_html: str):
+ return BeautifulSoup(text_as_html, "html.parser").find_all(recursive=False)