diff --git a/CHANGELOG.md b/CHANGELOG.md index 03f85dbae1..ceacd2a3ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.24-dev0 + +### Enhancement +- Optimize `OCRAgentTesseract.extract_word_from_hocr` (codeflash) + ## 0.18.23 ### Enhancement diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9d7aede9fb..eb15f85ed4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.23" # pragma: no cover +__version__ = "0.18.24-dev0" # pragma: no cover diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index a92540a018..efac81cd23 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -26,6 +26,8 @@ from unstructured_inference.inference.elements import TextRegions from unstructured_inference.inference.layoutelement import LayoutElements +_RE_X_CONF = re.compile(r"x_conf (\d+\.\d+)") + # -- force tesseract to be single threaded, otherwise we see major performance problems -- if "OMP_THREAD_LIMIT" not in os.environ: os.environ["OMP_THREAD_LIMIT"] = "1" @@ -152,12 +154,12 @@ def extract_word_from_hocr( if len(character_spans) == 0: return "" - word_text = "" + chars = [] for character_span in character_spans: char = character_span.text char_title = character_span.get("title", "") - conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) + conf_match = _RE_X_CONF.search(char_title) if not (char and conf_match): continue @@ -165,9 +167,9 @@ def extract_word_from_hocr( character_probability = float(conf_match.group(1)) / 100 if character_probability >= character_confidence_threshold: - word_text += char + chars.append(char) - return word_text + return "".join(chars) @requires_dependencies("unstructured_inference") def get_layout_elements_from_image(self, image: PILImage.Image) -> LayoutElements: