From 0b82d84fd03714c7332c0da7380661cabb803648 Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Fri, 19 Dec 2025 04:59:41 +0000
Subject: [PATCH] Optimize
 ObjectDetectionEvalProcessor._change_bbox_bounds_for_image_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimization replaces NumPy's vectorized `.clip()` operations with a custom Numba-compiled function that performs explicit bounds checking through manual loops. This achieves a **311% speedup** (335μs → 81.3μs) despite appearing to trade vectorized operations for explicit loops.

**Key optimizations applied:**

1. **Numba JIT compilation**: The `@njit(cache=True, fastmath=True)` decorator compiles the clipping function to optimized machine code, eliminating Python interpreter overhead
2. **Cache-enabled compilation**: `cache=True` stores the compiled version for subsequent runs, avoiding recompilation costs
3. **Fast math optimizations**: `fastmath=True` enables aggressive floating-point optimizations
4. **In-place operations**: The function modifies the input array directly without creating intermediate arrays

**Why this is faster than NumPy:**

NumPy's `.clip()` creates intermediate arrays for fancy indexing operations like `boxes[..., [0, 2]]`, requiring memory allocation and element copying. The original code performs this twice (once for x-coordinates, once for y-coordinates). The Numba version eliminates these allocations by processing elements directly in a tight loop that the compiler can heavily optimize.

**Performance characteristics from tests:**

- **Small arrays** (1-10 boxes): 600-700% speedup - Numba's compiled code dominates overhead
- **Medium arrays** (100s of boxes): 200-300% speedup - Sweet spot where loop efficiency shines
- **Large arrays** (1000+ boxes): 22-200% speedup - Still beneficial but diminishing returns as vectorization advantages increase

This optimization is particularly valuable for object detection pipelines that process many small to medium batches of bounding boxes, which is typical in computer vision workloads where this function would likely be called frequently during evaluation.
---
 unstructured/metrics/object_detection.py | 41 ++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/unstructured/metrics/object_detection.py b/unstructured/metrics/object_detection.py
index 7c28721518..6eb8175b47 100644
--- a/unstructured/metrics/object_detection.py
+++ b/unstructured/metrics/object_detection.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import torch
+from numba import njit
 
 IOU_THRESHOLDS = torch.tensor(
     [0.5000, 0.5500, 0.6000, 0.6500, 0.7000, 0.7500, 0.8000, 0.8500, 0.9000, 0.9500]
@@ -303,8 +304,8 @@ def _change_bbox_bounds_for_image_size(
         Returns:
             clipped_boxes:  Clipped bboxes in XYXY format of [..., 4] shape
         """
-        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(min=0, max=img_shape[1])
-        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(min=0, max=img_shape[0])
+        # Use Numba-accelerated function for inplace fast clipping
+        _numba_clip_boxes(boxes, img_shape[0], img_shape[1])
         return boxes
 
     @staticmethod
@@ -697,6 +698,42 @@ def _compute_detection_metrics_per_cls(
         return ap, precision, recall
 
 
+@njit(cache=True, fastmath=True)
+def _numba_clip_boxes(boxes: np.ndarray, img_height: int, img_width: int) -> np.ndarray:
+    # This helper efficiently clips _inplace_ for (..., 4)-shaped XYXY boxes:
+    #   x1 = min(max(x1, 0), img_width)
+    #   y1 = min(max(y1, 0), img_height)
+    #   x2 = min(max(x2, 0), img_width)
+    #   y2 = min(max(y2, 0), img_height)
+    #
+    # Works with both 2D (N, 4) and higher dimensional (..., 4) arrays.
+    boxes.shape
+    boxes_reshaped = boxes.reshape(-1, 4)
+    for i in range(boxes_reshaped.shape[0]):
+        # x1
+        if boxes_reshaped[i, 0] < 0:
+            boxes_reshaped[i, 0] = 0
+        if boxes_reshaped[i, 0] > img_width:
+            boxes_reshaped[i, 0] = img_width
+        # y1
+        if boxes_reshaped[i, 1] < 0:
+            boxes_reshaped[i, 1] = 0
+        if boxes_reshaped[i, 1] > img_height:
+            boxes_reshaped[i, 1] = img_height
+        # x2
+        if boxes_reshaped[i, 2] < 0:
+            boxes_reshaped[i, 2] = 0
+        if boxes_reshaped[i, 2] > img_width:
+            boxes_reshaped[i, 2] = img_width
+        # y2
+        if boxes_reshaped[i, 3] < 0:
+            boxes_reshaped[i, 3] = 0
+        if boxes_reshaped[i, 3] > img_height:
+            boxes_reshaped[i, 3] = img_height
+    # No need to reshape as it is a view of original
+    return boxes
+
+
 if __name__ == "__main__":
     from dataclasses import asdict