Add detector_transform and pose_transform

maximpavliv · maximpavliv · commit 014b96b850c7 · 2025-06-11T13:17:58.000+02:00
HRNet-32 requires input images to have shape that is multiple of 32, this preprocessing part was missing. I replaced the single PyTorchRunner.transform attribute by a detector transform and a pose transform, each of these transforms are built using the model config. I added a AutoPadToDivisor transform based on torchvision.transforms.functional.pad().
diff --git a/dlclive/pose_estimation_pytorch/data/image.py b/dlclive/pose_estimation_pytorch/data/image.py
@@ -138,3 +138,30 @@ def top_down_crop_torch(
     offset = x1, y1
     crop = F.resized_crop(image, y1, x1, h, w, [out_h, out_w])
     return crop, offset, scale
+
+
+class AutoPadToDivisor(torch.nn.Module):
+    def __init__(self, pad_height_divisor: int = 1, pad_width_divisor: int = 1):
+        super().__init__()
+        self.pad_height_divisor = pad_height_divisor
+        self.pad_width_divisor = pad_width_divisor
+
+    def forward(self, img: torch.Tensor) -> torch.Tensor:
+        # Accepts either (C, H, W) or (N, C, H, W)
+        if img.ndim == 3:
+            img = img.unsqueeze(0)  # add batch dim
+
+        assert img.ndim == 4, f"Expected 4D tensor, got shape {img.shape}"
+        _, _, h, w = img.shape
+
+        target_h = ((h + self.pad_height_divisor - 1) // self.pad_height_divisor) * self.pad_height_divisor
+        target_w = ((w + self.pad_width_divisor - 1) // self.pad_width_divisor) * self.pad_width_divisor
+
+        pad_h = target_h - h
+        pad_w = target_w - w
+
+        # Pad (left, top, right, bottom)
+        padding = (0, 0, pad_w, pad_h)
+
+        # Warning: this method returns the batched image, regardless if its input was batched or not
+        return F.pad(img, padding, padding_mode="reflect")
diff --git a/dlclive/pose_estimation_pytorch/runner.py b/dlclive/pose_estimation_pytorch/runner.py
@@ -22,6 +22,7 @@
 import dlclive.pose_estimation_pytorch.models as models
 import dlclive.pose_estimation_pytorch.dynamic_cropping as dynamic_cropping
 from dlclive.core.runner import BaseRunner
+from dlclive.pose_estimation_pytorch.data.image import AutoPadToDivisor
 
 
 @dataclass
@@ -142,7 +143,8 @@ def __init__(
         self.cfg = None
         self.detector = None
         self.model = None
-        self.transform = None
+        self.detector_transform = None
+        self.pose_transform = None
 
         # Parse Dynamic Cropping parameters
         if isinstance(dynamic, dict):
@@ -172,13 +174,7 @@ def close(self) -> None:
     @torch.inference_mode()
     def get_pose(self, frame: np.ndarray) -> np.ndarray:
         c, h, w = frame.shape
-        frame = (
-            self.transform(torch.from_numpy(frame).permute(2, 0, 1))
-            .unsqueeze(0)
-            .to(self.device)
-        )
-        if self.precision == "FP16":
-            frame = frame.half()
+        tensor = torch.from_numpy(frame).permute(2, 0, 1) # CHW, still on CPU
 
         offsets_and_scales = None
         if self.detector is not None:
@@ -187,18 +183,32 @@ def get_pose(self, frame: np.ndarray) -> np.ndarray:
                 detections = self.top_down_config.skip_frames.get_detections()
 
             if detections is None:
-                detections = self.detector(frame)[0]
+                # Apply detector transform before inference
+                detector_input = self.detector_transform(tensor).unsqueeze(0).to(self.device)
+                if self.precision == "FP16":
+                    detector_input = detector_input.half()
+                detections = self.detector(detector_input)[0]
 
-            frame_batch, offsets_and_scales = self._prepare_top_down(frame, detections)
+            frame_batch, offsets_and_scales = self._prepare_top_down(tensor, detections)
             if len(frame_batch) == 0:
                 offsets_and_scales = [(0, 0), 1]
             else:
-                frame = frame_batch.to(self.device)
+                tensor = frame_batch  # still CHW, batched
 
         if self.dynamic is not None:
-            frame = self.dynamic.crop(frame)
+            tensor = self.dynamic.crop(tensor)
+
+        # Apply pose transform
+        model_input = self.pose_transform(tensor)
+        # Ensure 4D input: (N, C, H, W)
+        if model_input.dim() == 3:
+            model_input = model_input.unsqueeze(0)
+        # Send to device
+        model_input = model_input.to(self.device)
+        if self.precision == "FP16":
+            model_input = model_input.half()
 
-        outputs = self.model(frame)
+        outputs = self.model(model_input)
         batch_pose = self.model.get_predictions(outputs)["bodypart"]["poses"]
 
         if self.dynamic is not None:
@@ -264,15 +274,18 @@ def load_model(self) -> None:
             self.detector.to(self.device)
             self.detector.load_state_dict(raw_data["detector"])
             self.detector.eval()
-
             if self.precision == "FP16":
                 self.detector = self.detector.half()
 
             if self.top_down_config is None:
                 self.top_down_config = TopDownConfig()
-
             self.top_down_config.read_config(self.cfg)
 
+            detector_transforms = [v2.ToDtype(torch.float32, scale=True)]
+            if self.cfg["detector"]["data"]["inference"].get("normalize_images", False):
+                detector_transforms.append(v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+            self.detector_transform = v2.Compose(detector_transforms)
+
         if isinstance(self.dynamic, dynamic_cropping.TopDownDynamicCropper):
             crop = self.cfg["data"]["inference"].get("top_down_crop", {})
             w, h = crop.get("width", 256), crop.get("height", 256)
@@ -287,12 +300,18 @@ def load_model(self) -> None:
                 "Top-down models must either use a detector or a TopDownDynamicCropper."
             )
 
-        self.transform = v2.Compose(
-            [
-                v2.ToDtype(torch.float32, scale=True),
-                v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-            ]
-        )
+        pose_transforms = [v2.ToDtype(torch.float32, scale=True)]
+        auto_padding_cfg = self.cfg["data"]["inference"].get("auto_padding", None)
+        if auto_padding_cfg:
+            pose_transforms.append(
+                AutoPadToDivisor(
+                    pad_height_divisor=auto_padding_cfg.get("pad_height_divisor", 1),
+                    pad_width_divisor=auto_padding_cfg.get("pad_width_divisor", 1),
+                )
+            )
+        if self.cfg["data"]["inference"].get("normalize_images", False):
+            pose_transforms.append(v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
+        self.pose_transform = v2.Compose(pose_transforms)
 
     def read_config(self) -> dict:
         """Reads the configuration file"""
@@ -306,8 +325,17 @@ def _prepare_top_down(
         self, frame: torch.Tensor, detections: dict[str, torch.Tensor]
     ):
         """Prepares a frame for top-down pose estimation."""
+        # Accept unbatched frame (C, H, W) or batched frame (1, C, H, W)
+        if frame.dim() == 4:
+            if frame.size(0) != 1:
+                raise ValueError(f"Expected batch size 1, got {frame.size(0)}")
+            frame = frame[0]  # (C, H, W)
+        elif frame.dim() != 3:
+            raise ValueError(f"Expected frame of shape (C, H, W) or (1, C, H, W), got {frame.shape}")
+
         bboxes, scores = detections["boxes"], detections["scores"]
         bboxes = bboxes[scores >= self.top_down_config.bbox_cutoff]
+
         if len(bboxes) > 0 and self.top_down_config.max_detections is not None:
             bboxes = bboxes[: self.top_down_config.max_detections]
 
@@ -316,7 +344,7 @@ def _prepare_top_down(
         for bbox in bboxes:
             x1, y1, x2, y2 = bbox.tolist()
             cropped_frame, offset, scale = data.top_down_crop_torch(
-                frame[0],
+                frame,
                 (x1, y1, x2 - x1, y2 - y1),
                 output_size=self.top_down_config.crop_size,
                 margin=0,