Merge pull request #16 from dronefreak/dji-tello-object-detection-segmentation-v2

dronefreak · web-flow · commit d208f1d7743f · 2025-10-22T16:56:48.000+02:00
Dji tello object detection segmentation v2
diff --git a/QUICKSTART.md b/QUICKSTART.md
@@ -28,24 +28,31 @@ python -m tello_vision.app
 ## First Steps
 
 ### 1. Test Detection Without Drone
+
 Good for verifying everything works:
+
 ```bash
 python examples/test_detector.py --source 0  # Webcam
 ```
 
 ### 2. Benchmark Your Setup
+
 See what FPS you can get:
+
 ```bash
 python examples/benchmark.py
 ```
 
 ### 3. Full Drone Mode
+
 With Tello connected:
+
 ```bash
 python -m tello_vision.app
 ```
 
 Controls:
+
 - **Tab**: Takeoff
 - **W/A/S/D**: Move
 - **Space/Shift**: Up/Down
@@ -60,29 +67,33 @@ Controls:
 Edit `config.yaml`:
 
 **Want faster FPS?** Use smaller model:
+
 ```yaml
 detector:
   yolov8:
-    model: "yolov8n-seg.pt"  # n=nano (fastest)
+    model: "yolov8n-seg.pt" # n=nano (fastest)
 ```
 
 **Only track people?**
+
 ```yaml
 detector:
   target_classes: ["person"]
 ```
 
 **Adjust visualization:**
+
 ```yaml
 visualization:
-  mask_alpha: 0.4  # Mask transparency
+  mask_alpha: 0.4 # Mask transparency
   show_confidence: true
 ```
 
 **Performance tuning:**
+
 ```yaml
 processing:
-  frame_skip: 1  # Process every 2nd frame (doubles FPS)
+  frame_skip: 1 # Process every 2nd frame (doubles FPS)
 ```
 
 ## Project Structure
@@ -131,8 +142,9 @@ This demonstrates reactive control suitable for autonomous vehicles.
 ## For Self-Driving Car Work
 
 This gives you:
+
 - Real-time object detection pipeline
-- Target tracking framework  
+- Target tracking framework
 - Reactive control examples
 - Extensible architecture for adding SLAM, planning, etc.
 
@@ -145,19 +157,38 @@ Check `examples/object_follower.py` for autonomous navigation basics.
 3. **Modify config.yaml** - Tune for your use case
 4. **Extend** - Add your own detectors/controllers
 
-## Performance Reference
+## Performance Reference - NVIDIA RTX 500 Ada Generation Laptop GPU
+
+| Model              | Size   | FPS   | Avg (ms) | Std (ms) | Min (ms) | Max (ms) | Notes               |
+| ------------------ | ------ | ----- | -------- | -------- | -------- | -------- | ------------------- |
+| YOLOv8n-seg        | Nano   | 207.8 | 4.8      | 0.4      | 4.4      | 8.2      | Fastest model       |
+| YOLOv8s-seg        | Small  | 120.2 | 8.3      | 0.1      | 8.2      | 9.1      | Most stable latency |
+| YOLOv8m-seg        | Medium | 53.2  | 18.8     | 0.5      | 16.4     | 19.6     | Balanced trade-off  |
+| Detectron2 R50-FPN | Large  | 9.7   | 102.7    | 0.8      | 101.2    | 107.5    | Slow but accurate   |
+
+---
+
+## Performance Reference Across GPUs
+
+| GPU         | Model   | FPS Range |
+| ----------- | ------- | --------- |
+| RTX 3060    | YOLOv8n | 25–30     |
+| RTX 3060    | YOLOv8s | 18–22     |
+| GTX 1050 Ti | YOLOv8n | 18–22     |
+| CPU         | YOLOv8n | 2–3       |
+
+---
+
+**Summary:**
 
-| GPU | Model | FPS |
-|-----|-------|-----|
-| RTX 3060 | YOLOv8n | 25-30 |
-| RTX 3060 | YOLOv8s | 18-22 |
-| 1050 Ti | YOLOv8n | 18-22 |
-| CPU | YOLOv8n | 2-3 |
+- **Fastest model:** YOLOv8n-seg (Nano) — 207.8 FPS
+- **Most stable latency:** YOLOv8s-seg (Small) — ±0.1 ms
+- **Performance leap:** RTX 500 Ada delivers **~7–8× speedup** over RTX 3060 for YOLOv8n.
 
 ## Files to Know
 
 - **config.yaml** - All settings
-- **tello_vision/app.py** - Main application  
+- **tello_vision/app.py** - Main application
 - **tello_vision/detectors/base_detector.py** - Add custom models here
 - **examples/object_follower.py** - Autonomous control reference
 
diff --git a/tello_vision/detectors/__init__.py b/tello_vision/detectors/__init__.py
@@ -0,0 +1,5 @@
+"""Detector module for various object detection/segmentation backends."""
+
+from .base_detector import BaseDetector, Detection, DetectionResult
+
+__all__ = ["BaseDetector", "Detection", "DetectionResult"]
diff --git a/tello_vision/detectors/detectron2_detector.py b/tello_vision/detectors/detectron2_detector.py
@@ -0,0 +1,143 @@
+"""Detectron2 detector implementation.
+
+Higher quality but slower than YOLO. Good for precision applications.
+"""
+
+import time
+
+import numpy as np
+
+from .base_detector import BaseDetector, Detection, DetectionResult
+
+
+class Detectron2Detector(BaseDetector):
+    """Detectron2 Mask R-CNN detector."""
+
+    def __init__(self, config: dict):
+        super().__init__(config)
+        self.predictor = None
+        self.metadata = None
+
+    def load_model(self) -> None:
+        """Load Detectron2 model."""
+        try:
+            from detectron2 import model_zoo
+            from detectron2.config import get_cfg
+            from detectron2.data import MetadataCatalog
+            from detectron2.engine import DefaultPredictor
+        except ImportError:
+            raise ImportError(
+                "detectron2 not installed. Install from: "
+                "https://github.com/facebookresearch/detectron2"
+            )
+
+        cfg = get_cfg()
+
+        # Load config
+        config_file = self.config.get(
+            "config_file", "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"
+        )
+        cfg.merge_from_file(model_zoo.get_config_file(config_file))
+
+        # Set model weights
+        weights = self.config.get("model_weights")
+        if weights and weights.startswith("detectron2://"):
+            cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(config_file)
+        else:
+            cfg.MODEL.WEIGHTS = weights or model_zoo.get_checkpoint_url(config_file)
+
+        # Set confidence threshold
+        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = self.config.get("confidence", 0.5)
+
+        # Set device
+        device = self.config.get("device", "cuda")
+        cfg.MODEL.DEVICE = device
+
+        print(f"Loading Detectron2 model: {config_file} on {device}")
+
+        # Create predictor
+        self.predictor = DefaultPredictor(cfg)
+
+        # Get metadata for class names
+        dataset_name = config_file.split("/")[0]
+        if dataset_name.startswith("COCO"):
+            self.metadata = MetadataCatalog.get("coco_2017_val")
+        else:
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
+
+        self.class_names = self.metadata.thing_classes
+
+        self._initialized = True
+        print(f"Detectron2 model loaded. Classes: {len(self.class_names)}")
+
+    def detect(self, frame: np.ndarray) -> DetectionResult:
+        """Run Detectron2 detection on frame.
+
+        Args:
+            frame: Input image (H, W, C) in BGR format
+
+        Returns:
+            DetectionResult with all detections
+        """
+        if not self._initialized:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+
+        start_time = time.time()
+
+        # Run inference
+        outputs = self.predictor(frame)
+
+        inference_time = time.time() - start_time
+
+        # Parse results
+        detections = []
+        instances = outputs["instances"].to("cpu")
+
+        if len(instances) > 0:
+            boxes = instances.pred_boxes.tensor.numpy()
+            scores = instances.scores.numpy()
+            classes = instances.pred_classes.numpy()
+
+            # Get masks if available
+            masks = None
+            if instances.has("pred_masks"):
+                masks = instances.pred_masks.numpy()
+
+            for idx in range(len(instances)):
+                bbox = boxes[idx].astype(int)
+
+                # Get mask
+                mask = None
+                if masks is not None:
+                    mask = masks[idx].astype(np.uint8)
+
+                detection = Detection(
+                    class_id=int(classes[idx]),
+                    class_name=self.get_class_name(int(classes[idx])),
+                    confidence=float(scores[idx]),
+                    bbox=tuple(bbox),
+                    mask=mask,
+                )
+                detections.append(detection)
+
+        return DetectionResult(
+            detections=detections,
+            inference_time=inference_time,
+            frame_shape=frame.shape,
+        )
+
+    def get_class_name(self, class_id: int) -> str:
+        """Get class name from ID."""
+        if 0 <= class_id < len(self.class_names):
+            return self.class_names[class_id]
+        return f"class_{class_id}"
+
+    def get_model_info(self) -> dict:
+        """Get model information."""
+        return {
+            "backend": "detectron2",
+            "config": self.config.get("config_file", "unknown"),
+            "device": self.device,
+            "num_classes": len(self.class_names),
+            "classes": self.class_names,
+        }
diff --git a/tello_vision/detectors/yolo_detector.py b/tello_vision/detectors/yolo_detector.py