feat(sdk): allow users to serve Tesseracts using multiple worker processes (#135)

dionhaefner · web-flow · commit 0aa57c62387b · 2025-05-02T13:51:31.000+02:00
#### Relevant issue or PR n/a #### Description of changes Adds `num_workers` argument to `$ tesseract-runtime serve`, `Tesseract.from_image`, and `engine.serve`. This allows users to serve using multiple processes, handling requests in parallel (at the expense of higher resource usage). The default is unchanged (1). #### Testing done CI, new test #### License - [x] By submitting this pull request, I confirm that my contribution is made under the terms of the [Apache 2.0 license](https://pasteurlabs.github.io/tesseract/LICENSE). - [x] I sign the Developer Certificate of Origin below by adding my name and email address to the `Signed-off-by` line. <details> <summary><b>Developer Certificate of Origin</b></summary> ```text Developer Certificate of Origin Version 1.1 Copyright (C) 2004, 2006 The Linux Foundation and its contributors. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Developer's Certificate of Origin 1.1 By making a contribution to this project, I certify that: (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. ``` </details> Signed-off-by: Dion Häfner <dion.haefner@simulation.science>
diff --git a/tesseract_core/sdk/engine.py b/tesseract_core/sdk/engine.py
@@ -496,6 +496,7 @@ def serve(
     volumes: list[str] | None = None,
     gpus: list[str] | None = None,
     debug: bool = False,
+    num_workers: int = 1,
 ) -> str:
     """Serve one or more Tesseract images.
 
@@ -507,6 +508,7 @@ def serve(
         volumes: list of paths to mount in the Tesseract container.
         gpus: IDs of host Nvidia GPUs to make available to the Tesseracts.
         debug: whether to enable debug mode.
+        num_workers: number of workers to use for serving the Tesseracts.
 
     Returns:
         A string representing the Tesseract Project ID.
@@ -527,7 +529,9 @@ def serve(
             f"Number of ports ({len(ports)}) must match number of images ({len(image_ids)})"
         )
 
-    template = _create_docker_compose_template(image_ids, ports, volumes, gpus, debug)
+    template = _create_docker_compose_template(
+        image_ids, ports, volumes, gpus, num_workers, debug
+    )
     compose_fname = _create_compose_fname()
 
     with tempfile.NamedTemporaryFile(
@@ -548,6 +552,7 @@ def _create_docker_compose_template(
     ports: list[str] | None = None,
     volumes: list[str] | None = None,
     gpus: list[str] | None = None,
+    num_workers: int = 1,
     debug: bool = False,
 ) -> str:
     """Create Docker Compose template."""
@@ -576,7 +581,7 @@ def _create_docker_compose_template(
 
         services.append(service)
     template = ENV.get_template("docker-compose.yml")
-    return template.render(services=services)
+    return template.render(services=services, num_workers=num_workers)
 
 
 def _create_compose_service_id(image_id: str) -> str:
diff --git a/tesseract_core/sdk/templates/docker-compose.yml b/tesseract_core/sdk/templates/docker-compose.yml
@@ -3,7 +3,7 @@ services:
   {{ service.name }}:
     image: {{ service.image }}
     restart: unless-stopped
-    entrypoint: tesseract-runtime serve
+    command: ["serve", "--num-workers", "{{ num_workers }}"]
     ports:
       - {{ service.port }}
     {%- if service.volumes %}
diff --git a/tesseract_core/sdk/tesseract.py b/tesseract_core/sdk/tesseract.py
@@ -24,18 +24,12 @@
 
 @dataclass
 class SpawnConfig:
-    """Configuration for spawning a Tesseract.
-
-    Attributes:
-        image: The image to use.
-        volumes: List of volumes to mount.
-        gpus: List of GPUs to use.
-        debug: Whether to run in debug mode.
-    """
+    """Configuration for spawning a Tesseract."""
 
     image: str
     volumes: list[str] | None
     gpus: list[str] | None
+    num_workers: int
     debug: bool
 
 
@@ -92,6 +86,7 @@ def from_image(
         *,
         volumes: list[str] | None = None,
         gpus: list[str] | None = None,
+        num_workers: int = 1,
     ) -> Tesseract:
         """Create a Tesseract instance from a Docker image.
 
@@ -108,8 +103,11 @@ def from_image(
 
         Args:
             image: The Docker image to use.
-            volumes: List of volumes to mount.
-            gpus: List of GPUs to use.
+            volumes: List of volumes to mount, e.g. ["/path/on/host:/path/in/container"].
+            gpus: List of GPUs to use, e.g. ["0", "1"]. (default: no GPUs)
+            num_workers: Number of worker processes to use. This determines how
+                many requests can be handled in parallel. Higher values
+                will increase throughput, but also increase resource usage.
 
         Returns:
             A Tesseract instance.
@@ -119,6 +117,7 @@ def from_image(
             image=image,
             volumes=volumes,
             gpus=gpus,
+            num_workers=num_workers,
             debug=True,
         )
         obj._serve_context = None
@@ -222,6 +221,7 @@ def serve(self, port: str | None = None) -> None:
             port=port,
             volumes=self._spawn_config.volumes,
             gpus=self._spawn_config.gpus,
+            num_workers=self._spawn_config.num_workers,
             debug=self._spawn_config.debug,
         )
         self._serve_context = dict(
@@ -261,14 +261,20 @@ def _serve(
         volumes: list[str] | None = None,
         gpus: list[str] | None = None,
         debug: bool = False,
+        num_workers: int = 1,
     ) -> tuple[str, str, int]:
         if port is not None:
             ports = [port]
         else:
             ports = None
 
         project_id = engine.serve(
-            [image], ports=ports, volumes=volumes, gpus=gpus, debug=debug
+            [image],
+            ports=ports,
+            volumes=volumes,
+            gpus=gpus,
+            debug=debug,
+            num_workers=num_workers,
         )
 
         command = ["docker", "compose", "-p", project_id, "ps", "--format", "json"]
diff --git a/tests/runtime_tests/test_serve.py b/tests/runtime_tests/test_serve.py
@@ -4,9 +4,12 @@
 import base64
 import json
 import os
+import platform
 import subprocess
 import sys
 import time
+from concurrent.futures import ThreadPoolExecutor
+from contextlib import contextmanager
 from textwrap import dedent
 
 import numpy as np
@@ -23,6 +26,12 @@
 }
 
 
+def is_wsl():
+    """Check if the current environment is WSL."""
+    kernel = platform.uname().release
+    return "Microsoft" in kernel or "WSL" in kernel
+
+
 def array_from_json(json_data):
     encoding = json_data["data"]["encoding"]
     if encoding == "base64":
@@ -42,6 +51,47 @@ def model_to_json(model):
     return json.loads(model.model_dump_json())
 
 
+@contextmanager
+def serve_in_subprocess(api_file, port, num_workers=1, timeout=30.0):
+    try:
+        proc = subprocess.Popen(
+            [
+                sys.executable,
+                "-c",
+                "from tesseract_core.runtime.serve import serve; "
+                f"serve(host='localhost', port={port}, num_workers={num_workers})",
+            ],
+            env=dict(os.environ, TESSERACT_API_PATH=api_file),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        # wait for server to start
+        while True:
+            try:
+                response = requests.get(f"http://localhost:{port}/health")
+            except requests.exceptions.ConnectionError:
+                pass
+            else:
+                if response.status_code == 200:
+                    break
+
+            time.sleep(0.1)
+            timeout -= 0.1
+
+            if timeout < 0:
+                raise TimeoutError("Server did not start in time")
+
+        yield f"http://localhost:{port}"
+
+    finally:
+        proc.terminate()
+        stdout, stderr = proc.communicate()
+        print(stdout.decode())
+        print(stderr.decode())
+        proc.wait(timeout=5)
+
+
 @pytest.fixture
 def http_client(dummy_tesseract_module):
     """A test HTTP client."""
@@ -157,6 +207,10 @@ def test_get_openapi_schema(http_client):
     assert response.json()["paths"]
 
 
+@pytest.mark.skipif(
+    is_wsl(),
+    reason="flaky on Windows",
+)
 def test_threading_sanity(tmpdir, free_port):
     """Test with a Tesseract that requires to be run in the main thread.
 
@@ -178,9 +232,6 @@ class OutputSchema(BaseModel):
     def apply(input: InputSchema) -> OutputSchema:
         assert threading.current_thread() == threading.main_thread()
         return OutputSchema()
-
-    def abstract_eval(abstract_inputs: dict) -> dict:
-        pass
     """
     )
 
@@ -191,47 +242,57 @@ def abstract_eval(abstract_inputs: dict) -> dict:
 
     # We can't run the server in the same process because it will use threading under the hood
     # so we need to spawn a new process instead
-    try:
-        proc = subprocess.Popen(
-            [
-                sys.executable,
-                "-c",
-                "from tesseract_core.runtime.serve import serve; "
-                f"serve(host='localhost', port={free_port}, num_workers=1)",
-            ],
-            env=dict(os.environ, TESSERACT_API_PATH=api_file),
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-        )
+    with serve_in_subprocess(api_file, free_port) as url:
+        response = requests.post(f"{url}/apply", json={"inputs": {}})
+        assert response.status_code == 200, response.text
 
-        # wait for server to start
-        timeout = 30.0
-        while True:
-            try:
-                response = requests.get(f"http://localhost:{free_port}/health")
-            except requests.exceptions.ConnectionError:
-                pass
-            else:
-                if response.status_code == 200:
-                    break
 
-            time.sleep(0.1)
-            timeout -= 0.1
+@pytest.mark.skipif(
+    is_wsl(),
+    reason="flaky on Windows",
+)
+def test_multiple_workers(tmpdir, free_port):
+    """Test that the server can be run with multiple worker processes."""
+    TESSERACT_API = dedent(
+        """
+    import time
+    import multiprocessing
+    from pydantic import BaseModel
 
-            if timeout < 0:
-                raise TimeoutError("Server did not start in time")
+    class InputSchema(BaseModel):
+        pass
 
-        response = requests.post(
-            f"http://localhost:{free_port}/apply", json={"inputs": {}}
-        )
-        assert response.status_code == 200, response.text
+    class OutputSchema(BaseModel):
+        pid: int
 
-    finally:
-        proc.terminate()
-        stdout, stderr = proc.communicate()
-        print(stdout.decode())
-        print(stderr.decode())
-        proc.wait(timeout=5)
+    def apply(input: InputSchema) -> OutputSchema:
+        return OutputSchema(pid=multiprocessing.current_process().pid)
+    """
+    )
+
+    api_file = tmpdir / "tesseract_api.py"
+
+    with open(api_file, "w") as f:
+        f.write(TESSERACT_API)
+
+    with serve_in_subprocess(api_file, free_port, num_workers=2) as url:
+        # Fire back-to-back requests to the server and check that they are handled
+        # by different workers (i.e. different PIDs)
+        post_request = lambda _: requests.post(f"{url}/apply", json={"inputs": {}})
+
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            # Fire a lot of requests in parallel
+            futures = executor.map(post_request, range(100))
+            responses = list(futures)
+
+        # Check that all responses are 200
+        for response in responses:
+            assert response.status_code == 200, response.text
+
+        # Check that not all pids are the same
+        # (i.e. the requests were handled by different workers)
+        pids = set(response.json()["pid"] for response in responses)
+        assert len(pids) > 1, "All requests were handled by the same worker"
 
 
 def test_debug_mode(dummy_tesseract_module, monkeypatch):
diff --git a/tests/sdk_tests/test_tesseract.py b/tests/sdk_tests/test_tesseract.py
@@ -107,7 +107,12 @@ def test_serve_lifecycle(mock_serving, mock_clients):
         pass
 
     mock_serving["serve_mock"].assert_called_with(
-        ["sometesseract:0.2.3"], ports=None, volumes=None, gpus=None, debug=True
+        ["sometesseract:0.2.3"],
+        ports=None,
+        volumes=None,
+        gpus=None,
+        debug=True,
+        num_workers=1,
     )
 
     mock_serving["teardown_mock"].assert_called_with("proj-id-123")