Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 81 additions & 1 deletion aw_qt/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import logging
import subprocess
import platform
import urllib.error
import urllib.request
from pathlib import Path
from glob import glob
from time import sleep
from time import monotonic, sleep
from typing import Optional, List, Hashable, Set, Iterable

import aw_core
Expand Down Expand Up @@ -148,6 +150,10 @@ def __init__(self, name: str, path: Path, type: str) -> None:
# self.location = "system" if _is_system_module(name) else "bundled"
self._process: Optional[subprocess.Popen[str]] = None
self._last_process: Optional[subprocess.Popen[str]] = None
self._external_server: bool = False # True if we detected an already-running server
self._external_server_testing: bool = False
self._external_server_probe_cache: Optional[bool] = None
self._external_server_probe_cache_at: float = 0.0

def __hash__(self) -> int:
return hash((self.name, self.path))
Expand All @@ -158,9 +164,61 @@ def __eq__(self, other: Hashable) -> bool:
def __repr__(self) -> str:
return f"<Module {self.name} at {self.path}>"

def _get_server_port(self, testing: bool) -> Optional[int]:
if self.name not in ("aw-server", "aw-server-rust"):
return None

from .config import _read_aw_server_port, _read_server_rust_port

if self.name == "aw-server":
return _read_aw_server_port(testing) or (5666 if testing else 5600)
return _read_server_rust_port(testing) or (5666 if testing else 5600)

def _probe_external_server(self, testing: bool, timeout: float = 0.2) -> bool:
port = self._get_server_port(testing)
if port is None:
return False

try:
with urllib.request.urlopen(
f"http://localhost:{port}/api/0/info", timeout=timeout
):
Comment on lines +183 to +185
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Startup probe timeout too short — may miss a loaded server

_probe_external_server uses timeout=0.2 for both the one-shot startup check (called from start()) and the repeated liveness probe (called from is_alive() via _probe_external_server_cached).

The 0.2 s limit makes sense for liveness polls to avoid stalling the Qt main thread. However, the startup probe in start() fires only once per session and blocks just as long either way. If the external server is momentarily loaded or slow (e.g., JVM / Python startup warmup), the probe can time out and return False, causing aw-qt to attempt launching a second instance — triggering the exact "address already in use" error this PR is intended to fix.

Consider using a separate, larger timeout for the startup case. One clean approach is a dedicated parameter on _probe_external_server:

def _probe_external_server(self, testing: bool, timeout: float = 0.2) -> bool:
    port = self._get_server_port(testing)
    if port is None:
        return False
    try:
        with urllib.request.urlopen(
            f"http://localhost:{port}/api/0/info", timeout=timeout
        ):
            return True
    except (urllib.error.URLError, OSError):
        return False

Then in start():

if self._probe_external_server(testing, timeout=1.0):

This keeps the liveness path at 0.2 s while giving the startup probe a fighting chance on a loaded system.

return True
except (urllib.error.URLError, OSError):
return False

def _probe_external_server_cached(self, testing: bool, max_age: float = 1.0) -> bool:
now = monotonic()
if (
self._external_server_probe_cache is not None
and now - self._external_server_probe_cache_at < max_age
):
return self._external_server_probe_cache

alive = self._probe_external_server(testing)
self._external_server_probe_cache = alive
self._external_server_probe_cache_at = now
return alive

def start(self, testing: bool) -> None:
logger.info(f"Starting module {self.name}")

# For server modules, check if a server is already running before attempting
# to start one. This avoids port conflicts and the confusing "Restart" requirement
# when aw-server is managed externally (e.g. via systemd or Docker).
if self._probe_external_server(testing, timeout=1.0):
port = self._get_server_port(testing)
logger.info(
f"{self.name}: server already running on port {port}, "
"using existing instance instead of starting a new one"
)
self._external_server = True
self._external_server_testing = testing
self._external_server_probe_cache = True
self._external_server_probe_cache_at = monotonic()
self.started = True
return

exec_cmd = [str(self.path)]
if testing:
exec_cmd.append("--testing")
Expand Down Expand Up @@ -195,6 +253,17 @@ def stop(self) -> None:
f"Tried to stop module {self.name}, but it hasn't been started"
)
return
elif self._external_server:
# We didn't start this server, so don't stop it — it's managed externally
logger.info(
f"Module {self.name} is using an external server instance, not stopping it"
)
self._external_server = False
self._external_server_testing = False
self._external_server_probe_cache = None
self._external_server_probe_cache_at = 0.0
self.started = False
return
elif not self.is_alive():
logger.warning(f"Tried to stop module {self.name}, but it wasn't running")
else:
Expand Down Expand Up @@ -223,6 +292,17 @@ def toggle(self, testing: bool) -> None:
self.start(testing)

def is_alive(self) -> bool:
if self._external_server:
# We don't own this process, so re-probe the server instead.
# Cache short-lived probe results to avoid blocking repeated UI poll
# cycles on the main thread when multiple checks happen close together.
alive = self._probe_external_server_cached(self._external_server_testing)
if not alive:
self._external_server = False
self._external_server_testing = False
self._external_server_probe_cache = None
self._external_server_probe_cache_at = 0.0
return alive
Comment on lines 294 to +305
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeated blocking HTTP probe inside is_alive() can stall the Qt event loop

is_alive() is called on every polling cycle (e.g. get_unexpected_stops() fires from the tray's timer). For each external-server module this now makes a blocking urlopen call with a 0.2 s timeout. When the external server is down every single check takes 0.2 s — and the call happens on the Qt main thread (same thread that drives autostart() and the tray timer).

Unlike the one-off probe in start(), this one is repeated on every liveness check. Consider caching the result with a short TTL (e.g. 5 s) or moving the probe off the Qt thread, so a dead external server doesn't impose a guaranteed 0.2 s stall per polling interval:

import time

_PROBE_CACHE_TTL = 5.0  # seconds

def is_alive(self) -> bool:
    if self._external_server:
        now = time.monotonic()
        if now - getattr(self, "_last_probe_time", 0) >= _PROBE_CACHE_TTL:
            alive = self._probe_external_server(self._external_server_testing)
            self._last_probe_time = now
            self._last_probe_result = alive
            if not alive:
                self._external_server = False
                self._external_server_testing = False
            return alive
        return self._last_probe_result
    ...

if self._process is None:
return False

Expand Down
129 changes: 129 additions & 0 deletions tests/test_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Unit tests for the Module manager."""

import os
import subprocess
from pathlib import Path
from unittest.mock import MagicMock, patch
Expand Down Expand Up @@ -69,6 +70,92 @@ def test_toggle_restarts_crashed_module(self, module):
assert not module.started # stop() was called to clean up


class TestModuleServerProbe:
def test_aw_server_uses_python_server_port(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")

with (
patch("aw_qt.config._read_aw_server_port", return_value=5601),
patch("aw_qt.config._read_server_rust_port", return_value=6601),
):
assert mod._get_server_port(testing=False) == 5601

def test_aw_server_rust_uses_rust_server_port(self):
mod = Module("aw-server-rust", Path("/usr/bin/aw-server-rust"), "system")

with (
patch("aw_qt.config._read_aw_server_port", return_value=5601),
patch("aw_qt.config._read_server_rust_port", return_value=6601),
):
assert mod._get_server_port(testing=False) == 6601

def test_probe_external_server_closes_response(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")
response = MagicMock()

with (
patch.object(mod, "_get_server_port", return_value=5600),
patch("urllib.request.urlopen", return_value=response) as urlopen,
):
assert mod._probe_external_server(testing=False) is True

urlopen.assert_called_once_with("http://localhost:5600/api/0/info", timeout=0.2)
response.__enter__.assert_called_once_with()
response.__exit__.assert_called_once()

def test_probe_external_server_allows_custom_timeout(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")
response = MagicMock()

with (
patch.object(mod, "_get_server_port", return_value=5600),
patch("urllib.request.urlopen", return_value=response) as urlopen,
):
assert mod._probe_external_server(testing=False, timeout=1.0) is True

urlopen.assert_called_once_with("http://localhost:5600/api/0/info", timeout=1.0)

def test_probe_external_server_cached_reuses_recent_result(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")

with (
patch("aw_qt.manager.monotonic", side_effect=[10.0, 10.4]),
patch.object(mod, "_probe_external_server", return_value=True) as probe,
):
assert mod._probe_external_server_cached(testing=True, max_age=1.0) is True
assert mod._probe_external_server_cached(testing=True, max_age=1.0) is True

assert probe.call_count == 1

def test_probe_external_server_cached_refreshes_after_ttl(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")

with (
patch("aw_qt.manager.monotonic", side_effect=[10.0, 10.4, 11.5]),
patch.object(mod, "_probe_external_server", side_effect=[True, False]) as probe,
):
assert mod._probe_external_server_cached(testing=True, max_age=1.0) is True
assert mod._probe_external_server_cached(testing=True, max_age=1.0) is True
assert mod._probe_external_server_cached(testing=True, max_age=1.0) is False

assert probe.call_count == 2


class TestModuleStart:
def test_start_uses_longer_timeout_for_external_server_probe(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")

with (
patch.object(mod, "_probe_external_server", return_value=True) as probe,
patch.object(mod, "_get_server_port", return_value=5600),
):
mod.start(testing=False)

probe.assert_called_once_with(False, timeout=1.0)
assert mod.started is True
assert mod._external_server is True


class TestModuleIsAlive:
"""Tests for Module.is_alive() behavior."""

Expand All @@ -87,6 +174,48 @@ def test_is_alive_exited(self, module):
module._process = mock_proc
assert not module.is_alive()

def test_is_alive_reprobes_external_server(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")
mod._external_server = True
mod._external_server_testing = True
mod.started = True

with (
patch("aw_qt.manager.monotonic", side_effect=[10.0, 11.5]),
patch.object(mod, "_probe_external_server", side_effect=[True, False]) as probe,
):
assert mod.is_alive()
assert mod.is_alive() is False
assert mod.started is True
assert mod._external_server is False
assert mod._external_server_testing is False

assert probe.call_args_list[0].args == (True,)
assert probe.call_args_list[1].args == (True,)

def test_stop_external_server_resets_state_without_terminating_process(self):
mod = Module("aw-server", Path("/usr/bin/aw-server"), "system")
mod.started = True
mod._external_server = True
mod._external_server_testing = True
mock_proc = MagicMock(spec=subprocess.Popen)
mod._process = mock_proc
mod._last_process = None
mod._external_server_probe_cache = True
mod._external_server_probe_cache_at = 10.0

mod.stop()

mock_proc.terminate.assert_not_called()
mock_proc.wait.assert_not_called()
assert mod.started is False
assert mod._external_server is False
assert mod._external_server_testing is False
assert mod._external_server_probe_cache is None
assert mod._external_server_probe_cache_at == 0.0
assert mod._last_process is None
assert mod._process is mock_proc


class TestGetUnexpectedStops:
"""Tests for Manager.get_unexpected_stops()."""
Expand Down
Loading