diff --git a/CLAUDE.md b/CLAUDE.md index f74cb95..53d038f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -121,6 +121,7 @@ Both automation servers expose `GET /health` and `POST /jsonrpc` (JSON-RPC 2.0) | `ios_get_device_info` | Get display size, rotation, and iOS version | | `ios_input_text` | Type text into the currently focused element | | `ios_press_home` | Press home button | +| `ios_screenshot` | Capture the simulator display and save as a PNG file (optional `outputPath`; defaults to `./screenshots/` in the project's CWD) | | `ios_stop_automation_server` | Stop the running XCUITest server | ### Typical Automation Workflow diff --git a/app/src/main/kotlin/com/example/visiontest/ios/IOSAutomationClient.kt b/app/src/main/kotlin/com/example/visiontest/ios/IOSAutomationClient.kt index 776ed3c..535d79e 100644 --- a/app/src/main/kotlin/com/example/visiontest/ios/IOSAutomationClient.kt +++ b/app/src/main/kotlin/com/example/visiontest/ios/IOSAutomationClient.kt @@ -181,6 +181,14 @@ class IOSAutomationClient( return sendRequest("device.pressHome") } + /** + * Captures a screenshot of the current simulator display. + * Returns the raw JSON-RPC response containing a base64-encoded PNG in `result.pngBase64`. + */ + suspend fun screenshot(): String { + return sendRequest("ui.screenshot") + } + /** * Finds an element by selector. Returns element info if found. * @param text Exact text match diff --git a/app/src/main/kotlin/com/example/visiontest/tools/IOSAutomationToolRegistrar.kt b/app/src/main/kotlin/com/example/visiontest/tools/IOSAutomationToolRegistrar.kt index 55fb5e8..b124a58 100644 --- a/app/src/main/kotlin/com/example/visiontest/tools/IOSAutomationToolRegistrar.kt +++ b/app/src/main/kotlin/com/example/visiontest/tools/IOSAutomationToolRegistrar.kt @@ -4,11 +4,19 @@ import com.example.visiontest.common.DeviceConfig import com.example.visiontest.config.IOSAutomationConfig import com.example.visiontest.discovery.ToolDiscovery import com.example.visiontest.ios.IOSAutomationClient +import com.google.gson.JsonParser import io.modelcontextprotocol.kotlin.sdk.Tool import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.delay import kotlinx.coroutines.withContext import org.slf4j.Logger +import java.io.File +import java.io.IOException +import java.nio.file.Files +import java.nio.file.StandardCopyOption +import java.time.LocalDateTime +import java.time.format.DateTimeFormatter +import java.util.Base64 class IOSAutomationToolRegistrar( private val ios: DeviceConfig, @@ -32,6 +40,7 @@ class IOSAutomationToolRegistrar( registerGetDeviceInfo(scope) registerPressHome(scope) registerInputText(scope) + registerScreenshot(scope) registerStopAutomationServer(scope) } @@ -467,6 +476,179 @@ class IOSAutomationToolRegistrar( } } + private fun registerScreenshot(scope: ToolScope) { + scope.tool( + name = "ios_screenshot", + description = """ + Captures a screenshot of the current iOS simulator display and saves it as a PNG file on the host. + The iOS automation server must be running first (use ios_start_automation_server). + + OPTIONAL PARAMETERS: + - outputPath: Absolute or relative path where the PNG will be written. + Relative paths resolve against the MCP server's working directory (typically the + user's current project). If the file already exists it will be overwritten. + Missing parent directories are created automatically. + If omitted, saves to ./screenshots/ios_screenshot_.png relative to + the server's working directory (i.e. the current project, not the visiontest install dir). + + Returns the absolute path of the saved PNG. + """.trimIndent(), + timeoutMs = 30000 + ) { request -> + captureScreenshot(request.optionalString("outputPath")) + } + } + + internal suspend fun captureScreenshot(outputPath: String?): String { + if (!iosAutomationClient.isServerRunning()) { + return "iOS automation server is not running. Use 'ios_start_automation_server' first." + } + + val response = iosAutomationClient.screenshot() + val root = try { + JsonParser.parseString(response).asJsonObject + } catch (e: Exception) { + return "Screenshot failed: unable to parse response from iOS automation server (${e.message})." + } + + // JSON-RPC 2.0 envelope: either `result` OR `error` is present at the top level. + // Check `error` first so we can surface the server's message and map `methodNotFound` + // to the outdated-bundle guidance (older bundles won't know about `ui.screenshot`). + val errorElement = root.get("error") + if (errorElement != null && !errorElement.isJsonNull) { + if (errorElement.isJsonObject) { + val errorObj = errorElement.asJsonObject + val codeElement = errorObj.get("code") + val code = if (codeElement?.isJsonPrimitive == true && codeElement.asJsonPrimitive.isNumber) { + codeElement.asInt + } else null + val messageElement = errorObj.get("message") + val message = if (messageElement?.isJsonPrimitive == true) { + messageElement.asString + } else "unknown error" + if (code == JSON_RPC_METHOD_NOT_FOUND) { + return "Screenshot failed: the iOS automation server does not recognize 'ui.screenshot' " + + "(JSON-RPC methodNotFound). This indicates an outdated iOS automation server bundle " + + "— rebuild from source or update the installed bundle." + } + return if (code != null) { + "Screenshot failed: iOS automation server returned error ($code): $message" + } else { + "Screenshot failed: iOS automation server returned an error: $message" + } + } + return "Screenshot failed: iOS automation server returned a malformed error envelope." + } + + val resultElement = root.get("result") + if (resultElement == null || resultElement.isJsonNull) { + return "Screenshot failed: response missing 'result' object." + } + if (!resultElement.isJsonObject) { + return "Screenshot failed: response 'result' is not a JSON object." + } + val result = resultElement.asJsonObject + + val successElement = result.get("success") + if (successElement == null || successElement.isJsonNull || !successElement.isJsonPrimitive) { + return "Screenshot failed: response 'result' has a missing or non-primitive 'success' field." + } + val successPrimitive = successElement.asJsonPrimitive + if (!successPrimitive.isBoolean) { + return "Screenshot failed: response 'result.success' is not a boolean (got: $successElement)." + } + if (!successPrimitive.asBoolean) { + val errorElement = result.get("error") + val error = if (errorElement != null && !errorElement.isJsonNull && errorElement.isJsonPrimitive && errorElement.asJsonPrimitive.isString) { + errorElement.asString + } else { + "unknown error" + } + return "Screenshot failed on the iOS automation server: $error" + } + + val pngBase64Element = result.get("pngBase64") + if (pngBase64Element == null || pngBase64Element.isJsonNull) { + return "Screenshot failed: response missing 'pngBase64'. This may indicate an outdated iOS automation server bundle — rebuild from source or update the installed bundle." + } + if (!pngBase64Element.isJsonPrimitive || !pngBase64Element.asJsonPrimitive.isString) { + return "Screenshot failed: response 'result.pngBase64' is not a string (got: $pngBase64Element)." + } + val pngBase64 = pngBase64Element.asString + if (pngBase64.isEmpty()) { + return "Screenshot failed: response missing 'pngBase64'. This may indicate an outdated iOS automation server bundle — rebuild from source or update the installed bundle." + } + + val targetFile = resolveScreenshotPath(outputPath) + return writeScreenshot(targetFile, pngBase64) + } + + internal fun resolveScreenshotPath(outputPath: String?): File { + if (outputPath != null && outputPath.isNotBlank()) { + return File(outputPath).absoluteFile + } + val timestamp = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss")) + // Default to the MCP server's working directory so screenshots land in the + // user's current project (not the visiontest install dir). Coding agents like + // Claude Code launch the server with CWD set to the project they're working on. + return File("screenshots/ios_screenshot_$timestamp.png").absoluteFile + } + + /** + * Decodes the base64 PNG and writes it atomically to [target]. + * Runs on Dispatchers.IO so we don't block the tool handler's coroutine context. + * Writes to a sibling temp file first, then moves into place so a failure or cancellation + * mid-write cannot leave a partial PNG at [target]. + * + * Returns a user-facing result string (success or a specific error message). + */ + internal suspend fun writeScreenshot(target: File, pngBase64: String): String = withContext(Dispatchers.IO) { + val bytes = try { + Base64.getDecoder().decode(pngBase64) + } catch (e: IllegalArgumentException) { + return@withContext "Screenshot failed: iOS automation server returned invalid base64 PNG data (${e.message})." + } + + val targetPath = target.toPath() + val parentDir = target.parentFile + ?: return@withContext "Screenshot failed: cannot determine parent directory for ${target.absolutePath}." + + try { + Files.createDirectories(parentDir.toPath()) + } catch (e: IOException) { + return@withContext "Screenshot failed: unable to create parent directory ${parentDir.absolutePath} (${e.message})." + } + + val tempFile = try { + Files.createTempFile(parentDir.toPath(), ".ios_screenshot_", ".png.tmp") + } catch (e: IOException) { + return@withContext "Screenshot failed: unable to create temp file in ${parentDir.absolutePath} (${e.message})." + } + + try { + Files.write(tempFile, bytes) + // ATOMIC_MOVE isn't guaranteed across filesystems, but tempFile is a sibling of + // target so they're on the same FS. Fall back to plain replace on rare failures. + try { + Files.move(tempFile, targetPath, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE) + } catch (_: java.nio.file.AtomicMoveNotSupportedException) { + Files.move(tempFile, targetPath, StandardCopyOption.REPLACE_EXISTING) + } + "Screenshot saved to ${target.absolutePath}" + } catch (e: IOException) { + Files.deleteIfExists(tempFile) + "Screenshot failed: unable to write PNG to ${target.absolutePath} (${e.message})." + } catch (e: Exception) { + Files.deleteIfExists(tempFile) + throw e + } + } + + companion object { + /** Standard JSON-RPC 2.0 error code for an unknown method. */ + private const val JSON_RPC_METHOD_NOT_FOUND = -32601 + } + private fun registerStopAutomationServer(scope: ToolScope) { scope.tool( name = "ios_stop_automation_server", diff --git a/app/src/test/kotlin/com/example/visiontest/tools/IOSScreenshotToolTest.kt b/app/src/test/kotlin/com/example/visiontest/tools/IOSScreenshotToolTest.kt new file mode 100644 index 0000000..9f8765d --- /dev/null +++ b/app/src/test/kotlin/com/example/visiontest/tools/IOSScreenshotToolTest.kt @@ -0,0 +1,265 @@ +package com.example.visiontest.tools + +import com.example.visiontest.common.DeviceConfig +import com.example.visiontest.discovery.ToolDiscovery +import com.example.visiontest.ios.IOSAutomationClient +import kotlinx.coroutines.runBlocking +import okhttp3.mockwebserver.MockResponse +import okhttp3.mockwebserver.MockWebServer +import org.slf4j.LoggerFactory +import java.io.File +import java.nio.file.Files +import java.util.Base64 +import kotlin.test.* + +class IOSScreenshotToolTest { + + // Smallest valid PNG (1x1 transparent pixel) used as fixture. + // Bytes: 89 50 4E 47 0D 0A 1A 0A ... (PNG magic header) + private val fixturePngBase64 = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" + private val fixturePngBytes = Base64.getDecoder().decode(fixturePngBase64) + + private lateinit var mockServer: MockWebServer + private lateinit var registrar: IOSAutomationToolRegistrar + private lateinit var tempDir: File + + private val logger = LoggerFactory.getLogger(IOSScreenshotToolTest::class.java) + private val fakeDeviceConfig = object : DeviceConfig { + override suspend fun listDevices() = emptyList() + override suspend fun getFirstAvailableDevice(): com.example.visiontest.common.MobileDevice = + throw UnsupportedOperationException() + override suspend fun listApps(deviceId: String?) = emptyList() + override suspend fun getAppInfo(packageName: String, deviceId: String?) = "" + override suspend fun launchApp(packageName: String, activityName: String?, deviceId: String?) = false + override suspend fun executeShell(command: String, deviceId: String?) = "" + } + + @BeforeTest + fun setUp() { + mockServer = MockWebServer() + mockServer.start() + val client = IOSAutomationClient(host = mockServer.hostName, port = mockServer.port) + registrar = IOSAutomationToolRegistrar(fakeDeviceConfig, client, ToolDiscovery(logger), logger) + tempDir = Files.createTempDirectory("ios-screenshot-test").toFile() + } + + @AfterTest + fun tearDown() { + mockServer.shutdown() + tempDir.deleteRecursively() + } + + // --- resolveScreenshotPath --- + + @Test + fun `default path places timestamped filename under screenshots in CWD`() { + val resolved = registrar.resolveScreenshotPath(null) + assertEquals("screenshots", resolved.parentFile.name) + assertTrue( + resolved.name.matches(Regex("""ios_screenshot_\d{8}_\d{6}\.png""")), + "Expected default filename to match ios_screenshot_.png, got ${resolved.name}" + ) + assertTrue(resolved.isAbsolute, "Default path should be absolute") + // Default must be relative to the JVM's working directory (the user's project when + // launched by a coding agent), NOT the visiontest install dir. + val expectedRoot = File(System.getProperty("user.dir")).absoluteFile + assertEquals(expectedRoot, resolved.parentFile.parentFile) + } + + @Test + fun `explicit outputPath is used verbatim`() { + val explicit = File(tempDir, "custom/shot.png") + val resolved = registrar.resolveScreenshotPath(explicit.absolutePath) + assertEquals(explicit.absolutePath, resolved.absolutePath) + } + + @Test + fun `blank outputPath falls back to default`() { + val resolved = registrar.resolveScreenshotPath(" ") + assertTrue(resolved.name.startsWith("ios_screenshot_")) + } + + // --- captureScreenshot end-to-end via MockWebServer --- + + @Test + fun `successful capture writes decoded PNG bytes to target path`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult(successBody(fixturePngBase64)) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("Screenshot saved to ${target.absolutePath}"), "Got: $message") + assertTrue(target.exists(), "PNG file should be written") + assertContentEquals(fixturePngBytes, target.readBytes()) + } + + @Test + fun `missing parent directories are created automatically`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult(successBody(fixturePngBase64)) + + val nested = File(tempDir, "a/b/c/out.png") + assertFalse(nested.parentFile.exists(), "precondition: parent should not exist") + + registrar.captureScreenshot(nested.absolutePath) + + assertTrue(nested.exists()) + assertContentEquals(fixturePngBytes, nested.readBytes()) + } + + @Test + fun `server not running short-circuits with no file write`() = runBlocking { + // Health check fails with 500; no second request should be made. + mockServer.enqueue(MockResponse().setResponseCode(500)) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("iOS automation server is not running")) + assertFalse(target.exists(), "No file should be written when server is not running") + assertEquals(1, mockServer.requestCount, "Only the health check should have been attempted") + } + + @Test + fun `success false in response surfaces as error and writes no file`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult("""{"success":false,"error":"capture failed"}""") + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("Screenshot failed on the iOS automation server: capture failed"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `missing pngBase64 surfaces outdated-bundle hint and writes no file`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult("""{"success":true}""") + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("response missing 'pngBase64'"), "Got: $message") + assertTrue(message.contains("outdated iOS automation server bundle"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `missing result object surfaces error and writes no file`() = runBlocking { + enqueueHealthOk() + // Response with no "result" wrapper at all. + mockServer.enqueue(MockResponse().setBody("""{"jsonrpc":"2.0","id":1}""")) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("response missing 'result' object"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `malformed JSON response surfaces parse error and writes no file`() = runBlocking { + enqueueHealthOk() + mockServer.enqueue(MockResponse().setBody("not json")) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue( + message.contains("unable to parse response from iOS automation server"), + "Got: $message" + ) + assertFalse(target.exists()) + } + + @Test + fun `JSON-RPC methodNotFound maps to outdated-bundle guidance`() = runBlocking { + enqueueHealthOk() + // Standard JSON-RPC 2.0 error envelope for methodNotFound (code -32601). + // This is what an older pre-built bundle would return for ui.screenshot. + mockServer.enqueue( + MockResponse().setBody( + """{"jsonrpc":"2.0","error":{"code":-32601,"message":"Method not found: ui.screenshot"},"id":1}""" + ) + ) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("methodNotFound"), "Got: $message") + assertTrue(message.contains("outdated iOS automation server bundle"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `other JSON-RPC errors surface code and message`() = runBlocking { + enqueueHealthOk() + mockServer.enqueue( + MockResponse().setBody( + """{"jsonrpc":"2.0","error":{"code":-32000,"message":"XCUITest crashed"},"id":1}""" + ) + ) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("-32000"), "Got: $message") + assertTrue(message.contains("XCUITest crashed"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `result present but not a JSON object is rejected gracefully`() = runBlocking { + enqueueHealthOk() + mockServer.enqueue(MockResponse().setBody("""{"jsonrpc":"2.0","result":"oops","id":1}""")) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("'result' is not a JSON object"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `invalid base64 in pngBase64 surfaces a decode error and writes no file`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult(successBody("!!!not-base64!!!")) + + val target = File(tempDir, "out.png") + val message = registrar.captureScreenshot(target.absolutePath) + + assertTrue(message.contains("invalid base64 PNG data"), "Got: $message") + assertFalse(target.exists()) + } + + @Test + fun `atomic write leaves no temp file on success`() = runBlocking { + enqueueHealthOk() + enqueueScreenshotResult(successBody(fixturePngBase64)) + + val target = File(tempDir, "out.png") + registrar.captureScreenshot(target.absolutePath) + + // Confirm the final file exists and no .png.tmp sidecar was left behind + assertTrue(target.exists()) + val leftovers = tempDir.listFiles { f -> f.name.endsWith(".png.tmp") }.orEmpty() + assertEquals(0, leftovers.size, "Temp files left behind: ${leftovers.map { it.name }}") + } + + // --- helpers --- + + private fun enqueueHealthOk() { + mockServer.enqueue(MockResponse().setResponseCode(200).setBody("OK")) + } + + private fun enqueueScreenshotResult(innerResultJson: String) { + val body = """{"jsonrpc":"2.0","result":$innerResultJson,"id":1}""" + mockServer.enqueue(MockResponse().setBody(body)) + } + + private fun successBody(pngBase64: String): String = + """{"success":true,"pngBase64":"$pngBase64"}""" +} diff --git a/ios-automation-server/IOSAutomationServerTests/AutomationModelsTests.swift b/ios-automation-server/IOSAutomationServerTests/AutomationModelsTests.swift index e1edbb7..12a60b1 100644 --- a/ios-automation-server/IOSAutomationServerTests/AutomationModelsTests.swift +++ b/ios-automation-server/IOSAutomationServerTests/AutomationModelsTests.swift @@ -20,6 +20,24 @@ final class AutomationModelsTests: XCTestCase { XCTAssertEqual(dict["error"] as? String, "timeout") } + // MARK: - ScreenshotResult + + func testScreenshotResultSuccess() { + let result = ScreenshotResult(success: true, pngBase64: "iVBORw0KGgo=", error: nil) + let dict = result.toDictionary() + XCTAssertEqual(dict["success"] as? Bool, true) + XCTAssertEqual(dict["pngBase64"] as? String, "iVBORw0KGgo=") + XCTAssertNil(dict["error"]) + } + + func testScreenshotResultFailure() { + let result = ScreenshotResult(success: false, pngBase64: nil, error: "capture failed") + let dict = result.toDictionary() + XCTAssertEqual(dict["success"] as? Bool, false) + XCTAssertNil(dict["pngBase64"]) + XCTAssertEqual(dict["error"] as? String, "capture failed") + } + // MARK: - DeviceInfoResult func testDeviceInfoResultAllFields() { diff --git a/ios-automation-server/IOSAutomationServerUITests/Bridge/XCUITestBridge.swift b/ios-automation-server/IOSAutomationServerUITests/Bridge/XCUITestBridge.swift index 79cae90..4647152 100644 --- a/ios-automation-server/IOSAutomationServerUITests/Bridge/XCUITestBridge.swift +++ b/ios-automation-server/IOSAutomationServerUITests/Bridge/XCUITestBridge.swift @@ -110,6 +110,18 @@ class XCUITestBridge { ) } + // MARK: - Screenshot + + /// Captures the current simulator display as a PNG and returns it base64-encoded. + func screenshot() -> ScreenshotResult { + let pngData = XCUIScreen.main.screenshot().pngRepresentation + if pngData.isEmpty { + return ScreenshotResult(success: false, pngBase64: nil, error: "Screenshot returned empty PNG data") + } + let base64 = pngData.base64EncodedString() + return ScreenshotResult(success: true, pngBase64: base64, error: nil) + } + // MARK: - Tap /// Taps at absolute screen coordinates using the springboard coordinate space. diff --git a/ios-automation-server/IOSAutomationServerUITests/Models/AutomationModels.swift b/ios-automation-server/IOSAutomationServerUITests/Models/AutomationModels.swift index 4464427..67f5f99 100644 --- a/ios-automation-server/IOSAutomationServerUITests/Models/AutomationModels.swift +++ b/ios-automation-server/IOSAutomationServerUITests/Models/AutomationModels.swift @@ -38,6 +38,19 @@ struct DeviceInfoResult { } } +struct ScreenshotResult { + let success: Bool + let pngBase64: String? + let error: String? + + func toDictionary() -> [String: Any] { + var dict: [String: Any] = ["success": success] + if let pngBase64 = pngBase64 { dict["pngBase64"] = pngBase64 } + if let error = error { dict["error"] = error } + return dict + } +} + struct OperationResult { let success: Bool let error: String? diff --git a/ios-automation-server/IOSAutomationServerUITests/Server/JsonRpcServer.swift b/ios-automation-server/IOSAutomationServerUITests/Server/JsonRpcServer.swift index 53ef541..386c3c7 100644 --- a/ios-automation-server/IOSAutomationServerUITests/Server/JsonRpcServer.swift +++ b/ios-automation-server/IOSAutomationServerUITests/Server/JsonRpcServer.swift @@ -96,6 +96,10 @@ class JsonRpcServer { let bundleId = params?["bundleId"] as? String return bridge.getInteractiveElements(includeDisabled: includeDisabled, bundleId: bundleId).toDictionary() + // Screenshot + case "ui.screenshot": + return bridge.screenshot().toDictionary() + // Device methods case "device.getInfo": return bridge.getDeviceInfo().toDictionary() diff --git a/openspec/changes/archive/2026-04-16-add-ios-screenshot/.openspec.yaml b/openspec/changes/archive/2026-04-16-add-ios-screenshot/.openspec.yaml new file mode 100644 index 0000000..3a54a17 --- /dev/null +++ b/openspec/changes/archive/2026-04-16-add-ios-screenshot/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-04-16 diff --git a/openspec/changes/archive/2026-04-16-add-ios-screenshot/design.md b/openspec/changes/archive/2026-04-16-add-ios-screenshot/design.md new file mode 100644 index 0000000..d1e571b --- /dev/null +++ b/openspec/changes/archive/2026-04-16-add-ios-screenshot/design.md @@ -0,0 +1,92 @@ +## Context + +VisionTest's iOS automation server runs as an XCUITest UI test process (see `AutomationServerUITest.swift`) exposing a JSON-RPC 2.0 endpoint over HTTP. The MCP server (`app/`) talks to it via `IOSAutomationClient`, which wraps `HttpURLConnection` calls to `/jsonrpc`. All JSON-RPC methods today exchange text-based JSON payloads: UI hierarchies as XML strings, element data as nested dictionaries. + +Screenshots are binary PNG data — tens of KB to a few MB per image. They must cross two boundaries: +1. Simulator (XCUITest process) → Mac host (MCP server) over HTTP/JSON-RPC. +2. MCP server → caller's working directory as a PNG file. + +XCUITest provides `XCUIScreen.main.screenshot()` which returns an `XCUIScreenshot` with a `.pngRepresentation: Data` property. This is the canonical simulator capture API; `simctl io screenshot` is an alternative but requires invoking the CLI and knowing the simulator UDID. + +## Goals / Non-Goals + +**Goals:** +- Capture the current iOS simulator display as a PNG using the existing XCUITest bridge. +- Return the file path of the saved PNG to the MCP caller, so the agent can reference it in follow-up operations. +- Allow the caller to specify an output path; default to a sensible location when omitted. +- Keep the transport mechanism consistent with existing JSON-RPC methods (no new endpoints). +- Fail clearly when the server is not running, when the output path is unwritable, or when the screenshot call itself throws. + +**Non-Goals:** +- Android screenshot support (follow-up change). +- Returning the image bytes inline to the MCP caller or as an MCP image content block — initial scope is file-on-disk only. +- Capturing a specific app/window subregion — only the full simulator display. +- Image format options (JPEG, compression level, resizing) — PNG at native resolution only. +- Streaming or chunked transport for very large screenshots. + +## Decisions + +### Decision 1: Encode PNG as base64 over the existing JSON-RPC transport + +**Choice:** The new `ui.screenshot` JSON-RPC method returns `{ "success": true, "pngBase64": "" }`. The Kotlin client decodes and writes to disk. + +**Alternatives considered:** +- **Separate binary HTTP endpoint (`GET /screenshot`)**: Avoids base64 overhead (~33%) but doubles the surface area of the automation server, requires new routing/content-type handling in Swifter, and diverges from the single-method JSON-RPC dispatch pattern established by every other operation. Not worth it at typical screenshot sizes (a 1290×2796 iPhone 15 Pro Max screenshot is ~1–2 MB raw, ~1.3–2.7 MB base64). +- **`simctl io screenshot ` from the Kotlin side**: Bypasses the automation server entirely and writes directly on the host. Simpler transport, but requires knowing the booted simulator's UDID, doesn't go through the "server must be running" check, and splits the iOS automation API across two mechanisms. Rejected to keep a single coherent surface. + +**Rationale:** Base64-over-JSON-RPC preserves architectural consistency, works with the existing Swifter JSON response helper, and the size overhead is acceptable for screenshots at this scale. + +### Decision 2: File is written by the Kotlin MCP tool, not by the Swift side + +**Choice:** The Swift bridge returns only the encoded bytes. The Kotlin tool handler decodes base64 and writes the file. + +**Alternatives considered:** +- **Write the file inside the XCUITest process**: The simulator and host share the filesystem in many cases but not reliably — `XCUIScreenshot` runs in the simulator's sandbox, so paths like `~/.local/share/visiontest/screenshots/foo.png` would resolve relative to the simulator's home, not the Mac's. Cross-process path translation is error-prone. + +**Rationale:** Writing on the host keeps path semantics unambiguous (caller provides a host path, host writes it). + +### Decision 3: Output path parameter — optional, default is CWD-relative + +**Choice:** `outputPath` is optional. When omitted, default to `./screenshots/ios_screenshot_.png` resolved against the MCP server's current working directory. When provided, the path is used as-is (both absolute and CWD-relative paths are supported). + +**Alternatives considered:** +- **Default under `VISIONTEST_DIR`** (initial design): placed screenshots in the visiontest install directory (`~/.local/share/visiontest/screenshots/`). Rejected because a coding agent taking a screenshot of the iOS simulator is almost always working on a *user project*, not on visiontest itself. Screenshots belong with the project that prompted them. +- **Require outputPath explicitly (no default)**: safer but forces every caller to think about path layout. Too much ceremony for the common case. +- **A fixed absolute path like `/tmp/`**: loses the "screenshots live with the project" property and is ephemeral across reboots on some platforms. + +**Rationale:** Coding agents (Claude Code, Codex, etc.) launch MCP servers with CWD set to the project they're working on. A CWD-relative default lands the PNG in the user's project directory, where the agent and user can easily reference it. The timestamped filename prevents overwrites when the agent takes multiple screenshots without specifying a path. + +### Decision 4: Path safety — create parent directories, trust caller-supplied paths + +**Choice:** If the caller supplies an explicit `outputPath`, trust it (the MCP server already has filesystem access under its user). Create any missing parent directories. + +**Rationale:** The MCP server runs as the user; restricting caller-supplied paths would be surprising and add little defense-in-depth. The CWD-relative default lands inside the project the agent is working on. + +### Decision 5: New `ScreenshotResult` model mirroring existing result types + +**Choice:** Add `ScreenshotResult { success: Bool, pngBase64: String?, error: String? }` to `AutomationModels.swift` with a `toDictionary()` method following the same pattern as `UiHierarchyResult`. + +**Rationale:** Consistency with the existing models; `toDictionary()` pattern is already covered by unit tests in `AutomationModelsTests.swift`. + +### Decision 6: Timeout and MCP tool registration + +**Choice:** Register in `IOSAutomationToolRegistrar` alongside other iOS tools. Use `timeoutMs = 30000` (same as `ios_get_ui_hierarchy`) because `XCUIScreen.main.screenshot()` typically returns in <1 second but base64 encoding + HTTP transport of a multi-MB payload warrants headroom. + +**Rationale:** Matches the pattern for other potentially large-payload tools. + +## Risks / Trade-offs + +- **Risk:** Base64 payloads for very large screenshots (iPad 13" in 2x mode approaches 8–10 MB raw → ~13 MB base64) may stress `HttpURLConnection` default buffer behavior or the MCP stdio transport. + - **Mitigation:** Initial scope is iPhone-sized simulators (<3 MB base64). Document the iPad limitation as a known issue; address with chunked or binary endpoint only if it surfaces in practice. + +- **Risk:** The pre-built iOS automation bundle (`ios-automation-server.tar.gz`) downloaded by `install.sh` will not contain the new `ui.screenshot` method until a new release is cut. Users running an older pre-built bundle will get a `methodNotFound` error. + - **Mitigation:** The JSON-RPC server already returns `methodNotFound` for unknown methods (see `JsonRpcServer.executeMethod` default case). The Kotlin tool should surface this cleanly with a message telling the user to upgrade or rebuild from source. A new tagged release will refresh the bundle. + +- **Risk:** Writing to an arbitrary caller-supplied path could clobber existing files. + - **Mitigation:** Document the overwrite behavior in the tool description; do not add a "confirm overwrite" flow (would break the stateless MCP tool contract). Defaults use timestamped filenames to avoid accidental overwrites. + +- **Risk:** The simulator must be unlocked/booted for `XCUIScreen.main.screenshot()` to return valid data. If the simulator is in a weird state (e.g., booting, locked), the call may return a black image rather than error. + - **Mitigation:** Accept this as a property of XCUITest; the agent can verify via `ios_get_device_info` or by inspecting the returned PNG. No extra validation in this change. + +- **Trade-off:** Not returning the image as an MCP image content block means the agent cannot "see" the screenshot directly in the tool response — it must issue a separate read/view to process the PNG. + - **Justification:** Keeps the Kotlin → MCP surface simple (string results only, like every other tool). An MCP image content response can be added later as a non-breaking enhancement if needed. diff --git a/openspec/changes/archive/2026-04-16-add-ios-screenshot/proposal.md b/openspec/changes/archive/2026-04-16-add-ios-screenshot/proposal.md new file mode 100644 index 0000000..b473af8 --- /dev/null +++ b/openspec/changes/archive/2026-04-16-add-ios-screenshot/proposal.md @@ -0,0 +1,27 @@ +## Why + +AI agents using VisionTest can inspect the iOS simulator's UI hierarchy and interactive elements, but have no way to capture a pixel-accurate image of the current screen. A screenshot capability is essential for visual verification (layout regressions, rendering bugs, image assets) and for situations where the accessibility tree does not fully describe what the user sees (custom-drawn views, web content, media). This change adds a first-class `ios_screenshot` MCP tool that captures the simulator display and saves it as a PNG file on the host machine. + +## What Changes + +- Add a new MCP tool `ios_screenshot` that captures the iOS simulator's current screen and saves it as a PNG to a caller-specified (or defaulted) path on the host filesystem. +- Extend the iOS automation server (XCUITest) with a new JSON-RPC method `ui.screenshot` that returns the PNG bytes as a base64-encoded string using `XCUIScreen.main.screenshot().pngRepresentation`. +- Extend `IOSAutomationClient` with a `screenshot()` suspend function that calls the new JSON-RPC method and returns the base64 payload. +- The tool decodes the base64 payload and writes the PNG bytes to the resolved output path, returning the absolute path in the tool result. +- Scope is iOS only — Android support is deferred to a follow-up change. + +## Capabilities + +### New Capabilities +- `ios-screenshot`: Captures a screenshot of the booted iOS simulator display via the XCUITest automation server and saves it as a PNG file on the host. + +### Modified Capabilities + + +## Impact + +- **iOS automation server (`ios-automation-server/`)** — New `screenshot()` method on `XCUITestBridge`, new `ScreenshotResult` model in `AutomationModels.swift`, new `ui.screenshot` case in `JsonRpcServer.executeMethod`. Pre-built bundle consumers (installed via `install.sh`) will need to be rebuilt/re-released for the new method to be available. +- **MCP server (`app/`)** — New `screenshot()` method on `IOSAutomationClient`, new tool registration in `IOSAutomationToolRegistrar`. No changes to shared infrastructure (`ToolFactory`, `ToolScope`, `ErrorHandler`). +- **Tests** — Unit tests for base64 decoding and file write in the Kotlin tool path; Swift unit tests for the new result model's `toDictionary()`. +- **Docs** — `CLAUDE.md` tool table needs a new row; `LEARNING.md` optionally documents the base64-over-JSON-RPC transport decision. +- **External surface** — New JSON-RPC method on the iOS automation server; no breaking changes to existing tools or endpoints. diff --git a/openspec/changes/archive/2026-04-16-add-ios-screenshot/specs/ios-screenshot/spec.md b/openspec/changes/archive/2026-04-16-add-ios-screenshot/specs/ios-screenshot/spec.md new file mode 100644 index 0000000..8e5e87e --- /dev/null +++ b/openspec/changes/archive/2026-04-16-add-ios-screenshot/specs/ios-screenshot/spec.md @@ -0,0 +1,49 @@ +## ADDED Requirements + +### Requirement: MCP tool captures iOS simulator screenshot as PNG +The MCP server SHALL expose a tool named `ios_screenshot` that captures the current booted iOS simulator display and saves the image as a PNG file on the host filesystem. + +#### Scenario: Screenshot saved to caller-supplied path +- **WHEN** an agent invokes `ios_screenshot` with parameter `outputPath` set to an absolute path ending in `.png` +- **THEN** the tool writes the PNG bytes of the current simulator display to that exact path and returns a success message containing the absolute path + +#### Scenario: Screenshot saved to default path when outputPath is omitted +- **WHEN** an agent invokes `ios_screenshot` with no `outputPath` parameter +- **THEN** the tool writes the PNG to `./screenshots/ios_screenshot_.png` resolved against the MCP server's current working directory (the user's project when launched by a coding agent, NOT the visiontest install directory) and returns the absolute path of the saved file + +#### Scenario: Parent directories are created automatically +- **WHEN** `ios_screenshot` is invoked with an `outputPath` whose parent directory does not yet exist +- **THEN** the tool creates all missing parent directories before writing the PNG + +#### Scenario: Server not running +- **WHEN** `ios_screenshot` is invoked and the iOS automation server is not reachable on its configured port +- **THEN** the tool returns an error message instructing the caller to run `ios_start_automation_server` and does NOT write any file + +### Requirement: iOS automation server exposes `ui.screenshot` JSON-RPC method +The iOS automation server SHALL accept a JSON-RPC request with method `ui.screenshot` that captures the current screen via XCUITest and returns the image bytes as a base64-encoded PNG string. + +#### Scenario: Successful screenshot capture +- **WHEN** a JSON-RPC client POSTs `{"jsonrpc":"2.0","method":"ui.screenshot","id":1}` to `/jsonrpc` while the simulator is displaying content +- **THEN** the response `result` object contains `success: true` and a non-empty `pngBase64` string whose decoded bytes start with the PNG magic header (`89 50 4E 47 0D 0A 1A 0A`) + +#### Scenario: Screenshot capture failure is reported +- **WHEN** the XCUITest screenshot API throws an error during capture +- **THEN** the response `result` object contains `success: false`, omits `pngBase64`, and includes an `error` field describing the failure + +#### Scenario: Unknown method rejected +- **WHEN** a client calls the JSON-RPC endpoint with a misspelled method name resembling `ui.screenshot` +- **THEN** the server returns a JSON-RPC error with code `methodNotFound` + +### Requirement: IOSAutomationClient provides a screenshot API +The Kotlin `IOSAutomationClient` SHALL expose a suspend function `screenshot()` that calls the `ui.screenshot` JSON-RPC method and returns the server's raw JSON response string, consistent with other client methods. + +#### Scenario: screenshot() delegates to sendRequest +- **WHEN** `IOSAutomationClient.screenshot()` is called +- **THEN** it invokes `sendRequest("ui.screenshot", null)` and returns the resulting response string unchanged + +### Requirement: Tool description documents outputPath semantics +The `ios_screenshot` tool description SHALL document the optional `outputPath` parameter, the default output location, the overwrite behavior when the path already exists, and the requirement that `ios_start_automation_server` be called first. + +#### Scenario: Tool description includes required workflow +- **WHEN** the MCP client lists the `ios_screenshot` tool +- **THEN** the returned description mentions `outputPath` (optional), the default path `./screenshots/` relative to the server's working directory (user's current project), that existing files at the target path will be overwritten, and the prerequisite of a running iOS automation server diff --git a/openspec/changes/archive/2026-04-16-add-ios-screenshot/tasks.md b/openspec/changes/archive/2026-04-16-add-ios-screenshot/tasks.md new file mode 100644 index 0000000..c2bc233 --- /dev/null +++ b/openspec/changes/archive/2026-04-16-add-ios-screenshot/tasks.md @@ -0,0 +1,32 @@ +## 1. iOS Automation Server (Swift) + +- [x] 1.1 Add `ScreenshotResult` struct to `ios-automation-server/IOSAutomationServerUITests/Models/AutomationModels.swift` with fields `success: Bool`, `pngBase64: String?`, `error: String?` and a `toDictionary()` method following the pattern of `UiHierarchyResult` +- [x] 1.2 Add a `screenshot()` method to `XCUITestBridge` that calls `XCUIScreen.main.screenshot().pngRepresentation`, base64-encodes the bytes, and returns a populated `ScreenshotResult` (checks for empty data since the XCUITest calls don't throw) +- [x] 1.3 Add a `case "ui.screenshot":` branch to `JsonRpcServer.executeMethod` that calls `bridge.screenshot().toDictionary()` +- [x] 1.4 Add unit tests for `ScreenshotResult.toDictionary()` in `ios-automation-server/IOSAutomationServerTests/AutomationModelsTests.swift` covering success (pngBase64 present, no error) and failure (error present, no pngBase64) paths + +## 2. Kotlin MCP Client + +- [x] 2.1 Add a `suspend fun screenshot(): String` method to `app/src/main/kotlin/com/example/visiontest/ios/IOSAutomationClient.kt` that calls `sendRequest("ui.screenshot", null)` and returns the raw response + +## 3. MCP Tool Registration + +- [x] 3.1 In `app/src/main/kotlin/com/example/visiontest/tools/IOSAutomationToolRegistrar.kt`, add a `registerScreenshot(scope)` call to `registerTools()` and implement a private `registerScreenshot` method that: + - Registers tool name `ios_screenshot` with `timeoutMs = 30000` + - Documents the optional `outputPath` parameter, default path under `VISIONTEST_DIR/screenshots/`, overwrite behavior, and the `ios_start_automation_server` prerequisite in the description + - Checks `iosAutomationClient.isServerRunning()` and returns the standard "server not running" message if false + - Calls `iosAutomationClient.screenshot()`, parses the JSON response to extract `result.pngBase64`, and returns an informative error if `success` is false or `pngBase64` is missing +- [x] 3.2 Resolve the output path: if the caller provided `outputPath`, use it as-is; otherwise build `./screenshots/ios_screenshot_.png` resolved against the MCP server's current working directory (the user's project), using `java.time.LocalDateTime` with a matching formatter — NOT under `VISIONTEST_DIR` which would incorrectly point at the visiontest install dir +- [x] 3.3 Create parent directories (`Files.createDirectories`), base64-decode the `pngBase64` string, and write the bytes atomically to the resolved path; return a success message containing the absolute path +- [x] 3.4 Add unit test `app/src/test/.../IOSScreenshotToolTest.kt` covering: (a) default path contains timestamped filename under screenshots/, (b) base64 is decoded correctly and written as bytes (compare against a known small PNG fixture), (c) parent directories are created when missing, (d) server-not-running short-circuits with no file write, (e) `success: false` in the response surfaces as an error and writes no file + +## 4. Documentation + +- [x] 4.1 Add an `ios_screenshot` row to the "UI Automation (iOS)" tool table in `CLAUDE.md` with a one-line description matching the tool registration +- [x] 4.2 Verify the new tool appears in the typical automation workflow section if needed (probably not, since screenshots are orthogonal to the tap/swipe flow) — confirmed: screenshots are orthogonal to the tap/swipe workflow, so no update needed there + +## 5. Verification + +- [x] 5.1 Run `./gradlew :app:test` and ensure all Kotlin tests (including the new `IOSScreenshotToolTest`) pass +- [x] 5.2 Run `xcodebuild test -project ios-automation-server/IOSAutomationServer.xcodeproj -scheme IOSAutomationServer -destination 'platform=iOS Simulator,name=iPhone 17' -only-testing:IOSAutomationServerTests` and ensure the new model test passes (used iPhone 17 — iPhone 16 not installed on this machine; all 69 tests pass including the 2 new `ScreenshotResult` tests) +- [x] 5.3 Manual end-to-end check: start the iOS automation server on a booted simulator, invoke `ios_screenshot` via the MCP server (or a direct JSON-RPC curl), and confirm the PNG file opens and renders the simulator's current display at native resolution — verified by user diff --git a/openspec/specs/ios-screenshot/spec.md b/openspec/specs/ios-screenshot/spec.md new file mode 100644 index 0000000..8e5e87e --- /dev/null +++ b/openspec/specs/ios-screenshot/spec.md @@ -0,0 +1,49 @@ +## ADDED Requirements + +### Requirement: MCP tool captures iOS simulator screenshot as PNG +The MCP server SHALL expose a tool named `ios_screenshot` that captures the current booted iOS simulator display and saves the image as a PNG file on the host filesystem. + +#### Scenario: Screenshot saved to caller-supplied path +- **WHEN** an agent invokes `ios_screenshot` with parameter `outputPath` set to an absolute path ending in `.png` +- **THEN** the tool writes the PNG bytes of the current simulator display to that exact path and returns a success message containing the absolute path + +#### Scenario: Screenshot saved to default path when outputPath is omitted +- **WHEN** an agent invokes `ios_screenshot` with no `outputPath` parameter +- **THEN** the tool writes the PNG to `./screenshots/ios_screenshot_.png` resolved against the MCP server's current working directory (the user's project when launched by a coding agent, NOT the visiontest install directory) and returns the absolute path of the saved file + +#### Scenario: Parent directories are created automatically +- **WHEN** `ios_screenshot` is invoked with an `outputPath` whose parent directory does not yet exist +- **THEN** the tool creates all missing parent directories before writing the PNG + +#### Scenario: Server not running +- **WHEN** `ios_screenshot` is invoked and the iOS automation server is not reachable on its configured port +- **THEN** the tool returns an error message instructing the caller to run `ios_start_automation_server` and does NOT write any file + +### Requirement: iOS automation server exposes `ui.screenshot` JSON-RPC method +The iOS automation server SHALL accept a JSON-RPC request with method `ui.screenshot` that captures the current screen via XCUITest and returns the image bytes as a base64-encoded PNG string. + +#### Scenario: Successful screenshot capture +- **WHEN** a JSON-RPC client POSTs `{"jsonrpc":"2.0","method":"ui.screenshot","id":1}` to `/jsonrpc` while the simulator is displaying content +- **THEN** the response `result` object contains `success: true` and a non-empty `pngBase64` string whose decoded bytes start with the PNG magic header (`89 50 4E 47 0D 0A 1A 0A`) + +#### Scenario: Screenshot capture failure is reported +- **WHEN** the XCUITest screenshot API throws an error during capture +- **THEN** the response `result` object contains `success: false`, omits `pngBase64`, and includes an `error` field describing the failure + +#### Scenario: Unknown method rejected +- **WHEN** a client calls the JSON-RPC endpoint with a misspelled method name resembling `ui.screenshot` +- **THEN** the server returns a JSON-RPC error with code `methodNotFound` + +### Requirement: IOSAutomationClient provides a screenshot API +The Kotlin `IOSAutomationClient` SHALL expose a suspend function `screenshot()` that calls the `ui.screenshot` JSON-RPC method and returns the server's raw JSON response string, consistent with other client methods. + +#### Scenario: screenshot() delegates to sendRequest +- **WHEN** `IOSAutomationClient.screenshot()` is called +- **THEN** it invokes `sendRequest("ui.screenshot", null)` and returns the resulting response string unchanged + +### Requirement: Tool description documents outputPath semantics +The `ios_screenshot` tool description SHALL document the optional `outputPath` parameter, the default output location, the overwrite behavior when the path already exists, and the requirement that `ios_start_automation_server` be called first. + +#### Scenario: Tool description includes required workflow +- **WHEN** the MCP client lists the `ios_screenshot` tool +- **THEN** the returned description mentions `outputPath` (optional), the default path `./screenshots/` relative to the server's working directory (user's current project), that existing files at the target path will be overwritten, and the prerequisite of a running iOS automation server