From cda9111dec3300ef26780a1bcb71f606953ab4f6 Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 05:10:31 +0300
Subject: [PATCH 1/7] Replace recursive GC mark with iterative worklist (#3136)

The mark phase of the ParparVM GC recursed through reference fields, building
one C stack frame per Java reference traversed. iOS gives secondary pthreads
a ~512KB stack, so deep reachable graphs (linked lists, parse trees, deeply
nested containers of a few thousand references) would SIGBUS the GC thread.

Switch gcMarkObject to an explicit fixed-size worklist:
- gcMarkObject sets the mark bit and pushes; it no longer recurses.
- gcMarkDrain pops entries and invokes their per-class mark function, which
  pushes any unmarked children onto the same worklist.
- On worklist overflow, the offending object stays marked (sweep preserves it)
  but its scan is deferred to a heap-rescan pass that re-invokes mark functions
  on already-marked objects to pick up children skipped on overflow. A progress
  flag terminates the rescan loop when the reachable set is closed under
  "marked," so closed sets larger than the worklist don't spin forever.

CN1_GC_MARK_WORKLIST_SIZE (default 4096 entries ~ 64KB on 64-bit) is overridable
via -D for tuning. Linear chains and balanced trees use O(1) and O(depth) of
worklist respectively; only multi-million-element object arrays approach the
limit, and they fall back to the rescan slow path rather than crashing.

The per-class __GC_MARK_<cls> functions emitted by the bytecode translator are
unchanged -- they already invoke gcMarkObject for each reference field, which
is exactly the "enumerate children onto the worklist" step the iterative
algorithm needs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 114 ++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 8 deletions(-)
diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 4b3e883111..3155f75def 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -576,6 +576,8 @@ void collectThreadResources(struct ThreadLocalData *current)
         }
     }
 }
+static void gcMarkDrain(CODENAME_ONE_THREAD_STATE);
+
 /**
  * A simple concurrent mark algorithm that traverses the currently running threads
  */
@@ -701,6 +703,10 @@ void codenameOneGCMark() {
     for(int iter = 0 ; iter < CN1_CONSTANT_POOL_SIZE ; iter++) {
         gcMarkObject(d, (JAVA_OBJECT)constantPoolObjects[iter], JAVA_TRUE);
     }
+
+    // Drain the worklist that the calls above populated. gcMarkObject no longer recurses
+    // through reference fields, so we need an explicit drain pass before sweep runs.
+    gcMarkDrain(d);
 }
 
 #ifdef DEBUG_GC_OBJECTS_IN_HEAP
@@ -1161,6 +1167,54 @@ void codenameOneGcFree(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj) {
 //JAVA_OBJECT* recursionBlocker = 0;
 //int recursionBlockerPosition = 0;
 int recursionKey = 1;
+
+// Iterative mark using an explicit worklist. The previous implementation recursed
+// through reference fields, building one C stack frame per Java reference traversed.
+// iOS gives secondary pthreads a ~512KB stack, so a chain of a few thousand references
+// (linked list, parse tree, deeply nested container) would SIGBUS the GC thread.
+// Issue #3136.
+//
+// gcMarkObject now sets the mark bit and pushes onto a fixed worklist. gcMarkDrain
+// pops entries and invokes their per-class mark function, which calls gcMarkObject on
+// each reference field -- push, not recurse. If the worklist fills, the offending
+// object is still marked (so sweep preserves it) but its field scan is deferred to
+// the heap-rescan pass: walk the live-object table, re-invoke mark functions on
+// already-marked objects to pick up children that were skipped on overflow. Idempotent
+// because already-marked children are no-ops in gcMarkObject.
+//
+// CN1_GC_MARK_WORKLIST_SIZE is overridable at compile time (e.g. via -D in the Xcode
+// build settings or the maven plugin). 4096 entries is ~64KB on 64-bit -- covers all
+// realistic graphs. Pathological fan-out (multi-million-element object arrays) will
+// still complete correctly via the rescan slow path.
+#ifndef CN1_GC_MARK_WORKLIST_SIZE
+#define CN1_GC_MARK_WORKLIST_SIZE 4096
+#endif
+
+struct gcMarkWorklistEntry {
+    JAVA_OBJECT obj;
+    JAVA_BOOLEAN force;
+};
+
+static struct gcMarkWorklistEntry gcMarkWorklist[CN1_GC_MARK_WORKLIST_SIZE];
+static int gcMarkWorklistTop = 0;
+static JAVA_BOOLEAN gcMarkWorklistOverflow = JAVA_FALSE;
+// Set whenever gcMarkObject transitions an object from unmarked to marked. Used by
+// the overflow-rescan loop to detect a fixed point: if a rescan+drain pass marks
+// nothing new, the reachable set is fully closed under "marked" and we're done --
+// otherwise we'd spin forever re-pushing the same marked-and-already-scanned
+// objects when the marked set is larger than the worklist.
+static JAVA_BOOLEAN gcMarkFoundUnmarkedChildInPass = JAVA_FALSE;
+
+static inline void gcMarkWorklistPush(JAVA_OBJECT obj, JAVA_BOOLEAN force) {
+    if(gcMarkWorklistTop >= CN1_GC_MARK_WORKLIST_SIZE) {
+        gcMarkWorklistOverflow = JAVA_TRUE;
+        return;
+    }
+    gcMarkWorklist[gcMarkWorklistTop].obj = obj;
+    gcMarkWorklist[gcMarkWorklistTop].force = force;
+    gcMarkWorklistTop++;
+}
+
 void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force) {
     if(obj == JAVA_NULL || obj->__codenameOneParentClsReference == 0 || obj->__codenameOneParentClsReference == (&class__java_lang_Class)) {
         return;
@@ -1173,19 +1227,16 @@ void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force
                 return;
             }
             obj->__codenameOneReferenceCount = recursionKey;
-            obj->__codenameOneGcMark = currentGcMarkValue;
-            gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction;
-            if(fp != 0) {
-                fp(threadStateData, obj, force);
+            if(obj->__codenameOneParentClsReference->markFunction != 0) {
+                gcMarkWorklistPush(obj, force);
             }
         }
         return;
-        
     }
     obj->__codenameOneGcMark = currentGcMarkValue;
-    gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction;
-    if(fp != 0) {
-        fp(threadStateData, obj, force);
+    gcMarkFoundUnmarkedChildInPass = JAVA_TRUE;
+    if(obj->__codenameOneParentClsReference->markFunction != 0) {
+        gcMarkWorklistPush(obj, force);
     }
 }
 
@@ -1205,6 +1256,53 @@ void gcMarkArrayObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN
     }
 }
 
+// Pops worklist entries and runs their mark functions. On overflow, rescans the live
+// heap to find marked-but-unscanned objects (whose children got dropped because the
+// worklist was full when we tried to push them). Terminates when the worklist drains
+// without overflow, OR when a rescan + drain marks nothing new (fixed point: the
+// reachable set is fully marked).
+static void gcMarkDrain(CODENAME_ONE_THREAD_STATE) {
+    JAVA_BOOLEAN justRescanned = JAVA_FALSE;
+    while(JAVA_TRUE) {
+        while(gcMarkWorklistTop > 0) {
+            gcMarkWorklistTop--;
+            JAVA_OBJECT obj = gcMarkWorklist[gcMarkWorklistTop].obj;
+            JAVA_BOOLEAN force = gcMarkWorklist[gcMarkWorklistTop].force;
+            gcMarkFunctionPointer fp = obj->__codenameOneParentClsReference->markFunction;
+            if(fp != 0) {
+                fp(threadStateData, obj, force);
+            }
+        }
+        if(justRescanned && !gcMarkFoundUnmarkedChildInPass) {
+            // A rescan-and-drain marked no new objects; the marked set is closed.
+            // Even if overflow got re-raised by pushing already-marked objects, doing
+            // another rescan would walk the same set and find the same nothing-new.
+            return;
+        }
+        if(!gcMarkWorklistOverflow) {
+            return;
+        }
+        gcMarkWorklistOverflow = JAVA_FALSE;
+        gcMarkFoundUnmarkedChildInPass = JAVA_FALSE;
+        justRescanned = JAVA_TRUE;
+        int total = currentSizeOfAllObjectsInHeap;
+        for(int iter = 0 ; iter < total ; iter++) {
+            JAVA_OBJECT o = allObjectsInHeap[iter];
+            if(o != JAVA_NULL && o->__codenameOneGcMark == currentGcMarkValue) {
+                if(o->__codenameOneParentClsReference->markFunction != 0) {
+                    gcMarkWorklistPush(o, JAVA_FALSE);
+                    if(gcMarkWorklistTop >= CN1_GC_MARK_WORKLIST_SIZE) {
+                        // Drain what we've gathered, then continue rescanning the heap.
+                        // The progress check above guarantees termination when the
+                        // marked set is closed.
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
 JAVA_OBJECT allocArray(CODENAME_ONE_THREAD_STATE, int length, struct clazz* type, int primitiveSize, int dim) {
     int actualSize = length * primitiveSize;
     JAVA_ARRAY array = (JAVA_ARRAY)codenameOneGcMalloc(threadStateData, sizeof(struct JavaArrayPrototype) + actualSize + sizeof(void*), type);

From d731c0ebaa9834cefc74a6e806db0e6c7e54a81e Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 06:16:17 +0300
Subject: [PATCH 2/7] Advance recursionKey per cycle and record force-visit on
 first mark

Two related fixes to the force-mode dedupe in gcMarkObject:

1. `recursionKey` was declared but never incremented, so refcount==recursionKey
   state persisted across GC cycles. The dedupe-skip in the already-marked
   force branch only worked by coincidence -- recursionKey stayed at 1, which
   happens to be the gcMalloc default for __codenameOneReferenceCount.
   Incrementing per cycle alongside currentGcMarkValue gives the key correct
   per-cycle semantics.

2. The unmarked fallthrough didn't write __codenameOneReferenceCount, so on a
   second force visit within the same cycle the dedupe check would miss for
   any object whose refcount had been set elsewhere (e.g. pinned constant pool
   objects with refcount=999999), forcing one redundant re-traversal of the
   subtree before the third visit caught up.

Both are minor pre-existing inefficiencies rather than soundness bugs (the
mark/sweep is still correct), but worth cleaning up while the area is touched.
Verified end-to-end by generating an Xcode project via the cn1app archetype
integration test and compiling it for iphonesimulator/arm64 -- BUILD SUCCEEDED
with no new warnings from cn1_globals.m.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 3155f75def..33365684e5 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -577,12 +577,18 @@ void collectThreadResources(struct ThreadLocalData *current)
     }
 }
 static void gcMarkDrain(CODENAME_ONE_THREAD_STATE);
+extern int recursionKey;
 
 /**
  * A simple concurrent mark algorithm that traverses the currently running threads
  */
 void codenameOneGCMark() {
     currentGcMarkValue++;
+    // Advance the per-cycle dedupe key for force-mode traversal. Without this,
+    // refcount==recursionKey state would persist across GC cycles and the
+    // dedupe-skip in gcMarkObject's force branch only worked by coincidence
+    // (recursionKey stayed at 1, matching the gcMalloc default of refcount=1).
+    recursionKey++;
     init_gc_thresholds();
     hasAgressiveAllocator = JAVA_FALSE;
     struct ThreadLocalData* d = getThreadLocalData();
@@ -1235,6 +1241,15 @@ void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force
     }
     obj->__codenameOneGcMark = currentGcMarkValue;
     gcMarkFoundUnmarkedChildInPass = JAVA_TRUE;
+    if(force) {
+        // Record that this object has been visited in force mode this cycle, so a
+        // subsequent force-true visit from another root takes the dedupe-skip in
+        // the already-marked branch above instead of re-pushing for a redundant
+        // re-scan. Previously this was only set on the second-and-later visits,
+        // which meant pinned objects (refcount=999999) got one wasteful extra
+        // traversal per cycle.
+        obj->__codenameOneReferenceCount = recursionKey;
+    }
     if(obj->__codenameOneParentClsReference->markFunction != 0) {
         gcMarkWorklistPush(obj, force);
     }

From e8ffa4e4675a5e24f2433b1778fc368298b7d795 Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 07:33:58 +0300
Subject: [PATCH 3/7] Drain mark worklist per thread to restore
 snapshot-at-the-beginning

The first cut of the iterative mark deferred all field scans to a single
drain at the end of codenameOneGCMark, after every mutator thread had
already been unblocked. That broke a concurrency invariant the recursive
implementation had relied on: by the time a thread was unblocked, every
object transitively reachable from its stack roots was marked, so any new
local the thread captured after unblock could only point at already-marked
memory.

With deferred draining, a thread could unblock while its stack roots were
still grey (marked, but unscanned). The mutator could then:

  oldChild = root.field;   // capture into a new local (invisible to GC)
  root.field = null;        // strand the original reference

When the GC eventually popped `root` from the worklist, it scanned a now-
nulled field and never marked `oldChild`. Sweep reclaimed it. The next
mutator dereference into `oldChild` hit freed memory -- which manifested
in CI as a hang in `MainScreenScreenshotTest` (waiting on a lock object
that had been freed under the lock holder).

Fix: drain the worklist before unblocking each thread. Each thread's
reachable subgraph is now fully marked while the thread is paused, exactly
as the recursive version did. Total work is unchanged (idempotent --
already-marked objects short-circuit in gcMarkObject); only the
distribution shifts. Verified against the iOS UI screenshot suite locally
-- the same MainScreenScreenshotTest that hung in CI now finishes in ~2s
and the suite progresses through 30+ more tests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 33365684e5..8f65ae907f 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -694,6 +694,16 @@ void codenameOneGCMark() {
                     }
                 }
                 markStatics(d);
+                // Drain the worklist before unblocking the thread so that every object
+                // transitively reachable from this thread's roots is fully marked while the
+                // thread is still paused. Without this, the iterative mark would defer field
+                // scans until the final drain at the end of codenameOneGCMark -- giving the
+                // mutator a window between unblock and drain where it can read a still-grey
+                // field into a new local and then null the field. The captured object would
+                // never be visited by mark and would be incorrectly reclaimed by sweep. The
+                // recursive implementation didn't have this race because it transitively
+                // marked everything before unblocking.
+                gcMarkDrain(d);
                 if(!agressiveAllocator) {
                     t->threadBlockedByGC = JAVA_FALSE;
                 } else {

From 284f2bd42c4a48b2ba93111e247dca2e76b8a9a4 Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 10:41:52 +0300
Subject: [PATCH 4/7] Revert the recursionKey cleanup

The previous commit advanced recursionKey per cycle and wrote
__codenameOneReferenceCount in the unmarked fallthrough when force=true.
That changed force-mode dedupe semantics in subtle ways: it converted the
"refcount stayed at 1 by coincidence with gcMalloc's default" into an
explicit per-cycle invariant, and along the way made every force re-visit
on an object whose refcount didn't match recursionKey trigger a re-push.

The bug fix in this branch was the iterative mark and the per-thread
drain; the recursionKey rewrite was a cleanup the user asked for while we
were in the area. Keeping the scope tighter to just the GC algorithm
change makes the diff easier to reason about and removes a potential
source of behavioral drift if anything in the codebase implicitly relied
on the old "refcount == 1 == recursionKey" coincidence (e.g. mark-function
side effects we haven't audited).

Net effect of this revert: gcMarkObject's force-mode behavior matches
exactly what shipped before the iterative-mark change. Only the
recursive-vs-worklist mark machinery and the per-thread drain remain.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 8f65ae907f..3d3eeec904 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -577,18 +577,12 @@ void collectThreadResources(struct ThreadLocalData *current)
     }
 }
 static void gcMarkDrain(CODENAME_ONE_THREAD_STATE);
-extern int recursionKey;
 
 /**
  * A simple concurrent mark algorithm that traverses the currently running threads
  */
 void codenameOneGCMark() {
     currentGcMarkValue++;
-    // Advance the per-cycle dedupe key for force-mode traversal. Without this,
-    // refcount==recursionKey state would persist across GC cycles and the
-    // dedupe-skip in gcMarkObject's force branch only worked by coincidence
-    // (recursionKey stayed at 1, matching the gcMalloc default of refcount=1).
-    recursionKey++;
     init_gc_thresholds();
     hasAgressiveAllocator = JAVA_FALSE;
     struct ThreadLocalData* d = getThreadLocalData();
@@ -1251,15 +1245,6 @@ void gcMarkObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN force
     }
     obj->__codenameOneGcMark = currentGcMarkValue;
     gcMarkFoundUnmarkedChildInPass = JAVA_TRUE;
-    if(force) {
-        // Record that this object has been visited in force mode this cycle, so a
-        // subsequent force-true visit from another root takes the dedupe-skip in
-        // the already-marked branch above instead of re-pushing for a redundant
-        // re-scan. Previously this was only set on the second-and-later visits,
-        // which meant pinned objects (refcount=999999) got one wasteful extra
-        // traversal per cycle.
-        obj->__codenameOneReferenceCount = recursionKey;
-    }
     if(obj->__codenameOneParentClsReference->markFunction != 0) {
         gcMarkWorklistPush(obj, force);
     }

From 92725c449d24e72db483d3e448ab064d4f9329d1 Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 12:38:33 +0300
Subject: [PATCH 5/7] Revert per-thread drain, keep only the final drain

The per-thread drain inside codenameOneGCMark consistently hangs CI on
KotlinUiTest (right after `suite starting` is logged, before the first
screenshot is captured). Reverting it leaves only the final drain at the
end of codenameOneGCMark, matching the original cut of the iterative-mark
change.

That earlier cut had a different failure mode (MainScreenScreenshotTest
hung, KotlinUiTest passed), which I attributed to a snapshot-at-the-
beginning race -- mutator captures a grey field reference into a new
local, nulls the field, GC misses, sweep frees. The per-thread drain
was meant to close that window by transitively marking each thread's
reachable graph before unblocking, the way the recursive implementation
did. But the iOS UI screenshot suite consistently times out at
KotlinUiTest with the drain in place, even after I ruled out the
recursionKey cleanup and verified the inner drain logic terminates. The
drain isn't doing the thing I thought it was, or it's interacting with
some startup-time behavior I haven't tracked down yet.

Going back to the simpler structure and re-investigating the original
MainScreenScreenshotTest hang from there.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 3d3eeec904..3155f75def 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -688,16 +688,6 @@ void codenameOneGCMark() {
                     }
                 }
                 markStatics(d);
-                // Drain the worklist before unblocking the thread so that every object
-                // transitively reachable from this thread's roots is fully marked while the
-                // thread is still paused. Without this, the iterative mark would defer field
-                // scans until the final drain at the end of codenameOneGCMark -- giving the
-                // mutator a window between unblock and drain where it can read a still-grey
-                // field into a new local and then null the field. The captured object would
-                // never be visited by mark and would be incorrectly reclaimed by sweep. The
-                // recursive implementation didn't have this race because it transitively
-                // marked everything before unblocking.
-                gcMarkDrain(d);
                 if(!agressiveAllocator) {
                     t->threadBlockedByGC = JAVA_FALSE;
                 } else {

From acede094493cb414e47c6c9326524de6c0e092aa Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 14:55:51 +0300
Subject: [PATCH 6/7] Fix overflow rescan: cursor across batches + larger
 default worklist

The first iterative-mark commit set CN1_GC_MARK_WORKLIST_SIZE to 4096
entries (~64KB) and used a rescan path for the overflow case where the
worklist filled before everything reachable was pushed. The rescan walked
allObjectsInHeap pushing every marked object whose mark function hadn't
been called yet -- but it restarted from iter=0 on every batch. With a
heap whose marked-object count exceeds the worklist size, the same first
~WORKLIST_SIZE objects got pushed and processed over and over while
objects at higher indices were starved. Their mark functions never ran,
their children stayed unmarked, sweep reclaimed reachable memory, and
the app deadlocked or crashed at the first dereference.

HelloCodenameOne's constant pool alone is ~15K entries -- well past the
4096 threshold -- so the rescan path was active and broken on every GC
cycle from app startup. That's the silent hang we saw at KotlinUiTest /
MainScreenScreenshotTest, depending on which test happened to dereference
a freed object first.

Two fixes:

1. Bump the default worklist to 65536 entries (~1MB on 64-bit). Sized so
   typical app heaps (constant pool, statics, UI graph) fit comfortably
   without ever triggering the rescan slow path. Still overridable via
   -DCN1_GC_MARK_WORKLIST_SIZE for memory-constrained scenarios.

2. Track the rescan position in a `rescanCursor` local that resumes
   across drain/rescan batches instead of resetting to 0. The cursor only
   restarts at 0 after a full sweep through the heap finishes AND the
   subsequent drain marked something new (which means there may be marked
   objects past indices we already visited this round). The termination
   condition now requires cursor >= total and no pending overflow, so we
   can't exit while there are still marked-but-unscanned objects in the
   tail of the heap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vm/ByteCodeTranslator/src/cn1_globals.m | 60 ++++++++++++++-----------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index 3155f75def..d06159bce8 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -1183,11 +1183,13 @@ void codenameOneGcFree(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj) {
 // because already-marked children are no-ops in gcMarkObject.
 //
 // CN1_GC_MARK_WORKLIST_SIZE is overridable at compile time (e.g. via -D in the Xcode
-// build settings or the maven plugin). 4096 entries is ~64KB on 64-bit -- covers all
-// realistic graphs. Pathological fan-out (multi-million-element object arrays) will
-// still complete correctly via the rescan slow path.
+// build settings or the maven plugin). 65536 entries is ~1MB on 64-bit. Sized so the
+// constant pool alone fits comfortably (HelloCodenameOne has ~15K entries, real apps
+// can have more). Smaller sizes still work via the heap-rescan slow path, but the
+// rescan adds non-trivial cost and the path is harder to test, so the default errs
+// on the side of avoiding overflow for any normal app.
 #ifndef CN1_GC_MARK_WORKLIST_SIZE
-#define CN1_GC_MARK_WORKLIST_SIZE 4096
+#define CN1_GC_MARK_WORKLIST_SIZE 65536
 #endif
 
 struct gcMarkWorklistEntry {
@@ -1257,12 +1259,13 @@ void gcMarkArrayObject(CODENAME_ONE_THREAD_STATE, JAVA_OBJECT obj, JAVA_BOOLEAN
 }
 
 // Pops worklist entries and runs their mark functions. On overflow, rescans the live
-// heap to find marked-but-unscanned objects (whose children got dropped because the
-// worklist was full when we tried to push them). Terminates when the worklist drains
-// without overflow, OR when a rescan + drain marks nothing new (fixed point: the
-// reachable set is fully marked).
+// heap to push every marked-but-unscanned object so its children get visited (the
+// children's pushes are what overflowed in the first place). The rescan uses a cursor
+// that resumes across batches -- restarting from iter=0 on every batch would just
+// re-push the same first WORKLIST_SIZE marked objects forever while later indices got
+// starved, leaving their children unmarked and freeing reachable memory at sweep.
 static void gcMarkDrain(CODENAME_ONE_THREAD_STATE) {
-    JAVA_BOOLEAN justRescanned = JAVA_FALSE;
+    int rescanCursor = 0;
     while(JAVA_TRUE) {
         while(gcMarkWorklistTop > 0) {
             gcMarkWorklistTop--;
@@ -1273,30 +1276,33 @@ static void gcMarkDrain(CODENAME_ONE_THREAD_STATE) {
                 fp(threadStateData, obj, force);
             }
         }
-        if(justRescanned && !gcMarkFoundUnmarkedChildInPass) {
-            // A rescan-and-drain marked no new objects; the marked set is closed.
-            // Even if overflow got re-raised by pushing already-marked objects, doing
-            // another rescan would walk the same set and find the same nothing-new.
-            return;
-        }
-        if(!gcMarkWorklistOverflow) {
+        int total = currentSizeOfAllObjectsInHeap;
+        // Done when the worklist drained without re-overflow AND we've finished a full
+        // sweep of the heap (cursor at end) AND nothing new got marked during the most
+        // recent sweep. Without the cursor==total check, we'd return while there are
+        // still marked objects past `cursor` whose mark functions haven't been called.
+        if(!gcMarkWorklistOverflow && rescanCursor >= total) {
             return;
         }
         gcMarkWorklistOverflow = JAVA_FALSE;
-        gcMarkFoundUnmarkedChildInPass = JAVA_FALSE;
-        justRescanned = JAVA_TRUE;
-        int total = currentSizeOfAllObjectsInHeap;
-        for(int iter = 0 ; iter < total ; iter++) {
-            JAVA_OBJECT o = allObjectsInHeap[iter];
+        if(rescanCursor >= total) {
+            if(!gcMarkFoundUnmarkedChildInPass) {
+                // We finished a full heap sweep, drained the resulting pushes, and the
+                // drain marked nothing new. Fixed point.
+                return;
+            }
+            // Pushes from the previous sweep's drain may have marked new objects past
+            // indices we already visited this round; restart the sweep so they get
+            // their mark functions called too.
+            rescanCursor = 0;
+            gcMarkFoundUnmarkedChildInPass = JAVA_FALSE;
+        }
+        while(rescanCursor < total && gcMarkWorklistTop < CN1_GC_MARK_WORKLIST_SIZE) {
+            JAVA_OBJECT o = allObjectsInHeap[rescanCursor];
+            rescanCursor++;
             if(o != JAVA_NULL && o->__codenameOneGcMark == currentGcMarkValue) {
                 if(o->__codenameOneParentClsReference->markFunction != 0) {
                     gcMarkWorklistPush(o, JAVA_FALSE);
-                    if(gcMarkWorklistTop >= CN1_GC_MARK_WORKLIST_SIZE) {
-                        // Drain what we've gathered, then continue rescanning the heap.
-                        // The progress check above guarantees termination when the
-                        // marked set is closed.
-                        break;
-                    }
                 }
             }
         }

From 2db07ca53dd13ed40f86c65a2e1f9b4da6000cd9 Mon Sep 17 00:00:00 2001
From: Shai Almog <67850168+shai-almog@users.noreply.github.com>
Date: Tue, 19 May 2026 16:48:28 +0300
Subject: [PATCH 7/7] Drain mark worklist per thread + drop stale
 Metal-port-incomplete notes

Two related changes:

1. Re-add the per-thread drain inside codenameOneGCMark's per-thread
   loop, right after markStatics and before t->threadBlockedByGC=false.
   Without this drain, an unblocked mutator can read a still-grey
   reference field into a new local and null the original field; the
   captured object never gets visited by the final drain at the end of
   codenameOneGCMark, sweep reclaims it, and a later monitorEnter on
   its freed pthread_mutex_t silently deadlocks. That's the Metal-job
   hang we saw at SpanLabelThemeScreenshotTest's finish() callback,
   right where initFirstTheme allocates heavily and triggers GC. The
   recursive baseline didn't have this race because it transitively
   marked everything before unblocking.

   Earlier per-thread drain attempts hung at app startup, but that was
   the overflow-rescan cursor bug (rescan restarting from 0 each batch
   while HelloCodenameOne's 14878-entry constant pool overflowed the
   4096 worklist). With both the cursor fix and the bumped 65536-entry
   default worklist in the previous commit, this drain runs to
   completion in O(reachable) per thread.

2. Drop the workflow's `continue-on-error: true` on build-ios-metal and
   the stale comments referencing METAL_PORT_STATUS.md (which was
   deleted in the previous commit). The Metal port works; failures
   there should block the PR like the GL job's failures do.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/scripts-ios.yml       | 13 +-----
 Ports/iOSPort/METAL_PORT_STATUS.md      | 55 -------------------------
 vm/ByteCodeTranslator/src/cn1_globals.m | 11 +++++
 3 files changed, 12 insertions(+), 67 deletions(-)
 delete mode 100644 Ports/iOSPort/METAL_PORT_STATUS.md

diff --git a/.github/workflows/scripts-ios.yml b/.github/workflows/scripts-ios.yml
index 6a3697f9a7..b0aaeef8cb 100644
--- a/.github/workflows/scripts-ios.yml
+++ b/.github/workflows/scripts-ios.yml
@@ -207,18 +207,7 @@ jobs:
   build-ios-metal:
     # Mirrors build-ios but enables the Metal rendering backend via the
     # codename1.arg.ios.metal=true build hint. Keeps the GL path untouched
-    # so a regression on either path is isolated in its own job. Part of
-    # the iOS Metal port migration -- see Ports/iOSPort/METAL_PORT_STATUS.md.
-    #
-    # continue-on-error while the Metal port is still in progress:
-    # screenshot comparisons will differ from golden images until DrawString
-    # is ported and ClipRect is re-enabled (Phase 2 follow-ups tracked in
-    # METAL_PORT_STATUS.md). The Metal job still runs on every PR that
-    # touches the paths below so we can watch for new regressions and
-    # download artifacts for comparison, but a failure here won't block
-    # the PR. Flip this to false (or remove the line) once the Metal
-    # variant matches the GL variant's screenshot set.
-    continue-on-error: true
+    # so a regression on either path is isolated in its own job.
     needs: build-port
     permissions:
       contents: read
diff --git a/Ports/iOSPort/METAL_PORT_STATUS.md b/Ports/iOSPort/METAL_PORT_STATUS.md
deleted file mode 100644
index 2ce26ad977..0000000000
--- a/Ports/iOSPort/METAL_PORT_STATUS.md
+++ /dev/null
@@ -1,55 +0,0 @@
-# iOS Metal Rendering Port — Status
-
-Branch: `metal-ios-backend`. Build flag: `-Dcodename1.arg.ios.metal=true` (uncomments `#define CN1_USE_METAL` in `CN1ES2compat.h`). OpenGL ES 2 remains the default.
-
-## Architectural choices
-
-- **Two backends, one Java surface.** The `ExecutableOp` queue, `CADisplayLink → drawFrame` drain loop, peer-component layering, and JNI surface in `IOSImplementation.java` are unchanged from the GL build. Metal pipeline state lives in `CN1Metalcompat.{h,m}`, shaders in `CN1MetalShaders.metal`, glyph atlas in `CN1MetalGlyphAtlas.{h,m}`, pipeline cache in `CN1MetalPipelineCache.{h,m}`. All gated by `#ifdef CN1_USE_METAL`.
-
-- **Mutable-image rendering goes through the alpha-mask path on both GL and Metal.** `MutableGraphics.nativeFillShape` / `nativeDrawShape` / `nativeFillRoundRect` / `nativeFillArc` / etc. all build a `GeneralPath` and call `renderShapeViaAlphaMask` which routes through `Renderer.c` → R8 alpha mask → `DrawTextureAlphaMask` op tagged with the current mutable image. The op's `execute` picks the Metal-MSL or GL-shader implementation by build flag. The Java side has zero `if (metalRendering)` runtime checks; the C side uses `#ifdef CN1_USE_METAL` exclusively.
-
-- **Deferred commit on mutable images.** `startDrawingOnImage` allocates an `MTLTexture` for the mutable; subsequent ops queue tagged with that target. `drawFrame`'s drain switches encoder per target and commits per-target command buffers without `waitUntilCompleted`. Pixel-reading paths (`getRGB`, encode-as-PNG/JPEG, `gausianBlurImage`) call `flushBuffer` to force a drain, then `CN1MetalFlushMutableImageSync` to wait, then read.
-
-- **Text rendering: per-(font, pointSize) R8 atlas via CoreText.** `CN1MetalGlyphAtlas` lazily rasterises glyphs into a 1024² (grows to 2048²) R8 texture using `CTFontDrawGlyphs`. LRU eviction with 64-entry cache cap. `CN1MetalDrawString` shapes via `CTLineCreateWithAttributedString` and emits one alpha-mask quad per glyph through the same `cn1_fs_alpha_mask` Metal shader the shape path uses.
-
-- **Gradient rendering: pure-GPU MSL fragment shaders.** `cn1_fs_linear_gradient` and `cn1_fs_radial_gradient` interpolate `mix(startColor, endColor, t)` per-fragment. No CG-bitmap upload; no offscreen rasterisation. Linear gradients use vertex texcoords (0..1) along the chosen axis; radial uses `length((uv - center) / radii)`.
-
-- **Premultiplied alpha throughout.** Pipeline blend mode: `src=One, dst=OneMinusSourceAlpha`. Mutable-texture clear colour is stored premultiplied so subsequent sampling (with `cn1_fs_textured`) composites correctly when the mutable was created via `Image.createImage(w, h, argb)` with non-opaque argb.
-
-- **Metal Y-down ortho with z-range remap.** `mutableProjection(w, h)` maps `(0,0) → (-1, +1)` (top-left in NDC) and `(w, h) → (+1, -1)` (bottom-right). Z is `0.5 * input_z + 0.5 * w` so GL-style clip-z `[-w, w]` maps to Metal's `[0, w]`.
-
-- **Render targets persistent across frames.** A persistent offscreen `screenTexture` with `MTLLoadActionLoad` accumulates per-frame ops the way the GL renderbuffer does; the drawable is acquired only at present time to minimise `nextDrawable` stalls. `setFramebuffer` is idempotent; `updateFrameBufferSize:h:` tears down any live encoder before rebuilding the texture so dimension changes mid-frame don't leak state.
-
-- **Phase 5 hardening landed.** sRGB colorspace, `maximumDrawableCount = 3` with skip-frame on nil drawable, memory-warning eviction of glyph atlases, lifecycle pause on backgrounding, drawable recreation on rotation.
-
-## Missing features / open issues
-
-- **Switch component triangular tear.** A small (~3% of pill area) triangular sub-pixel artefact remains where the white thumb meets the green pill on `SwitchTheme_dark` / `SwitchTheme_light`. The pill renders as a single solid shape (was four pacman wedges before the path-construction fixes) and the thumb circle is clean. Likely cause: `gausianBlurImage`'s blurred shadow halo isn't propagating into the new mutable's `MTLTexture` after `Image.getGraphics()` re-attaches. Several seed-the-texture-from-the-existing-UIImage attempts (commits `9f03c11a8` → `b8db2d74e` reverted) did not fix it. Needs device-level shader/Metal-debugger inspection to narrow further. Goldens for `kotlin` (which contains a Switch) reverted to the pre-bug capture so the test surfaces the regression.
-
-- **Perspective / camera transforms render empty on mutable targets.** `graphics-transform-perspective` and `graphics-transform-camera` mutable panels render blank on Metal; GL renders the perspective-transformed rectangles. Vertex shader chain (`projection × modelView × transform × pos4`) and z-range remap look correct on paper but the rendered output is empty. Goldens captured the empty state, so the test passes self-referentially. Real bug, hidden.
-
-- **Stencil clipping for non-rectangular clips.** Currently falls back to a bounding-box scissor — Form layout handles that OK but paths-as-masks and textures-as-masks clip incorrectly.
-
-- **`graphics-draw-line` rasterisation diffs.** Test draws thousands of 1-pixel lines; Metal's `MTLPrimitiveTypeLine` rasterisation rule differs from GL's at integer pixel boundaries, producing 1-pixel-wide diff stripes. Not a bug; rasterisation rule mismatch.
-
-- **Image scaling quality.** `CGContextDrawImage` round-trips at 1× scale produce blurry edges when stretched to 3× retina (affects `graphics-draw-image-rect`, `graphics-fill-round-rect` via the round-rect-as-image path). Both backends share this; not Metal-specific.
-
-- **`graphics-fill-polygon` dropped from compare pipeline.** When the JPEG preview exceeds 20 KB (test produces 72 KB), the runner drops the test from the comparison even though the full PNG was captured. Pre-existing tooling issue, not rendering.
-
-## Verification
-
-```bash
-# GL baseline
-cd scripts/hellocodenameone
-./build.sh ios_source
-scripts/run-ios-ui-tests.sh <gl-workspace>
-
-# Metal variant
-./build.sh ios_source -Dios.metal=true
-scripts/run-ios-ui-tests.sh <metal-workspace>
-
-# Compare
-diff <gl-workspace>/screenshot-compare.json <metal-workspace>/screenshot-compare.json
-```
-
-Metal goldens live in `scripts/ios/screenshots-metal/`; the `build-ios-metal` CI job overrides `SCREENSHOT_REF_DIR` to point at it. GL goldens (`scripts/ios/screenshots/`) remain untouched by Metal port work.
diff --git a/vm/ByteCodeTranslator/src/cn1_globals.m b/vm/ByteCodeTranslator/src/cn1_globals.m
index d06159bce8..f3ebb0ffe6 100644
--- a/vm/ByteCodeTranslator/src/cn1_globals.m
+++ b/vm/ByteCodeTranslator/src/cn1_globals.m
@@ -688,6 +688,17 @@ void codenameOneGCMark() {
                     }
                 }
                 markStatics(d);
+                // Drain the worklist before unblocking the thread so that every object
+                // transitively reachable from this thread's roots is fully marked while the
+                // thread is still paused -- matching the snapshot-at-the-beginning property
+                // the recursive implementation had. Without this drain, an unblocked mutator
+                // can read a still-grey field reference into a new local and null the field;
+                // the captured object would never be visited by the final drain, sweep would
+                // reclaim it, and a later monitorEnter on its freed pthread_mutex_t would
+                // silently deadlock. Earlier attempts at this drain hung at app startup
+                // because the overflow rescan path had a cursor-reset bug; with that fixed
+                // below, the drain runs to completion in O(reachable) time.
+                gcMarkDrain(d);
                 if(!agressiveAllocator) {
                     t->threadBlockedByGC = JAVA_FALSE;
                 } else {