From 846e7fddc6f6a01361b9956114a7277b28729423 Mon Sep 17 00:00:00 2001
From: Andrew Wormald <andreww@luno.com>
Date: Tue, 10 Mar 2026 09:27:42 +0000
Subject: [PATCH 1/3] Fix error counter keying on dynamic error messages

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/errorcounter/errorcounter.go | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go
index 45b1490..23caf52 100644
--- a/internal/errorcounter/errorcounter.go
+++ b/internal/errorcounter/errorcounter.go
@@ -20,27 +20,30 @@ func (c *Counter) Add(err error, labels ...string) int {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	errMsg := err.Error()
-	errMsg += strings.Join(labels, "-")
-	c.store[errMsg] += 1
-	return c.store[errMsg]
+	key := makeKey(labels)
+	c.store[key] += 1
+	return c.store[key]
 }
 
 func (c *Counter) Count(err error, labels ...string) int {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	errMsg := err.Error()
-	errMsg += strings.Join(labels, "-")
-	return c.store[errMsg]
+	key := makeKey(labels)
+	return c.store[key]
 }
 
 func (c *Counter) Clear(err error, labels ...string) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	errMsg := err.Error()
-	errMsg += strings.Join(labels, "-")
-	c.store[errMsg] = 0
-	return
+	key := makeKey(labels)
+	delete(c.store, key)
+}
+
+// makeKey builds a stable key from labels only. The error message is excluded
+// because it often contains dynamic data (timestamps, IDs) which would create
+// unique keys and prevent the PauseAfterErrCount threshold from ever being reached.
+func makeKey(labels []string) string {
+	return strings.Join(labels, "-")
 }

From 08af657f2215b25be93b3f98b77d2dae5d252621 Mon Sep 17 00:00:00 2001
From: Andrew Wormald <andreww@luno.com>
Date: Tue, 10 Mar 2026 10:04:45 +0000
Subject: [PATCH 2/3] Add tests for label-based error counter keying

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/errorcounter/errorcounter_test.go | 42 ++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go
index 798f294..f758c88 100644
--- a/internal/errorcounter/errorcounter_test.go
+++ b/internal/errorcounter/errorcounter_test.go
@@ -59,3 +59,45 @@ func TestErrorCounter(t *testing.T) {
 		})
 	}
 }
+
+func TestErrorCounter_DifferentErrorsSameLabels(t *testing.T) {
+	c := errorcounter.New()
+	labels := []string{"processName", "run-123"}
+
+	// Different error messages with the same labels should share a counter.
+	c.Add(errors.New("connection refused at 10:00:01"), labels...)
+	c.Add(errors.New("connection refused at 10:00:02"), labels...)
+	count := c.Add(errors.New("timeout after 30s"), labels...)
+
+	require.Equal(t, 3, count)
+
+	// Count should work regardless of which error is passed.
+	require.Equal(t, 3, c.Count(errors.New("completely different error"), labels...))
+}
+
+func TestErrorCounter_ClearRemovesKey(t *testing.T) {
+	c := errorcounter.New()
+	labels := []string{"process", "run-1"}
+
+	c.Add(errors.New("err"), labels...)
+	c.Add(errors.New("err"), labels...)
+	require.Equal(t, 2, c.Count(errors.New("err"), labels...))
+
+	c.Clear(errors.New("err"), labels...)
+
+	// After clear, count should be 0 and next Add should return 1.
+	require.Equal(t, 0, c.Count(errors.New("err"), labels...))
+	require.Equal(t, 1, c.Add(errors.New("err"), labels...))
+}
+
+func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) {
+	c := errorcounter.New()
+	err := errors.New("same error")
+
+	c.Add(err, "process-a", "run-1")
+	c.Add(err, "process-a", "run-1")
+	c.Add(err, "process-b", "run-2")
+
+	require.Equal(t, 2, c.Count(err, "process-a", "run-1"))
+	require.Equal(t, 1, c.Count(err, "process-b", "run-2"))
+}

From 33333bcf543de492a10a25aa298c1cd9bf5f39ea Mon Sep 17 00:00:00 2001
From: Andrew Wormald <andreww@luno.com>
Date: Tue, 10 Mar 2026 16:45:54 +0000
Subject: [PATCH 3/3] Require at least one label in ErrorCounter API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous approach silently dropped the error message from the key
but kept labels optional. This meant callers without labels would share
a single global counter — a worse behaviour change.

Now the interface requires at least one label (label string, extras ...string),
making the contract explicit. All existing callers already pass two labels
(processName, runID) so no call sites change.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 errors.go                                  |  10 +-
 internal/errorcounter/errorcounter.go      |  19 ++--
 internal/errorcounter/errorcounter_test.go | 103 +++++++++------------
 3 files changed, 60 insertions(+), 72 deletions(-)

diff --git a/errors.go b/errors.go
index 66bdcca..95e95c0 100644
--- a/errors.go
+++ b/errors.go
@@ -10,9 +10,11 @@ var (
 	ErrInvalidTransition    = errors.New("invalid transition")
 )
 
-// ErrorCounter defines an interface for counting occurrences of errors with optional labels.
+// ErrorCounter defines an interface for counting occurrences of errors keyed by stable labels.
+// At least one label is required — labels should identify the process and run (e.g. processName, runID).
+// The error value is not used for keying because error messages often contain dynamic data.
 type ErrorCounter interface {
-	Add(err error, labels ...string) int
-	Count(err error, labels ...string) int
-	Clear(err error, labels ...string)
+	Add(err error, label string, extras ...string) int
+	Count(err error, label string, extras ...string) int
+	Clear(err error, label string, extras ...string)
 }
diff --git a/internal/errorcounter/errorcounter.go b/internal/errorcounter/errorcounter.go
index 23caf52..e293913 100644
--- a/internal/errorcounter/errorcounter.go
+++ b/internal/errorcounter/errorcounter.go
@@ -16,34 +16,37 @@ type Counter struct {
 	store map[string]int
 }
 
-func (c *Counter) Add(err error, labels ...string) int {
+func (c *Counter) Add(err error, label string, extras ...string) int {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	key := makeKey(labels)
+	key := makeKey(label, extras)
 	c.store[key] += 1
 	return c.store[key]
 }
 
-func (c *Counter) Count(err error, labels ...string) int {
+func (c *Counter) Count(err error, label string, extras ...string) int {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	key := makeKey(labels)
+	key := makeKey(label, extras)
 	return c.store[key]
 }
 
-func (c *Counter) Clear(err error, labels ...string) {
+func (c *Counter) Clear(err error, label string, extras ...string) {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 
-	key := makeKey(labels)
+	key := makeKey(label, extras)
 	delete(c.store, key)
 }
 
 // makeKey builds a stable key from labels only. The error message is excluded
 // because it often contains dynamic data (timestamps, IDs) which would create
 // unique keys and prevent the PauseAfterErrCount threshold from ever being reached.
-func makeKey(labels []string) string {
-	return strings.Join(labels, "-")
+func makeKey(label string, extras []string) string {
+	if len(extras) == 0 {
+		return label
+	}
+	return label + "-" + strings.Join(extras, "-")
 }
diff --git a/internal/errorcounter/errorcounter_test.go b/internal/errorcounter/errorcounter_test.go
index f758c88..4f78d3f 100644
--- a/internal/errorcounter/errorcounter_test.go
+++ b/internal/errorcounter/errorcounter_test.go
@@ -10,84 +10,67 @@ import (
 )
 
 func TestErrorCounter(t *testing.T) {
-	testCases := []struct {
-		name           string
-		inputErr       error
-		labels         []string
-		iterationCount int
-		expectedCount  int
-	}{
-		{
-			name:           "Add 3 and get 3",
-			inputErr:       errors.New("test error"),
-			labels:         []string{"label 1", "label 2"},
-			iterationCount: 3,
-			expectedCount:  3,
-		},
-		{
-			name:           "Add 1 and get 1 - no labels",
-			inputErr:       errors.New("test error"),
-			labels:         []string{},
-			iterationCount: 3,
-			expectedCount:  3,
-		},
-		{
-			name:           "Add 0 and get 0",
-			inputErr:       errors.New("test error"),
-			labels:         []string{"label 1"},
-			iterationCount: 0,
-			expectedCount:  0,
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			c := errorcounter.New()
-
-			var currentCount int
-			for i := 0; i < tc.iterationCount; i++ {
-				currentCount = c.Add(tc.inputErr, tc.labels...)
-			}
-			require.Equal(t, tc.expectedCount, currentCount)
-
-			count := c.Count(tc.inputErr, tc.labels...)
-			require.Equal(t, tc.expectedCount, count)
-
-			c.Clear(tc.inputErr, tc.labels...)
-			count = c.Count(tc.inputErr, tc.labels...)
-			require.Equal(t, 0, count)
-		})
-	}
+	t.Run("Add 3 and get 3", func(t *testing.T) {
+		c := errorcounter.New()
+		err := errors.New("test error")
+
+		c.Add(err, "label 1", "label 2")
+		c.Add(err, "label 1", "label 2")
+		count := c.Add(err, "label 1", "label 2")
+		require.Equal(t, 3, count)
+
+		require.Equal(t, 3, c.Count(err, "label 1", "label 2"))
+
+		c.Clear(err, "label 1", "label 2")
+		require.Equal(t, 0, c.Count(err, "label 1", "label 2"))
+	})
+
+	t.Run("Single label", func(t *testing.T) {
+		c := errorcounter.New()
+		err := errors.New("test error")
+
+		c.Add(err, "only-label")
+		count := c.Add(err, "only-label")
+		require.Equal(t, 2, count)
+
+		require.Equal(t, 2, c.Count(err, "only-label"))
+
+		c.Clear(err, "only-label")
+		require.Equal(t, 0, c.Count(err, "only-label"))
+	})
+
+	t.Run("Add 0 and get 0", func(t *testing.T) {
+		c := errorcounter.New()
+		require.Equal(t, 0, c.Count(errors.New("test error"), "label 1"))
+	})
 }
 
 func TestErrorCounter_DifferentErrorsSameLabels(t *testing.T) {
 	c := errorcounter.New()
-	labels := []string{"processName", "run-123"}
 
 	// Different error messages with the same labels should share a counter.
-	c.Add(errors.New("connection refused at 10:00:01"), labels...)
-	c.Add(errors.New("connection refused at 10:00:02"), labels...)
-	count := c.Add(errors.New("timeout after 30s"), labels...)
+	c.Add(errors.New("connection refused at 10:00:01"), "processName", "run-123")
+	c.Add(errors.New("connection refused at 10:00:02"), "processName", "run-123")
+	count := c.Add(errors.New("timeout after 30s"), "processName", "run-123")
 
 	require.Equal(t, 3, count)
 
 	// Count should work regardless of which error is passed.
-	require.Equal(t, 3, c.Count(errors.New("completely different error"), labels...))
+	require.Equal(t, 3, c.Count(errors.New("completely different error"), "processName", "run-123"))
 }
 
 func TestErrorCounter_ClearRemovesKey(t *testing.T) {
 	c := errorcounter.New()
-	labels := []string{"process", "run-1"}
 
-	c.Add(errors.New("err"), labels...)
-	c.Add(errors.New("err"), labels...)
-	require.Equal(t, 2, c.Count(errors.New("err"), labels...))
+	c.Add(errors.New("err"), "process", "run-1")
+	c.Add(errors.New("err"), "process", "run-1")
+	require.Equal(t, 2, c.Count(errors.New("err"), "process", "run-1"))
 
-	c.Clear(errors.New("err"), labels...)
+	c.Clear(errors.New("err"), "process", "run-1")
 
 	// After clear, count should be 0 and next Add should return 1.
-	require.Equal(t, 0, c.Count(errors.New("err"), labels...))
-	require.Equal(t, 1, c.Add(errors.New("err"), labels...))
+	require.Equal(t, 0, c.Count(errors.New("err"), "process", "run-1"))
+	require.Equal(t, 1, c.Add(errors.New("err"), "process", "run-1"))
 }
 
 func TestErrorCounter_DifferentLabelsSeparateCounters(t *testing.T) {