self-evolving-agent/benchmarks/suite.json at main · RangeKing/self-evolving-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
{
  "skill_name": "self-evolving-agent",
  "judge_policy": {
    "required_criterion_min_score": 1,
    "pass_ratio_threshold": 0.7
  },
  "scenarios": [
    {
      "id": "task-light-restraint",
      "title": "Task-light restraint on familiar work",
      "prompt": "Use $self-evolving-agent for this situation: I only need to update one familiar README sentence. Classify the task, keep it in `task_light`, retrieve only the minimum context, and give me one verification check.",
      "criteria": [
        {
          "name": "mode selection discipline",
          "description": "Keeps the task in task_light mode or an equivalent smallest-safe framing.",
          "required": true
        },
        {
          "name": "selective retrieval",
          "description": "Retrieves only a minimal relevant context set rather than expanding into task_full.",
          "required": true
        },
        {
          "name": "verification check",
          "description": "Provides a small but explicit verification step before delivery.",
          "required": true
        },
        {
          "name": "avoids task-full bloat",
          "description": "Does not produce unnecessary training, agenda, or promotion output for a trivial task.",
          "required": false
        }
      ]
    },
    {
      "id": "pre-task-risk-diagnosis",
      "title": "Pre-task capability risk diagnosis",
      "prompt": "Use $self-evolving-agent for this situation: I need to modify a production deployment workflow I have never touched before. Before doing any edits, classify the task, retrieve targeted context, give me a pre-task capability risk diagnosis, and propose an execution strategy with a verification plan.",
      "criteria": [
        {
          "name": "mode classification",
          "description": "Classifies the task into a heavier mode before execution.",
          "required": true
        },
        {
          "name": "retrieval plan",
          "description": "Identifies which prior records should be retrieved and why.",
          "required": true
        },
        {
          "name": "capability risk diagnosis",
          "description": "Names likely weak capabilities or failure modes relevant to the task.",
          "required": true
        },
        {
          "name": "verification-first strategy",
          "description": "Proposes an execution strategy and explicit verification plan before edits.",
          "required": true
        }
      ]
    },
    {
      "id": "post-task-diagnosis-and-training",
      "title": "Post-task diagnosis and training-unit generation",
      "prompt": "Use $self-evolving-agent after this incident: I edited a release command by guessing a flag name, the user corrected me, and I now realize I skipped validating the tool contract. Produce the post-task diagnosis, decide whether the task should stay in `task_light` or escalate to `task_full`, update the capability evidence, and create a training unit if warranted.",
      "criteria": [
        {
          "name": "capability-level diagnosis",
          "description": "Frames the issue as a capability weakness, not only as a surface incident.",
          "required": true
        },
        {
          "name": "escalation judgment",
          "description": "Explains whether the situation should escalate beyond task_light mode and why.",
          "required": true
        },
        {
          "name": "evidence update",
          "description": "Updates capability evidence or next-focus logic using the incident.",
          "required": true
        },
        {
          "name": "training restraint",
          "description": "Creates a training unit only when the pattern is structural or recurring.",
          "required": true
        }
      ]
    },
    {
      "id": "missed-retrieval-recovery",
      "title": "Missed retrieval recovery",
      "prompt": "Use $self-evolving-agent for this follow-up: I repeated a mistake because I failed to retrieve a relevant prior lesson in time. Reclassify the situation, show which records should have been retrieved, and explain how the retrieval strategy should change.",
      "criteria": [
        {
          "name": "retrieval-failure diagnosis",
          "description": "Identifies memory retrieval as the operative weakness.",
          "required": true
        },
        {
          "name": "missed-record surfacing",
          "description": "Names the kind of records that should have been pulled in before action.",
          "required": true
        },
        {
          "name": "strategy update",
          "description": "Proposes a trigger-signature or retrieval change that would prevent the repeat.",
          "required": true
        },
        {
          "name": "no vague recap",
          "description": "Goes beyond generic reflection and gives a concrete retrieval correction.",
          "required": false
        }
      ]
    },
    {
      "id": "evaluation-and-promotion",
      "title": "Evaluation state and promotion decision",
      "prompt": "Use $self-evolving-agent for this follow-up: a verification-first strategy previously trained on deployment commands later helped me correctly update a different release automation task without user correction. Decide the evaluation state and whether the strategy should be promoted.",
      "criteria": [
        {
          "name": "evaluation ladder discipline",
          "description": "Places the learning at the correct state on the recorded-to-promoted ladder.",
          "required": true
        },
        {
          "name": "transfer evidence",
          "description": "Uses the new scenario as transfer evidence rather than treating the original recording as enough.",
          "required": true
        },
        {
          "name": "promotion gate",
          "description": "Makes a scoped promotion decision with trigger signature and minimal durable rule.",
          "required": true
        },
        {
          "name": "overgeneralization control",
          "description": "Acknowledges limits or risks instead of promoting a broad vague rule.",
          "required": false
        }
      ]
    },
    {
      "id": "agenda-review",
      "title": "Learning agenda review",
      "prompt": "Use $self-evolving-agent before the next unfamiliar project: I have completed five meaningful task cycles, verification is improving, but I still miss relevant past learnings and sometimes skip my intended workflow under pressure. Run a learning agenda review and decide the top 1-3 capabilities to train next.",
      "criteria": [
        {
          "name": "agenda trigger recognition",
          "description": "Recognizes that the scenario should trigger a learning agenda review.",
          "required": true
        },
        {
          "name": "small active focus set",
          "description": "Chooses only 1-3 active focus capabilities instead of an unbounded list.",
          "required": true
        },
        {
          "name": "prioritization rationale",
          "description": "Explains why these capabilities matter now relative to alternatives.",
          "required": true
        },
        {
          "name": "exit criteria",
          "description": "Defines what evidence would retire or advance each active focus.",
          "required": true
        }
      ]
    }
  ]
}