teebot-tools/mem-test at main · teebotbyteejay/teebot-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env bash
# mem-test — ground truth test suite for memory retrieval quality
# Tests whether the memory system can answer factual questions about its own history.
# Each test case has a question and a set of required facts that must appear in the answer.
#
# Usage: mem-test [--verbose]
# Options:
#   --verbose    Show full retrieval output for each question
#   --tool TOOL  Test a specific retrieval tool (graphmem|mem-research|pre-task) [default: mem-research]
set -euo pipefail

VERBOSE=false
TOOL="mem-research"
RAW_CHECK=false

for arg in "$@"; do
  case "$arg" in
    --verbose) VERBOSE=true ;;
    --tool) shift; TOOL="${2:-mem-research}" ;;
    --raw-check) RAW_CHECK=true ;;
  esac
done

# Handle --tool=X format
for arg in "$@"; do
  if [[ "$arg" == --tool=* ]]; then
    TOOL="${arg#--tool=}"
  fi
done

WORKSPACE="${WORKSPACE:-$HOME/.openclaw/workspace}"
C_RESET='\033[0m'; C_BOLD='\033[1m'; C_DIM='\033[2m'
C_GREEN='\033[32m'; C_RED='\033[31m'; C_CYAN='\033[36m'; C_YELLOW='\033[33m'

# ============================================================
# TEST CASES: question | required_facts (pipe-separated) | source
# Required facts are substrings that MUST appear in a correct answer.
# ============================================================
TEST_CASES=(
  # Factual: What database was used before SQLite?
  "What database did teebot use before switching to SQLite for graph memory?|kuzu|Kuzu|2026-02-26"

  # Factual: How many entities when Kuzu crashed?
  "How many entities were in the graph when Kuzu started segfaulting?|270|2026-02-26"

  # Factual: What was the mem-eval score?
  "What is mem-eval v2 and what does it do?|score|0 to 100|staleness|orphan|2026-02-27"

  # Factual: Who is described as the sharpest critic?
  "Who does teebot consider the sharpest critic among key people?|Faheem|MEMORY.md"

  # Factual: What tool does hymem implement?
  "What does hymem do? What technique does it implement?|complexity-routed|retrieval|2026-02-26"

  # Factual: How many reflexion rules were active before the prune?
  "How many active reflexion rules existed before the platitude prune on March 2?|37|2026-03-02"

  # Factual: What percentage were platitudes?
  "What percentage of reflexion rules were classified as generic platitudes?|70|2026-03-02"

  # Factual: What was the context reduction ratio from memory distillation?
  "What was the context reduction percentage from the memory distillation enhancement to pre-task?|88|2026-03-02"

  # Factual: What paper inspired JIT retrieval?
  "What research paper inspired the JIT retrieval approach in mem-research?|GAM|2026-03-01 or 2026-03-02"

  # Factual: What is memchain?
  "What is memchain and what does it provide?|cryptographic|verification|2026-02-26"

  # === March 3 additions: harder questions requiring cross-session synthesis ===

  # Cross-reference: What paper finding did the retrieval trigger implement?
  "What paper's concept inspired the retrieval trigger in pre-task, and what threshold was chosen?|MemGen|0.35|2026-03-04"

  # Numerical: What was the Exgentic finding about model vs architecture variance?
  "According to the Exgentic paper, what percentage of agent performance variance is explained by model choice vs architecture?|28.2|0.6|2026-03-03"

  # Synthesis: Connect two findings (requires contrastive framing)
  "How does the Exgentic finding about scaffolding relate to teebot's own retrieval quality data? Distinguish between task types.|12|0.6|28.2|2026-03-03"

  # Architecture: What are the three memory tiers after current-state consolidation?
  "What are the three memory tiers in teebot's system after the MEM1-inspired consolidation change?|CURRENT_STATE|MEMORY.md|daily|2026-03-03"

  # Temporal: What blog posts were published on March 3, 2026?
  "What blog posts did teebot publish on March 3, 2026?|scaffolding|forget|2026-03-03"

  # === March 10 additions: harder questions requiring recent context + deeper synthesis ===

  # Recent: What did the SICA self-edit system produce?
  "What is SICA and what pipeline does the self-edit tool implement?|propose|apply|evaluate|decide|2026-03-08"

  # Cross-reference: What did SiriuS inspire in teebot's system?
  "What technique from SiriuS was implemented in teebot's eval system, and how many repaired trajectories exist?|repaired|trajector|5|2026-03-08"

  # Quantitative: What was the capability score trend from March 3 to March 10?
  "What were the capability-score composite values on March 5, March 9, and March 10?|87|93|2026-03-10"

  # Synthesis: Connect rule count changes over time
  "How did the number of active reflexion rules change from the initial count through the platitude prune to the current state?|37|70|5|2026-03-10"

  # Architecture: What safety mechanisms does the self-edit system have?
  "What safety guardrails does the SICA self-edit tool include?|backup|cooldown|1 cycle|2026-03-08"

  # People: What is Shujaat's connection to teebot's work?
  "What is Shujaat researching and what is teebot's involvement?|DCEGM|economic|Mississauga|2026-03-07"

  # Meta: What key finding changed how teebot does retrieval?
  "What finding about keyword matching led teebot to switch retrieval methods, and what was the measured impact?|keyword|corrupt|false positive|embedding|2026-03-04"

  # Cross-system: How does the nightly consolidation work end-to-end?
  "Describe the nightly consolidation pipeline: what tools run, what files are updated, and what gets archived?|CURRENT_STATE|MEMORY|MEMORY-ARCHIVE|Sonnet|8KB|2026-03-07"

  # === March 14 additions: harder questions requiring multi-hop, temporal, and precise recall ===

  # Multi-hop: Connect person → finding → system change (3 hops)
  "What specific finding from Hazel_OC influenced teebot's approach to behavioral change, and what percentage was cited?|70|reversion|structural|2026-03-13"

  # Temporal precision: Distinguish between similar events at different times
  "How many times has teebot leaked information to wrong recipients, and who were the affected parties?|Shujaat|credential|routing|2026-03-14"

  # Quantitative chain: Track a metric across 3+ measurements
  "What was the graph entity count at key milestones: during Kuzu segfault, after initial graphmem-sqlite build, and most recently?|270|260|953|2026-03-14"

  # Causal chain: Why did X lead to Y lead to Z?
  "What chain of events led from the Kuzu segfault to the current SQLite-based graph memory system?|segfault|Claude Code|SQLite|pivot|2026-02-27"

  # Disambiguation: Two concepts with overlapping keywords
  "What is the difference between capability-score and BRS? Which measures skill and which measures intelligence?|skill|intelligence|76|83.2|2026-03-12"

  # Recency test: Does retrieval prefer recent data when appropriate?
  "How many active reflexion rules exist currently, and how many existed before the March 12 audit?|2|15|archived|2026-03-12"

  # Cross-domain: Connect research paper to implementation detail
  "What specific threshold value was derived from the MemGen paper, and what does it control in the pre-task pipeline?|0.35|embedding|novel|retrieval trigger|2026-03-04"
)

echo -e "${C_CYAN}${C_BOLD}Memory Ground Truth Test Suite${C_RESET}"
if $RAW_CHECK; then
  echo -e "  Mode: ${C_YELLOW}raw-check${C_RESET} (deterministic, bypasses LLM synthesis)"
else
  echo -e "  Tool: $TOOL"
fi
echo -e "  Test cases: ${#TEST_CASES[@]}"
echo ""

passed=0
failed=0
errors=0
results=()

for test_case in "${TEST_CASES[@]}"; do
  # Parse test case
  IFS='|' read -r question facts_str source <<< "$test_case"

  # Split facts on | (remaining fields after question)
  IFS='|' read -ra all_parts <<< "$test_case"
  question="${all_parts[0]}"
  source="${all_parts[${#all_parts[@]}-1]}"
  # Facts are everything between question and source
  facts=()
  for ((i=1; i<${#all_parts[@]}-1; i++)); do
    facts+=("${all_parts[$i]}")
  done

  echo -ne "  ${C_DIM}Testing:${C_RESET} ${question:0:70}... "

  # Run retrieval
  if $RAW_CHECK; then
    # Raw check mode: use mem-research --raw to bypass LLM synthesis
    # This tests retrieval quality deterministically
    output=$(mem-research "$question" --raw 2>/dev/null || echo "ERROR")
  else
    case "$TOOL" in
      graphmem)
        output=$(graphmem-sqlite search "$question" --limit 10 --no-log 2>/dev/null || echo "ERROR")
        ;;
      mem-research)
        output=$(mem-research "$question" 2>/dev/null || echo "ERROR")
        ;;
      pre-task)
        output=$(pre-task "$question" 2>/dev/null || echo "ERROR")
        ;;
      *)
        echo -e "${C_RED}Unknown tool: $TOOL${C_RESET}"
        exit 1
        ;;
    esac
  fi

  if [[ "$output" == "ERROR" ]]; then
    echo -e "${C_RED}ERROR${C_RESET}"
    errors=$((errors + 1))
    results+=("ERROR|$question")
    continue
  fi

  # Check each required fact
  # NOTE: Using here-string (<<<) instead of echo|pipe to avoid SIGPIPE race
  # condition with grep -q. With pipefail, if grep -q exits early on large
  # output (~29KB), echo gets SIGPIPE (exit 141) and the pipeline returns
  # non-zero even though grep found the match. This caused 20-40% false
  # negatives depending on timing. Here-strings avoid pipes entirely.
  all_found=true
  missing=()
  for fact in "${facts[@]}"; do
    if ! grep -qi "$fact" <<< "$output"; then
      all_found=false
      missing+=("$fact")
    fi
  done

  if $all_found; then
    echo -e "${C_GREEN}PASS${C_RESET}"
    passed=$((passed + 1))
    results+=("PASS|$question")
  else
    echo -e "${C_RED}FAIL${C_RESET} (missing: ${missing[*]})"
    failed=$((failed + 1))
    results+=("FAIL|$question|missing: ${missing[*]}")
  fi

  $VERBOSE && echo -e "${C_DIM}${output:0:200}${C_RESET}\n"
done

# Summary
echo ""
echo -e "${C_BOLD}Results:${C_RESET} $passed passed, $failed failed, $errors errors out of ${#TEST_CASES[@]}"
pct=0
[[ ${#TEST_CASES[@]} -gt 0 ]] && pct=$(( (passed * 100) / ${#TEST_CASES[@]} ))
echo -e "${C_BOLD}Score:${C_RESET} ${pct}%"

# Save results for tracking
results_file="$WORKSPACE/memory/mem-test-results.json"
jq -n \
  --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
  --arg tool "$TOOL" \
  --argjson passed "$passed" \
  --argjson failed "$failed" \
  --argjson errors "$errors" \
  --argjson total "${#TEST_CASES[@]}" \
  --argjson pct "$pct" \
  '{timestamp: $ts, tool: $tool, passed: $passed, failed: $failed, errors: $errors, total: $total, score_pct: $pct}' \
  > "$results_file" 2>/dev/null || true

if [[ $pct -ge 80 ]]; then
  echo -e "${C_GREEN}Memory system is healthy.${C_RESET}"
elif [[ $pct -ge 50 ]]; then
  echo -e "${C_YELLOW}Memory system has gaps.${C_RESET}"
else
  echo -e "${C_RED}Memory system is failing.${C_RESET}"
fi