teebot-tools/post-task at main · teebotbyteejay/teebot-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env bash
# post-task — evaluate a completed task, close the reflexion feedback loop
# 1. Runs reflexion-eval evaluate (extracts new rules)
# 2. Checks which pre-task rules were followed/ignored → reinforce/contradict
# 3. Ingests into graphmem
#
# Usage: post-task "task description" "outcome summary"
# Options:
#   --score-only    Just output the score (for scripting)
#   --no-graph      Skip graphmem ingestion
#   --no-feedback   Skip rule feedback loop
set -euo pipefail

TASK="${1:-}"
OUTCOME="${2:-}"
SCORE_ONLY=false
NO_GRAPH=false
NO_FEEDBACK=false

for arg in "$@"; do
  case "$arg" in
    --score-only) SCORE_ONLY=true ;;
    --no-graph) NO_GRAPH=true ;;
    --no-feedback) NO_FEEDBACK=true ;;
  esac
done

[[ -z "$TASK" ]] && { echo "Usage: post-task '<task>' '<outcome>'" >&2; exit 1; }
[[ -z "$OUTCOME" ]] && { echo "Usage: post-task '<task>' '<outcome>'" >&2; exit 1; }

WORKSPACE="${WORKSPACE:-$HOME/.openclaw/workspace}"
PRETASK_FILE="$WORKSPACE/memory/last-pretask.json"
C_RESET='\033[0m'; C_BOLD='\033[1m'; C_GREEN='\033[32m'; C_CYAN='\033[36m'; C_YELLOW='\033[33m'; C_DIM='\033[2m'

# --- Run reflexion evaluation (extract new rules) ---
eval_id=""
if command -v reflexion-eval &>/dev/null; then
  eval_output=$(reflexion-eval evaluate "$TASK" "$OUTCOME" 2>&1)

  if $SCORE_ONLY; then
    echo "$eval_output" | grep -oP 'score=\K[0-9]+' || echo "unknown"
    exit 0
  fi

  echo -e "${C_CYAN}${C_BOLD}Post-task evaluation${C_RESET}"
  echo "$eval_output"

  # Extract eval ID from output if available
  eval_id=$(echo "$eval_output" | grep -oP 'eval_id=\K[a-f0-9-]+' || echo "post-$(date +%s)")
fi

# --- Rule Feedback Loop ---
if ! $NO_FEEDBACK && [[ -f "$PRETASK_FILE" ]] && command -v reflexion-eval &>/dev/null; then
  pretask_task=$(jq -r '.task // ""' "$PRETASK_FILE" 2>/dev/null)
  rule_count=$(jq '.matched_rules | length' "$PRETASK_FILE" 2>/dev/null || echo 0)

  if [[ "$rule_count" -gt 0 ]]; then
    echo -e "\n${C_CYAN}${C_BOLD}Rule Feedback Loop${C_RESET} ($rule_count rules to evaluate)"

    # Build a summary of matched rules for the LLM
    rules_summary=$(jq -r '.matched_rules[] | "- [\(.id)] WHEN \(.trigger) → \(.action)"' "$PRETASK_FILE" 2>/dev/null)

    # Ask LLM to evaluate which rules were followed
    if [[ -n "${OPENAI_API_KEY:-}" ]]; then
      system_prompt="You evaluate whether behavioral rules were followed during a task. For each rule, respond with a JSON array of objects: {\"id\": \"rule-id\", \"verdict\": \"reinforced\" or \"contradicted\" or \"not_applicable\", \"reason\": \"brief explanation\"}. Only output the JSON array, nothing else."

      user_prompt="Task: $TASK

Outcome: $OUTCOME

Rules that were available during this task:
$rules_summary

For each rule, was it followed (reinforced), violated (contradicted), or not relevant (not_applicable)?"

      response=$(curl -s --max-time 30 "https://api.openai.com/v1/chat/completions" \
        -H "Content-Type: application/json" \
        -H "Authorization: Bearer $OPENAI_API_KEY" \
        -d "$(jq -n --arg sys "$system_prompt" --arg usr "$user_prompt" '{
          model: "gpt-4o-mini",
          messages: [{role: "system", content: $sys}, {role: "user", content: $usr}],
          temperature: 0.1
        }')" 2>/dev/null)

      verdicts=$(echo "$response" | jq -r '.choices[0].message.content' 2>/dev/null | sed 's/^```json//;s/^```//;s/```$//' || echo "[]")

      # Process verdicts
      reinforced=0
      contradicted=0
      skipped=0

      while IFS= read -r verdict_json; do
        [[ -z "$verdict_json" || "$verdict_json" == "null" ]] && continue
        rid=$(echo "$verdict_json" | jq -r '.id' 2>/dev/null)
        v=$(echo "$verdict_json" | jq -r '.verdict' 2>/dev/null)
        reason=$(echo "$verdict_json" | jq -r '.reason' 2>/dev/null)

        [[ -z "$rid" || "$rid" == "null" ]] && continue

        case "$v" in
          reinforced)
            reflexion-eval reinforce "$rid" "$eval_id" 2>/dev/null && \
              echo -e "  ${C_GREEN}✓${C_RESET} Reinforced: $reason" || true
            reinforced=$((reinforced + 1))
            ;;
          contradicted)
            reflexion-eval contradict "$rid" "$eval_id" 2>/dev/null && \
              echo -e "  ${C_YELLOW}✗${C_RESET} Contradicted: $reason" || true
            contradicted=$((contradicted + 1))
            ;;
          *)
            skipped=$((skipped + 1))
            ;;
        esac
      done < <(echo "$verdicts" | jq -c '.[]' 2>/dev/null)

      echo -e "\n  ${C_DIM}Feedback: $reinforced reinforced, $contradicted contradicted, $skipped skipped${C_RESET}"

      # Persist feedback data into the eval file for later analysis
      eval_file=$(ls -t "$WORKSPACE/memory/reflexion-evals/"*.json 2>/dev/null | head -1)
      if [[ -n "$eval_file" ]]; then
        matched_rules=$(jq -c '.matched_rules' "$PRETASK_FILE" 2>/dev/null || echo "[]")
        tmp=$(mktemp)
        jq --argjson matched "$matched_rules" \
           --argjson verdicts "$(echo "$verdicts" | jq -c '.' 2>/dev/null || echo '[]')" \
           --argjson reinforced "$reinforced" \
           --argjson contradicted "$contradicted" \
           --argjson skipped "$skipped" \
           '. + {feedback: {matched_rules: $matched, verdicts: $verdicts, reinforced: $reinforced, contradicted: $contradicted, skipped: $skipped}}' \
           "$eval_file" > "$tmp" 2>/dev/null && mv "$tmp" "$eval_file" && \
          echo -e "  ${C_GREEN}✓${C_RESET} Feedback persisted to eval file" || rm -f "$tmp"
      fi
    else
      echo -e "  ${C_YELLOW}⚠${C_RESET} OPENAI_API_KEY not set — skipping rule feedback"
    fi
  fi
fi

# --- Retrieval Quality Feedback ---
if ! $NO_FEEDBACK && [[ -f "$PRETASK_FILE" ]] && [[ -n "${OPENAI_API_KEY:-}" ]]; then
  distilled_used=$(jq -r '.retrieval.distilled_used // false' "$PRETASK_FILE" 2>/dev/null)
  distilled_context=$(jq -r '.retrieval.distilled // ""' "$PRETASK_FILE" 2>/dev/null)

  if [[ "$distilled_used" == "true" && -n "$distilled_context" ]]; then
    echo -e "\n${C_CYAN}${C_BOLD}Retrieval Quality Feedback${C_RESET}"

    system_prompt="Rate how useful the retrieved context was for completing this task. Output JSON: {\"retrieval_score\": 1-5, \"reason\": \"brief explanation\", \"what_was_missing\": \"what context would have helped but wasn't retrieved\"}. 1=useless noise, 3=somewhat helpful, 5=exactly what was needed. Only output the JSON."

    user_prompt="Task: $TASK
Outcome: $OUTCOME

Context that was retrieved and injected:
$distilled_context"

    ret_response=$(curl -s --max-time 15 "https://api.openai.com/v1/chat/completions" \
      -H "Content-Type: application/json" \
      -H "Authorization: Bearer $OPENAI_API_KEY" \
      -d "$(jq -n --arg sys "$system_prompt" --arg usr "$user_prompt" '{
        model: "gpt-4o-mini",
        messages: [{role: "system", content: $sys}, {role: "user", content: $usr}],
        temperature: 0.1,
        max_tokens: 200
      }')" 2>/dev/null)

    ret_eval=$(echo "$ret_response" | jq -r '.choices[0].message.content // ""' 2>/dev/null | sed 's/^```json//;s/^```//;s/```$//')
    ret_score=$(echo "$ret_eval" | jq -r '.retrieval_score // "?"' 2>/dev/null || echo "?")
    ret_reason=$(echo "$ret_eval" | jq -r '.reason // ""' 2>/dev/null || echo "")
    ret_missing=$(echo "$ret_eval" | jq -r '.what_was_missing // ""' 2>/dev/null || echo "")

    echo -e "  Retrieval score: ${C_BOLD}$ret_score/5${C_RESET} — $ret_reason"
    [[ -n "$ret_missing" && "$ret_missing" != "null" ]] && echo -e "  ${C_DIM}Missing: $ret_missing${C_RESET}"

    # Persist retrieval feedback into eval file
    eval_file=$(ls -t "$WORKSPACE/memory/reflexion-evals/"*.json 2>/dev/null | head -1)
    if [[ -n "$eval_file" ]]; then
      tmp=$(mktemp)
      jq --argjson ret_feedback "$(echo "$ret_eval" | jq -c '.' 2>/dev/null || echo '{}')" \
        '. + {retrieval_feedback: $ret_feedback}' \
        "$eval_file" > "$tmp" 2>/dev/null && mv "$tmp" "$eval_file" && \
        echo -e "  ${C_GREEN}✓${C_RESET} Retrieval feedback persisted" || rm -f "$tmp"
    fi
  fi
fi

# --- Ingest into graphmem ---
if ! $NO_GRAPH && command -v graphmem-sqlite &>/dev/null; then
  if [[ ${#OUTCOME} -gt 20 ]]; then
    graphmem-sqlite add "Task: $TASK. Outcome: $OUTCOME" 2>/dev/null && \
      echo -e "\n  ${C_GREEN}✓${C_RESET} Added to graph memory" || true
  fi
fi