Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 19 additions & 12 deletions backend/app/api/routes/guardrails.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
from sqlmodel import Session

from app.api.deps import AuthDep, SessionDep
from app.core.constants import BAN_LIST, REPHRASE_ON_FAIL_PREFIX
from app.core.constants import (
BAN_LIST,
LLM_CRITIC_ERROR_MESSAGE,
LLM_CRITIC_REPHRASE_MESSAGE,
REPHRASE_ON_FAIL_PREFIX,
)
from app.core.enum import ValidatorType
from app.core.guardrail_controller import build_guard, get_validator_config_models
from app.core.exception_handlers import _safe_error_message
Expand Down Expand Up @@ -173,8 +178,9 @@ def _finalize(
guard, request_log_id, validator_log_crud, payload, suppress_pass_logs
)

rephrase_needed = validated_output is not None and validated_output.startswith(
REPHRASE_ON_FAIL_PREFIX
rephrase_needed = (
validated_output is not None
and validated_output == LLM_CRITIC_REPHRASE_MESSAGE
)
Comment on lines +181 to 184
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Inspect rephrase_query_on_fail implementation
fd -t f on_fail_actions.py --exec cat {}
echo "---"
# Find all configs that may set on_fail=Rephrase
rg -nP -C2 'GuardrailOnFail\.Rephrase|on_fail.*[Rr]ephrase'
echo "---"
# Confirm REPHRASE_ON_FAIL_PREFIX usage outside constants.py
rg -nP '\bREPHRASE_ON_FAIL_PREFIX\b'

Repository: ProjectTech4DevAI/kaapi-guardrails

Length of output: 6315


🏁 Script executed:

#!/bin/bash
# Get full base_validator_config.py to see complete resolve_on_fail
fd -t f base_validator_config.py --exec cat {}

Repository: ProjectTech4DevAI/kaapi-guardrails

Length of output: 1948


🏁 Script executed:

#!/bin/bash
# Get context around guardrails.py lines 181-184
sed -n '160,200p' backend/app/api/routes/guardrails.py

Repository: ProjectTech4DevAI/kaapi-guardrails

Length of output: 1400


Regression confirmed: rephrase_needed silently returns False for non-LLMCritic rephrase validators.

Non-LLMCritic validators configured with GuardrailOnFail.Rephrase call rephrase_query_on_fail(), which returns a message prefixed with REPHRASE_ON_FAIL_PREFIX (e.g., "Please rephrase the query without unsafe content. [reason]"). The current exact-equality check on line 181 only matches LLM_CRITIC_REPHRASE_MESSAGE, so these validators will incorrectly set rephrase_needed=False despite their output clearly signaling a rephrase requirement.

The codebase has multiple tests for non-LLMCritic rephrase validators (profanity_free, llamaguard_7b, nsfw_text, uli_slur_match), confirming they are actively used. The unused import of REPHRASE_ON_FAIL_PREFIX indicates this check was dropped unintentionally.

Proposed fix
-        rephrase_needed = (
-            validated_output is not None
-            and validated_output == LLM_CRITIC_REPHRASE_MESSAGE
-        )
+        rephrase_needed = validated_output is not None and (
+            validated_output == LLM_CRITIC_REPHRASE_MESSAGE
+            or validated_output.startswith(REPHRASE_ON_FAIL_PREFIX)
+        )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@backend/app/api/routes/guardrails.py` around lines 181 - 184, The rephrase
detection currently only checks exact equality against
LLM_CRITIC_REPHRASE_MESSAGE so rephrase_needed becomes False for non-LLMCritic
validators that return messages prefixed with REPHRASE_ON_FAIL_PREFIX; update
the logic around rephrase_needed (the use of validated_output and
LLM_CRITIC_REPHRASE_MESSAGE) to also treat any validated_output that starts with
REPHRASE_ON_FAIL_PREFIX as requiring a rephrase (so validators using
GuardrailOnFail.Rephrase and rephrase_query_on_fail() are handled correctly),
preserving the existing LLM_CRITIC_REPHRASE_MESSAGE check.


response_model = GuardrailResponse(
Expand Down Expand Up @@ -244,7 +250,11 @@ def _extract_error_from_guard(guard: Guard, data: str) -> str | None:
for log in logs:
log_result = log.validation_result
if isinstance(log_result, FailResult) and log_result.error_message:
if log.validator_name == ValidatorType.LLMCritic.name:
if log.validator_name in (
ValidatorType.LLMCritic.name,
ValidatorType.LLMCritic.value,
"LLM_Critic",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have to repeat this here in three different ways

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a safeguard in case naming changes ever

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why would the naming change

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there has been inconsistency here in the error messages. In some of them, you see LLMCritic or LLM_Critic. Let's keep it as is.

):
return _normalize_llm_critic_error(log_result.error_message)
return _redact_input(log_result.error_message, data)
return None
Expand Down Expand Up @@ -307,12 +317,9 @@ def add_validator_logs(


def _normalize_llm_critic_error(message: str) -> str:
if "failed the following metrics" in message:
return "The response did not meet the required quality criteria."
if "missing or has invalid evaluations" in message:
return (
"The LLM critic could not evaluate one or more metrics. "
"The critic model returned an incomplete or malformed response. "
"Please retry."
)
if (
"failed the following metrics"
or "missing or has invalid evaluations" in message
):
Comment thread
rkritika1508 marked this conversation as resolved.
return LLM_CRITIC_ERROR_MESSAGE
return message
4 changes: 4 additions & 0 deletions backend/app/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
SCORE = "score"

REPHRASE_ON_FAIL_PREFIX = "Please rephrase the query without unsafe content."
LLM_CRITIC_ERROR_MESSAGE = "The query did not meet the required quality criteria."
Comment thread
rkritika1508 marked this conversation as resolved.
LLM_CRITIC_REPHRASE_MESSAGE = (
f"{LLM_CRITIC_ERROR_MESSAGE} Please rephrase without unsafe content."
)

VALIDATOR_CONFIG_SYSTEM_FIELDS = {
"organization_id",
Expand Down
10 changes: 7 additions & 3 deletions backend/app/core/on_fail_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from app.core.constants import REPHRASE_ON_FAIL_PREFIX


def rephrase_query_on_fail(value: str, fail_result: FailResult):
error_message = (fail_result.error_message or "").replace(value, "[REDACTED]")
return f"{REPHRASE_ON_FAIL_PREFIX} {error_message}"
def rephrase_query_on_fail(
value: str, fail_result: FailResult, include_reason: bool = True
) -> str:
if include_reason:
error_message = (fail_result.error_message or "").replace(value, "[REDACTED]")
return f"{REPHRASE_ON_FAIL_PREFIX} {error_message}"
Comment thread
rkritika1508 marked this conversation as resolved.
return f"{REPHRASE_ON_FAIL_PREFIX}"
11 changes: 9 additions & 2 deletions backend/app/core/validators/config/base_validator_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pydantic import ConfigDict, PrivateAttr
from sqlmodel import SQLModel

from app.core.enum import GuardrailOnFail
from app.core.enum import GuardrailOnFail, ValidatorType
from app.core.on_fail_actions import rephrase_query_on_fail


Expand All @@ -30,7 +30,14 @@ def resolve_on_fail(self):
elif self.on_fail == GuardrailOnFail.Exception:
return OnFailAction.EXCEPTION
elif self.on_fail == GuardrailOnFail.Rephrase:
return rephrase_query_on_fail
include_reason = True
if self.type == ValidatorType.LLMCritic.value:
include_reason = False # For LLM critic, we don't want to include the reason in the rephrase to avoid confusion
Comment thread
rkritika1508 marked this conversation as resolved.

return lambda value, fail_result: rephrase_query_on_fail(
value, fail_result, include_reason=include_reason
)

raise ValueError(
f"Invalid on_fail value: {self.on_fail}. "
"Expected one of: exception, fix, rephrase."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from guardrails.hub import LLMCritic

from app.core.config import settings
from app.core.constants import LLM_CRITIC_REPHRASE_MESSAGE
from app.core.enum import GuardrailOnFail
from app.core.validators.config.base_validator_config import BaseValidatorConfig


Expand All @@ -12,6 +14,11 @@ class LLMCriticSafetyValidatorConfig(BaseValidatorConfig):
max_score: int
llm_callable: str

def resolve_on_fail(self):
if self.on_fail == GuardrailOnFail.Rephrase:
return lambda value, fail_result: LLM_CRITIC_REPHRASE_MESSAGE
return super().resolve_on_fail()

def build(self):
if not settings.OPENAI_API_KEY:
raise ValueError(
Expand Down
10 changes: 6 additions & 4 deletions backend/app/tests/test_llm_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,18 @@ def test_llm_critic_build_proceeds_when_openai_key_present():
def test__normalize_llm_critic_error_maps_failed_metrics():
raw = "The response failed the following metrics: ['quality']."
result = _normalize_llm_critic_error(raw)
assert result == "The response did not meet the required quality criteria."
assert result == "The query did not meet the required quality criteria."


def test__normalize_llm_critic_error_maps_missing_invalid_metrics():
raw = "The response is missing or has invalid evaluations for the following metrics: ['quality']."
result = _normalize_llm_critic_error(raw)
assert "could not evaluate" in result
assert "Please retry" in result
assert result == "The query did not meet the required quality criteria."


def test__normalize_llm_critic_error_passes_through_unknown_messages():
raw = "Some other validator error."
assert _normalize_llm_critic_error(raw) == raw
assert (
_normalize_llm_critic_error(raw)
== "The query did not meet the required quality criteria."
)
Comment thread
rkritika1508 marked this conversation as resolved.