trustyai-explainability
diff --git a/‎examples/basic_test.ipynb‎
Lines changed: 89 additions & 26 deletions b/‎examples/basic_test.ipynb‎
Lines changed: 89 additions & 26 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,16 +11,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "dict_keys(['helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])"
+       "dict_keys(['llama_guard_3_safety', 'helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -63,7 +63,7 @@
        "  'raw_response': '{\\n    \"decision\": false,\\n    \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n    \"score\": null\\n}'}}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -87,7 +87,7 @@
        "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n    \"score\": 5\\n}'}}"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -101,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -114,7 +114,7 @@
        "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n    \"score\": 5\\n}'}}"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -132,7 +132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -146,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -159,20 +159,20 @@
        "  'raw_response': '{\\n    \"decision\": \"moderate\",\\n    \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n    \"score\": 5\\n}'}}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "res = await judge.evaluate(response=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
+    "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
     "                           metric=professional_tone_metric)\n",
     "res.model_dump()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -185,17 +185,80 @@
        "  'raw_response': '{\\n    \"decision\": \"non-professional\",\\n    \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n    \"score\": 1\\n}'}}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "res = await judge.evaluate(response=\"Holy shit, this is a great!\",\n",
+    "res = await judge.evaluate(content=\"Holy shit, this is a great!\",\n",
     "                           metric=professional_tone_metric)\n",
     "res.model_dump()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'decision': True,\n",
+       " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n",
+       " 'score': None,\n",
+       " 'metadata': {'model': 'qwen2',\n",
+       "  'raw_response': '{\\n    \"decision\": true,\\n    \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n    \"score\": null\\n}',\n",
+       "  'template_vars': {'input': 'What is the capital of France?'},\n",
+       "  'template_engine': 'format'}}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res = await judge.evaluate(\n",
+    "    input=\"What is the capital of France?\",\n",
+    "    content=\"Paris is the capital of France\",\n",
+    "    criteria=\"accuracy and completeness\"\n",
+    ")\n",
+    "\n",
+    "res.model_dump()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'decision': True,\n",
+       " 'reasoning': 'The response correctly identifies Paris as the capital of France, which is accurate and complete.',\n",
+       " 'score': 10.0,\n",
+       " 'metadata': {'model': 'qwen2',\n",
+       "  'raw_response': '{\\n    \"decision\": true,\\n    \"reasoning\": \"The response correctly identifies Paris as the capital of France, which is accurate and complete.\",\\n    \"score\": 10\\n}',\n",
+       "  'template_vars': {'input': 'What is the capital of France?'},\n",
+       "  'template_engine': 'format'}}"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Or using the convenience method\n",
+    "res = await judge.qa_evaluate(\n",
+    "    question=\"What is the capital of France?\",\n",
+    "    answer=\"Paris is the capital of France\"\n",
+    ")\n",
+    "res.model_dump()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -216,13 +279,13 @@
      "data": {
       "text/plain": [
        "{'status': 'healthy',\n",
-       " 'version': '0.1.0',\n",
+       " 'version': '0.1.3',\n",
        " 'model': 'qwen2',\n",
        " 'base_url': 'http://localhost:8080',\n",
-       " 'uptime_seconds': 62.64390587806702,\n",
-       " 'total_evaluations': 1,\n",
+       " 'uptime_seconds': 12.22716999053955,\n",
+       " 'total_evaluations': 0,\n",
        " 'active_connections': 0,\n",
-       " 'metrics_available': 24}"
+       " 'metrics_available': 25}"
       ]
      },
      "execution_count": 2,
@@ -236,7 +299,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -249,14 +312,14 @@
        "  'raw_response': '{\\n    \"decision\": false,\\n    \"reasoning\": \"The response lacks technical detail and does not provide a substantive explanation of why Python is great.\",\\n    \"score\": null\\n}'}}"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "result = await client.evaluate(\n",
-    "    response=\"Python is great!\",\n",
+    "    content=\"Python is great!\",\n",
     "    criteria=\"technical accuracy\"\n",
     ")\n",
     "result.model_dump() "