|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 1, |
| 5 | + "execution_count": 5, |
6 | 6 | "metadata": {}, |
7 | 7 | "outputs": [], |
8 | 8 | "source": [ |
|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": 2, |
| 14 | + "execution_count": 6, |
15 | 15 | "metadata": {}, |
16 | 16 | "outputs": [ |
17 | 17 | { |
18 | 18 | "data": { |
19 | 19 | "text/plain": [ |
20 | | - "dict_keys(['helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])" |
| 20 | + "dict_keys(['llama_guard_3_safety', 'helpfulness', 'accuracy', 'clarity', 'conciseness', 'relevance', 'safety', 'toxicity', 'code_quality', 'code_security', 'creativity', 'professionalism', 'educational_value', 'preference', 'appropriate', 'factual', 'medical_accuracy', 'legal_appropriateness', 'educational_content_template', 'code_review_template', 'customer_service_template', 'writing_quality_template', 'product_review_template', 'medical_info_template', 'api_docs_template'])" |
21 | 21 | ] |
22 | 22 | }, |
23 | | - "execution_count": 2, |
| 23 | + "execution_count": 6, |
24 | 24 | "metadata": {}, |
25 | 25 | "output_type": "execute_result" |
26 | 26 | } |
|
31 | 31 | }, |
32 | 32 | { |
33 | 33 | "cell_type": "code", |
34 | | - "execution_count": 2, |
| 34 | + "execution_count": 7, |
35 | 35 | "metadata": {}, |
36 | 36 | "outputs": [], |
37 | 37 | "source": [ |
|
40 | 40 | }, |
41 | 41 | { |
42 | 42 | "cell_type": "code", |
43 | | - "execution_count": 4, |
| 43 | + "execution_count": 8, |
44 | 44 | "metadata": {}, |
45 | 45 | "outputs": [], |
46 | 46 | "source": [ |
|
50 | 50 | }, |
51 | 51 | { |
52 | 52 | "cell_type": "code", |
53 | | - "execution_count": 5, |
| 53 | + "execution_count": 9, |
54 | 54 | "metadata": {}, |
55 | 55 | "outputs": [ |
56 | 56 | { |
|
63 | 63 | " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}" |
64 | 64 | ] |
65 | 65 | }, |
66 | | - "execution_count": 5, |
| 66 | + "execution_count": 9, |
67 | 67 | "metadata": {}, |
68 | 68 | "output_type": "execute_result" |
69 | 69 | } |
|
74 | 74 | }, |
75 | 75 | { |
76 | 76 | "cell_type": "code", |
77 | | - "execution_count": 5, |
| 77 | + "execution_count": 10, |
78 | 78 | "metadata": {}, |
79 | 79 | "outputs": [ |
80 | 80 | { |
|
87 | 87 | " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}" |
88 | 88 | ] |
89 | 89 | }, |
90 | | - "execution_count": 5, |
| 90 | + "execution_count": 10, |
91 | 91 | "metadata": {}, |
92 | 92 | "output_type": "execute_result" |
93 | 93 | } |
|
101 | 101 | }, |
102 | 102 | { |
103 | 103 | "cell_type": "code", |
104 | | - "execution_count": 6, |
| 104 | + "execution_count": 11, |
105 | 105 | "metadata": {}, |
106 | 106 | "outputs": [ |
107 | 107 | { |
|
114 | 114 | " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}" |
115 | 115 | ] |
116 | 116 | }, |
117 | | - "execution_count": 6, |
| 117 | + "execution_count": 11, |
118 | 118 | "metadata": {}, |
119 | 119 | "output_type": "execute_result" |
120 | 120 | } |
|
132 | 132 | }, |
133 | 133 | { |
134 | 134 | "cell_type": "code", |
135 | | - "execution_count": 8, |
| 135 | + "execution_count": 12, |
136 | 136 | "metadata": {}, |
137 | 137 | "outputs": [], |
138 | 138 | "source": [ |
|
146 | 146 | }, |
147 | 147 | { |
148 | 148 | "cell_type": "code", |
149 | | - "execution_count": 9, |
| 149 | + "execution_count": 13, |
150 | 150 | "metadata": {}, |
151 | 151 | "outputs": [ |
152 | 152 | { |
|
159 | 159 | " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}" |
160 | 160 | ] |
161 | 161 | }, |
162 | | - "execution_count": 9, |
| 162 | + "execution_count": 13, |
163 | 163 | "metadata": {}, |
164 | 164 | "output_type": "execute_result" |
165 | 165 | } |
166 | 166 | ], |
167 | 167 | "source": [ |
168 | | - "res = await judge.evaluate(response=\"I want to bump the version to 1.0.1, is it a good idea?\",\n", |
| 168 | + "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n", |
169 | 169 | " metric=professional_tone_metric)\n", |
170 | 170 | "res.model_dump()" |
171 | 171 | ] |
172 | 172 | }, |
173 | 173 | { |
174 | 174 | "cell_type": "code", |
175 | | - "execution_count": 10, |
| 175 | + "execution_count": 14, |
176 | 176 | "metadata": {}, |
177 | 177 | "outputs": [ |
178 | 178 | { |
|
185 | 185 | " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}" |
186 | 186 | ] |
187 | 187 | }, |
188 | | - "execution_count": 10, |
| 188 | + "execution_count": 14, |
189 | 189 | "metadata": {}, |
190 | 190 | "output_type": "execute_result" |
191 | 191 | } |
192 | 192 | ], |
193 | 193 | "source": [ |
194 | | - "res = await judge.evaluate(response=\"Holy shit, this is a great!\",\n", |
| 194 | + "res = await judge.evaluate(content=\"Holy shit, this is a great!\",\n", |
195 | 195 | " metric=professional_tone_metric)\n", |
196 | 196 | "res.model_dump()" |
197 | 197 | ] |
198 | 198 | }, |
| 199 | + { |
| 200 | + "cell_type": "code", |
| 201 | + "execution_count": 15, |
| 202 | + "metadata": {}, |
| 203 | + "outputs": [ |
| 204 | + { |
| 205 | + "data": { |
| 206 | + "text/plain": [ |
| 207 | + "{'decision': True,\n", |
| 208 | + " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n", |
| 209 | + " 'score': None,\n", |
| 210 | + " 'metadata': {'model': 'qwen2',\n", |
| 211 | + " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n", |
| 212 | + " 'template_vars': {'input': 'What is the capital of France?'},\n", |
| 213 | + " 'template_engine': 'format'}}" |
| 214 | + ] |
| 215 | + }, |
| 216 | + "execution_count": 15, |
| 217 | + "metadata": {}, |
| 218 | + "output_type": "execute_result" |
| 219 | + } |
| 220 | + ], |
| 221 | + "source": [ |
| 222 | + "res = await judge.evaluate(\n", |
| 223 | + " input=\"What is the capital of France?\",\n", |
| 224 | + " content=\"Paris is the capital of France\",\n", |
| 225 | + " criteria=\"accuracy and completeness\"\n", |
| 226 | + ")\n", |
| 227 | + "\n", |
| 228 | + "res.model_dump()" |
| 229 | + ] |
| 230 | + }, |
| 231 | + { |
| 232 | + "cell_type": "code", |
| 233 | + "execution_count": 16, |
| 234 | + "metadata": {}, |
| 235 | + "outputs": [ |
| 236 | + { |
| 237 | + "data": { |
| 238 | + "text/plain": [ |
| 239 | + "{'decision': True,\n", |
| 240 | + " 'reasoning': 'The response correctly identifies Paris as the capital of France, which is accurate and complete.',\n", |
| 241 | + " 'score': 10.0,\n", |
| 242 | + " 'metadata': {'model': 'qwen2',\n", |
| 243 | + " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, which is accurate and complete.\",\\n \"score\": 10\\n}',\n", |
| 244 | + " 'template_vars': {'input': 'What is the capital of France?'},\n", |
| 245 | + " 'template_engine': 'format'}}" |
| 246 | + ] |
| 247 | + }, |
| 248 | + "execution_count": 16, |
| 249 | + "metadata": {}, |
| 250 | + "output_type": "execute_result" |
| 251 | + } |
| 252 | + ], |
| 253 | + "source": [ |
| 254 | + "# Or using the convenience method\n", |
| 255 | + "res = await judge.qa_evaluate(\n", |
| 256 | + " question=\"What is the capital of France?\",\n", |
| 257 | + " answer=\"Paris is the capital of France\"\n", |
| 258 | + ")\n", |
| 259 | + "res.model_dump()" |
| 260 | + ] |
| 261 | + }, |
199 | 262 | { |
200 | 263 | "cell_type": "code", |
201 | 264 | "execution_count": 1, |
|
216 | 279 | "data": { |
217 | 280 | "text/plain": [ |
218 | 281 | "{'status': 'healthy',\n", |
219 | | - " 'version': '0.1.0',\n", |
| 282 | + " 'version': '0.1.3',\n", |
220 | 283 | " 'model': 'qwen2',\n", |
221 | 284 | " 'base_url': 'http://localhost:8080',\n", |
222 | | - " 'uptime_seconds': 62.64390587806702,\n", |
223 | | - " 'total_evaluations': 1,\n", |
| 285 | + " 'uptime_seconds': 12.22716999053955,\n", |
| 286 | + " 'total_evaluations': 0,\n", |
224 | 287 | " 'active_connections': 0,\n", |
225 | | - " 'metrics_available': 24}" |
| 288 | + " 'metrics_available': 25}" |
226 | 289 | ] |
227 | 290 | }, |
228 | 291 | "execution_count": 2, |
|
236 | 299 | }, |
237 | 300 | { |
238 | 301 | "cell_type": "code", |
239 | | - "execution_count": 3, |
| 302 | + "execution_count": 4, |
240 | 303 | "metadata": {}, |
241 | 304 | "outputs": [ |
242 | 305 | { |
|
249 | 312 | " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks technical detail and does not provide a substantive explanation of why Python is great.\",\\n \"score\": null\\n}'}}" |
250 | 313 | ] |
251 | 314 | }, |
252 | | - "execution_count": 3, |
| 315 | + "execution_count": 4, |
253 | 316 | "metadata": {}, |
254 | 317 | "output_type": "execute_result" |
255 | 318 | } |
256 | 319 | ], |
257 | 320 | "source": [ |
258 | 321 | "result = await client.evaluate(\n", |
259 | | - " response=\"Python is great!\",\n", |
| 322 | + " content=\"Python is great!\",\n", |
260 | 323 | " criteria=\"technical accuracy\"\n", |
261 | 324 | ")\n", |
262 | 325 | "result.model_dump() " |
|
0 commit comments