Skip to content

Commit 1ea6e32

Browse files
author
Judgment Release Bot
committed
[Bump Minor Version] Release: Merge staging to main
2 parents f2261c1 + 20495c5 commit 1ea6e32

File tree

12 files changed

+314
-134
lines changed

12 files changed

+314
-134
lines changed

src/e2etests/test_tracer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ def retrieve_llm_cost_helper(trace_id):
286286
span_attrs = span.get("span_attributes", {})
287287
if isinstance(span_attrs, str):
288288
span_attrs = orjson.loads(span_attrs)
289-
llm_cost = span_attrs.get("gen_ai.usage.total_cost_usd", 0)
289+
llm_cost = span_attrs.get("judgment.usage.total_cost_usd", 0)
290290
total_llm_cost += llm_cost
291291

292292
if total_llm_cost == 0:
@@ -325,8 +325,8 @@ def retrieve_streaming_trace_helper(trace_id):
325325
assert False, "No completion content found in streaming span"
326326

327327
# Should have usage information
328-
input_tokens = span_attributes.get("gen_ai.usage.input_tokens")
329-
output_tokens = span_attributes.get("gen_ai.usage.output_tokens")
328+
input_tokens = span_attributes.get("judgment.usage.non_cached_input_tokens")
329+
output_tokens = span_attributes.get("judgment.usage.output_tokens")
330330

331331
if input_tokens is None or output_tokens is None:
332332
assert False, "Missing usage tokens in streaming span"

src/judgeval/tracer/keys.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,19 @@ class AttributeKeys(str, Enum):
2626

2727
PENDING_TRACE_EVAL = "judgment.pending_trace_eval"
2828

29+
JUDGMENT_LLM_PROVIDER = "judgment.llm.provider"
30+
JUDGMENT_LLM_MODEL_NAME = "judgment.llm.model"
31+
JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS = "judgment.usage.non_cached_input_tokens"
32+
JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS = (
33+
"judgment.usage.cache_creation_input_tokens"
34+
)
35+
JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS = "judgment.usage.cache_read_input_tokens"
36+
JUDGMENT_USAGE_OUTPUT_TOKENS = "judgment.usage.output_tokens"
37+
JUDGMENT_USAGE_TOTAL_COST_USD = "judgment.usage.total_cost_usd"
38+
2939
GEN_AI_PROMPT = "gen_ai.prompt"
3040
GEN_AI_COMPLETION = "gen_ai.completion"
31-
GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
32-
GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"
3341
GEN_AI_SYSTEM = "gen_ai.system"
34-
GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"
35-
GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
36-
GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS = (
37-
"gen_ai.usage.cache_creation_input_tokens"
38-
)
39-
GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read_input_tokens"
40-
4142
GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"
4243
GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"
4344
GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"

src/judgeval/tracer/llm/llm_anthropic/messages.py

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
9595
)
9696
ctx["model_name"] = kwargs.get("model", "")
9797
set_span_attribute(
98-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
98+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
9999
)
100100

101101
def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -112,17 +112,19 @@ def post_hook(ctx: Dict[str, Any], result: Message) -> None:
112112
_extract_anthropic_tokens(result.usage)
113113
)
114114
set_span_attribute(
115-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
115+
span,
116+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
117+
prompt_tokens,
116118
)
117119
set_span_attribute(
118-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
120+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
119121
)
120122
set_span_attribute(
121-
span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
123+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
122124
)
123125
set_span_attribute(
124126
span,
125-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
127+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
126128
cache_creation,
127129
)
128130
set_span_attribute(
@@ -133,7 +135,7 @@ def post_hook(ctx: Dict[str, Any], result: Message) -> None:
133135

134136
set_span_attribute(
135137
span,
136-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
138+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
137139
result.model,
138140
)
139141

@@ -169,7 +171,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
169171
)
170172
ctx["model_name"] = kwargs.get("model", "")
171173
set_span_attribute(
172-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
174+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
173175
)
174176
ctx["accumulated_content"] = ""
175177

@@ -197,17 +199,21 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: RawMessageStreamEvent) -> None:
197199
_extract_anthropic_tokens(usage_data)
198200
)
199201
set_span_attribute(
200-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
202+
span,
203+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
204+
prompt_tokens,
201205
)
202206
set_span_attribute(
203-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
207+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
204208
)
205209
set_span_attribute(
206-
span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
210+
span,
211+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
212+
cache_read,
207213
)
208214
set_span_attribute(
209215
span,
210-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
216+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
211217
cache_creation,
212218
)
213219
set_span_attribute(
@@ -279,7 +285,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
279285
)
280286
ctx["model_name"] = kwargs.get("model", "")
281287
set_span_attribute(
282-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
288+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
283289
)
284290

285291
def post_hook(ctx: Dict[str, Any], result: Message) -> None:
@@ -296,17 +302,19 @@ def post_hook(ctx: Dict[str, Any], result: Message) -> None:
296302
_extract_anthropic_tokens(result.usage)
297303
)
298304
set_span_attribute(
299-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
305+
span,
306+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
307+
prompt_tokens,
300308
)
301309
set_span_attribute(
302-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
310+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
303311
)
304312
set_span_attribute(
305-
span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
313+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
306314
)
307315
set_span_attribute(
308316
span,
309-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
317+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
310318
cache_creation,
311319
)
312320
set_span_attribute(
@@ -317,7 +325,7 @@ def post_hook(ctx: Dict[str, Any], result: Message) -> None:
317325

318326
set_span_attribute(
319327
span,
320-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
328+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
321329
result.model,
322330
)
323331

@@ -354,7 +362,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
354362
)
355363
ctx["model_name"] = kwargs.get("model", "")
356364
set_span_attribute(
357-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
365+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
358366
)
359367
ctx["accumulated_content"] = ""
360368

@@ -382,17 +390,21 @@ def yield_hook(inner_ctx: Dict[str, Any], chunk: RawMessageStreamEvent) -> None:
382390
_extract_anthropic_tokens(usage_data)
383391
)
384392
set_span_attribute(
385-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
393+
span,
394+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
395+
prompt_tokens,
386396
)
387397
set_span_attribute(
388-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
398+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
389399
)
390400
set_span_attribute(
391-
span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
401+
span,
402+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
403+
cache_read,
392404
)
393405
set_span_attribute(
394406
span,
395-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
407+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
396408
cache_creation,
397409
)
398410
set_span_attribute(

src/judgeval/tracer/llm/llm_anthropic/messages_stream.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
4444

4545
ctx["model_name"] = kwargs.get("model", "")
4646
set_span_attribute(
47-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
47+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
4848
)
4949
ctx["accumulated_content"] = ""
5050

@@ -125,22 +125,22 @@ def post_hook_exit_impl() -> None:
125125
) = _extract_anthropic_tokens(final_message.usage)
126126
set_span_attribute(
127127
span,
128-
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
128+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
129129
prompt_tokens,
130130
)
131131
set_span_attribute(
132132
span,
133-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
133+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
134134
completion_tokens,
135135
)
136136
set_span_attribute(
137137
span,
138-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
138+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
139139
cache_read,
140140
)
141141
set_span_attribute(
142142
span,
143-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
143+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
144144
cache_creation,
145145
)
146146
set_span_attribute(
@@ -151,7 +151,7 @@ def post_hook_exit_impl() -> None:
151151

152152
set_span_attribute(
153153
span,
154-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
154+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
155155
final_message.model,
156156
)
157157
except Exception:
@@ -190,7 +190,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
190190

191191
ctx["model_name"] = kwargs.get("model", "")
192192
set_span_attribute(
193-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
193+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
194194
)
195195
ctx["accumulated_content"] = ""
196196

@@ -271,22 +271,22 @@ async def post_hook_aexit_impl() -> None:
271271
) = _extract_anthropic_tokens(final_message.usage)
272272
set_span_attribute(
273273
span,
274-
AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS,
274+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
275275
prompt_tokens,
276276
)
277277
set_span_attribute(
278278
span,
279-
AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS,
279+
AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS,
280280
completion_tokens,
281281
)
282282
set_span_attribute(
283283
span,
284-
AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS,
284+
AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS,
285285
cache_read,
286286
)
287287
set_span_attribute(
288288
span,
289-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
289+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
290290
cache_creation,
291291
)
292292
set_span_attribute(
@@ -297,7 +297,7 @@ async def post_hook_aexit_impl() -> None:
297297

298298
set_span_attribute(
299299
span,
300-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
300+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
301301
final_message.model,
302302
)
303303
except Exception:

src/judgeval/tracer/llm/llm_google/generate_content.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def pre_hook(ctx: Dict[str, Any], *args: Any, **kwargs: Any) -> None:
6363
)
6464
ctx["model_name"] = kwargs.get("model", "")
6565
set_span_attribute(
66-
ctx["span"], AttributeKeys.GEN_AI_REQUEST_MODEL, ctx["model_name"]
66+
ctx["span"], AttributeKeys.JUDGMENT_LLM_MODEL_NAME, ctx["model_name"]
6767
)
6868

6969
def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
@@ -79,17 +79,19 @@ def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
7979
_extract_google_tokens(usage_data)
8080
)
8181
set_span_attribute(
82-
span, AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS, prompt_tokens
82+
span,
83+
AttributeKeys.JUDGMENT_USAGE_NON_CACHED_INPUT_TOKENS,
84+
prompt_tokens,
8385
)
8486
set_span_attribute(
85-
span, AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS, completion_tokens
87+
span, AttributeKeys.JUDGMENT_USAGE_OUTPUT_TOKENS, completion_tokens
8688
)
8789
set_span_attribute(
88-
span, AttributeKeys.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
90+
span, AttributeKeys.JUDGMENT_USAGE_CACHE_READ_INPUT_TOKENS, cache_read
8991
)
9092
set_span_attribute(
9193
span,
92-
AttributeKeys.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS,
94+
AttributeKeys.JUDGMENT_USAGE_CACHE_CREATION_INPUT_TOKENS,
9395
cache_creation,
9496
)
9597
set_span_attribute(
@@ -100,7 +102,7 @@ def post_hook(ctx: Dict[str, Any], result: GenerateContentResponse) -> None:
100102

101103
set_span_attribute(
102104
span,
103-
AttributeKeys.GEN_AI_RESPONSE_MODEL,
105+
AttributeKeys.JUDGMENT_LLM_MODEL_NAME,
104106
result.model_version if result.model_version else ctx["model_name"],
105107
)
106108

0 commit comments

Comments
 (0)