From 2307be140b03ff1096178f81b2d67d9b7b52fae2 Mon Sep 17 00:00:00 2001 From: Ayush8923 <80516839+Ayush8923@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:05:36 +0530 Subject: [PATCH 1/8] feat(*): Added new OpenAI models --- backend/app/models/llm/constants.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/app/models/llm/constants.py b/backend/app/models/llm/constants.py index 8cb8f71b3..a604d2a50 100644 --- a/backend/app/models/llm/constants.py +++ b/backend/app/models/llm/constants.py @@ -22,6 +22,13 @@ "o1", "o1-preview", "o1-mini", + "gpt-5.4-pro", + "gpt-5.4-mini", + "gpt-5.4-nano", + "gpt-5", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", ], } From 6b3abab80554bc801d9430c49fdbb9b82bde6471 Mon Sep 17 00:00:00 2001 From: Ayush8923 <80516839+Ayush8923@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:11:35 +0530 Subject: [PATCH 2/8] fix(*): formatting fixes --- backend/app/celery/celery_app.py | 1 - backend/app/celery/tasks/job_execution.py | 28 ++++++--- backend/app/celery/utils.py | 76 ++++++++++++++++------- 3 files changed, 76 insertions(+), 29 deletions(-) diff --git a/backend/app/celery/celery_app.py b/backend/app/celery/celery_app.py index 0dd72df2c..45364a72f 100644 --- a/backend/app/celery/celery_app.py +++ b/backend/app/celery/celery_app.py @@ -97,4 +97,3 @@ def warm_llm_modules(**_) -> None: broker_connection_retry_on_startup=True, broker_pool_limit=settings.CELERY_BROKER_POOL_LIMIT, ) - diff --git a/backend/app/celery/tasks/job_execution.py b/backend/app/celery/tasks/job_execution.py index aaa763830..a1663179d 100644 --- a/backend/app/celery/tasks/job_execution.py +++ b/backend/app/celery/tasks/job_execution.py @@ -70,7 +70,9 @@ def run_doctransform_job(self, project_id: int, job_id: str, trace_id: str, **kw @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_create_collection_job(self, project_id: int, job_id: str, trace_id: str, **kwargs): +def run_create_collection_job( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): from app.services.collections.create_collection import execute_job _set_trace(trace_id) @@ -84,7 +86,9 @@ def run_create_collection_job(self, project_id: int, job_id: str, trace_id: str, @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_delete_collection_job(self, project_id: int, job_id: str, trace_id: str, **kwargs): +def run_delete_collection_job( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): from app.services.collections.delete_collection import execute_job _set_trace(trace_id) @@ -98,7 +102,9 @@ def run_delete_collection_job(self, project_id: int, job_id: str, trace_id: str, @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_stt_batch_submission(self, project_id: int, job_id: str, trace_id: str, **kwargs): +def run_stt_batch_submission( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): from app.services.stt_evaluations.batch_job import execute_batch_submission _set_trace(trace_id) @@ -112,7 +118,9 @@ def run_stt_batch_submission(self, project_id: int, job_id: str, trace_id: str, @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_stt_metric_computation(self, project_id: int, job_id: str, trace_id: str, **kwargs): +def run_stt_metric_computation( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): from app.services.stt_evaluations.metric_job import execute_metric_computation _set_trace(trace_id) @@ -126,7 +134,9 @@ def run_stt_metric_computation(self, project_id: int, job_id: str, trace_id: str @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_tts_batch_submission(self, project_id: int, job_id: str, trace_id: str, **kwargs): +def run_tts_batch_submission( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): from app.services.tts_evaluations.batch_job import execute_batch_submission _set_trace(trace_id) @@ -140,8 +150,12 @@ def run_tts_batch_submission(self, project_id: int, job_id: str, trace_id: str, @celery_app.task(bind=True, queue="low_priority", priority=1) -def run_tts_result_processing(self, project_id: int, job_id: str, trace_id: str, **kwargs): - from app.services.tts_evaluations.batch_result_processing import execute_tts_result_processing +def run_tts_result_processing( + self, project_id: int, job_id: str, trace_id: str, **kwargs +): + from app.services.tts_evaluations.batch_result_processing import ( + execute_tts_result_processing, + ) _set_trace(trace_id) return execute_tts_result_processing( diff --git a/backend/app/celery/utils.py b/backend/app/celery/utils.py index 3fd871724..e4b2a2e3f 100644 --- a/backend/app/celery/utils.py +++ b/backend/app/celery/utils.py @@ -12,12 +12,12 @@ logger = logging.getLogger(__name__) -def start_llm_job( - project_id: int, job_id: str, trace_id: str = "N/A", **kwargs -) -> str: +def start_llm_job(project_id: int, job_id: str, trace_id: str = "N/A", **kwargs) -> str: from app.celery.tasks.job_execution import run_llm_job - task = run_llm_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) + task = run_llm_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) logger.info(f"[start_llm_job] Started job {job_id} with Celery task {task.id}") return task.id @@ -27,8 +27,12 @@ def start_llm_chain_job( ) -> str: from app.celery.tasks.job_execution import run_llm_chain_job - task = run_llm_chain_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_llm_chain_job] Started job {job_id} with Celery task {task.id}") + task = run_llm_chain_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_llm_chain_job] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -37,7 +41,9 @@ def start_response_job( ) -> str: from app.celery.tasks.job_execution import run_response_job - task = run_response_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) + task = run_response_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) logger.info(f"[start_response_job] Started job {job_id} with Celery task {task.id}") return task.id @@ -47,8 +53,12 @@ def start_doctransform_job( ) -> str: from app.celery.tasks.job_execution import run_doctransform_job - task = run_doctransform_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_doctransform_job] Started job {job_id} with Celery task {task.id}") + task = run_doctransform_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_doctransform_job] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -57,8 +67,12 @@ def start_create_collection_job( ) -> str: from app.celery.tasks.job_execution import run_create_collection_job - task = run_create_collection_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_create_collection_job] Started job {job_id} with Celery task {task.id}") + task = run_create_collection_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_create_collection_job] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -67,8 +81,12 @@ def start_delete_collection_job( ) -> str: from app.celery.tasks.job_execution import run_delete_collection_job - task = run_delete_collection_job.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_delete_collection_job] Started job {job_id} with Celery task {task.id}") + task = run_delete_collection_job.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_delete_collection_job] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -77,8 +95,12 @@ def start_stt_batch_submission( ) -> str: from app.celery.tasks.job_execution import run_stt_batch_submission - task = run_stt_batch_submission.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_stt_batch_submission] Started job {job_id} with Celery task {task.id}") + task = run_stt_batch_submission.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_stt_batch_submission] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -87,8 +109,12 @@ def start_stt_metric_computation( ) -> str: from app.celery.tasks.job_execution import run_stt_metric_computation - task = run_stt_metric_computation.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_stt_metric_computation] Started job {job_id} with Celery task {task.id}") + task = run_stt_metric_computation.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_stt_metric_computation] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -97,8 +123,12 @@ def start_tts_batch_submission( ) -> str: from app.celery.tasks.job_execution import run_tts_batch_submission - task = run_tts_batch_submission.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_tts_batch_submission] Started job {job_id} with Celery task {task.id}") + task = run_tts_batch_submission.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_tts_batch_submission] Started job {job_id} with Celery task {task.id}" + ) return task.id @@ -107,8 +137,12 @@ def start_tts_result_processing( ) -> str: from app.celery.tasks.job_execution import run_tts_result_processing - task = run_tts_result_processing.delay(project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs) - logger.info(f"[start_tts_result_processing] Started job {job_id} with Celery task {task.id}") + task = run_tts_result_processing.delay( + project_id=project_id, job_id=job_id, trace_id=trace_id, **kwargs + ) + logger.info( + f"[start_tts_result_processing] Started job {job_id} with Celery task {task.id}" + ) return task.id From 17374e4b407ee7823b3f982fc078e2214bb60c0a Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 25 Mar 2026 13:33:44 +0530 Subject: [PATCH 3/8] removing default --- backend/app/models/llm/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 1317c9ef3..ef48902e7 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -31,7 +31,7 @@ class TextLLMParams(SQLModel): description="Reasoning configuration or instructions", ) temperature: float | None = Field( - default=0.1, + default=None, ge=0.0, le=2.0, ) From c15fbde9ffc4fc8054035ce5381e2d2ea33e16fb Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 25 Mar 2026 15:38:37 +0530 Subject: [PATCH 4/8] remove templ --- backend/app/models/llm/request.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index ef48902e7..8cc5f5c3e 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -31,7 +31,7 @@ class TextLLMParams(SQLModel): description="Reasoning configuration or instructions", ) temperature: float | None = Field( - default=None, + default=0.1, ge=0.0, le=2.0, ) @@ -251,6 +251,7 @@ def validate_params(self): provider = self.provider provider_was_auto_assigned = True + user_provided_temperature = "temperature" in self.params validated = model_class.model_validate(self.params) if provider is not None: @@ -288,6 +289,8 @@ def validate_params(self): ) self.params = validated.model_dump(exclude_none=True) + if not user_provided_temperature: + self.params.pop("temperature", None) return self From c462653832d942d6127336c0d31f59dba3d0d4c7 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Wed, 25 Mar 2026 22:24:42 +0530 Subject: [PATCH 5/8] updated batch jsonl temperature defaults --- backend/app/crud/evaluations/batch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index 1ef6f9052..1385c5b9c 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -106,12 +106,12 @@ def build_evaluation_jsonl( body: dict[str, Any] = { "model": config.model, "instructions": config.instructions, - "temperature": config.temperature - if config.temperature is not None - else 0.01, "input": question, # Add input from dataset } + if config.temperature is not None: + body["temperature"] = config.temperature + # Add reasoning only if provided if config.reasoning: body["reasoning"] = {"effort": config.reasoning} From aa8b3815d15d994e1048eb32019550a006cd9784 Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 26 Mar 2026 14:06:47 +0530 Subject: [PATCH 6/8] remove from batch job --- backend/app/crud/evaluations/batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index 1385c5b9c..13fb9a50b 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -109,7 +109,7 @@ def build_evaluation_jsonl( "input": question, # Add input from dataset } - if config.temperature is not None: + if "temperature" in config.model_fields_set: body["temperature"] = config.temperature # Add reasoning only if provided @@ -189,7 +189,7 @@ def start_evaluation_batch( "description": f"Evaluation: {eval_run.run_name}", "completion_window": "24h", # Store complete config for reference - "evaluation_config": config.model_dump(exclude_none=True), + "evaluation_config": config.model_dump(exclude_unset=True), } # Step 5: Start batch job using generic infrastructure From 2637779b761d7062dc64168d498f1b94634d33fc Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 27 Mar 2026 17:28:24 +0530 Subject: [PATCH 7/8] added testcases --- .../app/tests/api/routes/test_evaluation.py | 59 ++++++++ backend/app/tests/models/__init__.py | 0 backend/app/tests/models/llm/__init__.py | 0 backend/app/tests/models/llm/test_request.py | 142 ++++++++++++++++++ 4 files changed, 201 insertions(+) create mode 100644 backend/app/tests/models/__init__.py create mode 100644 backend/app/tests/models/llm/__init__.py create mode 100644 backend/app/tests/models/llm/test_request.py diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 4b751a59a..17d647aea 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None: assert request_dict["body"]["input"] == f"Question {i}" assert request_dict["body"]["model"] == "gpt-4o" + def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None: + """When temperature is explicitly set, it should appear in the JSONL body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + config = TextLLMParams(model="gpt-4o", temperature=0.5) + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" in jsonl_data[0]["body"] + assert jsonl_data[0]["body"]["temperature"] == 0.5 + + def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None: + """When temperature is not explicitly set, it should NOT appear in the JSONL body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + # Only model provided — temperature not in model_fields_set + config = TextLLMParams(model="gpt-4o") + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" not in jsonl_data[0]["body"] + + def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set( + self, + ) -> None: + """When temperature is explicitly set to 0.0, it should still appear in the body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + config = TextLLMParams(model="gpt-4o", temperature=0.0) + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" in jsonl_data[0]["body"] + assert jsonl_data[0]["body"]["temperature"] == 0.0 + class TestGetEvaluationRunStatus: """Test GET /evaluations/{evaluation_id} endpoint.""" diff --git a/backend/app/tests/models/__init__.py b/backend/app/tests/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/models/llm/__init__.py b/backend/app/tests/models/llm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/models/llm/test_request.py b/backend/app/tests/models/llm/test_request.py new file mode 100644 index 000000000..d18428705 --- /dev/null +++ b/backend/app/tests/models/llm/test_request.py @@ -0,0 +1,142 @@ +import pytest +from pydantic import ValidationError + +from app.models.llm.request import KaapiCompletionConfig + + +class TestKaapiCompletionConfigTemperature: + """Test temperature handling in KaapiCompletionConfig.validate_params.""" + + def test_temperature_preserved_when_user_provides_it(self) -> None: + """When user explicitly provides temperature, it should be in params.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "temperature": 0.7, + }, + ) + + assert "temperature" in config.params + assert config.params["temperature"] == 0.7 + + def test_temperature_excluded_when_user_does_not_provide_it(self) -> None: + """When user does not provide temperature, it should NOT be in params + even though TextLLMParams has a default of 0.1.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + }, + ) + + assert "temperature" not in config.params + + def test_temperature_zero_preserved_when_explicitly_set(self) -> None: + """When user explicitly sets temperature to 0.0, it should be preserved.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "temperature": 0.0, + }, + ) + + assert "temperature" in config.params + assert config.params["temperature"] == 0.0 + + def test_temperature_with_instructions(self) -> None: + """Temperature should be preserved alongside other params when provided.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "instructions": "Be helpful", + "temperature": 1.5, + }, + ) + + assert config.params["temperature"] == 1.5 + assert config.params["instructions"] == "Be helpful" + + def test_no_temperature_with_other_params(self) -> None: + """When temperature is not provided, other params should still be present.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "instructions": "Be helpful", + "reasoning": "high", + }, + ) + + assert "temperature" not in config.params + assert config.params["instructions"] == "Be helpful" + assert config.params["reasoning"] == "high" + + +class TestNewSupportedModels: + """Test that newly added models are accepted for openai/text provider.""" + + @pytest.mark.parametrize( + "model", + [ + "gpt-5.4-pro", + "gpt-5.4-mini", + "gpt-5.4-nano", + "gpt-5", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", + ], + ) + def test_new_model_accepted(self, model: str) -> None: + """New models should be accepted for openai text provider.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": model}, + ) + + assert config.params["model"] == model + + @pytest.mark.parametrize( + "model", + [ + "gpt-4o", + "gpt-4o-mini", + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", + "gpt-5.4", + "gpt-5.1", + "gpt-5-mini", + "gpt-5-nano", + "o1", + "o1-preview", + "o1-mini", + ], + ) + def test_existing_models_still_accepted(self, model: str) -> None: + """Previously supported models should still be accepted.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": model}, + ) + + assert config.params["model"] == model + + def test_unsupported_model_rejected(self) -> None: + """An unsupported model should raise a validation error.""" + with pytest.raises(ValidationError, match="not supported"): + KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": "unsupported-model-xyz"}, + ) From 3ab3fa395bcbc4f8ea3cfe5adea1cfe814191f8f Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Sat, 28 Mar 2026 10:19:50 +0530 Subject: [PATCH 8/8] added testcases --- backend/app/tests/models/llm/test_request.py | 31 -------------------- 1 file changed, 31 deletions(-) diff --git a/backend/app/tests/models/llm/test_request.py b/backend/app/tests/models/llm/test_request.py index d18428705..3d40f607a 100644 --- a/backend/app/tests/models/llm/test_request.py +++ b/backend/app/tests/models/llm/test_request.py @@ -48,37 +48,6 @@ def test_temperature_zero_preserved_when_explicitly_set(self) -> None: assert "temperature" in config.params assert config.params["temperature"] == 0.0 - def test_temperature_with_instructions(self) -> None: - """Temperature should be preserved alongside other params when provided.""" - config = KaapiCompletionConfig( - provider="openai", - type="text", - params={ - "model": "gpt-4o", - "instructions": "Be helpful", - "temperature": 1.5, - }, - ) - - assert config.params["temperature"] == 1.5 - assert config.params["instructions"] == "Be helpful" - - def test_no_temperature_with_other_params(self) -> None: - """When temperature is not provided, other params should still be present.""" - config = KaapiCompletionConfig( - provider="openai", - type="text", - params={ - "model": "gpt-4o", - "instructions": "Be helpful", - "reasoning": "high", - }, - ) - - assert "temperature" not in config.params - assert config.params["instructions"] == "Be helpful" - assert config.params["reasoning"] == "high" - class TestNewSupportedModels: """Test that newly added models are accepted for openai/text provider."""