LayerLens · m-peko · Aug 26, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/.github/workflows/check-format.yaml b/.github/workflows/check-format.yaml
@@ -0,0 +1,37 @@
+name: Check Format
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Install Rye
+        uses: eifinger/setup-rye@v4
+        with:
+          version: latest
+
+      - name: Sync Rye environment
+        run: rye sync
+
+      - name: Run format script and check output
+        run: |
+          set -e  # exit on any command failure
+          OUTPUT=$(./scripts/format 2>&1)
+          echo "$OUTPUT"
+
+          # Fail only if "reformatted" exists
+          if echo "$OUTPUT" | grep -q "reformatted"; then
+            echo "Some files were reformatted. Please run './scripts/format' locally and commit changes."
+            exit 1
+          fi
diff --git a/.github/workflows/check-lint.yaml b/.github/workflows/check-lint.yaml
@@ -0,0 +1,28 @@
+name: Check Lint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Install Rye
+        uses: eifinger/setup-rye@v4
+        with:
+          version: latest
+
+      - name: Sync Rye environment
+        run: rye sync
+
+      - name: Check lint
+        run: ./scripts/lint
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 node_modules/
+venv/
 
 .prism.log
 _dev

diff --git a/examples/async_run_evaluations.py b/examples/async_run_evaluations.py
@@ -9,7 +9,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
     """Create and run a single evaluation, tracking progress."""
     try:
         print(f"Starting evaluation #{eval_number}...")
-        
+
         # Create evaluation
         evaluation = await client.evaluations.create(model=model, benchmark=benchmark)
         print(f"✓ Created evaluation #{eval_number}: {evaluation.id}, status={evaluation.status}")
@@ -18,7 +18,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
         evaluation = await client.evaluations.wait_for_completion(
             evaluation,
             interval_seconds=10,
-            timeout_seconds=600  # 10 minutes
+            timeout_seconds=600,  # 10 minutes
         )
         print(f"✓ Evaluation #{eval_number} ({evaluation.id}) finished with status={evaluation.status}")
 
@@ -30,7 +30,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
         else:
             print(f"✗ Evaluation #{eval_number} did not succeed")
             return eval_number, evaluation.id, 0, False
-            
+
     except Exception as e:
         print(f"✗ Error in evaluation #{eval_number}: {e}")
         return eval_number, None, 0, False
@@ -51,29 +51,26 @@ async def main():
     # Use first model and benchmark for all evaluations
     target_model = models[0]
     target_benchmark = benchmarks[0]
-    
+
     print(f"Using model: {target_model}")
     print(f"Using benchmark: {target_benchmark}")
     print("=" * 80)
 
     # Create 3 evaluation tasks
     num_evaluations = 3
     print(f"Starting {num_evaluations} evaluations in parallel...")
-
-    tasks = [
-        create_and_run_evaluation(client, target_model, target_benchmark, i + 1)
-        for i in range(num_evaluations)
-    ]
+
+    tasks = [create_and_run_evaluation(client, target_model, target_benchmark, i + 1) for i in range(num_evaluations)]
 
     # Execute all evaluations concurrently
     results = await asyncio.gather(*tasks, return_exceptions=True)
-    
+
     # Summary
     print("=" * 80)
     print("SUMMARY:")
     successful = 0
     total_results = 0
-    
+
     for result in results:
         if isinstance(result, Exception):
             print(f"Exception occurred: {result}")
@@ -85,7 +82,7 @@ async def main():
                 print(f"Evaluation #{eval_num} ({eval_id}): SUCCESS - {result_count} results")
             else:
                 print(f"Evaluation #{eval_num} ({eval_id}): FAILED")
-    
+
     print(f"\nOverall: {successful}/{num_evaluations} evaluations succeeded")
     print(f"Total results collected: {total_results}")
 

diff --git a/examples/fetch_results_async.py b/examples/fetch_results_async.py
@@ -11,13 +11,13 @@ async def fetch_evaluation_results(client, evaluation_id):
         print(f"Fetching evaluation {evaluation_id}...")
         evaluation = await client.evaluations.get_by_id(evaluation_id)
         print(f"Found evaluation {evaluation.id}, status={evaluation.status}")
-        
+
         # Get all results for this evaluation
         results = await client.results.get_all(evaluation=evaluation)
         print(f"Loaded {len(results)} results for evaluation {evaluation_id}")
         print(f"Results for {evaluation_id}: {results}")
         print("-" * 80)
-        
+
         return evaluation_id, results
     except Exception as e:
         print(f"Error fetching evaluation {evaluation_id}: {e}")
@@ -30,23 +30,17 @@ async def main():
 
     # List of evaluation IDs to fetch exmple
 
-    evaluation_ids = [
-        "68a65a3de7ad047fb5d8e7d4",
-        "688a254c673f6b2835cc7278"
-    ]
+    evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"]
 
     print(f"Starting async fetch for {len(evaluation_ids)} evaluations...")
     print("=" * 80)
 
     # Create tasks for concurrent execution
-    tasks = [
-        fetch_evaluation_results(client, eval_id) 
-        for eval_id in evaluation_ids
-    ]
+    tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids]
 
     # Execute all tasks concurrently and print results as they complete
     results = await asyncio.gather(*tasks, return_exceptions=True)
-    
+
     print("=" * 80)
     print("Summary:")
     successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception))

diff --git a/examples/get_benchmarks.py b/examples/get_benchmarks.py
@@ -21,5 +21,6 @@ async def main():
     print(f"Found {len(benchmarks)} benchmarks with type {benchmark_type}")
     print(benchmarks)
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/examples/get_evaluation.py b/examples/get_evaluation.py
@@ -15,5 +15,6 @@ async def main():
     print(f"Found evaluation {evaluation.id}")
     print(evaluation)
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/examples/get_models.py b/examples/get_models.py
@@ -33,5 +33,6 @@ async def main():
     print(f"Found {len(models)} models with type {model_type}")
     print(models)
 
+
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/examples/paginated_results.py b/examples/paginated_results.py
@@ -16,7 +16,7 @@ async def main():
     # --- Benchmarks
     benchmarks = await client.benchmarks.get()
     print(f"Found {len(benchmarks)} benchmarks")
-    
+
     # --- Create evaluation
     evaluation = await client.evaluations.create(
         model=models[0],
@@ -37,66 +37,62 @@ async def main():
     # --- Results with pagination
     if evaluation.is_success:
         print("Fetching all results with pagination...")
-        
+
         all_results = []
         page = 1
         page_size = 50
-        
+
         while True:
             print(f"Fetching page {page} (page size: {page_size})...")
-            
+
             # Get results for current page
-            results_data = await client.results.get_by_id(
-                evaluation_id=evaluation.id,
-                page=page,
-                page_size=page_size
-            )
-
+            results_data = await client.results.get_by_id(evaluation_id=evaluation.id, page=page, page_size=page_size)
+
             if not results_data or not results_data.results:
                 print("No more results to fetch")
                 break
-            
+
             # Add current page results to our collection
             all_results.extend(results_data.results)
-            
+
             # Show progress
             if page == 1:
                 total_count = results_data.pagination.total_count
                 total_pages = results_data.pagination.total_pages
                 print(f"Total results: {total_count:,}")
                 print(f"Total pages: {total_pages}")
-            
+
             print(f"Page {page}: Retrieved {len(results_data.results)} results")
             print(f"Running total: {len(all_results):,} results")
-            
+
             # Check if we've reached the last page
             if page >= results_data.pagination.total_pages:
                 print("Reached last page")
                 break
-            
+
             page += 1
-        
+
         # Summary of all results
         print(f"\n=== PAGINATION COMPLETE ===")
         print(f"Total results collected: {len(all_results):,}")
-        
+
         if all_results:
             # Calculate some basic statistics
             correct_answers = sum(1 for r in all_results if r.score > 0.5)
             accuracy = correct_answers / len(all_results)
             avg_score = sum(r.score for r in all_results) / len(all_results)
-            
+
             print(f"Overall accuracy: {accuracy:.1%} ({correct_answers:,}/{len(all_results):,})")
             print(f"Average score: {avg_score:.3f}")
-            
+
             # Show a few example results
             print(f"\nFirst 3 results:")
             for i, result in enumerate(all_results[:3], 1):
                 print(f"  {i}. Score: {result.score:.3f}, Subset: {result.subset}")
                 print(f"     Prompt: {result.prompt[:100]}...")
                 print(f"     Response: {result.result[:100]}...")
                 print()
-        
+
     else:
         print("Evaluation did not succeed, no results to show.")
 

diff --git a/src/atlas/_client.py b/src/atlas/_client.py
@@ -54,7 +54,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://8bg48mbhyi.execute-api.us-east-1.amazonaws.com/prod/api/v1"
+            base_url = "https://api.layerlens.ai/api/v1"
 
         super().__init__(
             base_url=base_url,
@@ -196,7 +196,7 @@ def __init__(
         if base_url is None:
             base_url = os.environ.get("LAYERLENS_ATLAS_BASE_URL")
         if base_url is None:
-            base_url = "https://8bg48mbhyi.execute-api.us-east-1.amazonaws.com/prod/api/v1"
+            base_url = "https://api.layerlens.ai/api/v1"
 
         super().__init__(base_url=base_url, timeout=timeout)
 

diff --git a/src/atlas/models/__init__.py b/src/atlas/models/__init__.py
@@ -6,29 +6,31 @@
     BenchmarksResponse,
     EvaluationsResponse,
     OrganizationResponse,
+    CreateEvaluationsResponse,
 )
 from .model import Model, CustomModel, PublicModel
 from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark
 from .evaluation import Result, Evaluation, EvaluationStatus
 from .organization import Project, Organization
 
 __all__ = [
-    "BenchmarksResponse",
-    "EvaluationsResponse",
-    "ModelsResponse",
-    "OrganizationResponse",
-    "ResultsResponse",
     "Benchmark",
+    "BenchmarksResponse",
+    "CreateEvaluationsResponse",
     "CustomBenchmark",
-    "PublicBenchmark",
+    "CustomModel",
     "Evaluation",
     "EvaluationStatus",
-    "Pagination",
-    "Result",
-    "ResultMetrics",
+    "EvaluationsResponse",
     "Model",
-    "CustomModel",
-    "PublicModel",
+    "ModelsResponse",
     "Organization",
+    "OrganizationResponse",
+    "Pagination",
     "Project",
+    "PublicBenchmark",
+    "PublicModel",
+    "Result",
+    "ResultMetrics",
+    "ResultsResponse",
 ]
diff --git a/src/atlas/models/api.py b/src/atlas/models/api.py
@@ -19,10 +19,15 @@ class Data(BaseModel):
     data: Data
 
 
-class EvaluationsResponse(BaseModel):
+class CreateEvaluationsResponse(BaseModel):
     data: List[Evaluation]
 
 
+class EvaluationsResponse(BaseModel):
+    evaluations: List[Evaluation]
+    pagination: Pagination
+
+
 class ModelsResponse(BaseModel):
     class Data(BaseModel):
         models: List[Model]

diff --git a/src/atlas/resources/benchmarks/benchmarks.py b/src/atlas/resources/benchmarks/benchmarks.py
@@ -16,7 +16,7 @@ def get(
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
         type: Literal["custom", "public"] | None = None,
         name: Optional[str] = None,
-    ) -> List[Benchmark] | None:
+    ) -> Optional[List[Benchmark]]:
         base_url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/benchmarks"
 
         def fetch(bench_type: str) -> BenchmarksResponse | None:
@@ -61,7 +61,7 @@ async def get(
         timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
         type: Literal["custom", "public"] | None = None,
         name: Optional[str] = None,
-    ) -> List[Benchmark] | None:
+    ) -> Optional[List[Benchmark]]:
         base_url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/benchmarks"
 
         async def fetch(bench_type: str) -> Optional[BenchmarksResponse]: