Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/workflows/check-format.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Check Format

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
format:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v5

- name: Install Rye
uses: eifinger/setup-rye@v4
with:
version: latest

- name: Sync Rye environment
run: rye sync

- name: Run format script and check output
run: |
set -e # exit on any command failure
OUTPUT=$(./scripts/format 2>&1)
echo "$OUTPUT"

# Fail only if "reformatted" exists
if echo "$OUTPUT" | grep -q "reformatted"; then
echo "Some files were reformatted. Please run './scripts/format' locally and commit changes."
exit 1
fi
28 changes: 28 additions & 0 deletions .github/workflows/check-lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Check Lint

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
format:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v5

- name: Install Rye
uses: eifinger/setup-rye@v4
with:
version: latest

- name: Sync Rye environment
run: rye sync

- name: Check lint
run: ./scripts/lint
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
node_modules/
venv/

.prism.log
_dev
Expand Down
21 changes: 9 additions & 12 deletions examples/async_run_evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
"""Create and run a single evaluation, tracking progress."""
try:
print(f"Starting evaluation #{eval_number}...")

# Create evaluation
evaluation = await client.evaluations.create(model=model, benchmark=benchmark)
print(f"✓ Created evaluation #{eval_number}: {evaluation.id}, status={evaluation.status}")
Expand All @@ -18,7 +18,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
evaluation = await client.evaluations.wait_for_completion(
evaluation,
interval_seconds=10,
timeout_seconds=600 # 10 minutes
timeout_seconds=600, # 10 minutes
)
print(f"✓ Evaluation #{eval_number} ({evaluation.id}) finished with status={evaluation.status}")

Expand All @@ -30,7 +30,7 @@ async def create_and_run_evaluation(client, model, benchmark, eval_number):
else:
print(f"✗ Evaluation #{eval_number} did not succeed")
return eval_number, evaluation.id, 0, False

except Exception as e:
print(f"✗ Error in evaluation #{eval_number}: {e}")
return eval_number, None, 0, False
Expand All @@ -51,29 +51,26 @@ async def main():
# Use first model and benchmark for all evaluations
target_model = models[0]
target_benchmark = benchmarks[0]

print(f"Using model: {target_model}")
print(f"Using benchmark: {target_benchmark}")
print("=" * 80)

# Create 3 evaluation tasks
num_evaluations = 3
print(f"Starting {num_evaluations} evaluations in parallel...")

tasks = [
create_and_run_evaluation(client, target_model, target_benchmark, i + 1)
for i in range(num_evaluations)
]

tasks = [create_and_run_evaluation(client, target_model, target_benchmark, i + 1) for i in range(num_evaluations)]

# Execute all evaluations concurrently
results = await asyncio.gather(*tasks, return_exceptions=True)

# Summary
print("=" * 80)
print("SUMMARY:")
successful = 0
total_results = 0

for result in results:
if isinstance(result, Exception):
print(f"Exception occurred: {result}")
Expand All @@ -85,7 +82,7 @@ async def main():
print(f"Evaluation #{eval_num} ({eval_id}): SUCCESS - {result_count} results")
else:
print(f"Evaluation #{eval_num} ({eval_id}): FAILED")

print(f"\nOverall: {successful}/{num_evaluations} evaluations succeeded")
print(f"Total results collected: {total_results}")

Expand Down
16 changes: 5 additions & 11 deletions examples/fetch_results_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ async def fetch_evaluation_results(client, evaluation_id):
print(f"Fetching evaluation {evaluation_id}...")
evaluation = await client.evaluations.get_by_id(evaluation_id)
print(f"Found evaluation {evaluation.id}, status={evaluation.status}")

# Get all results for this evaluation
results = await client.results.get_all(evaluation=evaluation)
print(f"Loaded {len(results)} results for evaluation {evaluation_id}")
print(f"Results for {evaluation_id}: {results}")
print("-" * 80)

return evaluation_id, results
except Exception as e:
print(f"Error fetching evaluation {evaluation_id}: {e}")
Expand All @@ -30,23 +30,17 @@ async def main():

# List of evaluation IDs to fetch exmple

evaluation_ids = [
"68a65a3de7ad047fb5d8e7d4",
"688a254c673f6b2835cc7278"
]
evaluation_ids = ["68a65a3de7ad047fb5d8e7d4", "688a254c673f6b2835cc7278"]

print(f"Starting async fetch for {len(evaluation_ids)} evaluations...")
print("=" * 80)

# Create tasks for concurrent execution
tasks = [
fetch_evaluation_results(client, eval_id)
for eval_id in evaluation_ids
]
tasks = [fetch_evaluation_results(client, eval_id) for eval_id in evaluation_ids]

# Execute all tasks concurrently and print results as they complete
results = await asyncio.gather(*tasks, return_exceptions=True)

print("=" * 80)
print("Summary:")
successful = sum(1 for _, result in results if result is not None and not isinstance(result, Exception))
Expand Down
1 change: 1 addition & 0 deletions examples/get_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ async def main():
print(f"Found {len(benchmarks)} benchmarks with type {benchmark_type}")
print(benchmarks)


if __name__ == "__main__":
asyncio.run(main())
1 change: 1 addition & 0 deletions examples/get_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ async def main():
print(f"Found evaluation {evaluation.id}")
print(evaluation)


if __name__ == "__main__":
asyncio.run(main())
1 change: 1 addition & 0 deletions examples/get_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,6 @@ async def main():
print(f"Found {len(models)} models with type {model_type}")
print(models)


if __name__ == "__main__":
asyncio.run(main())
36 changes: 16 additions & 20 deletions examples/paginated_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ async def main():
# --- Benchmarks
benchmarks = await client.benchmarks.get()
print(f"Found {len(benchmarks)} benchmarks")

# --- Create evaluation
evaluation = await client.evaluations.create(
model=models[0],
Expand All @@ -37,66 +37,62 @@ async def main():
# --- Results with pagination
if evaluation.is_success:
print("Fetching all results with pagination...")

all_results = []
page = 1
page_size = 50

while True:
print(f"Fetching page {page} (page size: {page_size})...")

# Get results for current page
results_data = await client.results.get_by_id(
evaluation_id=evaluation.id,
page=page,
page_size=page_size
)

results_data = await client.results.get_by_id(evaluation_id=evaluation.id, page=page, page_size=page_size)

if not results_data or not results_data.results:
print("No more results to fetch")
break

# Add current page results to our collection
all_results.extend(results_data.results)

# Show progress
if page == 1:
total_count = results_data.pagination.total_count
total_pages = results_data.pagination.total_pages
print(f"Total results: {total_count:,}")
print(f"Total pages: {total_pages}")

print(f"Page {page}: Retrieved {len(results_data.results)} results")
print(f"Running total: {len(all_results):,} results")

# Check if we've reached the last page
if page >= results_data.pagination.total_pages:
print("Reached last page")
break

page += 1

# Summary of all results
print(f"\n=== PAGINATION COMPLETE ===")
print(f"Total results collected: {len(all_results):,}")

if all_results:
# Calculate some basic statistics
correct_answers = sum(1 for r in all_results if r.score > 0.5)
accuracy = correct_answers / len(all_results)
avg_score = sum(r.score for r in all_results) / len(all_results)

print(f"Overall accuracy: {accuracy:.1%} ({correct_answers:,}/{len(all_results):,})")
print(f"Average score: {avg_score:.3f}")

# Show a few example results
print(f"\nFirst 3 results:")
for i, result in enumerate(all_results[:3], 1):
print(f" {i}. Score: {result.score:.3f}, Subset: {result.subset}")
print(f" Prompt: {result.prompt[:100]}...")
print(f" Response: {result.result[:100]}...")
print()

else:
print("Evaluation did not succeed, no results to show.")

Expand Down
4 changes: 2 additions & 2 deletions src/atlas/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(
if base_url is None:
base_url = os.environ.get("LAYERLENS_ATLAS_BASE_URL")
if base_url is None:
base_url = "https://8bg48mbhyi.execute-api.us-east-1.amazonaws.com/prod/api/v1"
base_url = "https://api.layerlens.ai/api/v1"

super().__init__(
base_url=base_url,
Expand Down Expand Up @@ -196,7 +196,7 @@ def __init__(
if base_url is None:
base_url = os.environ.get("LAYERLENS_ATLAS_BASE_URL")
if base_url is None:
base_url = "https://8bg48mbhyi.execute-api.us-east-1.amazonaws.com/prod/api/v1"
base_url = "https://api.layerlens.ai/api/v1"

super().__init__(base_url=base_url, timeout=timeout)

Expand Down
24 changes: 13 additions & 11 deletions src/atlas/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,31 @@
BenchmarksResponse,
EvaluationsResponse,
OrganizationResponse,
CreateEvaluationsResponse,
)
from .model import Model, CustomModel, PublicModel
from .benchmark import Benchmark, CustomBenchmark, PublicBenchmark
from .evaluation import Result, Evaluation, EvaluationStatus
from .organization import Project, Organization

__all__ = [
"BenchmarksResponse",
"EvaluationsResponse",
"ModelsResponse",
"OrganizationResponse",
"ResultsResponse",
"Benchmark",
"BenchmarksResponse",
"CreateEvaluationsResponse",
"CustomBenchmark",
"PublicBenchmark",
"CustomModel",
"Evaluation",
"EvaluationStatus",
"Pagination",
"Result",
"ResultMetrics",
"EvaluationsResponse",
"Model",
"CustomModel",
"PublicModel",
"ModelsResponse",
"Organization",
"OrganizationResponse",
"Pagination",
"Project",
"PublicBenchmark",
"PublicModel",
"Result",
"ResultMetrics",
"ResultsResponse",
]
7 changes: 6 additions & 1 deletion src/atlas/models/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@ class Data(BaseModel):
data: Data


class EvaluationsResponse(BaseModel):
class CreateEvaluationsResponse(BaseModel):
data: List[Evaluation]


class EvaluationsResponse(BaseModel):
evaluations: List[Evaluation]
pagination: Pagination


class ModelsResponse(BaseModel):
class Data(BaseModel):
models: List[Model]
Expand Down
4 changes: 2 additions & 2 deletions src/atlas/resources/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get(
timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
type: Literal["custom", "public"] | None = None,
name: Optional[str] = None,
) -> List[Benchmark] | None:
) -> Optional[List[Benchmark]]:
base_url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/benchmarks"

def fetch(bench_type: str) -> BenchmarksResponse | None:
Expand Down Expand Up @@ -61,7 +61,7 @@ async def get(
timeout: float | httpx.Timeout | None = DEFAULT_TIMEOUT,
type: Literal["custom", "public"] | None = None,
name: Optional[str] = None,
) -> List[Benchmark] | None:
) -> Optional[List[Benchmark]]:
base_url = f"/organizations/{self._client.organization_id}/projects/{self._client.project_id}/benchmarks"

async def fetch(bench_type: str) -> Optional[BenchmarksResponse]:
Expand Down
Loading
Loading