From d7322e192d07d1f10b373cef1d14ee9cce176ac3 Mon Sep 17 00:00:00 2001 From: DavidKoleczek <45405824+DavidKoleczek@users.noreply.github.com> Date: Sun, 8 Feb 2026 12:40:27 -0500 Subject: [PATCH 1/3] tooling updates, ty type checking --- .claude/commands/dependency-update.md | 3 +- .claude/settings.json | 29 +- .github/workflows/ci.yaml | 36 ++ .github/workflows/python-tests.yaml | 41 --- .gitignore | 1 + .pre-commit-config.yaml | 26 ++ .python-version | 1 + .vscode/settings.json | 8 +- AGENTS.md | 133 +++----- Makefile | 17 - README.md | 72 ++-- data/benchmarks/sample.yaml | 27 ++ docs/BENCHMARKING.md | 13 +- docs/ROADMAP.md | 9 - pyproject.toml | 34 +- scripts/run_benchmarks.py | 4 +- .../third_party_benchmarks/setup_arc_agi_2.py | 318 ------------------ .../eval_recipes}/__init__.py | 0 .../eval_recipes}/benchmarking/__init__.py | 0 .../benchmarking/docker_manager.py | 2 +- .../benchmarking/evaluation/__init__.py | 0 .../evaluation/agent_interacter.py | 0 .../evaluation/analysis_runner.py | 0 .../benchmarking/evaluation/semantic_test.py | 0 .../evaluation/semantic_test_comparison.py | 0 .../benchmarking/evaluation/test_utils.py | 0 .../benchmarking/job_framework/__init__.py | 0 .../benchmarking/job_framework/base.py | 4 +- .../benchmarking/job_framework/runner.py | 0 .../benchmarking/job_framework/state.py | 0 .../benchmarking/jobs/__init__.py | 0 .../benchmarking/jobs/comparison/__init__.py | 0 .../comparison/comparison_aggregation_job.py | 0 .../comparison_final_analysis_job.py | 0 .../comparison_results_aggregation_job.py | 0 .../jobs/comparison/comparison_trial_job.py | 0 .../jobs/comparison/extract_project_job.py | 0 .../comparison/semantic_comparison_job.py | 0 .../benchmarking/jobs/execute_agent_job.py | 2 +- .../benchmarking/jobs/score/__init__.py | 0 .../jobs/score/agent_comparison_job.py | 0 .../jobs/score/execute_evaluations_job.py | 0 .../jobs/score/final_analysis_job.py | 0 .../jobs/score/results_aggregation_job.py | 0 .../jobs/score/task_analysis_job.py | 0 .../jobs/score/trial_execution_job.py | 0 .../eval_recipes}/benchmarking/loaders.py | 0 .../benchmarking/pipelines/__init__.py | 0 .../pipelines/comparison_pipeline.py | 0 .../benchmarking/pipelines/score_pipeline.py | 0 .../benchmarking/reporting/__init__.py | 0 .../create_comparison_html_report.py | 0 .../reporting/create_html_report.py | 0 .../eval_recipes}/benchmarking/schemas.py | 0 .../eval_recipes}/evaluate.py | 0 .../eval_recipes}/evaluations/__init__.py | 0 .../evaluations/check_criteria/__init__.py | 0 .../check_criteria_evaluator.py | 0 .../evaluations/check_criteria/prompts.py | 0 .../claim_verification/__init__.py | 0 .../claim_verification/claim_extraction.py | 0 .../claim_verification_evaluator.py | 0 .../evaluations/claim_verification/prompts.py | 0 .../evaluations/claim_verification/schemas.py | 0 .../evaluations/claim_verification/utils.py | 0 .../evaluations/guidance/__init__.py | 0 .../guidance/guidance_evaluator.py | 0 .../evaluations/guidance/prompts.py | 0 .../preference_adherence/__init__.py | 0 .../preference_adherence_evaluator.py | 0 .../preference_adherence/prompts.py | 0 .../evaluations/semantic_test/__init__.py | 0 .../semantic_test/semantic_test_evaluator.py | 0 .../evaluations/tool_usage/__init__.py | 0 .../evaluations/tool_usage/prompts.py | 0 .../tool_usage/tool_usage_evaluator.py | 0 {eval_recipes => src/eval_recipes}/schemas.py | 0 .../eval_recipes}/utils/__init__.py | 0 .../eval_recipes}/utils/llm.py | 0 .../utils/responses_conversion.py | 13 +- tests/benchmarking/test_job_framework.py | 4 +- uv.lock | 270 +++++++-------- 82 files changed, 384 insertions(+), 683 deletions(-) create mode 100644 .github/workflows/ci.yaml delete mode 100644 .github/workflows/python-tests.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 .python-version delete mode 100644 Makefile create mode 100644 data/benchmarks/sample.yaml delete mode 100644 docs/ROADMAP.md delete mode 100644 scripts/third_party_benchmarks/setup_arc_agi_2.py rename {eval_recipes => src/eval_recipes}/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/docker_manager.py (99%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/agent_interacter.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/analysis_runner.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/semantic_test.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/semantic_test_comparison.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/evaluation/test_utils.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/job_framework/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/job_framework/base.py (99%) rename {eval_recipes => src/eval_recipes}/benchmarking/job_framework/runner.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/job_framework/state.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/comparison_aggregation_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/comparison_final_analysis_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/comparison_results_aggregation_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/comparison_trial_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/extract_project_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/comparison/semantic_comparison_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/execute_agent_job.py (99%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/agent_comparison_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/execute_evaluations_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/final_analysis_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/results_aggregation_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/task_analysis_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/jobs/score/trial_execution_job.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/loaders.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/pipelines/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/pipelines/comparison_pipeline.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/pipelines/score_pipeline.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/reporting/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/reporting/create_comparison_html_report.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/reporting/create_html_report.py (100%) rename {eval_recipes => src/eval_recipes}/benchmarking/schemas.py (100%) rename {eval_recipes => src/eval_recipes}/evaluate.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/check_criteria/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/check_criteria/check_criteria_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/check_criteria/prompts.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/claim_extraction.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/claim_verification_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/prompts.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/schemas.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/claim_verification/utils.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/guidance/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/guidance/guidance_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/guidance/prompts.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/preference_adherence/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/preference_adherence/preference_adherence_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/preference_adherence/prompts.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/semantic_test/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/semantic_test/semantic_test_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/tool_usage/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/tool_usage/prompts.py (100%) rename {eval_recipes => src/eval_recipes}/evaluations/tool_usage/tool_usage_evaluator.py (100%) rename {eval_recipes => src/eval_recipes}/schemas.py (100%) rename {eval_recipes => src/eval_recipes}/utils/__init__.py (100%) rename {eval_recipes => src/eval_recipes}/utils/llm.py (100%) rename {eval_recipes => src/eval_recipes}/utils/responses_conversion.py (97%) diff --git a/.claude/commands/dependency-update.md b/.claude/commands/dependency-update.md index 7bbde5f..8213bfd 100644 --- a/.claude/commands/dependency-update.md +++ b/.claude/commands/dependency-update.md @@ -4,4 +4,5 @@ The process to go through is: 2. For each dependency, go to its pypi release history site. For example, for the openai package that is: https://pypi.org/project/openai/#history Get the latest release version. 3. Now bump the dependency in pyproject.toml. For example, if the current version in the pyproject.toml is `>=1.05,<2.0`, but on Pypi the latest version is 1.11, change the dependency to `>=1.11,<2.0` 4. If you notice a major version upgrade (ex v2 to v3), let the user know of each of those cases, but do not make the change yourself. -5. Run `uv sync --all-extras --all-groups` to update the lock file. +5. Make sure all the checks still pass by running `uv run ruff format && uv run ruff check --fix && uv run ty check` from the root. +6. Run `uv sync --all-extras --all-groups` to update the lock file. diff --git a/.claude/settings.json b/.claude/settings.json index cee968d..8dab700 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -2,12 +2,24 @@ "permissions": { "allow": [ "mcp__ide__getDiagnostics", - "Bash(make:*)", - "Bash(tree:*)", - "Bash(mkdir:*)", - "Bash(cp:*)", "WebSearch", - "WebFetch" + "WebFetch", + "Bash(cp:*)", + "Bash(find:*)", + "Bash(mkdir:*)", + "Bash(ls:*)", + "Bash(xargs:*)", + "Bash(tree:*)", + "Bash(tee:*)", + "Bash(grep:*)", + "Bash(git clone:*)", + "Bash(uv build:*)", + "Bash(uv sync:*)", + "Bash(uv run ruff format:*)", + "Bash(uv run ruff check:*)", + "Bash(uv run ty:*)", + "Bash(gh release list:*)", + "Bash(gh release view:*)" ], "deny": [], "ask": [] @@ -15,14 +27,15 @@ "hooks": { "Stop": [ { - "matcher": "Edit|MultiEdit|Write", + "matcher": "Edit|Write", "hooks": [ { "type": "command", - "command": "make check" + "command": "uv run ruff check --fix && uv run ruff format && uv run ty check", + "timeout": 10 } ] } ] } -} \ No newline at end of file +} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..b06f39f --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + quality: + name: Code Quality + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Set up Python + run: uv python install + + - name: Install dependencies + run: uv sync --frozen --all-groups + + - name: Check formatting + run: uv run ruff format --check + + - name: Lint + run: uv run ruff check + + - name: Type check + run: uv run ty check \ No newline at end of file diff --git a/.github/workflows/python-tests.yaml b/.github/workflows/python-tests.yaml deleted file mode 100644 index 0902f7d..0000000 --- a/.github/workflows/python-tests.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) Microsoft. All rights reserved - -name: Python Integration Tests - -on: - workflow_dispatch: - pull_request: - branches: - - main - push: - branches: - - main - -permissions: - contents: read - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install uv - uses: astral-sh/setup-uv@v6 - with: - enable-cache: true - - - name: Set up Python - run: uv python install 3.12 - - - name: Install dependencies using make - run: make install - - - name: Run tests - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: | - source .venv/bin/activate - pytest . \ No newline at end of file diff --git a/.gitignore b/.gitignore index 361b74d..882d447 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ media/ .comparison_results/ data/tasks/arc-agi-2-*/ data/tasks/frontier-science-*/ +ai_working/ # Benchmarking generated reports benchmark_report.html diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..5bf1dd2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: + # uv lock file management + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.10.0 + hooks: + - id: uv-lock + + # Ruff linting and formatting + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.0 + hooks: + # Run the linter with --fix (must be before formatter) + - id: ruff-check + args: [--fix] + # Run the formatter + - id: ruff-format + + # Type checking with ty + - repo: local + hooks: + - id: ty-check + name: ty type check + entry: uv run ty check --output-format concise --no-progress + language: system + types: [python] + pass_filenames: false \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..902b2c9 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index d307cf0..d5cab54 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,7 +7,13 @@ }, "editor.defaultFormatter": "charliermarsh.ruff" }, + "notebook.formatOnSave.enabled": true, + "notebook.codeActionsOnSave": { + "notebook.source.fixAll": "explicit", + "notebook.source.organizeImports": "explicit" + }, + "python.languageServer": "None", "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "markdown.extension.orderedList.marker": "one" -} \ No newline at end of file +} diff --git a/AGENTS.md b/AGENTS.md index d90cdff..9cbe48e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,101 +1,52 @@ -# Project README -@README.md - - -# Project Dependencies -The dependencies are defined in the pyproject.toml file: -@pyproject.toml - -You must only use these dependencies in the code you write. If you need to add a new dependency, confirm with the user first before adding it. - -## Documentation -- Some documentation for common dependencies and tooling, such as `uv` is available in the `ai_context/` folder - - -# Core Design Principles -### Ruthless Simplicity -- KISS principle taken to heart: Keep everything as simple as possible. -- Minimize abstractions: Every layer of abstraction must justify its existence -- Start minimal, grow as needed: Begin with the simplest implementation that meets current needs -- Avoid future-proofing: Don't build for hypothetical future requirements -- Question everything: Regularly challenge complexity in the codebase - -### Library Usage Philosophy -- Use libraries as intended: Minimal wrappers around external libraries -- Direct integration: Avoid unnecessary adapter layers -- Selective dependency: Add dependencies only when they provide substantial value -- Understand what you import: No black-box dependencies - -### Testing Strategy -- Emphasis on integration and end-to-end tests -- Manual testability as a design goal -- Focus on critical path testing initially -- Add unit tests for complex logic and edge cases - -## Areas to Embrace Complexity -Some areas justify additional complexity: -1. Security: Never compromise on security fundamentals -2. Data integrity: Ensure data consistency and reliability -3. Core user experience: Make the primary user flows smooth and reliable -4. Error visibility: Make problems obvious and diagnosable - -## Areas to Aggressively Simplify -Push for extreme simplicity in these areas: -1. Internal abstractions: Minimize layers between components -2. Generic "future-proof" code: Resist solving non-existent problems -3. Edge case handling: Handle the common cases well first -4. Framework usage: Use only what you need from frameworks -5. State management: Keep state simple and explicit - -## Remember -- It's easier to add complexity later than to remove it -- Code you don't write has no bugs -- Favor clarity over cleverness -- The best code is often the simplest - - -# Development Guidelines -## Other Guidelines -- Do not use emojis unless asked. -- Do not include excessive print and logging statements. +# General Instructions +- This is a production-grade Python package using `uv` as the package and project manager. You must *always* follow best open-source Python practices. +- Shortcuts are not appropriate. When in doubt, you must work with the user for guidance. +- Any documentation you write, including in the README.md, should be clear, concise, and accurate like the official documentation of other production-grade Python packages. +- Make sure any comments in code are necessary. A necessary comment captures intent that cannot be encoded in names, types, or structure. Comments should be reserved for the "why", only used to record rationale, trade-offs, links to specs/papers, or non-obvious domain insights. They should add signal that code cannot. +- The current code in the package should be treated as an example of high quality code. Make sure to follow its style and tackle issues in similar ways where appropriate. +- Do not run tests automatically unless asked since they take a while. +- Don't generate characters that a user could not type on a standard keyboard like fancy arrows. +- Anything is possible. Do not blame external factors after something doesn't work on the first try. Instead, investigate and test assumptions through debugging through first principles. +- When writing documentation + - Keep it very concise + - No emojis or em dashes. + - Documentation should be written exactly like it is for production-grade, polished, open-source Python packages. - You should only use the dependencies in the provided dependency files. If you need to add a new one, ask first. -- Do not automatically run scripts, tests, or move/rename/delete files. Ask the user to do these tasks. -- Read in the entirety of files to get the full context -- Include `# Copyright (c) Microsoft. All rights reserved` at the top of each Python file (don't worry about __init__.py files) -- When writing documentation, you write as if you were a professional and experienced developer making their code available publicly on GitHub. -- Never add back in code or comments that the user has removed or changed. -- llms.txt is auto generated by `.github/workflows/generate-llms-txt.yaml`. Do not edit it directly. +- Never add back in code or comments that someone else has removed or changed since you last viewed it. -## Python Development Rules -- This project uses Python >=3.11, uv as the package and project manager, and Ruff as a linter and code formatter. +# Python Development Instructions +- `ty` by Astral is used for type checking. Always add appropriate type hints such that the code would pass ty's type check. - Follow the Google Python Style Guide. -- Instead of importing `Optional` from typing, using the `| `syntax. -- Always add appropriate type hints such that the code would pass pyright's type check. -- For type hints, use `list`, not `List`. For example, if the variable is `[{"name": "Jane", "age": 32}, {"name": "Amy", "age": 28}]` the type hint should be `list[dict[str. str | int]]` -- Always prefer pathlib for dealing with files. Use `Path.open` instead of `open`. +- After each code change, checks are automatically run. Fix any issues that arise. +- **IMPORTANT**: The checks will remove any unused imports after you make an edit to a file. So if you need to use a new import, be sure to use it FIRST (or do your edits at the same time) or else it will be automatically removed. DO NOT use local imports to get around this. +- At this stage of the project, NEVER add imports to __init__.py files. Leave them empty unless absolutely necessary. +- Always prefer pathlib for dealing with files. Use `Path.open` instead of `open`. - When using pathlib, **always** Use `.parents[i]` syntax to go up directories instead of using `.parent` multiple times. -- When writing multi-line strings, use `"""` instead of using string concatenation. Use `\` to break up long lines in appropriate places. -- When creating template strings, prefer to use python-liquid. This is the basic sample code: - ```python - from liquid import render - - print(render("Hello, {{ you }}!", you="World")) - # Hello, World! - ``` - When writing tests, use pytest and pytest-asyncio. - The pyproject.toml is already configured so you do not need to add the `@pytest.mark.asyncio` decorator. -- Prefer to use loguru instead of logging -- Follow Ruff best practices such as: - - Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling -- Do not use relative imports. -- Use dotenv to load environment variables for local development. Assume we have a `.env` file -- Since this is structured as Python package, you should not put `#!/usr/bin/env python` at the top of scripts as that is redundant. +- Prefer using loguru for logging instead of the built-in logging module. Do not add logging unless requested. +- NEVER use `# type: ignore`. It is better to leave the issue and have the user work with you to fix it. +- Don't put types in quotes unless it is absolutely necessary to avoid circular imports and forward references. +- When writing multi-line strings, use `"""` instead of using string concatenation. Use `\` to break up long lines in appropriate places. + +- When constructing long strings like prompts for LLMs, use `python-liquid`'s `render` function: +```python +from liquid import render + +print(render("Hello, {{ you }}!", you="World")) +# Hello, World! +``` +- Include `# Copyright (c) Microsoft. All rights reserved` at the top of each Python file (don't worry about __init__.py files) - For Windows compatibility, use encoding='utf-8' when handling files, unless required otherwise. - Leave __init__.py files empty unless there is a specific reason to add code there during this early stage of the project. +- When running scripts with `uv run` make sure you are in the top level of this Python project. All you need to do is `uv run