Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions benchmark_pipeline/benchmark_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ model_display_names:
"anthropic/claude-3.5-sonnet-20240620": "Sonnet 3.5"
"anthropic/claude-3.7-sonnet": "Sonnet 3.7"
"anthropic/claude-3.7-sonnetthinking": "Sonnet 3.7 Thinking"
"anthropic/claude-4.5-sonnet": "Sonnet 4.5"
"anthropic/claude-opus-4.1": "Claude Opus 4.1"
"anthropic/claude-sonnet-4": "Sonnet 4"
"anthropic/claude-sonnet-4thinking": "Sonnet 4 Thinking"
Expand Down
1,209 changes: 1,209 additions & 0 deletions docs/cases.html

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/__init__.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/__init__.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="success">Success</span></p>
<p><strong>Prompt Tokens:</strong> 59517</p>
<p><strong>Native Prompt Tokens:</strong> 67626</p>
<p><strong>Native Completion Tokens:</strong> 156</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.205218</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider___init__.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider___init__.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider___init__.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<div class="success-message"><p>✓ No differences found (successful run)</p><p>Expected output matches the model output exactly.</p></div>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/analytics.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/analytics.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="success">Success</span></p>
<p><strong>Prompt Tokens:</strong> 24542</p>
<p><strong>Native Prompt Tokens:</strong> 30657</p>
<p><strong>Native Completion Tokens:</strong> 2094</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.123381</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_analytics.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_analytics.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_analytics.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<div class="success-message"><p>✓ No differences found (successful run)</p><p>Expected output matches the model output exactly.</p></div>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

51 changes: 51 additions & 0 deletions docs/cases/anthropic_claude-4.5-sonnet/aider_aider_args.py.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/args.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/args.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="success">Success</span></p>
<p><strong>Prompt Tokens:</strong> 61543</p>
<p><strong>Native Prompt Tokens:</strong> 76671</p>
<p><strong>Native Completion Tokens:</strong> 7406</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.341103</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_args.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_args.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_args.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<div class="success-message"><p>✓ No differences found (successful run)</p><p>Expected output matches the model output exactly.</p></div>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/coders/editblock_coder.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/coders/editblock_coder.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="failure">Failure</span></p>
<p><strong>Prompt Tokens:</strong> 56338</p>
<p><strong>Native Prompt Tokens:</strong> 72436</p>
<p><strong>Native Completion Tokens:</strong> 5781</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.304023</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_coder.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_coder.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_coder.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<pre class="diff"><div></div><div>index 42fc8b445..25393018f 100644</div><div class="diff-header">--- a/aider_aider_coders_editblock_coder.py_expectedoutput.txt (expected):tmp/tmpueysfxkk_expected.txt </div><div class="diff-header">+++ b/aider_aider_coders_editblock_coder.py_extracted.txt (actual):tmp/tmpoi5t9hnk_actual.txt </div><div class="diff-info">@@ -448,7 +448,7 @@ def find_original_update_blocks(content, fence=DEFAULT_FENCE, valid_fnames=None)</div><div> while i < len(lines):</div><div> line = lines[i]</div><div> </div><div class="diff-removed">- # Check for shell code blocks</div><div class="diff-added">+ # Check for various shell code blocks</div><div> shell_starts = [</div><div> "```bash",</div><div> "```sh",</div><div></div></pre>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/coders/editblock_prompts.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/coders/editblock_prompts.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="failure">Failure</span></p>
<p><strong>Prompt Tokens:</strong> 35371</p>
<p><strong>Native Prompt Tokens:</strong> 42180</p>
<p><strong>Native Completion Tokens:</strong> 2006</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.15663</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_prompts.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_prompts.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_editblock_prompts.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<pre class="diff"><div></div><div>index b000ba510..3c13e60d4 100644</div><div class="diff-header">--- a/aider_aider_coders_editblock_prompts.py_expectedoutput.txt (expected):tmp/tmpw8w28zoo_expected.txt </div><div class="diff-header">+++ b/aider_aider_coders_editblock_prompts.py_extracted.txt (actual):tmp/tmp7oq1nply_actual.txt </div><div class="diff-info">@@ -195,6 +195,7 @@ The user will say when they've applied your edits. If they haven't explicitly co</div><div> """</div><div> </div><div> shell_cmd_reminder = """</div><div class="diff-added">+</div><div> Examples of when to suggest shell commands:</div><div> </div><div> - If you changed a self-contained html file, suggest an OS-appropriate command to open a browser to view it to see the updated content.</div><div></div></pre>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/coders/patch_coder.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/coders/patch_coder.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="success">Success</span></p>
<p><strong>Prompt Tokens:</strong> 22441</p>
<p><strong>Native Prompt Tokens:</strong> 28880</p>
<p><strong>Native Completion Tokens:</strong> 7576</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.20028</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_patch_coder.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_patch_coder.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_patch_coder.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<div class="success-message"><p>✓ No differences found (successful run)</p><p>Expected output matches the model output exactly.</p></div>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Case: aider/coders/wholefile_coder.py - Sonnet 4.5</title>
<link rel="stylesheet" href="../../styles.css">
</head>
<body>
<header>
<h1>Case: aider/coders/wholefile_coder.py</h1>
<h2>Model: Sonnet 4.5</h2>
<p><a href="../../models/anthropic_claude-4.5-sonnet.html">All Sonnet 4.5 Cases</a> | <a href="../../cases.html">All Cases</a> | <a href="../../index.html">Home</a></p>
</header>
<main>
<section class="case-details">
<div class="case-info">
<h2>Benchmark Case Information</h2>
<p><strong>Model:</strong> Sonnet 4.5</p>
<p><strong>Status:</strong> <span class="success">Success</span></p>
<p><strong>Prompt Tokens:</strong> 20299</p>
<p><strong>Native Prompt Tokens:</strong> 26267</p>
<p><strong>Native Completion Tokens:</strong> 1308</p>
<p><strong>Native Tokens Reasoning:</strong> 0</p>
<p><strong>Native Finish Reason:</strong> stop</p>
<p><strong>Cost:</strong> $0.098421</p>
</div>

<div class="content-links">
<h2>View Content</h2>
<ul>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_wholefile_coder.py/prompt.html" class="content-link">View Prompt</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_wholefile_coder.py/expected.html" class="content-link">View Expected Output</a></li>
<li><a href="../../content/anthropic_claude-4.5-sonnet/aider_aider_coders_wholefile_coder.py/actual.html" class="content-link">View Actual Output</a></li>
</ul>
</div>

<div class="diff-section">
<h2>Diff (Expected vs Actual)</h2>
<div id="diff-output">
<div class="success-message"><p>✓ No differences found (successful run)</p><p>Expected output matches the model output exactly.</p></div>
</div>
</div>
</section>
</main>
<footer>
<p>LoCoDiff-bench - <a href="https://github.com/AbanteAI/LoCoDiff-bench">GitHub Repository</a></p>
</footer>
</body>
</html>

Loading