Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
5830a70
quick and dirty
mohammedahmed18 Nov 27, 2025
3e0440b
safter
mohammedahmed18 Nov 27, 2025
eb16cb2
Optimize parse_test_failures_from_stdout
codeflash-ai[bot] Nov 27, 2025
168118a
Merge pull request #946 from codeflash-ai/codeflash/optimize-pr945-20…
mohammedahmed18 Nov 27, 2025
a7f8816
fix tests
mohammedahmed18 Nov 27, 2025
4e9f894
linting
mohammedahmed18 Nov 27, 2025
1c9abaf
did it pass ?
mohammedahmed18 Nov 28, 2025
0b2d894
revert test optimization
mohammedahmed18 Nov 28, 2025
ecfa89f
cleaner
mohammedahmed18 Nov 28, 2025
6ea2545
test: try to fix the candidate and see if the diff is empty
mohammedahmed18 Nov 28, 2025
fe68772
capture all test discrepancies
Nov 30, 2025
ed39ec8
do the repair in main loop
Nov 30, 2025
142da4c
todo write backend endpoint
Dec 1, 2025
5a7c356
need to test now
Dec 1, 2025
8a28d0d
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 1, 2025
5ed5dfc
works, figure out logging
Dec 1, 2025
fe33c82
local db logging
Dec 1, 2025
83814be
ready to run experiments
Dec 1, 2025
0325444
logging fix
Dec 1, 2025
9f7ed90
handle test class methods for the test diff
mohammedahmed18 Dec 1, 2025
1ddc87c
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 1, 2025
6060ffb
codeflash suggestion
mohammedahmed18 Dec 2, 2025
1120d64
safer parsing
mohammedahmed18 Dec 2, 2025
c2e037a
better parsing for pytest stdout
mohammedahmed18 Dec 2, 2025
5703889
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 2, 2025
bd1ebf4
temp logging
Dec 3, 2025
c1ae81e
working version
mohammedahmed18 Dec 3, 2025
696448c
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 3, 2025
97f2426
fix override candidate after the code repair
mohammedahmed18 Dec 4, 2025
6a9390c
typo
mohammedahmed18 Dec 4, 2025
b93fd34
enhancements and cleanups
mohammedahmed18 Dec 4, 2025
bcc19f7
handle repaired code is exact same as the original code
mohammedahmed18 Dec 4, 2025
79387c3
linting issue and handle file name in code_print for repaired candidate
mohammedahmed18 Dec 4, 2025
4c13bb9
fixes
mohammedahmed18 Dec 4, 2025
12dc7e1
Merge branch 'main' of github.com:codeflash-ai/codeflash into feat/fe…
mohammedahmed18 Dec 4, 2025
d66d2ce
small changes
mohammedahmed18 Dec 4, 2025
b4474f3
add code repairs to the queue
mohammedahmed18 Dec 5, 2025
726405b
optimization source
mohammedahmed18 Dec 5, 2025
ee4749a
make it work
mohammedahmed18 Dec 7, 2025
15b72b1
repair the code after refinement if needed
mohammedahmed18 Dec 8, 2025
e63d39f
typo
mohammedahmed18 Dec 9, 2025
cc9ad56
optimization source and parents
mohammedahmed18 Dec 10, 2025
4976d5d
some heuristics to limit code repair from generating many candidates
mohammedahmed18 Dec 10, 2025
704a4b0
shorten error string, more enhancements needed
Dec 10, 2025
ff584fb
reprlib repr for shorter repr
Dec 11, 2025
ae080d0
enhancements
mohammedahmed18 Dec 11, 2025
6c8be65
Merge branch 'main' of github.com:codeflash-ai/codeflash into feat/fe…
mohammedahmed18 Dec 11, 2025
2e5d728
Optimize AiServiceClient._get_valid_candidates
codeflash-ai[bot] Dec 11, 2025
4dea247
Merge pull request #966 from codeflash-ai/codeflash/optimize-pr945-20…
mohammedahmed18 Dec 11, 2025
b5ca2b4
fix failing test
mohammedahmed18 Dec 11, 2025
b438243
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 11, 2025
f219996
fix validation error for python code
mohammedahmed18 Dec 11, 2025
9d099ac
remove comment
mohammedahmed18 Dec 11, 2025
68e0e7c
Merge remote-tracking branch 'origin/feat/feedback-loop-for-unmatched…
Dec 15, 2025
cb8ce22
Merge pull request #965 from codeflash-ai/feat/shorten-test-feedback
aseembits93 Dec 15, 2025
41de7be
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 15, 2025
a6a5578
fixes
mohammedahmed18 Dec 15, 2025
93f7331
Merge branch 'main' into feat/feedback-loop-for-unmatched-test-results
mohammedahmed18 Dec 16, 2025
b96a01c
reprlib bugfix
aseembits93 Dec 17, 2025
ddb656f
tests: parse test output
mohammedahmed18 Dec 17, 2025
58ff338
Merge branch 'feat/feedback-loop-for-unmatched-test-results' of githu…
mohammedahmed18 Dec 17, 2025
46522d8
Merge branch 'main' into feat/feedback-loop-for-unmatched-test-results
aseembits93 Dec 19, 2025
d165a15
Merge remote-tracking branch 'origin/main' into feat/feedback-loop-fo…
aseembits93 Dec 19, 2025
9b04565
merge conflicts fix
aseembits93 Dec 19, 2025
5063fba
revert version
aseembits93 Dec 19, 2025
60a6db2
Merge branch 'main' into feat/feedback-loop-for-unmatched-test-results
aseembits93 Dec 19, 2025
13a82fd
bugfix
aseembits93 Dec 19, 2025
a04dc98
merge commit fix
aseembits93 Dec 19, 2025
194ded5
merge commit fix
aseembits93 Dec 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions codeflash/discovery/functions_to_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,25 +306,43 @@ def levenshtein_distance(s1: str, s2: str) -> int:
len1 = len(s1)
len2 = len(s2)
# Use a preallocated list instead of creating a new list every iteration

# Early exit for empty string cases
if len1 == 0:
return len2
if len2 == 0:
return len1

# Convert strings to lists for fast indexed access
s1_list = list(s1)
s2_list = list(s2)

# Preallocate and reuse arrays; avoid creating new ones every iteration
previous = list(range(len1 + 1))
current = [0] * (len1 + 1)

for index2 in range(len2):
char2 = s2[index2]
char2 = s2_list[index2]
current[0] = index2 + 1

# Remove redundant intermediate assignments for better cache locality
prev = previous
curr = current
s1_chars = s1_list
# Use local variables for frequently accessed values
for index1 in range(len1):
char1 = s1[index1]
if char1 == char2:
current[index1 + 1] = previous[index1]
# Unrolling char1 assignment and equality check
if s1_chars[index1] == char2:
curr[index1 + 1] = prev[index1]
else:
# Fast min calculation without tuple construct
a = previous[index1]
b = previous[index1 + 1]
c = current[index1]
min_val = min(b, a)
min_val = min(c, min_val)
current[index1 + 1] = 1 + min_val
# Swap references instead of copying
x = prev[index1]
y = prev[index1 + 1]
z = curr[index1]
min_xy = min(x, y)
min_xyz = min(z, min_xy)
curr[index1 + 1] = 1 + min_xyz
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚡️Codeflash found 73% (0.73x) speedup for levenshtein_distance in codeflash/discovery/functions_to_optimize.py

⏱️ Runtime : 2.04 seconds 1.18 seconds (best of 8 runs)

📝 Explanation and details

The optimized version achieves a 73% speedup by eliminating Python's built-in min() function calls and replacing them with direct comparisons. This is a targeted micro-optimization that addresses one of the most expensive operations in the Levenshtein distance algorithm.

Key optimization:

  • Replaced min() calls with direct comparisons: The original code used min(x, y) and min(z, min_xy) which create temporary tuples and invoke Python's generic minimum function. The optimized version uses nested if statements to find the minimum value directly, avoiding function call overhead and tuple creation.

Why this provides a speedup:

  • The min() function in Python has significant overhead for small numbers of arguments, especially when called millions of times in nested loops
  • Direct comparisons (if x < y) are primitive operations that execute much faster than function calls
  • Eliminates temporary tuple creation that min() uses internally
  • Reduces the call stack depth in the inner loop

Performance impact by test case type:

  • Identical/similar strings: 55-65% faster - benefits from reduced overhead in character matching paths
  • Completely different strings: 109-121% faster - maximizes benefit since every character comparison triggers the min() replacement logic
  • Large strings with many differences: 83-93% faster - compounds the per-operation savings across many iterations
  • Small strings: 15-50% faster - still benefits but overhead reduction is less pronounced

The optimization is particularly effective for the Levenshtein algorithm because the min() operation occurs in the innermost loop that executes O(n×m) times, making even small per-call improvements significant when multiplied across all iterations.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 148 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 96.6%
🌀 Generated Regression Tests and Runtime
from __future__ import annotations

# imports
import pytest  # used for our unit tests

from codeflash.discovery.functions_to_optimize import levenshtein_distance

# unit tests

# 1. Basic Test Cases


def test_identical_strings():
    # Levenshtein distance between identical strings should be 0
    codeflash_output = levenshtein_distance("kitten", "kitten")  # 15.8μs -> 9.90μs (60.0% faster)
    codeflash_output = levenshtein_distance("", "")  # 480ns -> 441ns (8.84% faster)
    codeflash_output = levenshtein_distance("a", "a")  # 2.09μs -> 1.95μs (7.22% faster)


def test_single_insertion():
    # Inserting one character
    codeflash_output = levenshtein_distance("kitten", "kitte")  # 13.0μs -> 8.69μs (49.4% faster)
    codeflash_output = levenshtein_distance("kitte", "kitten")  # 10.1μs -> 6.12μs (65.6% faster)
    codeflash_output = levenshtein_distance("", "a")  # 421ns -> 421ns (0.000% faster)
    codeflash_output = levenshtein_distance("a", "")  # 360ns -> 361ns (0.277% slower)


def test_single_deletion():
    # Deleting one character
    codeflash_output = levenshtein_distance("kitten", "kittn")  # 12.6μs -> 8.52μs (48.5% faster)
    codeflash_output = levenshtein_distance("kittn", "kitten")  # 10.0μs -> 5.96μs (67.9% faster)


def test_single_substitution():
    # Substituting one character
    codeflash_output = levenshtein_distance("kitten", "sitten")  # 14.8μs -> 9.26μs (60.2% faster)
    codeflash_output = levenshtein_distance("kitten", "kitteb")  # 11.7μs -> 6.81μs (72.4% faster)
    codeflash_output = levenshtein_distance("a", "b")  # 2.22μs -> 1.89μs (17.5% faster)


def test_multiple_operations():
    # Multiple edits required
    codeflash_output = levenshtein_distance("kitten", "sitting")  # 16.4μs -> 10.3μs (58.8% faster)
    codeflash_output = levenshtein_distance("flaw", "lawn")  # 6.70μs -> 4.47μs (50.0% faster)


def test_empty_and_nonempty():
    # One string empty, one non-empty
    codeflash_output = levenshtein_distance("", "abc")  # 751ns -> 751ns (0.000% faster)
    codeflash_output = levenshtein_distance("abc", "")  # 431ns -> 451ns (4.43% slower)


# 2. Edge Test Cases


def test_both_empty():
    # Both strings are empty
    codeflash_output = levenshtein_distance("", "")  # 781ns -> 761ns (2.63% faster)


def test_one_char_vs_empty():
    # One string is a single character, other is empty
    codeflash_output = levenshtein_distance("a", "")  # 771ns -> 781ns (1.28% slower)
    codeflash_output = levenshtein_distance("", "z")  # 431ns -> 441ns (2.27% slower)


def test_case_sensitivity():
    # Case should matter
    codeflash_output = levenshtein_distance("abc", "Abc")  # 7.70μs -> 5.87μs (31.1% faster)
    codeflash_output = levenshtein_distance("ABC", "abc")  # 5.14μs -> 3.73μs (37.9% faster)


def test_unicode_characters():
    # Unicode characters
    codeflash_output = levenshtein_distance("café", "cafe")  # 9.39μs -> 6.81μs (37.8% faster)
    codeflash_output = levenshtein_distance("naïve", "naive")  # 9.85μs -> 5.75μs (71.3% faster)
    codeflash_output = levenshtein_distance("你好", "你")  # 3.12μs -> 2.81μs (10.7% faster)
    codeflash_output = levenshtein_distance("你好", "您好")  # 3.10μs -> 2.71μs (14.5% faster)


def test_completely_different_strings():
    # No characters in common
    codeflash_output = levenshtein_distance("abc", "xyz")  # 7.45μs -> 5.61μs (32.9% faster)
    codeflash_output = levenshtein_distance("123", "abc")  # 5.14μs -> 3.46μs (48.7% faster)


def test_prefix_and_suffix():
    # One string is a prefix or suffix of the other
    codeflash_output = levenshtein_distance("abc", "abcd")  # 7.88μs -> 6.11μs (29.0% faster)
    codeflash_output = levenshtein_distance("abcd", "abc")  # 5.18μs -> 3.78μs (37.1% faster)
    codeflash_output = levenshtein_distance("abc", "zabc")  # 5.23μs -> 3.41μs (53.6% faster)
    codeflash_output = levenshtein_distance("abc", "abcz")  # 4.87μs -> 3.19μs (52.8% faster)


def test_repeated_characters():
    # Strings with repeated characters
    codeflash_output = levenshtein_distance("aaa", "aaaa")  # 4.89μs -> 4.79μs (2.11% faster)
    codeflash_output = levenshtein_distance("aaaa", "aaa")  # 2.92μs -> 3.06μs (4.89% slower)
    codeflash_output = levenshtein_distance("aaa", "bbb")  # 5.54μs -> 3.56μs (55.7% faster)


def test_numbers_and_symbols():
    # Strings with digits and symbols
    codeflash_output = levenshtein_distance("1234", "1243")  # 8.68μs -> 6.73μs (28.9% faster)
    codeflash_output = levenshtein_distance("!@#$", "!@#")  # 5.76μs -> 4.13μs (39.6% faster)
    codeflash_output = levenshtein_distance("!@#$", "$#@!")  # 6.25μs -> 4.45μs (40.5% faster)


def test_long_identical_strings():
    # Long identical strings (edge, but also performance)
    s = "a" * 100
    codeflash_output = levenshtein_distance(s, s)  # 519μs -> 535μs (2.86% slower)


def test_long_strings_one_difference():
    # Long strings with one difference at the end
    s1 = "a" * 999 + "b"
    s2 = "a" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 60.1ms -> 59.3ms (1.27% faster)
    codeflash_output = levenshtein_distance(s2, s1)  # 60.3ms -> 59.7ms (1.11% faster)


def test_long_strings_completely_different():
    # Long completely different strings
    s1 = "a" * 500
    s2 = "b" * 500
    codeflash_output = levenshtein_distance(s1, s2)  # 67.1ms -> 30.4ms (121% faster)


# 3. Large Scale Test Cases


def test_large_equal_strings():
    # Large identical strings
    s = "abcde" * 200  # length 1000
    codeflash_output = levenshtein_distance(s, s)  # 242ms -> 114ms (111% faster)


def test_large_one_insertion():
    # Large string with one insertion
    s1 = "a" * 500 + "b" + "a" * 499  # length 1000
    s2 = "a" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 58.2ms -> 56.2ms (3.59% faster)


def test_large_one_substitution():
    # Large string with one substitution in the middle
    s1 = "a" * 499 + "b" + "a" * 500
    s2 = "a" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 57.9ms -> 57.2ms (1.16% faster)


def test_large_completely_different():
    # Large strings, all substitutions
    s1 = "a" * 1000
    s2 = "b" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 274ms -> 129ms (112% faster)


def test_large_half_and_half():
    # Half the string is the same, half is different
    s1 = "a" * 500 + "b" * 500
    s2 = "a" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 171ms -> 93.5ms (83.5% faster)


def test_large_with_unicode():
    # Large string with unicode characters
    s1 = "你" * 500 + "好" * 500
    s2 = "你" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 174ms -> 96.3ms (81.0% faster)


# 4. Additional Robustness Cases


@pytest.mark.parametrize(
    "s1,s2,expected",
    [
        ("", "", 0),
        ("", "abc", 3),
        ("abc", "", 3),
        ("abc", "abc", 0),
        ("abc", "ab", 1),
        ("a", "b", 1),
        ("", "a", 1),
        ("a", "", 1),
        ("kitten", "sitting", 3),
        ("flaw", "lawn", 2),
        ("intention", "execution", 5),
        ("distance", "difference", 5),
        ("abcdef", "azced", 3),
        ("short", "ports", 3),
    ],
)
def test_various_cases(s1, s2, expected):
    # Parametrized test for various scenarios
    codeflash_output = levenshtein_distance(s1, s2)  # 130μs -> 85.5μs (52.5% faster)


# 5. Commutativity property (Levenshtein distance is symmetric)
def test_commutativity():
    pairs = [
        ("kitten", "sitting"),
        ("flaw", "lawn"),
        ("abc", "xyz"),
        ("", "abc"),
        ("a" * 500, "b" * 500),
        ("abcde" * 100, "edcba" * 100),
    ]
    for s1, s2 in pairs:
        codeflash_output = levenshtein_distance(s1, s2)
        d1 = codeflash_output  # 126ms -> 58.6ms (116% faster)
        codeflash_output = levenshtein_distance(s2, s1)
        d2 = codeflash_output  # 126ms -> 58.8ms (115% faster)


# 6. Triangle inequality property
def test_triangle_inequality():
    # For Levenshtein distance, d(x,z) <= d(x,y) + d(y,z)
    triples = [("kitten", "sitting", "sittin"), ("abc", "abd", "ab"), ("a" * 100, "a" * 99 + "b", "a" * 99 + "c")]
    for x, y, z in triples:
        codeflash_output = levenshtein_distance(x, z)
        d_xz = codeflash_output  # 557μs -> 537μs (3.89% faster)
        codeflash_output = levenshtein_distance(x, y)
        d_xy = codeflash_output  # 553μs -> 532μs (3.98% faster)
        codeflash_output = levenshtein_distance(y, z)
        d_yz = codeflash_output


# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
from __future__ import annotations

# imports
import pytest  # used for our unit tests

from codeflash.discovery.functions_to_optimize import levenshtein_distance

# unit tests


# 1. Basic Test Cases
def test_identical_strings():
    # Identical strings should have distance 0
    codeflash_output = levenshtein_distance("kitten", "kitten")  # 14.4μs -> 9.29μs (55.1% faster)
    codeflash_output = levenshtein_distance("", "")  # 611ns -> 521ns (17.3% faster)
    codeflash_output = levenshtein_distance("a", "a")  # 2.03μs -> 1.98μs (2.52% faster)


def test_single_insertion():
    # One insertion required
    codeflash_output = levenshtein_distance("kitten", "kittena")  # 16.1μs -> 9.74μs (65.7% faster)
    codeflash_output = levenshtein_distance("abc", "abcd")  # 5.73μs -> 3.86μs (48.6% faster)


def test_single_deletion():
    # One deletion required
    codeflash_output = levenshtein_distance("kitten", "kittn")  # 12.9μs -> 8.69μs (49.0% faster)
    codeflash_output = levenshtein_distance("abcd", "abc")  # 5.71μs -> 4.03μs (41.8% faster)


def test_single_substitution():
    # One substitution required
    codeflash_output = levenshtein_distance("kitten", "kittan")  # 14.5μs -> 9.22μs (57.3% faster)
    codeflash_output = levenshtein_distance("abc", "adc")  # 4.67μs -> 3.47μs (34.7% faster)


def test_multiple_operations():
    # Multiple operations needed
    codeflash_output = levenshtein_distance("kitten", "sitting")  # 16.6μs -> 10.1μs (65.1% faster)
    codeflash_output = levenshtein_distance("flaw", "lawn")  # 6.70μs -> 4.50μs (49.0% faster)
    codeflash_output = levenshtein_distance("gumbo", "gambol")  # 10.7μs -> 6.22μs (72.6% faster)


def test_case_sensitivity():
    # Should be case-sensitive
    codeflash_output = levenshtein_distance("a", "A")  # 4.12μs -> 3.55μs (16.1% faster)
    codeflash_output = levenshtein_distance("Python", "python")  # 13.1μs -> 7.71μs (69.8% faster)


def test_completely_different_strings():
    # All characters different
    codeflash_output = levenshtein_distance("abc", "xyz")  # 7.57μs -> 5.60μs (35.2% faster)
    codeflash_output = levenshtein_distance("aaa", "bbb")  # 4.95μs -> 3.26μs (52.0% faster)


# 2. Edge Test Cases


def test_empty_strings():
    # One or both strings empty
    codeflash_output = levenshtein_distance("", "abc")  # 822ns -> 751ns (9.45% faster)
    codeflash_output = levenshtein_distance("abc", "")  # 441ns -> 460ns (4.13% slower)
    codeflash_output = levenshtein_distance("", "")  # 290ns -> 321ns (9.66% slower)


def test_one_character_strings():
    # Single character to/from empty or another char
    codeflash_output = levenshtein_distance("a", "")  # 742ns -> 771ns (3.76% slower)
    codeflash_output = levenshtein_distance("", "a")  # 431ns -> 411ns (4.87% faster)
    codeflash_output = levenshtein_distance("a", "b")  # 3.80μs -> 3.29μs (15.5% faster)


def test_unicode_strings():
    # Unicode and multi-byte characters
    codeflash_output = levenshtein_distance("café", "cafe")  # 9.28μs -> 6.86μs (35.2% faster)
    codeflash_output = levenshtein_distance("你好", "你们好")  # 4.51μs -> 3.69μs (22.3% faster)
    codeflash_output = levenshtein_distance("🙂", "🙃")  # 2.33μs -> 2.08μs (12.0% faster)
    codeflash_output = levenshtein_distance("a🙂b", "a🙃b")  # 4.81μs -> 3.54μs (36.0% faster)


def test_whitespace_and_special_chars():
    # Strings with whitespace and special characters
    codeflash_output = levenshtein_distance("a b", "ab")  # 6.26μs -> 5.17μs (21.1% faster)
    codeflash_output = levenshtein_distance("a_b", "a-b")  # 5.12μs -> 3.48μs (47.3% faster)
    codeflash_output = levenshtein_distance("hello!", "hello")  # 10.1μs -> 5.99μs (68.2% faster)


def test_long_repeated_chars():
    # Strings with repeated characters
    codeflash_output = levenshtein_distance("aaaaa", "aaaa")  # 5.47μs -> 5.39μs (1.48% faster)
    codeflash_output = levenshtein_distance("aaaaa", "bbbbb")  # 10.9μs -> 6.39μs (71.0% faster)


def test_palindromes_and_reverses():
    # Palindrome and reversed strings
    codeflash_output = levenshtein_distance("abcde", "edcba")  # 11.9μs -> 7.68μs (54.8% faster)


def test_large_difference_in_length():
    # One string much longer than the other
    codeflash_output = levenshtein_distance("a", "a" * 100)  # 25.4μs -> 25.7μs (1.09% slower)
    codeflash_output = levenshtein_distance("b" * 100, "b")  # 23.3μs -> 23.4μs (0.474% slower)


def test_strings_with_numbers():
    # Strings with numbers
    codeflash_output = levenshtein_distance("abc123", "abc124")  # 14.5μs -> 9.02μs (60.9% faster)
    codeflash_output = levenshtein_distance("12345", "54321")  # 9.13μs -> 5.82μs (56.8% faster)


# 3. Large Scale Test Cases


def test_large_identical_strings():
    # Large identical strings should have distance 0
    s = "a" * 500
    codeflash_output = levenshtein_distance(s, s)  # 13.9ms -> 13.5ms (2.37% faster)


def test_large_one_insertion():
    # Large string with one insertion
    s1 = "a" * 499
    s2 = "a" * 250 + "b" + "a" * 249
    codeflash_output = levenshtein_distance(s1, s2)  # 13.8ms -> 13.6ms (1.61% faster)


def test_large_one_deletion():
    # Large string with one deletion
    s1 = "a" * 500
    s2 = "a" * 499
    codeflash_output = levenshtein_distance(s1, s2)  # 13.7ms -> 13.5ms (1.69% faster)


def test_large_one_substitution():
    # Large string with one substitution in the middle
    s1 = "a" * 250 + "b" + "a" * 249
    s2 = "a" * 500
    codeflash_output = levenshtein_distance(s1, s2)  # 13.9ms -> 13.5ms (2.27% faster)


def test_large_completely_different():
    # Large strings, all characters different
    s1 = "a" * 500
    s2 = "b" * 500
    codeflash_output = levenshtein_distance(s1, s2)  # 67.2ms -> 30.7ms (119% faster)


def test_large_partial_overlap():
    # Large strings with partial overlap
    s1 = "a" * 250 + "b" * 250
    s2 = "a" * 200 + "b" * 300
    # 50 a's replaced with b's
    codeflash_output = levenshtein_distance(s1, s2)  # 41.7ms -> 21.7ms (92.6% faster)


def test_large_strings_with_unicode():
    # Large strings with unicode characters
    s1 = "é" * 500
    s2 = "e" * 500
    codeflash_output = levenshtein_distance(s1, s2)  # 67.2ms -> 30.4ms (121% faster)


def test_large_strings_with_alternating_chars():
    # Alternating characters
    s1 = "ab" * 250
    s2 = "ba" * 250
    # Each position is different except for the middle if even length
    codeflash_output = levenshtein_distance(s1, s2)  # 41.5ms -> 21.5ms (92.9% faster)


# 4. Additional Edge Cases


def test_nonequivalent_lengths_and_content():
    # Both length and content differ
    codeflash_output = levenshtein_distance("abcdefg", "xyz")  # 12.9μs -> 8.40μs (53.8% faster)


def test_substring():
    # One string is a substring of the other
    codeflash_output = levenshtein_distance("abcdef", "abc")  # 9.93μs -> 7.42μs (33.7% faster)
    codeflash_output = levenshtein_distance("abc", "abcdef")  # 7.66μs -> 4.98μs (53.7% faster)


def test_strings_with_tabs_and_newlines():
    # Special whitespace characters
    codeflash_output = levenshtein_distance("abc\tdef", "abcdef")  # 16.8μs -> 10.3μs (62.8% faster)
    codeflash_output = levenshtein_distance("abc\ndef", "abcdef")  # 13.7μs -> 7.80μs (76.0% faster)


def test_zero_length_and_long_string():
    # One empty, one long
    codeflash_output = levenshtein_distance("", "a" * 999)  # 912ns -> 811ns (12.5% faster)
    codeflash_output = levenshtein_distance("b" * 999, "")  # 631ns -> 541ns (16.6% faster)


# 5. Determinism and Symmetry


@pytest.mark.parametrize(
    "s1,s2",
    [
        ("kitten", "sitting"),
        ("flaw", "lawn"),
        ("", "abc"),
        ("abc", ""),
        ("abc", "cba"),
        ("abc", "abc"),
        ("", ""),
        ("a", "b"),
        ("abc123", "abc124"),
        ("a" * 500, "a" * 500),
    ],
)
def test_symmetry(s1, s2):
    # Levenshtein distance is symmetric
    codeflash_output = levenshtein_distance(s1, s2)  # 13.8ms -> 13.5ms (1.90% faster)


# 6. Type robustness


def test_non_string_inputs():
    # Should raise TypeError if input is not string
    with pytest.raises(TypeError):
        levenshtein_distance(123, "abc")
    with pytest.raises(TypeError):
        levenshtein_distance("abc", None)
    with pytest.raises(TypeError):
        levenshtein_distance(["a", "b"], "ab")
    with pytest.raises(TypeError):
        levenshtein_distance("ab", ["a", "b"])


# 7. Stress test: Large but feasible within constraints


def test_large_strings_max_size():
    # Both strings at the upper limit (1000 chars)
    s1 = "a" * 1000
    s2 = "b" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 272ms -> 130ms (109% faster)


def test_large_strings_one_char_difference():
    # 999 identical, 1 different
    s1 = "a" * 999 + "b"
    s2 = "a" * 1000
    codeflash_output = levenshtein_distance(s1, s2)  # 58.4ms -> 57.5ms (1.56% faster)


# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To test or edit this optimization locally git merge codeflash/optimize-pr945-2025-11-27T14.39.26

Click to see suggested changes
Suggested change
x = prev[index1]
y = prev[index1 + 1]
z = curr[index1]
min_xy = min(x, y)
min_xyz = min(z, min_xy)
curr[index1 + 1] = 1 + min_xyz
# Avoid min() function call overhead by using direct comparisons
x = prev[index1]
y = prev[index1 + 1]
z = curr[index1]
if x < y:
if x < z:
curr[index1 + 1] = 1 + x
else:
curr[index1 + 1] = 1 + z
elif y < z:
curr[index1 + 1] = 1 + y
else:
curr[index1 + 1] = 1 + z

Static Badge


# Swap references rather than copying data
previous, current = current, previous
return previous[len1]

Expand Down
29 changes: 29 additions & 0 deletions codeflash/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import Counter, defaultdict
from typing import TYPE_CHECKING

import libcst as cst
from rich.tree import Tree

from codeflash.cli_cmds.console import DEBUG_MODE, lsp_log
Expand Down Expand Up @@ -505,6 +506,31 @@ def id(self) -> str:
f"{self.function_getting_tested}:{self.iteration_id}"
)

def find_func_in_class(self, class_node: cst.ClassDef, func_name: str) -> Optional[cst.FunctionDef]:
for stmt in class_node.body.body:
if isinstance(stmt, cst.FunctionDef) and stmt.name.value == func_name:
return stmt
return None

def get_src_code(self, test_path: Path) -> Optional[str]:
test_src = test_path.read_text(encoding="utf-8")
module_node = cst.parse_module(test_src)

if self.test_class_name:
for stmt in module_node.body:
if isinstance(stmt, cst.ClassDef) and stmt.name.value == self.test_class_name:
func_node = self.find_func_in_class(stmt, self.test_function_name)
if func_node:
return module_node.code_for_node(func_node).strip()
# class not found
return None

# Otherwise, look for a top level function
for stmt in module_node.body:
if isinstance(stmt, cst.FunctionDef) and stmt.name.value == self.test_function_name:
return module_node.code_for_node(stmt).strip()
return None

@staticmethod
def from_str_id(string_id: str, iteration_id: str | None = None) -> InvocationId:
components = string_id.split(":")
Expand Down Expand Up @@ -549,7 +575,10 @@ class TestResults(BaseModel): # noqa: PLW1641
# also we don't support deletion of test results elements - caution is advised
test_results: list[FunctionTestInvocation] = []
test_result_idx: dict[str, int] = {}

perf_stdout: Optional[str] = None
# mapping between test function name and stdout failure message
test_failures: Optional[dict[str, str]] = None

def add(self, function_test_invocation: FunctionTestInvocation) -> None:
unique_id = function_test_invocation.unique_invocation_loop_id
Expand Down
25 changes: 21 additions & 4 deletions codeflash/optimization/function_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1752,6 +1752,11 @@ def establish_original_code_baseline(
)
)

def get_results_not_matched_error(self) -> Failure:
logger.info("h4|Test results did not match the test results of the original code ❌")
console.rule()
return Failure("Test results did not match the test results of the original code.")

def run_optimized_candidate(
self,
*,
Expand Down Expand Up @@ -1808,13 +1813,25 @@ def run_optimized_candidate(
)
)
console.rule()
if compare_test_results(baseline_results.behavior_test_results, candidate_behavior_results):
match, diffs = compare_test_results(baseline_results.behavior_test_results, candidate_behavior_results)
if match:
logger.info("h3|Test results matched ✅")
console.rule()
else:
logger.info("h4|Test results did not match the test results of the original code ❌")
console.rule()
return Failure("Test results did not match the test results of the original code.")
result_unmatched_perc = len(diffs) / len(candidate_behavior_results)
if result_unmatched_perc > 0.5:
# if the test unmatched percentage is greater than 50%, we can't fix it
return self.get_results_not_matched_error()

# with the parsed test results diff ask the llm to fix the candidate to match the test results of the original code, and run again
# self.run_optimized_candidate(
# optimization_candidate_index=optimization_candidate_index,
# baseline_results=baseline_results,
# original_helper_code=original_helper_code,
# file_path_to_helper_classes=file_path_to_helper_classes,
# )
print(f"should try to fix it, diffs: {diffs}")
return self.get_results_not_matched_error()

logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...")

Expand Down
95 changes: 71 additions & 24 deletions codeflash/verification/equivalence.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
from dataclasses import dataclass
from enum import Enum

from codeflash.cli_cmds.console import logger
from codeflash.models.models import TestResults, TestType, VerificationType
Expand All @@ -7,21 +9,48 @@
INCREASED_RECURSION_LIMIT = 5000


def compare_test_results(original_results: TestResults, candidate_results: TestResults) -> bool:
class TestDiffScope(Enum):
RETURN_VALUE = "return_value"
STDOUT = "stdout"
TIMED_OUT = "timed_out"
DID_PASS = "did_pass" # noqa: S105


@dataclass
class TestDiff:
scope: TestDiffScope
test_src_code: str
pytest_error: str
original_value: any
candidate_value: any


def compare_test_results(original_results: TestResults, candidate_results: TestResults) -> tuple[bool, list[TestDiff]]:
# This is meant to be only called with test results for the first loop index
if len(original_results) == 0 or len(candidate_results) == 0:
return False # empty test results are not equal
return False, [] # empty test results are not equal
original_recursion_limit = sys.getrecursionlimit()
if original_recursion_limit < INCREASED_RECURSION_LIMIT:
sys.setrecursionlimit(INCREASED_RECURSION_LIMIT) # Increase recursion limit to avoid RecursionError
test_ids_superset = original_results.get_all_unique_invocation_loop_ids().union(
set(candidate_results.get_all_unique_invocation_loop_ids())
)
are_equal: bool = True
test_diffs: list[TestDiff] = []
did_all_timeout: bool = True
for test_id in test_ids_superset:
original_test_result = original_results.get_by_unique_invocation_loop_id(test_id)
cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id)
candidate_test_failures = candidate_results.test_failures
# original_test_failures = original_results.test_failures
cdd_pytest_error = (
candidate_test_failures.get(original_test_result.id.test_function_name, "")
if candidate_test_failures
else ""
)
# original_pytest_error = (
# original_test_failures.get(original_test_result.id.test_function_name, "") if original_test_failures else ""
# )

if cdd_test_result is not None and original_test_result is None:
continue
# If helper function instance_state verification is not present, that's ok. continue
Expand All @@ -32,8 +61,7 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
):
continue
if original_test_result is None or cdd_test_result is None:
are_equal = False
break
return False, []
did_all_timeout = did_all_timeout and original_test_result.timed_out
if original_test_result.timed_out:
continue
Expand All @@ -43,31 +71,42 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
in {VerificationType.INIT_STATE_HELPER, VerificationType.INIT_STATE_FTO}
):
superset_obj = True
test_src_code = original_test_result.id.get_src_code(original_test_result.file_name)
if not comparator(original_test_result.return_value, cdd_test_result.return_value, superset_obj=superset_obj):
are_equal = False
test_diffs.append(
TestDiff(
scope=TestDiffScope.RETURN_VALUE,
test_src_code=test_src_code,
original_value=original_test_result.return_value,
candidate_value=cdd_test_result.return_value,
pytest_error=cdd_pytest_error,
)
)

try:
logger.debug(
"File Name: %s\n"
"Test Type: %s\n"
"Verification Type: %s\n"
"Invocation ID: %s\n"
"Original return value: %s\n"
"Candidate return value: %s\n"
"-------------------",
original_test_result.file_name,
original_test_result.test_type,
original_test_result.verification_type,
original_test_result.id,
original_test_result.return_value,
cdd_test_result.return_value,
print(
f"File Name: {original_test_result.file_name}\n"
f"Test Type: {original_test_result.test_type}\n"
f"Verification Type: {original_test_result.verification_type}\n"
f"Invocation ID: {original_test_result.id}\n"
f"Original return value: {original_test_result.return_value}\n"
f"Candidate return value: {cdd_test_result.return_value}\n"
)
except Exception as e:
logger.error(e)
break
if (original_test_result.stdout and cdd_test_result.stdout) and not comparator(
original_test_result.stdout, cdd_test_result.stdout
):
are_equal = False
test_diffs.append(
TestDiff(
scope=TestDiffScope.STDOUT,
test_src_code=test_src_code,
original_value=original_test_result.stdout,
candidate_value=cdd_test_result.stdout,
pytest_error=cdd_pytest_error,
)
)
break

if original_test_result.test_type in {
Expand All @@ -76,9 +115,17 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
TestType.GENERATED_REGRESSION,
TestType.REPLAY_TEST,
} and (cdd_test_result.did_pass != original_test_result.did_pass):
are_equal = False
test_diffs.append(
TestDiff(
scope=TestDiffScope.DID_PASS,
test_src_code=test_src_code,
original_value=original_test_result.did_pass,
candidate_value=cdd_test_result.did_pass,
pytest_error=cdd_pytest_error,
)
)
break
sys.setrecursionlimit(original_recursion_limit)
if did_all_timeout:
return False
return are_equal
return False, test_diffs
return len(test_diffs) == 0, test_diffs
42 changes: 42 additions & 0 deletions codeflash/verification/parse_test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,43 @@ def merge_test_results(
return merged_test_results


def parse_test_failures_from_stdout(test_results: TestResults, stdout: str) -> TestResults:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should write tests for this function

stdout_lines = stdout.splitlines()
start_line = -1
end_line = -1
for i, line in enumerate(stdout_lines):
if start_line != -1 and end_line != -1:
break
if "FAILURES" in line:
start_line = i
elif "short test summary info" in line:
end_line = i
if start_line == -1 or end_line == -1:
return test_results

complete_failure_output_lines = stdout_lines[start_line:end_line] # exclude last summary line

test_case_to_failure: dict[str, str] = {}

current_test_case: str | None = None
current_failure_lines: list[str] = []

for line in complete_failure_output_lines:
if line.startswith("_______"):
if current_test_case:
test_case_to_failure[current_test_case] = "".join(current_failure_lines)
current_test_case = line.strip("_ ").strip()
current_failure_lines = []
elif current_test_case:
current_failure_lines.append(line + "\n")

if current_test_case:
test_case_to_failure[current_test_case] = "".join(current_failure_lines)

test_results.test_failures = test_case_to_failure
return test_results


def parse_test_results(
test_xml_path: Path,
test_files: TestFiles,
Expand Down Expand Up @@ -572,4 +609,9 @@ def parse_test_results(
function_name=function_name,
)
coverage.log_coverage()
try:
parse_test_failures_from_stdout(results, run_result.stdout)
except Exception as e:
logger.exception(e)

return results, coverage if all_args else None
Loading
Loading