Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 92 additions & 15 deletions PythonScripts/audit_translations/auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,19 +191,67 @@ def issue_base(rule: RuleInfo, file_name: str, language: str) -> dict:
def first_structure_mismatch(
english_tokens: List[str],
translated_tokens: List[str],
) -> Tuple[Optional[str], Optional[str]]:
) -> Tuple[Optional[str], Optional[str], int]:
"""
Find the first structural mismatch between two token lists.

Returns (en_token, tr_token, mismatch_position).
Position is the index in the token list where they first differ.
"""
min_len = min(len(english_tokens), len(translated_tokens))
for idx in range(min_len):
if english_tokens[idx] != translated_tokens[idx]:
return english_tokens[idx], translated_tokens[idx]
return english_tokens[idx], translated_tokens[idx], idx
if len(english_tokens) > min_len:
return english_tokens[min_len], None
return english_tokens[min_len], None, min_len
if len(translated_tokens) > min_len:
return None, translated_tokens[min_len]
return None, None
return None, translated_tokens[min_len], min_len
return None, None, -1


def resolve_issue_line_at_position(
rule: RuleInfo,
kind: str,
token: Optional[str] = None,
position: int = 0
) -> Optional[int]:
"""
Resolve the line number for a specific occurrence of an element within a rule.

Args:
rule: The rule to search in
kind: The kind of element ('match', 'condition', 'variables', 'structure')
token: For structure kind, the specific token to find
position: The occurrence index (0 for first, 1 for second, etc.)

Returns:
The line number if found, None if the element doesn't exist at that position.
"""
if kind == "match":
lines = rule.line_map.get("match", [])
elif kind == "condition":
lines = rule.line_map.get("condition", [])
elif kind == "variables":
lines = rule.line_map.get("variables", [])
elif kind == "structure" and token:
token_key = f"structure:{token.rstrip(':')}"
lines = rule.line_map.get(token_key, [])
else:
lines = []

if position < len(lines):
return lines[position]
return None

def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> int:

def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> Optional[int]:
"""
Resolve the line number for an issue within a rule.

Returns the line number if found, None if the element doesn't exist in the rule.
For 'structure' kind with a missing token, returns None instead of falling back
to rule.line_number to avoid misleading line numbers when elements are missing.
"""
if kind == "match":
lines = rule.line_map.get("match", [])
elif kind == "condition":
Expand All @@ -213,6 +261,9 @@ def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -
elif kind == "structure" and token:
token_key = f"structure:{token.rstrip(':')}"
lines = rule.line_map.get(token_key, [])
# For structure differences, if the token doesn't exist, return None
# rather than falling back to rule.line_number which is misleading
return lines[0] if lines else None
else:
lines = []
return lines[0] if lines else rule.line_number
Expand Down Expand Up @@ -274,9 +325,21 @@ def collect_issues(
if diff.diff_type == "structure":
en_tokens = extract_structure_elements(diff.english_rule.data)
tr_tokens = extract_structure_elements(diff.translated_rule.data)
en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens)
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)

# Skip reporting when tokens are misaligned (both exist but differ)
# This avoids misleading line numbers when entire blocks are missing/added
# We only report when one is None (clear case of missing element)
if en_token is not None and tr_token is not None and en_token != tr_token:
continue

issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)

# Skip reporting structure differences where we can't find both tokens
# This avoids misleading line numbers when blocks are missing
if issue_line_en is None or issue_line_tr is None:
continue
else:
issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
Expand Down Expand Up @@ -340,23 +403,37 @@ def print_warnings(result: ComparisonResult, file_name: str, verbose: bool = Fal
issues += 1

if result.rule_differences:
total_diffs = len(result.rule_differences)
console.print(
f"\n [magenta]≠[/] [bold]Rule Differences[/] "
f"[[magenta]{total_diffs}[/]] [dim](structural differences between en and translation)[/]"
)
# Count only diffs that will actually be displayed
displayable_diffs = []
for diff in result.rule_differences:
if diff.diff_type == "structure":
en_tokens = extract_structure_elements(diff.english_rule.data)
tr_tokens = extract_structure_elements(diff.translated_rule.data)
en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens)
en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens)

# Skip reporting when tokens are misaligned (both exist but differ)
# This avoids misleading line numbers when entire blocks are missing/added
if en_token is not None and tr_token is not None and en_token != tr_token:
continue

line_en = resolve_issue_line(diff.english_rule, "structure", en_token)
line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token)
# Skip structure diffs where we can't find both tokens
if line_en is None or line_tr is None:
continue
else:
line_en = resolve_issue_line(diff.english_rule, diff.diff_type)
line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type)
print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose)
issues += 1
displayable_diffs.append((diff, line_en, line_tr))

if displayable_diffs:
console.print(
f"\n [magenta]≠[/] [bold]Rule Differences[/] "
f"[[magenta]{len(displayable_diffs)}[/]] [dim](structural differences between en and translation)[/]"
)
for diff, line_en, line_tr in displayable_diffs:
print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose)
issues += 1

if result.extra_rules:
console.print(f"\n [blue]ℹ[/] [bold]Extra Rules[/] [[blue]{len(result.extra_rules)}[/]] [dim](may be intentional)[/]")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
- name: misaligned-structure
tag: root
match: "."
replace:
- test:
if: $Setting = 'Value'
then:
- test:
if: parent::m:minus
then: [T: "negativ"]
else: [T: "positiv"]
- test:
if: "*[2][.='2']"
then: [T: "quadrat"]
else: [T: "andere"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- name: missing-else-block
tag: root
match: "."
replace:
- test:
if: "@intent='foo'"
then: [T: "hat foo"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
- name: misaligned-structure
tag: root
match: "."
replace:
- test:
if: "$Verbosity!='Terse'"
then: [t: "the"]
- test:
if: "$Setting = 'Value'"
then:
- test:
if: "parent::m:minus"
then: [t: "negative"]
else: [t: "positive"]
- test:
if: "*[2][.='2']"
then: [t: "square"]
else: [t: "other"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- name: missing-else-block
tag: root
match: "."
replace:
- test:
if: "@intent='foo'"
then: [t: "has foo"]
else: [t: "no foo"]
36 changes: 36 additions & 0 deletions PythonScripts/audit_translations/tests/golden/jsonl/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,42 @@
"untranslated_texts": [],
"_explanation": "rule_difference structure_empty.yaml struct-empty|mi"
},
{
"language": "de",
"file": "structure_misaligned.yaml",
"rule_name": "misaligned-structure",
"rule_tag": "root",
"rule_key": "misaligned-structure|root",
"issue_line_en": 6,
"issue_line_tr": 6,
"rule_line_en": 1,
"rule_line_tr": 1,
"issue_type": "rule_difference",
"diff_type": "condition",
"description": "Conditions differ",
"english_snippet": "$Setting = 'Value', $Verbosity!='Terse', *[2][.='2'], parent::m:minus",
"translated_snippet": "$Setting = 'Value', *[2][.='2'], parent::m:minus",
"untranslated_texts": [],
"_explanation": "structure_misaligned.yaml: English has extra test block causing misalignment. Fix filters out misleading structure differences but reports condition difference."
},
{
"language": "de",
"file": "structure_missing_else.yaml",
"rule_name": "missing-else-block",
"rule_tag": "root",
"rule_key": "missing-else-block|root",
"issue_line_en": 8,
"issue_line_tr": 1,
"rule_line_en": 1,
"rule_line_tr": 1,
"issue_type": "rule_difference",
"diff_type": "structure",
"description": "Rule structure differs (test/if/then/else blocks)",
"english_snippet": "replace: test: if: then: else:",
"translated_snippet": "replace: test: if: then:",
"untranslated_texts": [],
"_explanation": "structure_missing_else.yaml: Verifies fix still reports legitimate missing else blocks. Unlike misaligned case, this has clear missing element."
},
{
"language": "de",
"file": "unicode.yaml",
Expand Down
133 changes: 133 additions & 0 deletions PythonScripts/audit_translations/tests/test_auditor.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,136 @@ def test_print_warnings_includes_snippets_when_verbose() -> None:
output = capture.get()

assert output == golden_path.read_text(encoding="utf-8")


def test_misaligned_structure_differences_are_skipped() -> None:
"""
Test that structure differences with misaligned tokens are skipped.

When English has a "test" block that Norwegian doesn't have (and vice versa),
the structural tokens become misaligned. The fix skips reporting these
to avoid showing confusing line numbers.
"""
base_dir = Path(__file__).parent
fixtures_dir = base_dir / "fixtures"

result = compare_files(
str(fixtures_dir / "en" / "structure_misaligned.yaml"),
str(fixtures_dir / "de" / "structure_misaligned.yaml"),
)

# The result should detect that structures differ
assert len(result.rule_differences) > 0
assert any(diff.diff_type == "structure" for diff in result.rule_differences)

# But when collecting issues, misaligned structure diffs should be filtered out
issues = collect_issues(result, "structure_misaligned.yaml", "de")
structure_issues = [i for i in issues if i["diff_type"] == "structure"]

# CRITICAL: Before the fix, this would have structure issues with misleading line numbers
# After the fix, misaligned structures are skipped, so we should have 0 structure issues
assert len(structure_issues) == 0, (
"Expected misaligned structure differences to be filtered out, "
f"but found {len(structure_issues)} structure issues"
)

# Other differences (like conditions) should still be reported
condition_issues = [i for i in issues if i["diff_type"] == "condition"]
assert len(condition_issues) > 0, "Expected condition differences to still be reported"


def test_missing_else_block_is_still_reported() -> None:
"""
Test that legitimate missing structural elements are still reported.

When one file has an 'else' block and the other doesn't, this is a clear
structural difference that should be reported with accurate line numbers.
"""
base_dir = Path(__file__).parent
fixtures_dir = base_dir / "fixtures"

result = compare_files(
str(fixtures_dir / "en" / "structure_missing_else.yaml"),
str(fixtures_dir / "de" / "structure_missing_else.yaml"),
)

# Should detect structure difference
assert len(result.rule_differences) > 0
structure_diffs = [diff for diff in result.rule_differences if diff.diff_type == "structure"]
assert len(structure_diffs) == 1

# This case has one token None (missing else), so it should still be reported
issues = collect_issues(result, "structure_missing_else.yaml", "de")
structure_issues = [i for i in issues if i["diff_type"] == "structure"]

# CRITICAL: This legitimate difference should still be reported
# One file has else:, the other doesn't - a clear missing element
assert len(structure_issues) == 1, (
"Expected missing else block to be reported, "
f"but found {len(structure_issues)} structure issues"
)

# Verify the issue has proper line numbers
issue = structure_issues[0]
assert issue["issue_line_en"] is not None
# When else: doesn't exist in translation, we fall back to the rule line number
assert issue["issue_line_tr"] == 1 # start of the rule


def test_print_warnings_skips_misaligned_structures() -> None:
"""
Test that print_warnings doesn't display misaligned structure differences.
"""
base_dir = Path(__file__).parent
fixtures_dir = base_dir / "fixtures"

result = compare_files(
str(fixtures_dir / "en" / "structure_misaligned.yaml"),
str(fixtures_dir / "de" / "structure_misaligned.yaml"),
)

# Raw result should have structure differences detected
structure_diffs = [diff for diff in result.rule_differences if diff.diff_type == "structure"]
assert len(structure_diffs) > 0

with console.capture() as capture:
issues_count = print_warnings(result, "structure_misaligned.yaml", verbose=False)
output = capture.get()

# CRITICAL: The output should not contain "Rule structure differs"
# because misaligned structures are filtered during display
assert "Rule structure differs" not in output, (
"Expected misaligned structure differences to be filtered from display"
)

# The issues count should not include filtered structure differences
# It should only count the condition differences
condition_diffs = [diff for diff in result.rule_differences if diff.diff_type == "condition"]
assert issues_count == len(condition_diffs), (
f"Expected issues_count ({issues_count}) to equal condition_diffs ({len(condition_diffs)})"
)


def test_print_warnings_still_shows_missing_else() -> None:
"""
Test that print_warnings still displays legitimate missing elements.
"""
base_dir = Path(__file__).parent
fixtures_dir = base_dir / "fixtures"

result = compare_files(
str(fixtures_dir / "en" / "structure_missing_else.yaml"),
str(fixtures_dir / "de" / "structure_missing_else.yaml"),
)

with console.capture() as capture:
issues_count = print_warnings(result, "structure_missing_else.yaml", verbose=False)
output = capture.get()

# CRITICAL: This legitimate difference should appear in output
assert "Rule structure differs" in output, (
"Expected missing else block to be shown in output"
)

# Should report exactly 1 issue (the structure difference)
assert issues_count == 1, f"Expected 1 issue but got {issues_count}"
Loading