diff --git a/PythonScripts/audit_translations/auditor.py b/PythonScripts/audit_translations/auditor.py index dd0c68ae..0fac4d2a 100644 --- a/PythonScripts/audit_translations/auditor.py +++ b/PythonScripts/audit_translations/auditor.py @@ -191,19 +191,67 @@ def issue_base(rule: RuleInfo, file_name: str, language: str) -> dict: def first_structure_mismatch( english_tokens: List[str], translated_tokens: List[str], -) -> Tuple[Optional[str], Optional[str]]: +) -> Tuple[Optional[str], Optional[str], int]: + """ + Find the first structural mismatch between two token lists. + + Returns (en_token, tr_token, mismatch_position). + Position is the index in the token list where they first differ. + """ min_len = min(len(english_tokens), len(translated_tokens)) for idx in range(min_len): if english_tokens[idx] != translated_tokens[idx]: - return english_tokens[idx], translated_tokens[idx] + return english_tokens[idx], translated_tokens[idx], idx if len(english_tokens) > min_len: - return english_tokens[min_len], None + return english_tokens[min_len], None, min_len if len(translated_tokens) > min_len: - return None, translated_tokens[min_len] - return None, None + return None, translated_tokens[min_len], min_len + return None, None, -1 + + +def resolve_issue_line_at_position( + rule: RuleInfo, + kind: str, + token: Optional[str] = None, + position: int = 0 +) -> Optional[int]: + """ + Resolve the line number for a specific occurrence of an element within a rule. + + Args: + rule: The rule to search in + kind: The kind of element ('match', 'condition', 'variables', 'structure') + token: For structure kind, the specific token to find + position: The occurrence index (0 for first, 1 for second, etc.) + + Returns: + The line number if found, None if the element doesn't exist at that position. + """ + if kind == "match": + lines = rule.line_map.get("match", []) + elif kind == "condition": + lines = rule.line_map.get("condition", []) + elif kind == "variables": + lines = rule.line_map.get("variables", []) + elif kind == "structure" and token: + token_key = f"structure:{token.rstrip(':')}" + lines = rule.line_map.get(token_key, []) + else: + lines = [] + if position < len(lines): + return lines[position] + return None -def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> int: + +def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) -> Optional[int]: + """ + Resolve the line number for an issue within a rule. + + Returns the line number if found, None if the element doesn't exist in the rule. + For 'structure' kind with a missing token, returns None instead of falling back + to rule.line_number to avoid misleading line numbers when elements are missing. + """ if kind == "match": lines = rule.line_map.get("match", []) elif kind == "condition": @@ -213,6 +261,9 @@ def resolve_issue_line(rule: RuleInfo, kind: str, token: Optional[str] = None) - elif kind == "structure" and token: token_key = f"structure:{token.rstrip(':')}" lines = rule.line_map.get(token_key, []) + # For structure differences, if the token doesn't exist, return None + # rather than falling back to rule.line_number which is misleading + return lines[0] if lines else None else: lines = [] return lines[0] if lines else rule.line_number @@ -274,9 +325,21 @@ def collect_issues( if diff.diff_type == "structure": en_tokens = extract_structure_elements(diff.english_rule.data) tr_tokens = extract_structure_elements(diff.translated_rule.data) - en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens) + en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens) + + # Skip reporting when tokens are misaligned (both exist but differ) + # This avoids misleading line numbers when entire blocks are missing/added + # We only report when one is None (clear case of missing element) + if en_token is not None and tr_token is not None and en_token != tr_token: + continue + issue_line_en = resolve_issue_line(diff.english_rule, "structure", en_token) issue_line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token) + + # Skip reporting structure differences where we can't find both tokens + # This avoids misleading line numbers when blocks are missing + if issue_line_en is None or issue_line_tr is None: + continue else: issue_line_en = resolve_issue_line(diff.english_rule, diff.diff_type) issue_line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type) @@ -340,23 +403,37 @@ def print_warnings(result: ComparisonResult, file_name: str, verbose: bool = Fal issues += 1 if result.rule_differences: - total_diffs = len(result.rule_differences) - console.print( - f"\n [magenta]≠[/] [bold]Rule Differences[/] " - f"[[magenta]{total_diffs}[/]] [dim](structural differences between en and translation)[/]" - ) + # Count only diffs that will actually be displayed + displayable_diffs = [] for diff in result.rule_differences: if diff.diff_type == "structure": en_tokens = extract_structure_elements(diff.english_rule.data) tr_tokens = extract_structure_elements(diff.translated_rule.data) - en_token, tr_token = first_structure_mismatch(en_tokens, tr_tokens) + en_token, tr_token, mismatch_pos = first_structure_mismatch(en_tokens, tr_tokens) + + # Skip reporting when tokens are misaligned (both exist but differ) + # This avoids misleading line numbers when entire blocks are missing/added + if en_token is not None and tr_token is not None and en_token != tr_token: + continue + line_en = resolve_issue_line(diff.english_rule, "structure", en_token) line_tr = resolve_issue_line(diff.translated_rule, "structure", tr_token) + # Skip structure diffs where we can't find both tokens + if line_en is None or line_tr is None: + continue else: line_en = resolve_issue_line(diff.english_rule, diff.diff_type) line_tr = resolve_issue_line(diff.translated_rule, diff.diff_type) - print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose) - issues += 1 + displayable_diffs.append((diff, line_en, line_tr)) + + if displayable_diffs: + console.print( + f"\n [magenta]≠[/] [bold]Rule Differences[/] " + f"[[magenta]{len(displayable_diffs)}[/]] [dim](structural differences between en and translation)[/]" + ) + for diff, line_en, line_tr in displayable_diffs: + print_diff_item(diff, line_en=line_en, line_tr=line_tr, verbose=verbose) + issues += 1 if result.extra_rules: console.print(f"\n [blue]ℹ[/] [bold]Extra Rules[/] [[blue]{len(result.extra_rules)}[/]] [dim](may be intentional)[/]") diff --git a/PythonScripts/audit_translations/tests/fixtures/de/structure_misaligned.yaml b/PythonScripts/audit_translations/tests/fixtures/de/structure_misaligned.yaml new file mode 100644 index 00000000..36e3097d --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/de/structure_misaligned.yaml @@ -0,0 +1,15 @@ +- name: misaligned-structure + tag: root + match: "." + replace: + - test: + if: $Setting = 'Value' + then: + - test: + if: parent::m:minus + then: [T: "negativ"] + else: [T: "positiv"] + - test: + if: "*[2][.='2']" + then: [T: "quadrat"] + else: [T: "andere"] diff --git a/PythonScripts/audit_translations/tests/fixtures/de/structure_missing_else.yaml b/PythonScripts/audit_translations/tests/fixtures/de/structure_missing_else.yaml new file mode 100644 index 00000000..445f0670 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/de/structure_missing_else.yaml @@ -0,0 +1,7 @@ +- name: missing-else-block + tag: root + match: "." + replace: + - test: + if: "@intent='foo'" + then: [T: "hat foo"] diff --git a/PythonScripts/audit_translations/tests/fixtures/en/structure_misaligned.yaml b/PythonScripts/audit_translations/tests/fixtures/en/structure_misaligned.yaml new file mode 100644 index 00000000..e4999d95 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/en/structure_misaligned.yaml @@ -0,0 +1,18 @@ +- name: misaligned-structure + tag: root + match: "." + replace: + - test: + if: "$Verbosity!='Terse'" + then: [t: "the"] + - test: + if: "$Setting = 'Value'" + then: + - test: + if: "parent::m:minus" + then: [t: "negative"] + else: [t: "positive"] + - test: + if: "*[2][.='2']" + then: [t: "square"] + else: [t: "other"] diff --git a/PythonScripts/audit_translations/tests/fixtures/en/structure_missing_else.yaml b/PythonScripts/audit_translations/tests/fixtures/en/structure_missing_else.yaml new file mode 100644 index 00000000..a1016856 --- /dev/null +++ b/PythonScripts/audit_translations/tests/fixtures/en/structure_missing_else.yaml @@ -0,0 +1,8 @@ +- name: missing-else-block + tag: root + match: "." + replace: + - test: + if: "@intent='foo'" + then: [t: "has foo"] + else: [t: "no foo"] diff --git a/PythonScripts/audit_translations/tests/golden/jsonl/de.json b/PythonScripts/audit_translations/tests/golden/jsonl/de.json index f5c4600f..66cb0214 100644 --- a/PythonScripts/audit_translations/tests/golden/jsonl/de.json +++ b/PythonScripts/audit_translations/tests/golden/jsonl/de.json @@ -145,6 +145,42 @@ "untranslated_texts": [], "_explanation": "rule_difference structure_empty.yaml struct-empty|mi" }, + { + "language": "de", + "file": "structure_misaligned.yaml", + "rule_name": "misaligned-structure", + "rule_tag": "root", + "rule_key": "misaligned-structure|root", + "issue_line_en": 6, + "issue_line_tr": 6, + "rule_line_en": 1, + "rule_line_tr": 1, + "issue_type": "rule_difference", + "diff_type": "condition", + "description": "Conditions differ", + "english_snippet": "$Setting = 'Value', $Verbosity!='Terse', *[2][.='2'], parent::m:minus", + "translated_snippet": "$Setting = 'Value', *[2][.='2'], parent::m:minus", + "untranslated_texts": [], + "_explanation": "structure_misaligned.yaml: English has extra test block causing misalignment. Fix filters out misleading structure differences but reports condition difference." + }, + { + "language": "de", + "file": "structure_missing_else.yaml", + "rule_name": "missing-else-block", + "rule_tag": "root", + "rule_key": "missing-else-block|root", + "issue_line_en": 8, + "issue_line_tr": 1, + "rule_line_en": 1, + "rule_line_tr": 1, + "issue_type": "rule_difference", + "diff_type": "structure", + "description": "Rule structure differs (test/if/then/else blocks)", + "english_snippet": "replace: test: if: then: else:", + "translated_snippet": "replace: test: if: then:", + "untranslated_texts": [], + "_explanation": "structure_missing_else.yaml: Verifies fix still reports legitimate missing else blocks. Unlike misaligned case, this has clear missing element." + }, { "language": "de", "file": "unicode.yaml", diff --git a/PythonScripts/audit_translations/tests/test_auditor.py b/PythonScripts/audit_translations/tests/test_auditor.py index 6b452014..6fcc5e07 100644 --- a/PythonScripts/audit_translations/tests/test_auditor.py +++ b/PythonScripts/audit_translations/tests/test_auditor.py @@ -219,3 +219,136 @@ def test_print_warnings_includes_snippets_when_verbose() -> None: output = capture.get() assert output == golden_path.read_text(encoding="utf-8") + + +def test_misaligned_structure_differences_are_skipped() -> None: + """ + Test that structure differences with misaligned tokens are skipped. + + When English has a "test" block that Norwegian doesn't have (and vice versa), + the structural tokens become misaligned. The fix skips reporting these + to avoid showing confusing line numbers. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + + result = compare_files( + str(fixtures_dir / "en" / "structure_misaligned.yaml"), + str(fixtures_dir / "de" / "structure_misaligned.yaml"), + ) + + # The result should detect that structures differ + assert len(result.rule_differences) > 0 + assert any(diff.diff_type == "structure" for diff in result.rule_differences) + + # But when collecting issues, misaligned structure diffs should be filtered out + issues = collect_issues(result, "structure_misaligned.yaml", "de") + structure_issues = [i for i in issues if i["diff_type"] == "structure"] + + # CRITICAL: Before the fix, this would have structure issues with misleading line numbers + # After the fix, misaligned structures are skipped, so we should have 0 structure issues + assert len(structure_issues) == 0, ( + "Expected misaligned structure differences to be filtered out, " + f"but found {len(structure_issues)} structure issues" + ) + + # Other differences (like conditions) should still be reported + condition_issues = [i for i in issues if i["diff_type"] == "condition"] + assert len(condition_issues) > 0, "Expected condition differences to still be reported" + + +def test_missing_else_block_is_still_reported() -> None: + """ + Test that legitimate missing structural elements are still reported. + + When one file has an 'else' block and the other doesn't, this is a clear + structural difference that should be reported with accurate line numbers. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + + result = compare_files( + str(fixtures_dir / "en" / "structure_missing_else.yaml"), + str(fixtures_dir / "de" / "structure_missing_else.yaml"), + ) + + # Should detect structure difference + assert len(result.rule_differences) > 0 + structure_diffs = [diff for diff in result.rule_differences if diff.diff_type == "structure"] + assert len(structure_diffs) == 1 + + # This case has one token None (missing else), so it should still be reported + issues = collect_issues(result, "structure_missing_else.yaml", "de") + structure_issues = [i for i in issues if i["diff_type"] == "structure"] + + # CRITICAL: This legitimate difference should still be reported + # One file has else:, the other doesn't - a clear missing element + assert len(structure_issues) == 1, ( + "Expected missing else block to be reported, " + f"but found {len(structure_issues)} structure issues" + ) + + # Verify the issue has proper line numbers + issue = structure_issues[0] + assert issue["issue_line_en"] is not None + # When else: doesn't exist in translation, we fall back to the rule line number + assert issue["issue_line_tr"] == 1 # start of the rule + + +def test_print_warnings_skips_misaligned_structures() -> None: + """ + Test that print_warnings doesn't display misaligned structure differences. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + + result = compare_files( + str(fixtures_dir / "en" / "structure_misaligned.yaml"), + str(fixtures_dir / "de" / "structure_misaligned.yaml"), + ) + + # Raw result should have structure differences detected + structure_diffs = [diff for diff in result.rule_differences if diff.diff_type == "structure"] + assert len(structure_diffs) > 0 + + with console.capture() as capture: + issues_count = print_warnings(result, "structure_misaligned.yaml", verbose=False) + output = capture.get() + + # CRITICAL: The output should not contain "Rule structure differs" + # because misaligned structures are filtered during display + assert "Rule structure differs" not in output, ( + "Expected misaligned structure differences to be filtered from display" + ) + + # The issues count should not include filtered structure differences + # It should only count the condition differences + condition_diffs = [diff for diff in result.rule_differences if diff.diff_type == "condition"] + assert issues_count == len(condition_diffs), ( + f"Expected issues_count ({issues_count}) to equal condition_diffs ({len(condition_diffs)})" + ) + + +def test_print_warnings_still_shows_missing_else() -> None: + """ + Test that print_warnings still displays legitimate missing elements. + """ + base_dir = Path(__file__).parent + fixtures_dir = base_dir / "fixtures" + + result = compare_files( + str(fixtures_dir / "en" / "structure_missing_else.yaml"), + str(fixtures_dir / "de" / "structure_missing_else.yaml"), + ) + + with console.capture() as capture: + issues_count = print_warnings(result, "structure_missing_else.yaml", verbose=False) + output = capture.get() + + # CRITICAL: This legitimate difference should appear in output + assert "Rule structure differs" in output, ( + "Expected missing else block to be shown in output" + ) + + # Should report exactly 1 issue (the structure difference) + assert issues_count == 1, f"Expected 1 issue but got {issues_count}"