From 557ce0c6b4a3a2f49a1af59580b9b824c2b59570 Mon Sep 17 00:00:00 2001 From: Mrityunjay Raj Date: Mon, 5 Jan 2026 13:36:17 +0530 Subject: [PATCH] Fix D2D mapping for Java class names differing from source filename When a Java source file contains class declarations with names different from the filename (e.g., DelombokTask.java containing class Tasks), the compiled .class file uses the class name (Tasks.class), not the filename. This fix: - Extracts class/interface/enum names from Java source files during scanning - Stores class names in extra_data["java_classes"] - Indexes both filename-based and class-name-based paths for matching - Enables correct D2D mapping for these edge cases Closes #1993 Signed-off-by: Mrityunjay Raj --- scanpipe/pipes/jvm.py | 69 ++++++++++++++++++++++++- scanpipe/tests/pipes/test_d2d.py | 28 ++++++++++ scanpipe/tests/pipes/test_jvm.py | 89 ++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+), 1 deletion(-) diff --git a/scanpipe/pipes/jvm.py b/scanpipe/pipes/jvm.py index b071d9dcd4..675eace17a 100644 --- a/scanpipe/pipes/jvm.py +++ b/scanpipe/pipes/jvm.py @@ -41,6 +41,8 @@ class JvmLanguage: binary_extensions: tuple = (".class",) # Like java_package, kotlin_package, scala_package, used as an attribute in resource source_package_attribute_name: str = None + # Like java_classes, stores the class names defined in the source file + source_classes_attribute_name: str = None # A regex pattern to extract a package from a source file package_regex: Pattern = None # Type of relation for a binary file to its source file @@ -115,14 +117,38 @@ def get_indexable_qualified_paths_from_values(cls, resource_values): And the output tuples look like this example:: (123, "org/apache/commons/LoggerImpl.java") + + If the source file contains class names that differ from the filename + (e.g., a file named "Foo.java" containing "class Bar"), additional + entries are yielded for each class name. """ for resource_id, resource_name, resource_extra_data in resource_values: + jvm_package = resource_extra_data.get(cls.source_package_attribute_name) + # Yield the original filename-based path fully_qualified = get_fully_qualified_path( - jvm_package=resource_extra_data.get(cls.source_package_attribute_name), + jvm_package=jvm_package, filename=resource_name, ) yield resource_id, fully_qualified + # Also yield paths for any class names that differ from the filename + if cls.source_classes_attribute_name: + class_names = resource_extra_data.get( + cls.source_classes_attribute_name, [] + ) + # Get the base name without extension to compare + base_name = Path(resource_name).stem + extension = Path(resource_name).suffix + for class_name in class_names: + # Only yield if class name differs from filename + if class_name != base_name: + class_filename = f"{class_name}{extension}" + class_path = get_fully_qualified_path( + jvm_package=jvm_package, + filename=class_filename, + ) + yield resource_id, class_path + @classmethod def get_normalized_path(cls, path, extension): """ @@ -180,14 +206,55 @@ def find_expression(lines, regex): return value +def find_all_expressions(lines, regex, max_lines=500): + """Return all values found using ``regex`` in the first ``max_lines`` lines.""" + results = [] + for ln, line in enumerate(lines): + if ln > max_lines: + break + for value in regex.findall(line): + if value and value not in results: + results.append(value) + return results + + class JavaLanguage(JvmLanguage): name = "java" source_extensions = (".java",) binary_extensions = (".class",) source_package_attribute_name = "java_package" + source_classes_attribute_name = "java_classes" package_regex = re.compile(r"^\s*package\s+([\w\.]+)\s*;") + # Regex to match class/interface/enum declarations in Java + # Matches patterns like: "class Foo", "public class Foo", "interface Bar", etc. + class_name_regex = re.compile( + r"(?:^|[;\s{}])\s*" # Start of line or after ; { } or whitespace + r"(?:public\s+|private\s+|protected\s+|abstract\s+|final\s+|static\s+)*" + r"(?:class|interface|enum)\s+" + r"(\w+)" # Capture the class/interface/enum name + ) binary_map_type = "java_to_class" + @classmethod + def find_source_package(cls, lines): + """Find the package and class names from Java source lines.""" + result = {} + lines_list = list(lines) + + # Find package + package = find_expression(lines=iter(lines_list), regex=cls.package_regex) + if package: + result[cls.source_package_attribute_name] = package + + # Find all class/interface/enum names + class_names = find_all_expressions( + lines=iter(lines_list), regex=cls.class_name_regex + ) + if class_names: + result[cls.source_classes_attribute_name] = class_names + + return result if result else None + class ScalaLanguage(JvmLanguage): name = "scala" diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index 4d8433498e..81281e4d53 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -498,6 +498,34 @@ def test_scanpipe_pipes_d2d_map_java_to_class_no_java(self): expected = "No ('.java',) resources to map." self.assertIn(expected, buffer.getvalue()) + def test_scanpipe_pipes_d2d_map_java_to_class_different_class_name(self): + """Test D2D mapping when class name differs from source filename (#1993).""" + # Source file named DelombokTask.java but contains class Tasks + from1 = make_resource_file( + self.project1, + path="from/lombok/delombok/ant/DelombokTask.java", + extra_data={ + "java_package": "lombok.delombok.ant", + "java_classes": ["Tasks", "Delombok"], + }, + ) + # The .class file is named after the class, not the source file + to1 = make_resource_file( + self.project1, + path="to/lombok/delombok/ant/Tasks.class", + ) + + buffer = io.StringIO() + d2d.map_jvm_to_class( + self.project1, logger=buffer.write, jvm_lang=jvm.JavaLanguage + ) + + # Should find the mapping via class name + relation = self.project1.codebaserelations.get() + self.assertEqual(from1, relation.from_resource) + self.assertEqual(to1, relation.to_resource) + self.assertEqual("java_to_class", relation.map_type) + def test_scanpipe_pipes_d2d_java_ignore_pattern(self): make_resource_file(self.project1, path="to/module-info.class") make_resource_file(self.project1, path="to/META-INF/MANIFEST.MF") diff --git a/scanpipe/tests/pipes/test_jvm.py b/scanpipe/tests/pipes/test_jvm.py index a832fe3a1d..1c6df2b8f7 100644 --- a/scanpipe/tests/pipes/test_jvm.py +++ b/scanpipe/tests/pipes/test_jvm.py @@ -50,6 +50,48 @@ java_package_too_far_down = ("\n" * 501) + "package org.apache.logging.log4j.core;" +# Java code where class name differs from filename (like lombok's DelombokTask.java) +java_code_different_class_name = """ +package lombok.delombok.ant; + +import org.apache.tools.ant.Task; + +/** + * Ant tasks for delombok. + * This file is named DelombokTask.java but contains class Tasks. + */ +class Tasks { + public static class Delombok extends Task { + public void execute() {} + } + + public static class Format extends Task { + public void execute() {} + } +} +""" + +# Java code with multiple classes including interface and enum +java_code_multiple_types = """ +package com.example; + +public class MainClass { + // Main implementation +} + +interface SomeInterface { + void doSomething(); +} + +enum Status { + ACTIVE, INACTIVE +} + +abstract class AbstractBase { + public abstract void process(); +} +""" + class ScanPipeJvmTest(TestCase): data = Path(__file__).parent.parent / "data" @@ -114,6 +156,53 @@ def test_scanpipe_pipes_jvm_get_fully_qualified_java_path(self): fqjp = jvm.get_fully_qualified_path("org.common", "Bar.java") self.assertEqual("org/common/Bar.java", fqjp) + def test_scanpipe_pipes_jvm_find_java_package_with_different_class_name(self): + """Test that find_source_package extracts class names differing from filename.""" + result = jvm.JavaLanguage.find_source_package( + java_code_different_class_name.splitlines() + ) + self.assertEqual("lombok.delombok.ant", result["java_package"]) + # The class name "Tasks" differs from what would be the filename + self.assertIn("Tasks", result["java_classes"]) + # Also check for inner classes + self.assertIn("Delombok", result["java_classes"]) + self.assertIn("Format", result["java_classes"]) + + def test_scanpipe_pipes_jvm_find_java_package_with_multiple_types(self): + """Test that find_source_package extracts all class/interface/enum names.""" + result = jvm.JavaLanguage.find_source_package( + java_code_multiple_types.splitlines() + ) + self.assertEqual("com.example", result["java_package"]) + classes = result["java_classes"] + self.assertIn("MainClass", classes) + self.assertIn("SomeInterface", classes) + self.assertIn("Status", classes) + self.assertIn("AbstractBase", classes) + + def test_scanpipe_pipes_jvm_get_indexable_qualified_paths_with_class_names(self): + """Test get_indexable_qualified_paths_from_values yields class name paths.""" + resource_values = [ + ( + 1, + "DelombokTask.java", + { + "java_package": "lombok.delombok.ant", + "java_classes": ["Tasks", "Delombok"], + }, + ), + ] + paths = list( + jvm.JavaLanguage.get_indexable_qualified_paths_from_values(resource_values) + ) + # Should yield: filename path, and paths for each class name that differs + self.assertEqual(3, len(paths)) + # First is the original filename-based path + self.assertEqual((1, "lombok/delombok/ant/DelombokTask.java"), paths[0]) + # Then paths for class names that differ from filename + self.assertEqual((1, "lombok/delombok/ant/Tasks.java"), paths[1]) + self.assertEqual((1, "lombok/delombok/ant/Delombok.java"), paths[2]) + class ScanPipeJvmScalaTest(TestCase): data = Path(__file__).parent.parent / "data"