From a4988f55853b66fbb244df3b4e6febf651321aa4 Mon Sep 17 00:00:00 2001 From: Prem Date: Tue, 13 Jan 2026 22:10:06 +0530 Subject: [PATCH] Created importers for detailed scan results Signed-off-by: Prem --- scanpipe/pipelines/load_inventory.py | 35 +++- scanpipe/pipes/input.py | 198 ++++++++++++++++++ .../data/integrations/matchcode_import.json | 40 ++++ .../data/integrations/purldb_import.json | 31 +++ .../integrations/vulnerablecode_import.json | 27 +++ scanpipe/tests/pipes/test_input.py | 108 ++++++++++ 6 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 scanpipe/tests/data/integrations/matchcode_import.json create mode 100644 scanpipe/tests/data/integrations/purldb_import.json create mode 100644 scanpipe/tests/data/integrations/vulnerablecode_import.json diff --git a/scanpipe/pipelines/load_inventory.py b/scanpipe/pipelines/load_inventory.py index bcc10d61ef..4cec036ad2 100644 --- a/scanpipe/pipelines/load_inventory.py +++ b/scanpipe/pipelines/load_inventory.py @@ -33,6 +33,11 @@ class LoadInventory(Pipeline): Supported format are ScanCode-toolkit JSON scan results, ScanCode.io JSON output, and ScanCode.io XLSX output. + Additionally supports importing scan results from integrated tools: + - VulnerableCode: Vulnerability data export + - PurlDB: Package enrichment data export + - MatchCode.io: Matching results export + An inventory is composed of packages, dependencies, resources, and relations. """ @@ -78,4 +83,32 @@ def build_inventory_from_scans(self): ) else: - raise Exception(f"Input not supported: {str(input_path)} ") + integrated_tool = input.get_integrated_tool_name(scan_data) + + if integrated_tool == "vulnerablecode": + updated_count = input.load_vulnerabilities_from_vulnerablecode( + self.project, scan_data + ) + self.log( + f"Loaded vulnerability data for {updated_count} packages " + f"from {input_path.name}" + ) + + elif integrated_tool == "purldb": + result = input.load_enrichment_from_purldb(self.project, scan_data) + self.log( + f"PurlDB import: {result['created']} packages created, " + f"{result['updated']} packages updated from {input_path.name}" + ) + + elif integrated_tool == "matchcodeio": + created_count = input.load_matches_from_matchcode( + self.project, scan_data + ) + self.log( + f"MatchCode.io import: {created_count} packages created " + f"from {input_path.name}" + ) + + else: + raise Exception(f"Input not supported: {str(input_path)} ") diff --git a/scanpipe/pipes/input.py b/scanpipe/pipes/input.py index 58ec2e5c96..4b5fb02e0d 100644 --- a/scanpipe/pipes/input.py +++ b/scanpipe/pipes/input.py @@ -237,3 +237,201 @@ def load_inventory_from_xlsx(project, input_location, extra_data_prefix=None): if extra_data_prefix: extra_data = {extra_data_prefix: extra_data} project.update_extra_data(extra_data) + + +def get_integrated_tool_name(scan_data): + """ + Detect and return the integrated tool name from the ``scan_data`` structure. + + Supported tools: + - vulnerablecode: VulnerableCode vulnerability data export + - purldb: PurlDB package enrichment data export + - matchcodeio: MatchCode.io matching results export + + Returns None if the tool cannot be identified. + """ + if "vulnerabilities" in scan_data or ( + isinstance(scan_data, list) + and scan_data + and "affected_by_vulnerabilities" in scan_data[0] + ): + return "vulnerablecode" + + if "files" in scan_data and "packages" in scan_data: + files = scan_data.get("files", []) + if files and any("for_packages" in f for f in files if isinstance(f, dict)): + for file_data in files: + if isinstance(file_data, dict): + extra_data = file_data.get("extra_data", {}) + if any( + key in extra_data + for key in ["matched_to", "path_score", "matched_fingerprints"] + ): + return "matchcodeio" + + if "packages" in scan_data or ( + isinstance(scan_data, list) + and scan_data + and isinstance(scan_data[0], dict) + and "purl" in scan_data[0] + and any( + key in scan_data[0] + for key in ["repository_homepage_url", "api_data_url", "package_content"] + ) + ): + return "purldb" + + return None + + +def load_vulnerabilities_from_vulnerablecode(project, scan_data): + """ + Load vulnerability data from VulnerableCode export and update project packages. + + The ``scan_data`` should contain vulnerability information that can be matched + to existing packages in the project by their PURL. + + Expected format: + - List of package dicts with 'purl' and 'affected_by_vulnerabilities' keys + - Or dict with 'vulnerabilities' key containing vulnerability details + """ + packages_by_purl = {} + for package in project.discoveredpackages.all(): + if package.package_url: + packages_by_purl[package.package_url] = package + + if isinstance(scan_data, list): + vulnerability_data_list = scan_data + elif "packages" in scan_data: + vulnerability_data_list = scan_data.get("packages", []) + elif "results" in scan_data: + vulnerability_data_list = scan_data.get("results", []) + else: + vulnerability_data_list = [] + + updated_packages = [] + for vuln_data in vulnerability_data_list: + purl = vuln_data.get("purl") + if not purl: + continue + + package = packages_by_purl.get(purl) + if not package: + continue + + affected_by = vuln_data.get("affected_by_vulnerabilities", []) + if affected_by: + package.affected_by_vulnerabilities = affected_by + updated_packages.append(package) + + if updated_packages: + DiscoveredPackage.objects.bulk_update( + objs=updated_packages, + fields=["affected_by_vulnerabilities"], + batch_size=1000, + ) + + return len(updated_packages) + + +def load_enrichment_from_purldb(project, scan_data): + """ + Load package enrichment data from PurlDB export and update/create packages. + + The ``scan_data`` should contain package information that can be used to + enrich existing packages or create new packages in the project. + + Expected format: + - List of package dicts with package data fields + - Or dict with 'packages' key containing package dicts + """ + if isinstance(scan_data, list): + package_data_list = scan_data + elif "packages" in scan_data: + package_data_list = scan_data.get("packages", []) + elif "results" in scan_data: + package_data_list = scan_data.get("results", []) + else: + package_data_list = [] + + created_count = 0 + updated_count = 0 + + for package_data in package_data_list: + purl = package_data.get("purl") + if not purl: + continue + + existing_package = project.discoveredpackages.filter( + package_url=purl + ).first() + + if existing_package: + updated_fields = existing_package.update_from_data(package_data) + if updated_fields: + existing_package.update_extra_data( + {"enriched_from_purldb": updated_fields} + ) + updated_count += 1 + else: + pipes.update_or_create_package(project, package_data) + created_count += 1 + + return {"created": created_count, "updated": updated_count} + + +def load_matches_from_matchcode(project, scan_data): + """ + Load matching results from MatchCode.io export and create packages/associations. + + The ``scan_data`` should contain matching results with package data and + resource associations. + + Expected format: + - Dict with 'files' and 'packages' keys + - 'files' contains resource data with 'for_packages' associations + - 'packages' contains matched package data + """ + from collections import defaultdict + + files_data = scan_data.get("files", []) + packages_data = scan_data.get("packages", []) + + resource_paths_by_package_uid = defaultdict(list) + for file_data in files_data: + for_packages = file_data.get("for_packages", []) + file_path = file_data.get("path") + if file_path: + for package_uid in for_packages: + resource_paths_by_package_uid[package_uid].append(file_path) + + created_packages = 0 + + for package_data in packages_data: + package_uid = package_data.get("package_uid") + if not package_uid: + continue + + + resource_paths = resource_paths_by_package_uid.get(package_uid, []) + + resources = project.codebaseresources.filter(path__in=resource_paths) + + package, created = pipes.update_or_create_package(project, package_data) + if created: + created_packages += 1 + + if package and resources.exists(): + package.add_resources(resources) + + for file_data in files_data: + if file_data.get("path") in resource_paths: + extra_data = file_data.get("extra_data", {}) + if extra_data: + resource = project.codebaseresources.filter( + path=file_data["path"] + ).first() + if resource: + resource.update_extra_data(extra_data) + + return created_packages diff --git a/scanpipe/tests/data/integrations/matchcode_import.json b/scanpipe/tests/data/integrations/matchcode_import.json new file mode 100644 index 0000000000..90c993a0f7 --- /dev/null +++ b/scanpipe/tests/data/integrations/matchcode_import.json @@ -0,0 +1,40 @@ +{ + "files": [ + { + "path": "src/utils.js", + "type": "file", + "for_packages": [ + "pkg:npm/lodash@4.17.21?uuid=test-uuid-1234" + ], + "extra_data": { + "matched_to": "lodash", + "path_score": 100, + "matched_fingerprints": [ + "abc123def456" + ] + } + }, + { + "path": "src/helper.js", + "type": "file", + "for_packages": [ + "pkg:npm/lodash@4.17.21?uuid=test-uuid-1234" + ], + "extra_data": { + "matched_to": "lodash", + "path_score": 95 + } + } + ], + "packages": [ + { + "purl": "pkg:npm/lodash@4.17.21", + "package_uid": "pkg:npm/lodash@4.17.21?uuid=test-uuid-1234", + "type": "npm", + "name": "lodash", + "version": "4.17.21", + "description": "Lodash modular utilities", + "declared_license_expression": "mit" + } + ] +} \ No newline at end of file diff --git a/scanpipe/tests/data/integrations/purldb_import.json b/scanpipe/tests/data/integrations/purldb_import.json new file mode 100644 index 0000000000..6eb3a58712 --- /dev/null +++ b/scanpipe/tests/data/integrations/purldb_import.json @@ -0,0 +1,31 @@ +{ + "packages": [ + { + "purl": "pkg:npm/lodash@4.17.21", + "type": "npm", + "namespace": "", + "name": "lodash", + "version": "4.17.21", + "description": "Lodash modular utilities", + "homepage_url": "https://lodash.com/", + "download_url": "https://registry.npmjs.com/lodash/-/lodash-4.17.21.tgz", + "repository_homepage_url": "https://www.npmjs.com/package/lodash", + "declared_license_expression": "mit", + "declared_license_expression_spdx": "MIT", + "copyright": "Copyright OpenJS Foundation and other contributors", + "primary_language": "JavaScript" + }, + { + "purl": "pkg:pypi/requests@2.28.0", + "type": "pypi", + "namespace": "", + "name": "requests", + "version": "2.28.0", + "description": "Python HTTP for Humans", + "homepage_url": "https://requests.readthedocs.io", + "repository_homepage_url": "https://pypi.org/project/requests/", + "declared_license_expression": "apache-2.0", + "declared_license_expression_spdx": "Apache-2.0" + } + ] +} \ No newline at end of file diff --git a/scanpipe/tests/data/integrations/vulnerablecode_import.json b/scanpipe/tests/data/integrations/vulnerablecode_import.json new file mode 100644 index 0000000000..34dbded53e --- /dev/null +++ b/scanpipe/tests/data/integrations/vulnerablecode_import.json @@ -0,0 +1,27 @@ +[ + { + "purl": "pkg:pypi/django@5.0", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-3gge-bre2-aaac", + "summary": "CVE-2024-24680 vulnerability", + "aliases": [ + "CVE-2024-24680", + "GHSA-xxj9-f6rv-m3x4" + ] + } + ] + }, + { + "purl": "pkg:pypi/requests@2.28.0", + "affected_by_vulnerabilities": [ + { + "vulnerability_id": "VCID-test-vuln-aaaa", + "summary": "Test vulnerability", + "aliases": [ + "CVE-2023-12345" + ] + } + ] + } +] \ No newline at end of file diff --git a/scanpipe/tests/pipes/test_input.py b/scanpipe/tests/pipes/test_input.py index 0717fd7c8b..139f291244 100644 --- a/scanpipe/tests/pipes/test_input.py +++ b/scanpipe/tests/pipes/test_input.py @@ -233,3 +233,111 @@ def test_scanpipe_pipes_input_clean_xlsx_data_to_model_data_dependency(self): "datasource_id": "pypi_wheel_metadata", } self.assertEqual(expected, results) + + def test_scanpipe_pipes_input_get_integrated_tool_name(self): + """Test detection of integrated tool name from scan data structure.""" + vuln_data_list = [ + {"purl": "pkg:pypi/django@5.0", "affected_by_vulnerabilities": []} + ] + self.assertEqual("vulnerablecode", input.get_integrated_tool_name(vuln_data_list)) + + vuln_data_dict = {"vulnerabilities": [{"id": "test"}]} + self.assertEqual("vulnerablecode", input.get_integrated_tool_name(vuln_data_dict)) + + purldb_data = { + "packages": [ + { + "purl": "pkg:npm/lodash@4.17.21", + "repository_homepage_url": "https://npmjs.com", + } + ] + } + self.assertEqual("purldb", input.get_integrated_tool_name(purldb_data)) + + matchcode_data = { + "files": [ + { + "path": "test.js", + "for_packages": ["pkg:npm/test@1.0"], + "extra_data": {"matched_to": "test", "path_score": 100}, + } + ], + "packages": [{"purl": "pkg:npm/test@1.0"}], + } + self.assertEqual("matchcodeio", input.get_integrated_tool_name(matchcode_data)) + + unknown_data = {"random_key": "value"} + self.assertIsNone(input.get_integrated_tool_name(unknown_data)) + + def test_scanpipe_pipes_input_load_vulnerabilities_from_vulnerablecode(self): + """Test loading vulnerability data from VulnerableCode export.""" + from scanpipe.models import DiscoveredPackage + + project = Project.objects.create(name="vuln_test") + + DiscoveredPackage.objects.create( + project=project, + type="pypi", + name="django", + version="5.0", + ) + + input_location = self.data / "integrations" / "vulnerablecode_import.json" + scan_data = json.loads(input_location.read_text()) + updated_count = input.load_vulnerabilities_from_vulnerablecode(project, scan_data) + + self.assertEqual(1, updated_count) + + package = project.discoveredpackages.get(name="django") + self.assertIsNotNone(package.affected_by_vulnerabilities) + self.assertEqual(1, len(package.affected_by_vulnerabilities)) + self.assertEqual( + "VCID-3gge-bre2-aaac", + package.affected_by_vulnerabilities[0]["vulnerability_id"], + ) + + def test_scanpipe_pipes_input_load_enrichment_from_purldb(self): + """Test loading package enrichment data from PurlDB export.""" + project = Project.objects.create(name="purldb_test") + + input_location = self.data / "integrations" / "purldb_import.json" + scan_data = json.loads(input_location.read_text()) + result = input.load_enrichment_from_purldb(project, scan_data) + + self.assertEqual(2, result["created"]) + self.assertEqual(0, result["updated"]) + self.assertEqual(2, project.discoveredpackages.count()) + + lodash = project.discoveredpackages.get(name="lodash") + self.assertEqual("pkg:npm/lodash@4.17.21", lodash.package_url) + self.assertEqual("mit", lodash.declared_license_expression) + + requests_pkg = project.discoveredpackages.get(name="requests") + self.assertEqual("pkg:pypi/requests@2.28.0", requests_pkg.package_url) + self.assertEqual("apache-2.0", requests_pkg.declared_license_expression) + + def test_scanpipe_pipes_input_load_matches_from_matchcode(self): + """Test loading matching results from MatchCode.io export.""" + project = Project.objects.create(name="matchcode_test") + + CodebaseResource.objects.create( + project=project, + path="src/utils.js", + type=CodebaseResource.Type.FILE, + ) + CodebaseResource.objects.create( + project=project, + path="src/helper.js", + type=CodebaseResource.Type.FILE, + ) + + input_location = self.data / "integrations" / "matchcode_import.json" + scan_data = json.loads(input_location.read_text()) + created_count = input.load_matches_from_matchcode(project, scan_data) + + self.assertEqual(1, created_count) + self.assertEqual(1, project.discoveredpackages.count()) + + package = project.discoveredpackages.first() + self.assertEqual("lodash", package.name) + self.assertEqual("4.17.21", package.version)