Skip to content

Commit 6871324

Browse files
authored
Merge pull request #2163 from ziadhany/collect-package-patch
Add support for Reference Fix Commits improver
2 parents 818b92b + bab0d75 commit 6871324

3 files changed

Lines changed: 229 additions & 0 deletions

File tree

vulnerabilities/improvers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
)
3232
from vulnerabilities.pipelines.v2_improvers import flag_ghost_packages as flag_ghost_packages_v2
3333
from vulnerabilities.pipelines.v2_improvers import group_advisories_for_packages
34+
from vulnerabilities.pipelines.v2_improvers import reference_collect_commits
3435
from vulnerabilities.pipelines.v2_improvers import relate_severities
3536
from vulnerabilities.pipelines.v2_improvers import unfurl_version_range as unfurl_version_range_v2
3637
from vulnerabilities.utils import create_registry
@@ -73,5 +74,6 @@
7374
relate_severities.RelateSeveritiesPipeline,
7475
group_advisories_for_packages.GroupAdvisoriesForPackages,
7576
compute_advisory_todo_v2.ComputeToDo,
77+
reference_collect_commits.CollectReferencesFixCommitsPipeline,
7678
]
7779
)
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from collections import defaultdict
10+
11+
from aboutcode.pipeline import LoopProgress
12+
from django.db.models import Prefetch
13+
from packageurl.contrib.purl2url import purl2url
14+
from packageurl.contrib.url2purl import url2purl
15+
16+
from aboutcode.federated import get_core_purl
17+
from vulnerabilities.models import AdvisoryReference
18+
from vulnerabilities.models import AdvisoryV2
19+
from vulnerabilities.models import ImpactedPackage
20+
from vulnerabilities.models import PackageCommitPatch
21+
from vulnerabilities.models import Patch
22+
from vulnerabilities.pipelines import VulnerableCodePipeline
23+
from vulnerabilities.utils import is_commit
24+
25+
26+
class CollectReferencesFixCommitsPipeline(VulnerableCodePipeline):
27+
"""
28+
Improver pipeline to scout References/Patch and create PackageCommitPatch entries.
29+
"""
30+
31+
pipeline_id = "collect_ref_fix_commits_v2"
32+
33+
@classmethod
34+
def steps(cls):
35+
return (cls.collect_and_store_fix_commits,)
36+
37+
def get_vcs_data(self, url):
38+
"""Extracts a VCS URL and commit hash from URL.
39+
>> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/commit/98e516011d6e096e25247b82fc5f196bbeecff10')
40+
("pkg:github/aboutcode-org/vulnerablecode", 'https://github.com/aboutcode-org/vulnerablecode', '98e516011d6e096e25247b82fc5f196bbeecff10')
41+
>> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/pull/1974')
42+
None
43+
"""
44+
try:
45+
purl = url2purl(url)
46+
if not purl:
47+
return
48+
49+
version = purl.version
50+
if not version or not is_commit(version):
51+
return
52+
base_purl = get_core_purl(purl)
53+
vcs_url = purl2url(base_purl.to_string())
54+
if base_purl and vcs_url and version:
55+
return base_purl, vcs_url, version
56+
except Exception as e:
57+
self.log(f"Invalid URL: url:{url} error:{e}")
58+
59+
def collect_and_store_fix_commits(self):
60+
advisories = AdvisoryV2.objects.only("id").prefetch_related(
61+
Prefetch("references", queryset=AdvisoryReference.objects.only("id", "url")),
62+
Prefetch("patches", queryset=Patch.objects.only("id", "patch_url")),
63+
)
64+
65+
progress = LoopProgress(total_iterations=advisories.count(), logger=self.log)
66+
67+
commit_batch = []
68+
updated_pkg_patch_commit_count = 0
69+
batch_size = 10000
70+
for adv in progress.iter(advisories.iterator(chunk_size=batch_size)):
71+
urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()}
72+
73+
for url in urls:
74+
vcs_data = self.get_vcs_data(url)
75+
if not vcs_data:
76+
continue
77+
base_purl, vcs_url, commit_hash = vcs_data
78+
commit_batch.append((str(base_purl), vcs_url, commit_hash, adv.id))
79+
80+
if len(commit_batch) >= batch_size:
81+
updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch)
82+
commit_batch.clear()
83+
84+
if commit_batch:
85+
updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch)
86+
commit_batch.clear()
87+
88+
self.log(f"Successfully processed pkg patch commit {updated_pkg_patch_commit_count:,d}")
89+
90+
def bulk_commit_batch_update(self, vcs_data_table):
91+
impact_data = {(row[0], row[3]) for row in vcs_data_table} # base_purl, adv_id
92+
commit_data = {(row[1], row[2]) for row in vcs_data_table} # vcs_url, commit_hash
93+
94+
adv_ids = {adv_id for _, adv_id in impact_data}
95+
commit_hashes = {commit_hash for _, commit_hash in commit_data}
96+
97+
existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only(
98+
"id", "base_purl", "advisory_id"
99+
)
100+
existing_impact_pairs = {
101+
(impact_pkg.base_purl, impact_pkg.advisory_id) for impact_pkg in existing_impacts
102+
}
103+
104+
if new_impacts := impact_data - existing_impact_pairs:
105+
ImpactedPackage.objects.bulk_create(
106+
[
107+
ImpactedPackage(base_purl=base_purl, advisory_id=adv_id)
108+
for base_purl, adv_id in new_impacts
109+
],
110+
ignore_conflicts=True,
111+
)
112+
113+
PackageCommitPatch.objects.bulk_create(
114+
[
115+
PackageCommitPatch(vcs_url=vcs_url, commit_hash=commit_hash)
116+
for vcs_url, commit_hash in commit_data
117+
],
118+
ignore_conflicts=True,
119+
)
120+
121+
fetched_impacts = {
122+
(impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg
123+
for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only(
124+
"id", "base_purl", "advisory_id"
125+
)
126+
}
127+
128+
fetched_pkg_commits = {
129+
(pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch
130+
for pkg_commit_patch in PackageCommitPatch.objects.filter(
131+
commit_hash__in=commit_hashes
132+
).only("id", "vcs_url", "commit_hash")
133+
}
134+
135+
through_model = PackageCommitPatch.fixed_in_impacts.through
136+
137+
relations = []
138+
for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table:
139+
impacted_pkg_obj = fetched_impacts.get((base_purl, adv_id))
140+
pkg_commit_obj = fetched_pkg_commits.get((vcs_url, commit_hash))
141+
142+
if impacted_pkg_obj and pkg_commit_obj:
143+
relations.append(
144+
through_model(
145+
packagecommitpatch_id=pkg_commit_obj.id,
146+
impactedpackage_id=impacted_pkg_obj.id,
147+
)
148+
)
149+
150+
through_model.objects.bulk_create(
151+
relations,
152+
ignore_conflicts=True,
153+
batch_size=10000,
154+
)
155+
return len(vcs_data_table)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
9+
from datetime import datetime
10+
11+
import pytest
12+
13+
from vulnerabilities.models import AdvisoryReference
14+
from vulnerabilities.models import AdvisoryV2
15+
from vulnerabilities.models import ImpactedPackage
16+
from vulnerabilities.models import PackageCommitPatch
17+
from vulnerabilities.models import PackageV2
18+
from vulnerabilities.pipelines.v2_improvers.reference_collect_commits import (
19+
CollectReferencesFixCommitsPipeline,
20+
)
21+
22+
23+
@pytest.mark.django_db
24+
def test_collect_fix_commits_pipeline_creates_entry():
25+
advisory = AdvisoryV2.objects.create(
26+
advisory_id="CVE-2025-1000",
27+
datasource_id="test-ds",
28+
avid="test-ds/CVE-2025-1000",
29+
url="https://example.com/advisory/CVE-2025-1000",
30+
unique_content_id="11111",
31+
date_collected=datetime.now(),
32+
)
33+
34+
reference = AdvisoryReference.objects.create(
35+
url="https://github.com/test/testpkg/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d"
36+
)
37+
advisory.references.add(reference)
38+
39+
pipeline = CollectReferencesFixCommitsPipeline()
40+
pipeline.collect_and_store_fix_commits()
41+
42+
package_commit_patch = PackageCommitPatch.objects.all()
43+
impacted_packages = advisory.impacted_packages.all()
44+
45+
assert package_commit_patch.count() == 1
46+
assert impacted_packages.count() == 1
47+
48+
fix = package_commit_patch.first()
49+
assert fix.commit_hash == "6bd301819f8f69331a55ae2336c8b111fc933f3d"
50+
assert fix.vcs_url == "https://github.com/test/testpkg"
51+
assert impacted_packages.first().fixed_by_package_commit_patches.count() == 1
52+
53+
54+
@pytest.mark.django_db
55+
def test_collect_fix_commits_pipeline_skips_non_commit_urls():
56+
advisory = AdvisoryV2.objects.create(
57+
advisory_id="CVE-2025-2000",
58+
datasource_id="test-ds",
59+
avid="test-ds/CVE-2025-2000",
60+
url="https://example.com/advisory/CVE-2025-2000",
61+
unique_content_id="11111",
62+
date_collected=datetime.now(),
63+
)
64+
65+
reference = AdvisoryReference.objects.create(
66+
url="https://github.com/test/testpkg/issues/12"
67+
) # invalid reference 1
68+
advisory.references.add(reference)
69+
70+
pipeline = CollectReferencesFixCommitsPipeline()
71+
pipeline.collect_and_store_fix_commits()
72+
assert PackageCommitPatch.objects.count() == 0

0 commit comments

Comments
 (0)