aboutcode-org · Kaushik-Kumar-CEG · Jun 5, 2026
diff --git a/etc/scripts/dataset_pipeline/annotate_composites.py b/etc/scripts/dataset_pipeline/annotate_composites.py
@@ -0,0 +1,164 @@
+# annotates composite (AND/OR) license rules with {{ }} required phrase markers
+# uses scancode's license index to find license names in rule text
+import re
+from pathlib import Path
+import click
+from license_expression import Licensing
+from licensedcode.models import Rule, load_licenses
+from licensedcode.models import rules_data_dir as default_rules_data_dir
+from licensedcode.required_phrases import add_required_phrase_to_rule
+
+MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
+VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)
+
+# extra short forms used in rule text that the license index doesnt have
+EXTRA_NAMES = {
+    'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'],
+    'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
+    'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
+    'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
+    'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
+    'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'],
+    'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
+    'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
+    'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
+    'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
+    'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
+    'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
+    'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'],
+    'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'],
+    'mit': ['MIT License', 'MIT license', 'MIT'],
+    'isc': ['ISC License', 'ISC license', 'ISC'],
+    'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
+    'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
+    'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
+    'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
+    'unlicense': ['Unlicense'],
+}
+
+
+def strip_version_suffix(name):
+    """removing trailing version from a license name"""
+    result = VERSION_SUFFIX_RE.sub('', name).strip()
+    if len(result) < 10 or result == name:
+        return None
+    return result
+
+
+def get_candidate_names(lic):
+    """collect names to search for.longest first"""
+    names = []
+    if lic.name:
+        names.append(lic.name)
+        base = strip_version_suffix(lic.name)
+        if base:
+            names.append(base)
+    if lic.short_name and lic.short_name not in names:
+        names.append(lic.short_name)
+    if lic.spdx_license_key and lic.spdx_license_key not in names:
+        names.append(lic.spdx_license_key)
+    if lic.key not in names:
+        names.append(lic.key)
+    for e in EXTRA_NAMES.get(lic.key, []):
+        if e not in names:
+            names.append(e)
+    names.sort(key=len, reverse=True)
+    return names
+
+
+def find_in_text(text, candidates):
+    """case insensitive search.returns matched span having original case"""
+    text_lower = text.lower()
+    for name in candidates:
+        if not name or len(name) < 3:
+            continue
+        pos = text_lower.find(name.lower())
+        if pos != -1:
+            return text[pos:pos + len(name)]
+    return None
+
+
+@click.command()
+@click.option('--rules-dir', type=click.Path(exists=True), default=None)
+@click.option('--expression-filter', default=None,
+              help='only process rules containing this in their expression')
+@click.option('--limit', type=int, default=None)
+@click.option('--dry-run', is_flag=True)
+def main(rules_dir, expression_filter, limit, dry_run):
+    """annotate rules with required phrase markers"""
+    if not rules_dir:
+        repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
+        rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir
+
+    rules_path = Path(rules_dir)
+    licenses_db = load_licenses()
+    licensing = Licensing(list(licenses_db.values()))
+
+    processed = 0
+    annotated = 0
+
+    for rf in sorted(rules_path.glob('*.RULE')):
+        if limit and processed >= limit:
+            break
+        stem = rf.stem
+        if '_or_' not in stem and '_and_' not in stem:
+            continue
+        try:
+            rule = Rule.from_file(rule_file=str(rf))
+        except Exception:
+            continue
+        if not rule.license_expression:
+            continue
+        if getattr(rule, 'is_required_phrase', False):
+            continue
+        text = rule.text or ''
+        if MARKER_RE.search(text):
+            continue
+        if expression_filter and expression_filter not in rule.license_expression:
+            continue
+
+        processed += 1
+        try:
+            keys = licensing.license_keys(rule.license_expression, unique=True)
+        except Exception:
+            continue
+        if not keys:
+            continue
+
+        # find each license name in the text
+        phrases = []
+        found_all = True
+        for key in keys:
+            lic = licenses_db.get(key)
+            if not lic:
+                found_all = False
+                break
+            match = find_in_text(text, get_candidate_names(lic))
+            if not match:
+                found_all = False
+                break
+            phrases.append(match)
+
+        if not found_all:
+            continue
+
+        added = False
+        for phrase in phrases:
+            if add_required_phrase_to_rule(rule=rule, required_phrase=phrase,
+                                          source='composite_annotation', dry_run=dry_run):
+                added = True
+        if added:
+            annotated += 1
+            if dry_run:
+                click.echo(f'  {rule.identifier}: {phrases}')
+
+    click.echo(f'\ndone - {annotated}/{processed} annotated')
+
+
+if __name__ == '__main__':
+    main()
+
+# commands:
+# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
+# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
+# python etc/scripts/dataset_pipeline/annotate_composites.py