Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions etc/scripts/dataset_pipeline/annotate_composites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# annotates composite (AND/OR) license rules with {{ }} required phrase markers
# uses scancode's license index to find license names in rule text
import re
from pathlib import Path
import click
from license_expression import Licensing
from licensedcode.models import Rule, load_licenses
from licensedcode.models import rules_data_dir as default_rules_data_dir
from licensedcode.required_phrases import add_required_phrase_to_rule

MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
VERSION_SUFFIX_RE = re.compile(r'\s+v?\d[\d.]*(?:\s*(?:only|or[ -]later|\+))?$', re.IGNORECASE)

# extra short forms used in rule text that the license index doesnt have
EXTRA_NAMES = {
'gpl-2.0': ['GPL-2.0', 'GPLv2', 'GPL 2.0', 'GPL version 2'],
'gpl-2.0-plus': ['GPL-2.0+', 'GPLv2+', 'GPL 2.0 or later'],
'gpl-3.0': ['GPL-3.0', 'GPLv3', 'GPL 3.0', 'GPL version 3'],
'gpl-3.0-plus': ['GPL-3.0+', 'GPLv3+', 'GPL 3.0 or later'],
'lgpl-2.1': ['LGPL-2.1', 'LGPLv2.1', 'LGPL 2.1'],
'lgpl-2.1-plus': ['LGPL-2.1+', 'LGPLv2.1+'],
'lgpl-3.0': ['LGPL-3.0', 'LGPLv3', 'LGPL 3.0'],
'lgpl-3.0-plus': ['LGPL-3.0+', 'LGPLv3+'],
'agpl-3.0': ['AGPL-3.0', 'AGPLv3', 'AGPL 3.0'],
'agpl-3.0-plus': ['AGPL-3.0+', 'AGPLv3+'],
'mpl-2.0': ['MPL-2.0', 'MPL 2.0'],
'apache-2.0': ['Apache-2.0', 'Apache 2.0'],
'bsd-new': ['BSD-3-Clause', 'BSD 3-Clause'],
'bsd-simplified': ['BSD-2-Clause', 'BSD 2-Clause'],
'mit': ['MIT License', 'MIT license', 'MIT'],
'isc': ['ISC License', 'ISC license', 'ISC'],
'artistic-2.0': ['Artistic-2.0', 'Artistic 2.0'],
'epl-1.0': ['EPL-1.0', 'EPL 1.0'],
'epl-2.0': ['EPL-2.0', 'EPL 2.0'],
'cc-by-4.0': ['CC-BY-4.0', 'CC BY 4.0'],
'unlicense': ['Unlicense'],
}


def strip_version_suffix(name):
"""removing trailing version from a license name"""
result = VERSION_SUFFIX_RE.sub('', name).strip()
if len(result) < 10 or result == name:
return None
return result


def get_candidate_names(lic):
"""collect names to search for.longest first"""
names = []
if lic.name:
names.append(lic.name)
base = strip_version_suffix(lic.name)
if base:
names.append(base)
if lic.short_name and lic.short_name not in names:
names.append(lic.short_name)
if lic.spdx_license_key and lic.spdx_license_key not in names:
names.append(lic.spdx_license_key)
if lic.key not in names:
names.append(lic.key)
for e in EXTRA_NAMES.get(lic.key, []):
if e not in names:
names.append(e)
names.sort(key=len, reverse=True)
return names


def find_in_text(text, candidates):
"""case insensitive search.returns matched span having original case"""
text_lower = text.lower()
for name in candidates:
if not name or len(name) < 3:
continue
pos = text_lower.find(name.lower())
if pos != -1:
return text[pos:pos + len(name)]
return None


@click.command()
@click.option('--rules-dir', type=click.Path(exists=True), default=None)
@click.option('--expression-filter', default=None,
help='only process rules containing this in their expression')
@click.option('--limit', type=int, default=None)
@click.option('--dry-run', is_flag=True)
def main(rules_dir, expression_filter, limit, dry_run):
"""annotate rules with required phrase markers"""
if not rules_dir:
repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir

rules_path = Path(rules_dir)
licenses_db = load_licenses()
licensing = Licensing(list(licenses_db.values()))

processed = 0
annotated = 0

for rf in sorted(rules_path.glob('*.RULE')):
if limit and processed >= limit:
break
stem = rf.stem
if '_or_' not in stem and '_and_' not in stem:
continue
try:
rule = Rule.from_file(rule_file=str(rf))
except Exception:
continue
if not rule.license_expression:
continue
if getattr(rule, 'is_required_phrase', False):
continue
text = rule.text or ''
if MARKER_RE.search(text):
continue
if expression_filter and expression_filter not in rule.license_expression:
continue

processed += 1
try:
keys = licensing.license_keys(rule.license_expression, unique=True)
except Exception:
continue
if not keys:
continue

# find each license name in the text
phrases = []
found_all = True
for key in keys:
lic = licenses_db.get(key)
if not lic:
found_all = False
break
match = find_in_text(text, get_candidate_names(lic))
if not match:
found_all = False
break
phrases.append(match)

if not found_all:
continue

added = False
for phrase in phrases:
if add_required_phrase_to_rule(rule=rule, required_phrase=phrase,
source='composite_annotation', dry_run=dry_run):
added = True
if added:
annotated += 1
if dry_run:
click.echo(f' {rule.identifier}: {phrases}')

click.echo(f'\ndone - {annotated}/{processed} annotated')


if __name__ == '__main__':
main()

# commands:
# python etc/scripts/dataset_pipeline/annotate_composites.py --dry-run
# python etc/scripts/dataset_pipeline/annotate_composites.py --expression-filter apache --limit 20
# python etc/scripts/dataset_pipeline/annotate_composites.py