From 5477224d01a969125eb65fde303755a15fcf441b Mon Sep 17 00:00:00 2001 From: Kaushik Date: Wed, 3 Jun 2026 15:52:52 +0000 Subject: [PATCH 1/3] Add script to extract required phrase annotations from rule files Parses .RULE files for {{ }} markers and outputs JSONL with character positions and normalized phrases for NER training. Signed-off-by: Kaushik --- etc/scripts/dataset_pipeline/build_dataset.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 etc/scripts/dataset_pipeline/build_dataset.py diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py new file mode 100644 index 0000000000..c1e119f930 --- /dev/null +++ b/etc/scripts/dataset_pipeline/build_dataset.py @@ -0,0 +1,139 @@ +# extracts required phrases from .RULE files +# outputs a JSONL dataset for NER model training +import json +import re +import unicodedata +from pathlib import Path +import click + +from licensedcode.models import Rule +from licensedcode.models import rules_data_dir as default_rules_data_dir + +MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) + + +def extract_markers(text): + """Pull out all {{ }} markers and compute char positions + relative to plain text (with markers removed)""" + markers = [] + offset = 0 + for m in MARKER_RE.finditer(text): + phrase = m.group(1) + start = m.start() - offset + end = start + len(phrase) + markers.append({'phrase': phrase, 'start': start, 'end': end}) + offset += 4 # account for removed {{ and }} + return markers + + +def strip_markers(text): + """Remove {{ }} but keep text inside""" + return MARKER_RE.sub(lambda m: m.group(1), text) + + +def normalize_phrase(phrase): + """Clean raw marker phrase for training""" + result = phrase + # replace html entities + result = result.replace('"', '"').replace('&', '&') + result = result.replace('<', '<').replace('>', '>') + # strip xml tags like , + result = re.sub(r'<[^>]+>', '', result) + # remove markdown backticks + result = result.replace('`', '') + # collapse whitespace and trim + result = re.sub(r'\s+', ' ', result).strip() + # strip trailing/leading punct thats not meaningful + result = result.strip('.,;:>') + return result + + +@click.command() +@click.option('--rules-dir', type=click.Path(exists=True), default=None, + help='Path to rules directory (defaults to repo rules dir)') +@click.option('--output', default='dataset-output/required_phrases.jsonl', + help='Output JSONL path') +def main(rules_dir, output): + """Extract required phrases from rule files for NER training""" + if not rules_dir: + repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules' + rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir + + rules_path = Path(rules_dir) + out_path = Path(output) + out_path.parent.mkdir(parents=True, exist_ok=True) + + total_rules = 0 + annotated = 0 + total_phrases = 0 + results = [] + + click.echo(f'scanning rules from: {rules_path}') + for rf in sorted(rules_path.glob('*.RULE')): + try: + rule = Rule.from_file(rule_file=str(rf)) + except Exception: + continue + total_rules += 1 + + # skip is_required_phrase files,theyre just the phrase itself + if getattr(rule, 'is_required_phrase', False): + continue + text = rule.text or '' + if not MARKER_RE.search(text): + continue + + # normalize line endings and unicode + text = text.replace('\r\n', '\n').replace('\r', '\n') + text = unicodedata.normalize('NFKC', text) + + markers = extract_markers(text) + plain_text = strip_markers(text) + # validate positions and normalize each phrase + valid_markers = [] + for m in markers: + if m['start'] < 0 or m['end'] > len(plain_text): + continue + actual = plain_text[m['start']:m['end']] + if actual != m['phrase']: + continue + normalized = normalize_phrase(m['phrase']) + if not normalized: + continue + valid_markers.append({ + 'phrase': m['phrase'], + 'phrase_normalized': normalized, + 'start': m['start'], + 'end': m['end'], + }) + + if not valid_markers: + continue + annotated += 1 + total_phrases += len(valid_markers) + results.append({ + 'identifier': rule.identifier, + 'license_expression': rule.license_expression or '', + 'required_phrases': valid_markers, + }) + + # write jsonl + with open(out_path, 'w', encoding='utf-8') as f: + for entry in results: + f.write(json.dumps(entry, ensure_ascii=False) + '\n') + + click.echo('\ndone') + click.echo(f' rules scanned: {total_rules}') + click.echo(f' annotated: {annotated}') + click.echo(f' phrases extracted: {total_phrases}') + click.echo(f' output: {out_path}') + + +# stuff to do(follow up commits): +# - add plain text field to output (whole rule text) +# - BIOES labels +# - train/val/test split by license expression +# - additional fields (rule_type,etc) + +if __name__ == '__main__': + main() From 68e94b89c9fce7c00fa878cf293d3889149fe785 Mon Sep 17 00:00:00 2001 From: Kaushik Date: Sun, 7 Jun 2026 13:12:51 +0000 Subject: [PATCH 2/3] refactor and add metadata fields to dataset output drop local {{ }} regex and use scancode's get_required_phrase_verbatim. Add identifier, rule_type, text fields. Drop per phrase start/end since they arent needed for training References #5077 --- etc/scripts/dataset_pipeline/build_dataset.py | 77 ++++++++----------- 1 file changed, 33 insertions(+), 44 deletions(-) diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py index c1e119f930..b3f549da73 100644 --- a/etc/scripts/dataset_pipeline/build_dataset.py +++ b/etc/scripts/dataset_pipeline/build_dataset.py @@ -8,27 +8,7 @@ from licensedcode.models import Rule from licensedcode.models import rules_data_dir as default_rules_data_dir - -MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL) - - -def extract_markers(text): - """Pull out all {{ }} markers and compute char positions - relative to plain text (with markers removed)""" - markers = [] - offset = 0 - for m in MARKER_RE.finditer(text): - phrase = m.group(1) - start = m.start() - offset - end = start + len(phrase) - markers.append({'phrase': phrase, 'start': start, 'end': end}) - offset += 4 # account for removed {{ and }} - return markers - - -def strip_markers(text): - """Remove {{ }} but keep text inside""" - return MARKER_RE.sub(lambda m: m.group(1), text) +from licensedcode.required_phrases import get_required_phrase_verbatim def normalize_phrase(phrase): @@ -48,6 +28,16 @@ def normalize_phrase(phrase): return result +def get_rule_type(rule): + """is_* flag set on the rule""" + for flag in ('is_license_text', 'is_license_notice', 'is_license_reference', + 'is_license_tag', 'is_license_intro', 'is_license_clue', + 'is_false_positive'): + if getattr(rule, flag, False): + return flag + return 'unknown' + + @click.command() @click.option('--rules-dir', type=click.Path(exists=True), default=None, help='Path to rules directory (defaults to repo rules dir)') @@ -76,45 +66,45 @@ def main(rules_dir, output): continue total_rules += 1 - # skip is_required_phrase files,theyre just the phrase itself + # is_required_phrase rules don't need {{ }}.the flag covers them if getattr(rule, 'is_required_phrase', False): continue + text = rule.text or '' - if not MARKER_RE.search(text): + if not text: continue # normalize line endings and unicode text = text.replace('\r\n', '\n').replace('\r', '\n') text = unicodedata.normalize('NFKC', text) - markers = extract_markers(text) - plain_text = strip_markers(text) - # validate positions and normalize each phrase - valid_markers = [] - for m in markers: - if m['start'] < 0 or m['end'] > len(plain_text): - continue - actual = plain_text[m['start']:m['end']] - if actual != m['phrase']: - continue - normalized = normalize_phrase(m['phrase']) + phrases = list(get_required_phrase_verbatim(text)) + if not phrases: + continue + + # strip out the {{ }} markers + text = text.replace('{{', '').replace('}}', '') + + valid_phrases = [] + for p in phrases: + normalized = normalize_phrase(p) if not normalized: continue - valid_markers.append({ - 'phrase': m['phrase'], + valid_phrases.append({ + 'phrase': p, 'phrase_normalized': normalized, - 'start': m['start'], - 'end': m['end'], }) - if not valid_markers: + if not valid_phrases: continue annotated += 1 - total_phrases += len(valid_markers) + total_phrases += len(valid_phrases) results.append({ 'identifier': rule.identifier, 'license_expression': rule.license_expression or '', - 'required_phrases': valid_markers, + 'rule_type': get_rule_type(rule), + 'text': text, + 'required_phrases': valid_phrases, }) # write jsonl @@ -130,10 +120,9 @@ def main(rules_dir, output): # stuff to do(follow up commits): -# - add plain text field to output (whole rule text) -# - BIOES labels +# - tokens + BIOES labels # - train/val/test split by license expression -# - additional fields (rule_type,etc) + if __name__ == '__main__': main() From 235fd61c202a27df23ddca007c20f3888d1146a8 Mon Sep 17 00:00:00 2001 From: Kaushik Date: Sun, 7 Jun 2026 19:48:17 +0000 Subject: [PATCH 3/3] Add tokens and BIOES labels per record using scancode's required_phrase_splitter References #5077 --- etc/scripts/dataset_pipeline/build_dataset.py | 59 +++++++++++++------ 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py index b3f549da73..d2bdf9c616 100644 --- a/etc/scripts/dataset_pipeline/build_dataset.py +++ b/etc/scripts/dataset_pipeline/build_dataset.py @@ -9,6 +9,7 @@ from licensedcode.models import Rule from licensedcode.models import rules_data_dir as default_rules_data_dir from licensedcode.required_phrases import get_required_phrase_verbatim +from licensedcode.tokenize import required_phrase_splitter def normalize_phrase(phrase): @@ -17,14 +18,14 @@ def normalize_phrase(phrase): # replace html entities result = result.replace('"', '"').replace('&', '&') result = result.replace('<', '<').replace('>', '>') - # strip xml tags like , - result = re.sub(r'<[^>]+>', '', result) + # strip xml tags like , but keep urls in angle brackets + result = re.sub(r'<(?![a-zA-Z]+://)[^>]+>', '', result) # remove markdown backticks result = result.replace('`', '') # collapse whitespace and trim result = re.sub(r'\s+', ' ', result).strip() # strip trailing/leading punct thats not meaningful - result = result.strip('.,;:>') + result = result.strip('.,;:<>') return result @@ -38,6 +39,33 @@ def get_rule_type(rule): return 'unknown' +def tag_tokens(text): + """Tag each word token with a BIOES label""" + tokens = [] + labels = [] + in_phrase = False + count = 0 # word tokens seen since the last {{ + + for tok in required_phrase_splitter(text): + if tok == '{{': + in_phrase = True + count = 0 + continue + if tok == '}}': + if in_phrase and count > 0: + labels[-1] = 'S-REQ' if count == 1 else 'E-REQ' + in_phrase = False + count = 0 + continue + tokens.append(tok) + if in_phrase: + labels.append('B-REQ' if count == 0 else 'I-REQ') + count += 1 + else: + labels.append('O') + return tokens, labels + + @click.command() @click.option('--rules-dir', type=click.Path(exists=True), default=None, help='Path to rules directory (defaults to repo rules dir)') @@ -82,21 +110,17 @@ def main(rules_dir, output): if not phrases: continue + # word tokens + BIOES labels (computed before stripping markers) + tokens, bioes_labels = tag_tokens(text) + # strip out the {{ }} markers text = text.replace('{{', '').replace('}}', '') - valid_phrases = [] - for p in phrases: - normalized = normalize_phrase(p) - if not normalized: - continue - valid_phrases.append({ - 'phrase': p, - 'phrase_normalized': normalized, - }) - - if not valid_phrases: - continue + valid_phrases = [ + {'phrase': p, 'phrase_normalized': normalize_phrase(p)} + for p in phrases + ] + annotated += 1 total_phrases += len(valid_phrases) results.append({ @@ -104,6 +128,8 @@ def main(rules_dir, output): 'license_expression': rule.license_expression or '', 'rule_type': get_rule_type(rule), 'text': text, + 'tokens': tokens, + 'bioes_labels': bioes_labels, 'required_phrases': valid_phrases, }) @@ -120,8 +146,7 @@ def main(rules_dir, output): # stuff to do(follow up commits): -# - tokens + BIOES labels -# - train/val/test split by license expression +# train/val/test split by license expression if __name__ == '__main__':