From 5477224d01a969125eb65fde303755a15fcf441b Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Wed, 3 Jun 2026 15:52:52 +0000
Subject: [PATCH 1/3] Add script to extract required phrase annotations from
 rule files

Parses .RULE files for {{ }} markers and outputs JSONL with character positions and normalized phrases for NER training.

Signed-off-by: Kaushik <kaushikrjpm10@gmail.com>
---
 etc/scripts/dataset_pipeline/build_dataset.py | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 etc/scripts/dataset_pipeline/build_dataset.py
diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py
new file mode 100644
index 0000000000..c1e119f930
--- /dev/null
+++ b/etc/scripts/dataset_pipeline/build_dataset.py
@@ -0,0 +1,139 @@
+# extracts required phrases from .RULE files
+# outputs a JSONL dataset for NER model training
+import json
+import re
+import unicodedata
+from pathlib import Path
+import click
+
+from licensedcode.models import Rule
+from licensedcode.models import rules_data_dir as default_rules_data_dir
+
+MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
+
+
+def extract_markers(text):
+    """Pull out all {{ }} markers and compute char positions
+    relative to plain text (with markers removed)"""
+    markers = []
+    offset = 0
+    for m in MARKER_RE.finditer(text):
+        phrase = m.group(1)
+        start = m.start() - offset
+        end = start + len(phrase)
+        markers.append({'phrase': phrase, 'start': start, 'end': end})
+        offset += 4  # account for removed {{ and }}
+    return markers
+
+
+def strip_markers(text):
+    """Remove {{ }} but keep text inside"""
+    return MARKER_RE.sub(lambda m: m.group(1), text)
+
+
+def normalize_phrase(phrase):
+    """Clean raw marker phrase for training"""
+    result = phrase
+    # replace html entities
+    result = result.replace('&quot;', '"').replace('&amp;', '&')
+    result = result.replace('&lt;', '<').replace('&gt;', '>')
+    # strip xml tags like <name>,</license>
+    result = re.sub(r'<[^>]+>', '', result)
+    # remove markdown backticks
+    result = result.replace('`', '')
+    # collapse whitespace and trim
+    result = re.sub(r'\s+', ' ', result).strip()
+    # strip trailing/leading punct thats not meaningful
+    result = result.strip('.,;:>')
+    return result
+
+
+@click.command()
+@click.option('--rules-dir', type=click.Path(exists=True), default=None,
+              help='Path to rules directory (defaults to repo rules dir)')
+@click.option('--output', default='dataset-output/required_phrases.jsonl',
+              help='Output JSONL path')
+def main(rules_dir, output):
+    """Extract required phrases from rule files for NER training"""
+    if not rules_dir:
+        repo_rules = Path(__file__).resolve().parents[3] / 'src' / 'licensedcode' / 'data' / 'rules'
+        rules_dir = str(repo_rules) if repo_rules.is_dir() else default_rules_data_dir
+
+    rules_path = Path(rules_dir)
+    out_path = Path(output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    total_rules = 0
+    annotated = 0
+    total_phrases = 0
+    results = []
+
+    click.echo(f'scanning rules from: {rules_path}')
+    for rf in sorted(rules_path.glob('*.RULE')):
+        try:
+            rule = Rule.from_file(rule_file=str(rf))
+        except Exception:
+            continue
+        total_rules += 1
+
+        # skip is_required_phrase files,theyre just the phrase itself
+        if getattr(rule, 'is_required_phrase', False):
+            continue
+        text = rule.text or ''
+        if not MARKER_RE.search(text):
+            continue
+
+        # normalize line endings and unicode
+        text = text.replace('\r\n', '\n').replace('\r', '\n')
+        text = unicodedata.normalize('NFKC', text)
+
+        markers = extract_markers(text)
+        plain_text = strip_markers(text)
+        # validate positions and normalize each phrase
+        valid_markers = []
+        for m in markers:
+            if m['start'] < 0 or m['end'] > len(plain_text):
+                continue
+            actual = plain_text[m['start']:m['end']]
+            if actual != m['phrase']:
+                continue
+            normalized = normalize_phrase(m['phrase'])
+            if not normalized:
+                continue
+            valid_markers.append({
+                'phrase': m['phrase'],
+                'phrase_normalized': normalized,
+                'start': m['start'],
+                'end': m['end'],
+            })
+
+        if not valid_markers:
+            continue
+        annotated += 1
+        total_phrases += len(valid_markers)
+        results.append({
+            'identifier': rule.identifier,
+            'license_expression': rule.license_expression or '',
+            'required_phrases': valid_markers,
+        })
+
+    # write jsonl
+    with open(out_path, 'w', encoding='utf-8') as f:
+        for entry in results:
+            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+
+    click.echo('\ndone')
+    click.echo(f'  rules scanned: {total_rules}')
+    click.echo(f'  annotated: {annotated}')
+    click.echo(f'  phrases extracted: {total_phrases}')
+    click.echo(f'  output: {out_path}')
+
+
+# stuff to do(follow up commits):
+# - add plain text field to output (whole rule text)
+# - BIOES labels
+# - train/val/test split by license expression
+# - additional fields (rule_type,etc)
+
+if __name__ == '__main__':
+    main()

From 68e94b89c9fce7c00fa878cf293d3889149fe785 Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Sun, 7 Jun 2026 13:12:51 +0000
Subject: [PATCH 2/3] refactor and add metadata fields to dataset output

drop local {{ }} regex and use scancode's get_required_phrase_verbatim.
Add identifier, rule_type, text fields. Drop per phrase start/end
since they arent needed for training

References #5077
---
 etc/scripts/dataset_pipeline/build_dataset.py | 77 ++++++++-----------
 1 file changed, 33 insertions(+), 44 deletions(-)

diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py
index c1e119f930..b3f549da73 100644
--- a/etc/scripts/dataset_pipeline/build_dataset.py
+++ b/etc/scripts/dataset_pipeline/build_dataset.py
@@ -8,27 +8,7 @@
 
 from licensedcode.models import Rule
 from licensedcode.models import rules_data_dir as default_rules_data_dir
-
-MARKER_RE = re.compile(r'\{\{([^}]*)\}\}', re.DOTALL)
-
-
-def extract_markers(text):
-    """Pull out all {{ }} markers and compute char positions
-    relative to plain text (with markers removed)"""
-    markers = []
-    offset = 0
-    for m in MARKER_RE.finditer(text):
-        phrase = m.group(1)
-        start = m.start() - offset
-        end = start + len(phrase)
-        markers.append({'phrase': phrase, 'start': start, 'end': end})
-        offset += 4  # account for removed {{ and }}
-    return markers
-
-
-def strip_markers(text):
-    """Remove {{ }} but keep text inside"""
-    return MARKER_RE.sub(lambda m: m.group(1), text)
+from licensedcode.required_phrases import get_required_phrase_verbatim
 
 
 def normalize_phrase(phrase):
@@ -48,6 +28,16 @@ def normalize_phrase(phrase):
     return result
 
 
+def get_rule_type(rule):
+    """is_* flag set on the rule"""
+    for flag in ('is_license_text', 'is_license_notice', 'is_license_reference',
+                 'is_license_tag', 'is_license_intro', 'is_license_clue',
+                 'is_false_positive'):
+        if getattr(rule, flag, False):
+            return flag
+    return 'unknown'
+
+
 @click.command()
 @click.option('--rules-dir', type=click.Path(exists=True), default=None,
               help='Path to rules directory (defaults to repo rules dir)')
@@ -76,45 +66,45 @@ def main(rules_dir, output):
             continue
         total_rules += 1
 
-        # skip is_required_phrase files,theyre just the phrase itself
+        # is_required_phrase rules don't need {{ }}.the flag covers them
         if getattr(rule, 'is_required_phrase', False):
             continue
+
         text = rule.text or ''
-        if not MARKER_RE.search(text):
+        if not text:
             continue
 
         # normalize line endings and unicode
         text = text.replace('\r\n', '\n').replace('\r', '\n')
         text = unicodedata.normalize('NFKC', text)
 
-        markers = extract_markers(text)
-        plain_text = strip_markers(text)
-        # validate positions and normalize each phrase
-        valid_markers = []
-        for m in markers:
-            if m['start'] < 0 or m['end'] > len(plain_text):
-                continue
-            actual = plain_text[m['start']:m['end']]
-            if actual != m['phrase']:
-                continue
-            normalized = normalize_phrase(m['phrase'])
+        phrases = list(get_required_phrase_verbatim(text))
+        if not phrases:
+            continue
+
+        # strip out the {{ }} markers
+        text = text.replace('{{', '').replace('}}', '')
+
+        valid_phrases = []
+        for p in phrases:
+            normalized = normalize_phrase(p)
             if not normalized:
                 continue
-            valid_markers.append({
-                'phrase': m['phrase'],
+            valid_phrases.append({
+                'phrase': p,
                 'phrase_normalized': normalized,
-                'start': m['start'],
-                'end': m['end'],
             })
 
-        if not valid_markers:
+        if not valid_phrases:
             continue
         annotated += 1
-        total_phrases += len(valid_markers)
+        total_phrases += len(valid_phrases)
         results.append({
             'identifier': rule.identifier,
             'license_expression': rule.license_expression or '',
-            'required_phrases': valid_markers,
+            'rule_type': get_rule_type(rule),
+            'text': text,
+            'required_phrases': valid_phrases,
         })
 
     # write jsonl
@@ -130,10 +120,9 @@ def main(rules_dir, output):
 
 
 # stuff to do(follow up commits):
-# - add plain text field to output (whole rule text)
-# - BIOES labels
+# - tokens + BIOES labels
 # - train/val/test split by license expression
-# - additional fields (rule_type,etc)
+
 
 if __name__ == '__main__':
     main()

From 235fd61c202a27df23ddca007c20f3888d1146a8 Mon Sep 17 00:00:00 2001
From: Kaushik <kaushikrjpm10@gmail.com>
Date: Sun, 7 Jun 2026 19:48:17 +0000
Subject: [PATCH 3/3] Add tokens and BIOES labels per record using scancode's
 required_phrase_splitter

References #5077
---
 etc/scripts/dataset_pipeline/build_dataset.py | 59 +++++++++++++------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/etc/scripts/dataset_pipeline/build_dataset.py b/etc/scripts/dataset_pipeline/build_dataset.py
index b3f549da73..d2bdf9c616 100644
--- a/etc/scripts/dataset_pipeline/build_dataset.py
+++ b/etc/scripts/dataset_pipeline/build_dataset.py
@@ -9,6 +9,7 @@
 from licensedcode.models import Rule
 from licensedcode.models import rules_data_dir as default_rules_data_dir
 from licensedcode.required_phrases import get_required_phrase_verbatim
+from licensedcode.tokenize import required_phrase_splitter
 
 
 def normalize_phrase(phrase):
@@ -17,14 +18,14 @@ def normalize_phrase(phrase):
     # replace html entities
     result = result.replace('&quot;', '"').replace('&amp;', '&')
     result = result.replace('&lt;', '<').replace('&gt;', '>')
-    # strip xml tags like <name>,</license>
-    result = re.sub(r'<[^>]+>', '', result)
+    # strip xml tags like <name>,</license> but keep urls in angle brackets
+    result = re.sub(r'<(?![a-zA-Z]+://)[^>]+>', '', result)
     # remove markdown backticks
     result = result.replace('`', '')
     # collapse whitespace and trim
     result = re.sub(r'\s+', ' ', result).strip()
     # strip trailing/leading punct thats not meaningful
-    result = result.strip('.,;:>')
+    result = result.strip('.,;:<>')
     return result
 
 
@@ -38,6 +39,33 @@ def get_rule_type(rule):
     return 'unknown'
 
 
+def tag_tokens(text):
+    """Tag each word token with a BIOES label"""
+    tokens = []
+    labels = []
+    in_phrase = False
+    count = 0  # word tokens seen since the last {{
+
+    for tok in required_phrase_splitter(text):
+        if tok == '{{':
+            in_phrase = True
+            count = 0
+            continue
+        if tok == '}}':
+            if in_phrase and count > 0:
+                labels[-1] = 'S-REQ' if count == 1 else 'E-REQ'
+            in_phrase = False
+            count = 0
+            continue
+        tokens.append(tok)
+        if in_phrase:
+            labels.append('B-REQ' if count == 0 else 'I-REQ')
+            count += 1
+        else:
+            labels.append('O')
+    return tokens, labels
+
+
 @click.command()
 @click.option('--rules-dir', type=click.Path(exists=True), default=None,
               help='Path to rules directory (defaults to repo rules dir)')
@@ -82,21 +110,17 @@ def main(rules_dir, output):
         if not phrases:
             continue
 
+        # word tokens + BIOES labels (computed before stripping markers)
+        tokens, bioes_labels = tag_tokens(text)
+
         # strip out the {{ }} markers
         text = text.replace('{{', '').replace('}}', '')
 
-        valid_phrases = []
-        for p in phrases:
-            normalized = normalize_phrase(p)
-            if not normalized:
-                continue
-            valid_phrases.append({
-                'phrase': p,
-                'phrase_normalized': normalized,
-            })
-
-        if not valid_phrases:
-            continue
+        valid_phrases = [
+            {'phrase': p, 'phrase_normalized': normalize_phrase(p)}
+            for p in phrases
+        ]
+
         annotated += 1
         total_phrases += len(valid_phrases)
         results.append({
@@ -104,6 +128,8 @@ def main(rules_dir, output):
             'license_expression': rule.license_expression or '',
             'rule_type': get_rule_type(rule),
             'text': text,
+            'tokens': tokens,
+            'bioes_labels': bioes_labels,
             'required_phrases': valid_phrases,
         })
 
@@ -120,8 +146,7 @@ def main(rules_dir, output):
 
 
 # stuff to do(follow up commits):
-# - tokens + BIOES labels
-# - train/val/test split by license expression
+# train/val/test split by license expression
 
 
 if __name__ == '__main__':