forklore/parse-maintainer.py at develop · fossunited/forklore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
"""Simple parser for maintainer issue markdown to JSON"""

import sys
import json
import re
from datetime import datetime
from pathlib import Path

import os

if os.getenv("CI") == "true":
    sys.exit(0)

try:
    import jsonschema

    HAS_JSONSCHEMA = True
except ImportError:
    HAS_JSONSCHEMA = False


# Exact questions that must appear in the form (in order)
REQUIRED_QUESTIONS = [
    "How to support",
    "A small brief about your project",
    "One FOSS maintainer lesson for your younger self",
    "Why do you do it? Why do you bother maintaining a FOSS project?",
    "If your repo had a theme song, what would it be?",
    "Which file in your project would you most like to set on fire?",
    "What's your open-source villain origin story?",
    "If you had to use one emoji to convey what it is like to be a FOSS maintainer, what would it be?",
]

# Mapping of user input labels to schema-valid labels
LABEL_NORMALIZATION = {
    # Lowercase variants
    "github": "GitHub",
    "gitlab": "GitLab",
    "codeberg": "Codeberg",
    "bitbucket": "BitBucket",
    "linkedin": "LinkedIn",
    "mastodon": "Mastodon",
    "bluesky": "Bluesky",
    "substack": "Substack",
    "discourse": "Discourse",
    "twitter": "Twitter",
    "email": "Email",
    "rss": "RSS",
    "web": "Web",
    "x": "X",
    "reddit": "Reddit",
    # Mixed case variants
    "Github": "GitHub",
    "Gitlab": "GitLab",
    "Linkedin": "LinkedIn",
    "Bluesky": "Bluesky",
    "Bitbucket": "BitBucket",
    "Reddit": "Reddit",
    # Common aliases
    "website": "Website",
    "web": "Web",
    "Blog": "Web",
    "blog": "Web",
    "Mail": "Email",
    "mail": "Email",
    "X/Twitter": "X",
    "Twitter/X": "X",
}

# Valid labels as per schema
VALID_LABELS = {
    "GitHub", "GitLab", "Gitlab", "Codeberg", "BitBucket", "LinkedIn",
    "X", "Twitter", "Mastodon", "Bluesky", "Substack",
    "Discourse", "Email", "RSS", "Web", "Reddit"
}


def normalize_label(label: str) -> str:
    """Normalize a social media label to match schema requirements."""
    label = label.strip()

    # Check if already valid
    if label in VALID_LABELS:
        return label

    # Try normalization map
    if label in LABEL_NORMALIZATION:
        return LABEL_NORMALIZATION[label]

    # Return as-is (will fail validation, but user will see error)
    return label


def load_schema():
    """Load the JSON schema for validation."""
    schema_path = Path(__file__).parent / "maintainer.schema.json"
    if not schema_path.exists():
        return None
    with open(schema_path, encoding="utf-8") as f:
        return json.load(f)


def validate_data(data: dict, schema: dict) -> list[str]:
    """Validate data against schema. Returns list of friendly error messages."""
    if not HAS_JSONSCHEMA:
        return ["jsonschema library not installed. Install with: pip install jsonschema"]

    errors = []
    validator = jsonschema.Draft7Validator(schema, format_checker=jsonschema.FormatChecker())

    for error in validator.iter_errors(data):
        path_parts = list(error.absolute_path)

        # Add context for questions
        if len(path_parts) >= 2 and path_parts[0] == "form":
            question_index = path_parts[1]
            try:
                question = data["form"][question_index]["question"]
                errors.append(f"  Question '{question}': {error.message}")
                continue
            except (IndexError, KeyError):
                pass

        # Add context for projects
        if len(path_parts) >= 2 and path_parts[0] == "projects":
            project_index = path_parts[1]
            try:
                project_name = data["projects"][project_index].get("name", f"Project #{project_index + 1}")
                field = path_parts[2] if len(path_parts) >= 3 else ""
                if field:
                    errors.append(f"  Project '{project_name}', field '{field}': {error.message}")
                else:
                    errors.append(f"  Project '{project_name}': {error.message}")
                continue
            except (IndexError, KeyError):
                pass

        # Default format
        path = ".".join(str(p) for p in path_parts) if path_parts else "(root)"
        errors.append(f"  {path}: {error.message}")

    return errors


def normalize_question(question: str) -> str:
    """Normalize question text by removing trailing colons and extra whitespace."""
    question = question.strip()
    if question.endswith(":"):
        question = question[:-1]
    return question


def parse_issue(md):
    """Parse markdown issue into JSON structure."""
    # Remove HTML comments
    md = re.sub(r"<!--.*?-->", "", md, flags=re.DOTALL)

    data = {
        "username": "",
        "full_name": "",
        "photo": "",
        "designation": "",
        "socials": [],
        "projects": [],
        "form": [],
        "created_on": datetime.now().astimezone().isoformat(),
    }

    # Parse basic fields (username, full_name, photo, designation)
    for field in ["username", "full_name", "photo", "designation"]:
        pattern = rf"\*\*{field}:\*\*\s*(.+?)(?=\n\*\*|\n---|\Z)"
        match = re.search(pattern, md, re.IGNORECASE | re.DOTALL)
        if match:
            data[field] = match.group(1).strip()

    # Parse socials
    socials_match = re.search(r"\*\*socials:\*\*\s*\n((?:^- .+\n?)+)", md, re.MULTILINE)
    if socials_match:
        for line in socials_match.group(1).strip().split("\n"):
            if ":" in line:
                line = line.lstrip("- ").strip()
                label, link = line.split(":", 1)
                # Normalize the label to match schema requirements
                normalized_label = normalize_label(label.strip())
                if normalized_label not in VALID_LABELS:
                    print(
                        f"Warning: Unknown or invalid social label '{label.strip()}' (normalized: '{normalized_label}')",
                        file=sys.stderr,
                    )
                data["socials"].append(
                    {"label": normalized_label, "link": link.strip()}
                )

    # Parse projects
    project_blocks = re.findall(
        r"\*\*project:\*\*\s*\n((?:^- .+(?:\n(?:    .+)?)*\n?)+)", md, re.MULTILINE
    )

    for block in project_blocks:
        project = {
            "name": "",
            "project_link": "",
            "website_link": "",
            "logo": "",
            "short_description": "",
            "description": "",
        }

        for field in project.keys():
            # Match both single line and multi-line (with 4-space indent)
            pattern = rf"^- {field}:\s*(.+?)(?=\n- |\Z)"
            match = re.search(pattern, block, re.MULTILINE | re.DOTALL)
            if match:
                value = match.group(1).strip()
                # Clean up multi-line descriptions (remove leading spaces)
                value = re.sub(r"\n\s{4}", "\n", value)
                project[field] = value.strip()

        if project["name"]:
            data["projects"].append(project)

    # Parse questions section
    questions_section = re.search(r"## Questions(.+)", md, re.DOTALL)
    if questions_section:
        question_blocks = re.findall(
            r"\*\*(.+?):\*\*\s*\n(.+?)(?=\n\*\*|\Z)",
            questions_section.group(1),
            re.DOTALL,
        )

        parsed_questions = {}
        for question, response in question_blocks:
            normalized_q = normalize_question(question)
            parsed_questions[normalized_q] = response.rstrip("\n").replace("\n", "<br>")

        # Ensure questions are in the correct order and match exactly
        for required_q in REQUIRED_QUESTIONS:
            if required_q in parsed_questions:
                data["form"].append(
                    {"question": required_q, "response": parsed_questions[required_q]}
                )
            else:
                # Add with empty response if missing
                data["form"].append({"question": required_q, "response": ""})
                print(
                    f"Warning: Missing required question: '{required_q}'",
                    file=sys.stderr,
                )

    return data


def validate_json_file(filepath: str) -> tuple[bool, list[str]]:
    """Validate a JSON file against the schema."""
    schema = load_schema()
    if schema is None:
        return False, ["Schema file 'maintainer.schema.json' not found"]

    if not HAS_JSONSCHEMA:
        return False, [
            "jsonschema library not installed. Install with: pip install jsonschema"
        ]

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError as e:
        return False, [f"Invalid JSON: {e}"]
    except FileNotFoundError:
        return False, [f"File not found: {filepath}"]

    errors = validate_data(data, schema)
    return len(errors) == 0, errors


if __name__ == "__main__":
    if len(sys.argv) < 2 or sys.argv[1] == "--help":
        print("Usage: python parse_maintainer.py <input_file.md>")
        print(
            "       python parse_maintainer.py --validate <file1.json> [file2.json ...]"
        )
        print("")
        print("Options:")
        print(
            "  --validate    Validate JSON files against schema (for pre-commit hooks)"
        )
        sys.exit(1)

    # Check if we're in validation-only mode
    if sys.argv[1] == "--validate":
        if len(sys.argv) < 3:
            print("Error: --validate requires at least one JSON file", file=sys.stderr)
            sys.exit(1)

        json_files = sys.argv[2:]
        all_valid = True

        for json_file in json_files:
            print(f"Validating {json_file}...", file=sys.stderr)
            is_valid, errors = validate_json_file(json_file)

            if is_valid:
                print(f"✓ {json_file} is valid", file=sys.stderr)
            else:
                print(f"✗ {json_file} validation failed:", file=sys.stderr)
                for error in errors:
                    print(error, file=sys.stderr)
                all_valid = False

        sys.exit(0 if all_valid else 1)

    # Parse mode
    input_file = sys.argv[1]

    with open(input_file, "r", encoding="utf-8") as f:
        result = parse_issue(f.read())

    # Validate before saving
    schema = load_schema()
    if schema is None:
        print("Warning: Schema file not found, skipping validation", file=sys.stderr)
    elif not HAS_JSONSCHEMA:
        print("Warning: jsonschema not installed, skipping validation", file=sys.stderr)
        print("Install with: pip install jsonschema", file=sys.stderr)
    else:
        errors = validate_data(result, schema)
        if errors:
            print(f"\n❌ Validation failed - JSON not saved:", file=sys.stderr)
            for error in errors:
                print(error, file=sys.stderr)
            print(
                "\nPlease fix the errors in your markdown and try again.",
                file=sys.stderr,
            )
            sys.exit(1)

    # Output JSON to stdout
    json_output = json.dumps(result, indent=2, ensure_ascii=False)
    print(json_output)

    # Write to file
    username = result.get("username", "output") or "output"
    output_file = f"content/maintainers/{username}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(json_output)
        f.write("\n")

    print(f"\n✓ Validation passed - Saved to {output_file}", file=sys.stderr)

    # sync images to local public dir
    import subprocess
    import os

    json_filename = os.path.basename(output_file)
    try:
        subprocess.run(["python3", "sync-image.py", json_filename], check=True)
    except subprocess.CalledProcessError as e:
        print(f"[WARN] Image sync failed: {e}", file=sys.stderr)