AHGMF-Code-Implementation/AHMGF_implementation_code.py at main · halodma/AHGMF-Code-Implementation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import re
# This code provides a reference implementation of the AHMGF framework.
# Its purpose is to support transparency, reproducibility.
#
# The implementation is intentionally simplified and optimized for clarity rather
# than throughput or large-scale screening. Production-oriented, high-throughput
# implementations used by the author for batch and pipeline integration are
# maintained separately.

# Author: Dylan M. Armstrong, author email: halodma07@gmail.com
# Licensed under CC by 4.0

print("\nTo the following requests enter Yes, No, or leave the field blank and continue.\n")

user_input_confirmation_1 = input("Enable Structure Entrance: ").lower()  # For entering structures
# This option tells you stuff like: "[{'Fe(Fe(Fe(Fe2O3)2)3)6': [{'(Fe2O3)2': 36}, {'(Fe)6': 6}, {'(Fe)3': 18}, {'Fe': 1}]}]"
# what the center atom is, and how things branch off each other.
user_input_confirmation_2 = input("Enable Structure Determination: ").lower()  # For figuring out structures
print()
# This option helps you create a AHMGF string


molecule_data = []
if user_input_confirmation_2 == 'yes' or user_input_confirmation_2 == 'y':
    print('For the following enter your molecules as 1 line, but separate them with (+ or ,)\n'
          'and your molecules are permitted to be entered in any order.\n')
    # For example Fe + Fe + Fe + Fe2O3 or Fe, Fe, Fe, Fe2O3
    enter_molecules = input("Enter you molecules here: ")
    molecule_list_input = [m.strip() for m in re.split(r'[+,]', enter_molecules)]
    for item_ml_input in molecule_list_input:
        print('\n- You will be asked to enter the branching order, enter it in the format branch_#_position_sub_# (position can be "edge"'
              ' or "innerlayer_#")\n  If the molecule is the center, just enter "Center".\n'
              '- sub_# is only for if you have multiple branches on the same layer\n')
        # For example "Please enter the branching order here for Fe2O3: branch_1_edge_sub_0" (edge of branch_1)
        #             "Please enter the branching order here for Fe: Center" (center)
        #             "Please enter the branching order here for Fe: branch_1_innerlayer_1_sub_0" (Parent branch)
        #             "Please enter the branching order here for Fe: branch_1_innerlayer_2_sub_0" (branch attached to the parent branch_1)
        # This input tells the program that Fe2O3 is at the edge, Fe is the central unit with a group of Fe attached to it,
        # and connected to this first inner layer of Fe is a second group of Fe, which then connects to Fe2O3.

        # The "#" acts as a placeholder for numbers, indicating later which units are connected to which.
        # This allows the program to construct the AHMGF string.
        # Different numbers represent different branches, so branch_1 and branch_2 indicate that there are two separate branches in total.
        position_entrance_input = input(f"Please enter the branching order here for {item_ml_input}: ").lower().strip()
        if position_entrance_input == 'center':
            mult_value = 1
        else:
            number_of_molecules_input = input(f"Please enter the multiplicand for {item_ml_input}: ").strip()
            if float(number_of_molecules_input) % 1 != 0:
                mult_value = float(number_of_molecules_input)
            else:
                mult_value = int(number_of_molecules_input)

        if position_entrance_input == "center":
            molecule_data.append({item_ml_input: {
                "branch": 0,
                "center": 0,
                "sub": 0,
                "mult": mult_value}})
        else:
            parts = position_entrance_input.split("_")

            # Must start with branch_#
            if len(parts) < 3 or parts[0] != "branch":
                raise ValueError('Invalid format. Use "branch_#_edge" or "branch_#_innerlayer_#".')

            # Branch number
            try:
                branch_value = int(parts[1])
            except ValueError:
                raise ValueError("Branch number must be numeric (branch_#).")

            # Default sub value
            sub_value = 0
            if "sub" in parts:
                sub_index = parts.index("sub")
                if sub_index + 1 < len(parts):
                    try:
                        sub_value = int(parts[sub_index + 1])
                    except ValueError:
                        raise ValueError("Sub value must be numeric (sub_#).")
                else:
                    sub_value = 0  # sub keyword exists but no number provided → default 0

            # EDGE CASE
            if parts[2] == "edge":
                position_key = "edge"
                position_value = 0

            # INNERLAYER CASE
            elif parts[2] == "innerlayer":
                if len(parts) < 4:
                    raise ValueError("Innerlayer requires a numeric value (innerlayer_#).")
                try:
                    position_number = int(parts[3])
                except ValueError:
                    raise ValueError("Innerlayer position must be numeric.")
                position_key = f"innerlayer_{position_number}"
                position_value = position_number

            else:
                raise ValueError('Position must be "edge" or "innerlayer_#".')
            molecule_data.append({item_ml_input: {
                "branch": branch_value,
                position_key: position_value,
                "sub": sub_value,
                "mult": mult_value}})

# Constructing an AHMGF notation string
structure_enter_input_list = []
true_multiplicand_export_list = []
non_poly_sep = []
poly_sep = []
comb_tracker_molecule = []

# The following molecule_data entries are to allow you to test the code to use them just remove "#" and run the code
#molecule_data = [{'a': {'branch': 0, 'center': 0, 'sub': 0, 'mult': 1}}, {'b': {'branch': 1, 'innerlayer_1': 1, 'sub': 0, 'mult': 2}}, {'c': {'branch': 2, 'innerlayer_1': 1, 'sub': 0, 'mult': 3}}, {'d': {'branch': 1, 'innerlayer_2': 2, 'sub': 0, 'mult': 4}}, {'e': {'branch': 2, 'innerlayer_2': 2, 'sub': 0, 'mult': 5}}, {'f': {'branch': 1, 'innerlayer_2': 2, 'sub': 1, 'mult': 6}}]
#molecule_data = [{'a': {'branch': 0, 'center': 0, 'sub': 0, 'mult': 1}}, {'b': {'branch': 1, 'innerlayer_1': 1, 'sub': 0, 'mult': 2}}, {'c': {'branch': 2, 'innerlayer_1': 1, 'sub': 0, 'mult': 3}}, {'d': {'branch': 1, 'innerlayer_2': 2, 'sub': 0, 'mult': 4}}, {'f': {'branch': 1, 'innerlayer_2': 2, 'sub': 1, 'mult': 5}}]
#molecule_data = [{"a": {"center": True, "mult": 1}}, {"b": {"branch": 1, "innerlayer_1": True, "sub": 0, "mult": 2}}, {"c": {"branch": 1, "innerlayer_2": True, "sub": 0, "mult": 3}},]
# ----------------------------------------------------------------------------------------------------------------------

if user_input_confirmation_2 == 'yes' or user_input_confirmation_2 == 'y':
    flat_nodes = []
    center_node = None
    for entry in molecule_data:
        mol, info = next(iter(entry.items()))

        node = {
            "name": mol,
            "branch": info.get("branch", 0),
            "layer": None,
            "is_edge": False,
            "sub": info.get("sub", 0),
            "mult": info.get("mult", 1),
            "children": []}

        if "center" in info:
            node["layer"] = 0
            center_node = node
        else:
            for key in info:
                if key.startswith("innerlayer"):
                    node["layer"] = int(key.split("_")[1])
                elif key == "edge":
                    node["is_edge"] = True
                    node["layer"] = None  # assigned later
        flat_nodes.append(node)

    branches = {}
    for node in flat_nodes:
        branches.setdefault(node["branch"], []).append(node)  # This groups by branch

    # collect per branch (excluding center)
    branch_nodes = {}
    for node in flat_nodes:
        if node is center_node:
            continue
        branch_nodes.setdefault(node["branch"], []).append(node)

    # assign edge nodes to the deepest layer within their branch
    for branch_id, nodes in branch_nodes.items():
        max_layer = max([n["layer"] for n in nodes if n["layer"] is not None] + [0])
        for n in nodes:
            if n["is_edge"]:
                n["layer"] = max_layer + 1

    # build tree branch by branch
    for branch_id, nodes in branch_nodes.items():
        nodes.sort(key=lambda x: (x["layer"], x["sub"]))
        parent_lookup = {0: center_node}
        for node in nodes:
            parent = parent_lookup.get(node["layer"] - 1)
            if parent and node not in parent["children"]:
                parent["children"].append(node)
            if not node["is_edge"]:
                parent_lookup[node["layer"]] = node

    branch_strings = []
    for branch_id in sorted(branch_nodes.keys()):
        # render branch nodes in order, starting from their topmost parent
        # find first layer in branch
        top_nodes = [n for n in branch_nodes[branch_id] if n["layer"] == 1]
        top_nodes.sort(key=lambda x: x["sub"])
        parts = []
        for node in top_nodes:
            stack = [(node, 0)]
            render_map = {}
            while stack:
                node2, state = stack.pop()
                if state == 0:
                    stack.append((node2, 1))
                    for child in reversed(sorted(node2["children"], key=lambda x: x["sub"])):
                        stack.append((child, 0))
                else:
                    s = node2["name"]
                    if node2["children"]:
                        inner = "".join(render_map[id(child)] for child in sorted(node2["children"], key=lambda x: x["sub"]))
                        s = f"{s}{inner})"
                        if node2["mult"] > 1:
                            s += str(node2["mult"])
                    else:
                        if node2["mult"] > 1:
                            s = f"({s}){node2['mult']}"
                        elif node2["mult"] == 1:
                            s = f"({s}){node2['mult']}"
                        elif node2["mult"] != 1:
                            s = f"({s}){node2['mult']}"

                    render_map[id(node2)] = s
            parts.append(render_map[id(node)])
        branch_strings.append("".join(parts))

    # combine all branches: center + each branch in order
    final_structure = center_node["name"]

    # sort branches by branch number
    sorted_branch_ids = sorted(branch_nodes.keys())
    for branch_id in sorted_branch_ids:
        branch_str = branch_strings[sorted_branch_ids.index(branch_id)]
        first_node = sorted(branch_nodes[branch_id], key=lambda x: x["layer"])[0]

        # apply multiplicand correctly: after the branch_str, no extra parentheses
        if first_node["mult"] > 1:
            # wrap branch_str ONLY if there are multiple nodes, else keep leaf as-is
            if len(branch_str) > len(first_node["name"]):
                if branch_str.startswith("("):
                    final_structure += branch_str
                else:
                    final_structure += f"({branch_str}"
            else:
                final_structure += f"{branch_str}{first_node['mult']}"
        else:
            final_structure += f"({branch_str})" if len(branch_str) > len(first_node["name"]) else branch_str
    if not sorted_branch_ids:
        structure_enter_input_list.append(center_node["name"])
    if sorted_branch_ids:
        structure_enter_input_list.append(final_structure)


if not molecule_data:
    structure_enter_input = input("Please enter your AHMGF structure for interpretation: ").split("+")
    for item_sei in structure_enter_input:
        structure_enter_input_list.append(item_sei)
#
if structure_enter_input_list:
    # This section of code is to use structure_enter_input to determine stuff like true multiplicand list or whatever
    for item in structure_enter_input_list:
        stack = []
        false_list = []
        current_molecule = ''
        i = 0
        while i < len(item):
            char = item[i]
            if char == '(':
                if current_molecule:
                    stack.append(current_molecule)
                    current_molecule = ''
                stack.append('(')
            elif char == ')':
                if current_molecule:
                    stack.append(current_molecule)
                    current_molecule = ''
                temp_molecule = ''
                while stack and stack[-1] != '(':
                    temp_molecule = stack.pop() + temp_molecule
                stack.pop()
                multiplier = ''
                i += 1
                while i < len(item) and (item[i].isdigit() or item[i] == '.'):
                    multiplier += item[i]
                    i += 1
                if not multiplier:
                    multiplier = '1'
                if temp_molecule:
                    modified_molecule = f'({temp_molecule}){multiplier}'
                    false_list.append(modified_molecule)
                    poly_sep.append(modified_molecule)
                continue
            elif char.isalnum():
                current_molecule += char
            i += 1
        if current_molecule:
            stack.append(current_molecule)
        while stack:
            molecule = stack.pop()
            if '(' not in molecule:
                current_molecule = molecule
            else:
                if poly_sep:
                    current_molecule = f"{current_molecule} + " + " + ".join(poly_sep)
        if current_molecule and not false_list:
            non_poly_sep.append(current_molecule)
        if false_list and not current_molecule:
            comb_tracker_molecule.append({item: ["", false_list.copy()]})
        if current_molecule and false_list:
            non_poly_sep.append(current_molecule)
            comb_tracker_molecule.append({item: [current_molecule, false_list.copy()]})

if comb_tracker_molecule:
    true_multiplicand = []
    for item in comb_tracker_molecule:
        item_poly = []
        inner_for_loop_true_multiplicand = []
        for key, value in item.items():
            if isinstance(value[1], list):
                for sub_item in value[1]:
                    item_poly.append(sub_item)
            match = re.search(r'\(.*\)\d*', key)
            if match:
                item_no_outer_non_poly = match.group(0)
                if not item_no_outer_non_poly[-1].isdigit():
                    item_no_outer_non_poly += "1"
            inner_key_search_list = []
            for poly in item_poly:
                match = re.search(r'\((.*?)\)', poly)
                if match:
                    inner_key_search_list.append(match.group(1))
            closing_item_numbers = []
            processed_positions = set()
            for compound in inner_key_search_list:
                start_pos = 0
                while start_pos < len(item_no_outer_non_poly):
                    start_pos = item_no_outer_non_poly.find(compound, start_pos)
                    if start_pos == -1:
                        break
                    if start_pos in processed_positions:
                        start_pos += len(compound)
                        continue
                    parentheses_depth = 0
                    skip_next_closing = False
                    i_current = start_pos
                    closing_numbers = []
                    while i_current < len(item_no_outer_non_poly):
                        char = item_no_outer_non_poly[i_current]
                        if char == "(":
                            parentheses_depth += 1
                        elif char == ")":
                            if parentheses_depth > 0:
                                parentheses_depth -= 1
                                if parentheses_depth == 0 and skip_next_closing:
                                    skip_next_closing = False
                                    i_current += 1
                                    continue
                        if item_no_outer_non_poly[i_current] == ')' and parentheses_depth == 0 and not skip_next_closing:
                            if i_current + 1 < len(item_no_outer_non_poly) and not item_no_outer_non_poly[i_current + 1].isdigit():
                                number = '1'
                            else:
                                match = re.search(r'\)(\d+(?:\.\d+)?)', item_no_outer_non_poly[i_current:])
                                if match:
                                    number = match.group(1)
                                else:
                                    number = '1'
                            closing_numbers.append(number)
                        if char == "(" and parentheses_depth == 1:
                            skip_next_closing = True
                        i_current += 1
                    if closing_numbers:
                        multiplicand_end = closing_numbers[0]
                    else:
                        pass
                    compound = f'({compound}){multiplicand_end}'
                    if not '((' in compound:
                        closing_item_numbers.append({compound: closing_numbers})
                    else:
                        pass
                    processed_positions.add(start_pos)
                    start_pos += len(compound)
            list_dict_val = []
            for item in closing_item_numbers:
                for key2, values in item.items():
                    true_mult = 1
                    for val in values:
                        val_int = float(val)
                        true_mult *= val_int
                true_mult = round(true_mult)
                if true_mult < 1:
                    print(f"Multiplicand for '{key2}' is less than 1: {true_mult}")
                    raise ValueError("Multiplicand is less than 1")
                true_multiplicand.append({key2: true_mult})
                inner_for_loop_true_multiplicand.append({key2: true_mult})
                list_dict_val.append({key2: true_mult})
            central_molecule_tmel = value[0]
            list_dict_val.append({central_molecule_tmel: 1})
            true_multiplicand_export_list.append({key: list_dict_val})
#
# This code here is only for dealing with central atom only inputs
if structure_enter_input_list:
    if not true_multiplicand_export_list:
        for item_seil in structure_enter_input_list:
            true_multiplicand_export_list.append({item_seil: 1})

print('------------------------------------------------------------------------')
print("AHMGF Output: ")
if molecule_data:
    print("molecule_data: ", molecule_data)
    print("Constructed AHMGF Notation: ", structure_enter_input_list)
if true_multiplicand_export_list:
    print("True Molecule Counts: ", true_multiplicand_export_list)