Skip to content

Commit 1e306b2

Browse files
committed
Dump coclustering JSON report keys in the expected order
Thus, the `write_khiops_json_file` method can be used to dump the JSON report with the keys exactly in the same order as expected by MODL_Coclustering. Thus, coclustering reports are now verbatim-identical to the reference reports.
1 parent b296bdb commit 1e306b2

File tree

3 files changed

+246
-10
lines changed

3 files changed

+246
-10
lines changed

khiops/core/coclustering_results.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,127 @@ class CoclusteringResults(KhiopsJSONObject):
7777
Coclustering modeling report.
7878
"""
7979

80+
# Set coclustering report order key specification
81+
# pylint: disable=line-too-long
82+
json_key_sort_spec = {
83+
"tool": None,
84+
"version": None,
85+
"shortDescription": None,
86+
"coclusteringReport": {
87+
"summary": {
88+
"instances": None,
89+
"cells": None,
90+
"nullCost": None,
91+
"cost": None,
92+
"level": None,
93+
"initialDimensions": None,
94+
"frequencyVariable": None,
95+
"dictionary": None,
96+
"database": None,
97+
"samplePercentage": None,
98+
"samplingMode": None,
99+
"selectionVariable": None,
100+
"selectionValue": None,
101+
},
102+
"dimensionSummaries": [
103+
{
104+
"name": None,
105+
"isVarPart": None,
106+
"type": None,
107+
"parts": None,
108+
"initialParts": None,
109+
"values": None,
110+
"interest": None,
111+
"description": None,
112+
"min": None,
113+
"max": None,
114+
},
115+
],
116+
"dimensionPartitions": [
117+
{
118+
"name": None,
119+
"type": None,
120+
"innerVariables": {
121+
"dimensionSummaries": [
122+
{
123+
"name": None,
124+
"type": None,
125+
"parts": None,
126+
"initialParts": None,
127+
"values": None,
128+
"interest": None,
129+
"description": None,
130+
"min": None,
131+
"max": None,
132+
}
133+
],
134+
"dimensionPartitions": [
135+
{
136+
"name": None,
137+
"type": None,
138+
"intervals": [
139+
{
140+
"cluster": None,
141+
"bounds": None,
142+
}
143+
],
144+
"valueGroups": [
145+
{
146+
"cluster": None,
147+
"values": None,
148+
"valueFrequencies": None,
149+
}
150+
],
151+
"defaultGroupIndex": None,
152+
}
153+
],
154+
},
155+
"intervals": [
156+
{
157+
"cluster": None,
158+
"bounds": None,
159+
}
160+
],
161+
"valueGroups": [
162+
{
163+
"cluster": None,
164+
"values": None,
165+
"valueFrequencies": None,
166+
"valueTypicalities": None,
167+
}
168+
],
169+
"defaultGroupIndex": None,
170+
},
171+
],
172+
"dimensionHierarchies": [
173+
{
174+
"name": None,
175+
"type": None,
176+
"clusters": [
177+
{
178+
"cluster": None,
179+
"parentCluster": None,
180+
"frequency": None,
181+
"interest": None,
182+
"hierarchicalLevel": None,
183+
"rank": None,
184+
"hierarchicalRank": None,
185+
"isLeaf": None,
186+
"shortDescription": None,
187+
"description": None,
188+
}
189+
],
190+
}
191+
],
192+
"cellPartIndexes": None,
193+
"cellFrequencies": None,
194+
},
195+
"khiops_encoding": None,
196+
"ansi_chars": None,
197+
"colliding_utf8_chars": None,
198+
}
199+
# pylint: enable=line-too-long
200+
80201
def __init__(self, json_data=None):
81202
"""See class docstring"""
82203
# Initialize super class

khiops/core/internals/io.py

Lines changed: 79 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,31 @@ class KhiopsJSONObject:
145145
sub_tool : str, optional
146146
Identifies the tool that originated the JSON file. Used by tools of the Khiops
147147
family such as PataText or Enneade.
148+
json_key_sort_spec : dict, optional
149+
Dictionary that specifies the order of the keys in the Khiops JSON report.
150+
Its values are `None`, except when they are dictionaries themselves.
151+
152+
.. note::
153+
This is a class attribute that can be set in subclasses, to specify
154+
a key order when serializing the report in a JSON file, via the
155+
``write_khiops_json_file`` method.
156+
148157
json_data : dict
149158
Python dictionary extracted from the Khiops JSON report file.
150159
**Deprecated** will be removed in Khiops 12.
151160
"""
152161

162+
# Set default JSON key sort specification attribute
163+
# Can be set in classes that specialize this class
164+
json_key_sort_spec = None
165+
153166
def __init__(self, json_data=None):
154167
"""See class docstring"""
168+
# Check the type of the json_key_sort_spec class attribute
169+
assert self.json_key_sort_spec is None or isinstance(
170+
self.json_key_sort_spec, dict
171+
), type_error_message("key_sort_spec", self.json_key_sort_spec, dict)
172+
155173
# Check the type of json_data
156174
if json_data is not None and not isinstance(json_data, dict):
157175
raise TypeError(type_error_message("json_data", json_data, dict))
@@ -261,17 +279,76 @@ def to_dict(self):
261279
report["subTool"] = self.sub_tool
262280
return report
263281

264-
def write_khiops_json_file(self, json_file_path):
282+
def _json_key_sort_by_spec(self, jdict, key_sort_spec=None):
283+
# json_key_sort_spec must be set before using this method
284+
assert self.json_key_sort_spec is not None
285+
286+
# Handle the base case with non-None key_sort_spec
287+
sorted_jdict = {}
288+
if key_sort_spec is None:
289+
key_sort_spec = self.json_key_sort_spec
290+
291+
# Iterate over the current fields and recurse if necessary
292+
for spec_key, spec_value in key_sort_spec.items():
293+
if not (spec_value is None or isinstance(spec_value, (dict, list))):
294+
raise ValueError(
295+
type_error_message(
296+
"specification value",
297+
spec_value,
298+
"'None' or dict or list",
299+
)
300+
)
301+
if spec_key in jdict:
302+
json_value = jdict[spec_key]
303+
304+
# If json_value is not a dict, then:
305+
# - if not list-like, then add it as such to the output dict
306+
# - else, iterate on the list-like value
307+
# else, recurse on the dict structure
308+
if not isinstance(json_value, dict):
309+
if not isinstance(json_value, list):
310+
sorted_jdict[spec_key] = json_value
311+
else:
312+
sorted_jdict[spec_key] = []
313+
for json_el in json_value:
314+
if not isinstance(json_el, dict):
315+
sorted_jdict[spec_key].append(json_el)
316+
else:
317+
if isinstance(spec_value, list):
318+
sorted_jdict[spec_key].append(
319+
self._json_key_sort_by_spec(
320+
json_el, key_sort_spec=spec_value[0]
321+
)
322+
)
323+
else:
324+
sorted_jdict[spec_key] = self._json_key_sort_by_spec(
325+
json_value, key_sort_spec=spec_value
326+
)
327+
return sorted_jdict
328+
329+
def write_khiops_json_file(self, json_file_path, _ensure_ascii=False):
265330
"""Write the JSON data of the object to a Khiops JSON file
266331
332+
The JSON keys are sorted according to the
333+
``KhiopsJSONObject.json_key_sort_spec`` class attribute, if set.
334+
Otherwise, the JSON keys are not sorted.
335+
267336
Parameters
268337
----------
269338
json_file_path : str
270339
Path to the Khiops JSON file.
340+
_ensure_ascii : bool, default False
341+
If True, then non-ASCII characters in the report are escaped. Otherwise,
342+
they are dumped as-is.
271343
"""
272344
# Serialize JSON data to string
273345
# Do not escape non-ASCII Unicode characters
274-
json_string = json.dumps(self.to_dict(), ensure_ascii=False)
346+
json_dict = self.to_dict()
347+
if self.json_key_sort_spec is not None:
348+
json_dict = self._json_key_sort_by_spec(json_dict)
349+
json_string = json.dumps(json_dict, indent=4, ensure_ascii=_ensure_ascii)
350+
else:
351+
json_string = json.dumps(json_dict, indent=4, ensure_ascii=_ensure_ascii)
275352
with io.BytesIO() as json_stream:
276353
writer = self.create_output_file_writer(json_stream)
277354
writer.write(json_string)

tests/test_core.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import json
1111
import os
1212
import shutil
13+
import tempfile
1314
import textwrap
1415
import unittest
1516
import warnings
@@ -33,7 +34,45 @@
3334
class KhiopsCoreIOTests(unittest.TestCase):
3435
"""Tests the reading/writing of files for the core module classes/functions"""
3536

36-
def _assert_report_is_dumped_to_correct_json(self, report, ref_json_report):
37+
def _assert_coclustering_report_is_written_to_sorted_json_file(
38+
self, cc_report, ref_json_report
39+
):
40+
# Write the coclustering report to a JSON file, sorted according to
41+
# the spec defined in the CoclusteringResults class
42+
# Set _ensure_ascii, as non-ASCII characters are escaped in the reference
43+
# reports
44+
tmp_dir = tempfile.mkdtemp()
45+
output_report = os.path.join(tmp_dir, "TestCoclustering.khcj")
46+
cc_report.write_khiops_json_file(output_report, _ensure_ascii=True)
47+
48+
# Load JSON Khiops reports into Python dictionaries
49+
with open(ref_json_report, encoding="utf-8") as ref_json_file:
50+
ref_json = json.load(ref_json_file)
51+
with open(output_report, encoding="utf-8") as output_json_file:
52+
output_json = json.load(output_json_file)
53+
54+
# Dump reports with consistent indentation
55+
ref_json_string = json.dumps(ref_json, indent=4)
56+
output_json_string = json.dumps(output_json, indent=4)
57+
58+
# Succeed if the dumped reports are equal
59+
if output_json_string == ref_json_string:
60+
shutil.rmtree(tmp_dir)
61+
return
62+
63+
# On failure print the differences
64+
output_json_lines = output_json_string.splitlines(keepends=True)
65+
ref_json_lines = ref_json_string.splitlines(keepends=True)
66+
out_ref_diff = "".join(unified_diff(ref_json_lines, output_json_lines))
67+
if out_ref_diff:
68+
self.fail(
69+
"CoclusteringResults JSON dump differs from reference "
70+
f"'{ref_json_report}':\n{out_ref_diff}"
71+
)
72+
73+
def _assert_analysis_report_is_dumped_to_correct_json(
74+
self, report, ref_json_report
75+
):
3776
# Dump the report as JSON (4-space indented and keys sorted in
3877
# lexicographic order)
3978
output_json = report.to_dict()
@@ -54,11 +93,8 @@ def _assert_report_is_dumped_to_correct_json(self, report, ref_json_report):
5493
ref_json_lines = ref_json_string.splitlines(keepends=True)
5594
out_ref_diff = "".join(unified_diff(ref_json_lines, output_json_lines))
5695
if out_ref_diff:
57-
report_type = (
58-
"Analysis" if ref_json_report.endswith(".khj") else "Coclustering"
59-
)
6096
self.fail(
61-
f"{report_type}Results JSON dump differs from reference "
97+
f"AnalysisResults JSON dump differs from reference "
6298
f"'{ref_json_report}':\n{out_ref_diff}"
6399
)
64100

@@ -111,12 +147,12 @@ def test_analysis_results(self):
111147
elif report in reports_warn:
112148
with self.assertWarns(UserWarning):
113149
results = kh.read_analysis_results_file(ref_json_report)
114-
self._assert_report_is_dumped_to_correct_json(
150+
self._assert_analysis_report_is_dumped_to_correct_json(
115151
results, ref_json_report
116152
)
117153
else:
118154
results = kh.read_analysis_results_file(ref_json_report)
119-
self._assert_report_is_dumped_to_correct_json(
155+
self._assert_analysis_report_is_dumped_to_correct_json(
120156
results, ref_json_report
121157
)
122158

@@ -152,7 +188,9 @@ def test_coclustering_results(self):
152188
results = kh.read_coclustering_results_file(ref_json_report)
153189
else:
154190
results = kh.read_coclustering_results_file(ref_json_report)
155-
self._assert_report_is_dumped_to_correct_json(results, ref_json_report)
191+
self._assert_coclustering_report_is_written_to_sorted_json_file(
192+
results, ref_json_report
193+
)
156194

157195
def test_binary_dictionary_domain(self):
158196
"""Test binary dictionary write"""

0 commit comments

Comments
 (0)