Skip to content

Commit 6142e50

Browse files
committed
Handle secondary-table data path deprecation in the Core API
More specifically, preprocess the data paths in the `additional_data_tables` and `output_additional_data_tables` arguments: - split each path on "`" - check that each data path fragment after the split is non-empty - check that the first fragment is identical to the the name of the current dictionary (as specified in `dictionary_name` or `train_dictionary_name`) If all these conditions are met, then convert the legacy data path to the new format: - drop the current dictionary name fragment from the beginning of the path - join the remaining data path fragments on "/" Note: This does not handle external tables, but for the impending beta, this seems like an acceptable tradeoff. closes #370
1 parent 977bcf9 commit 6142e50

File tree

8 files changed

+545
-0
lines changed

8 files changed

+545
-0
lines changed

khiops/core/api.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,71 @@ def _preprocess_arguments(args):
198198
return command_line_options, system_settings, task_is_called_with_domain
199199

200200

201+
def _deprecate_legacy_data_path(data_path_task_arg_name, task_args):
202+
"""Detect and replace legacy data path with the current syntax
203+
204+
.. note:: The function mutates task_args.
205+
"""
206+
if (
207+
data_path_task_arg_name in task_args
208+
and task_args[data_path_task_arg_name] is not None
209+
):
210+
assert "dictionary_name" in task_args or "train_dictionary_name" in task_args
211+
if "dictionary_name" in task_args:
212+
current_dictionary_name = task_args["dictionary_name"]
213+
else:
214+
current_dictionary_name = task_args["train_dictionary_name"]
215+
216+
for kdic_path in task_args[data_path_task_arg_name].keys():
217+
if isinstance(kdic_path, str):
218+
deprecated_data_path_separator = "`"
219+
data_path_separator = "/"
220+
kdic_path_for_warning = kdic_path
221+
else:
222+
assert isinstance(kdic_path, bytes)
223+
deprecated_data_path_separator = b"`"
224+
data_path_separator = b"/"
225+
if isinstance(current_dictionary_name, str):
226+
current_dictionary_name = bytes(
227+
current_dictionary_name, encoding="ascii"
228+
)
229+
kdic_path_for_warning = kdic_path.decode("ascii")
230+
231+
# Path split "`" yields non-empty fragments; the first fragment
232+
# starts with the current dictionary name
233+
kdic_path_parts = kdic_path.split(deprecated_data_path_separator)
234+
if all(len(path_part) > 0 for path_part in kdic_path_parts):
235+
source_dictionary_name = kdic_path_parts[0]
236+
if source_dictionary_name == current_dictionary_name:
237+
# Escape any "/" char in the path parts except for the
238+
# current dictionary, which is is skipped from the new path
239+
new_kdic_path_parts = []
240+
for kdic_path_part in kdic_path_parts[1:]:
241+
new_kdic_path_parts.append(
242+
kdic_path_part.replace(
243+
data_path_separator,
244+
deprecated_data_path_separator + data_path_separator,
245+
)
246+
)
247+
248+
# Replace the legacy data path with the current data path
249+
new_kdic_path = data_path_separator.join(new_kdic_path_parts)
250+
kdic_file_path = task_args[data_path_task_arg_name].pop(kdic_path)
251+
task_args[data_path_task_arg_name][new_kdic_path] = kdic_file_path
252+
warnings.warn(
253+
deprecation_message(
254+
"'`'-based dictionary data path: "
255+
f"'{kdic_path_for_warning}'",
256+
"11.0.1",
257+
replacement=(
258+
"'/'-based dictionary data path "
259+
f"convention: '{new_kdic_path}'"
260+
),
261+
quote=False,
262+
)
263+
)
264+
265+
201266
def _preprocess_task_arguments(task_args):
202267
"""Preprocessing of task arguments common to various tasks
203268
@@ -320,6 +385,14 @@ def _preprocess_task_arguments(task_args):
320385
)
321386
del task_args["max_variable_importances"]
322387

388+
# Detect and replace deprecated data-path syntax on additional_data_tables
389+
# Mutate task_args in the process
390+
for data_path_task_arg_name in (
391+
"additional_data_tables",
392+
"output_additional_data_tables",
393+
):
394+
_deprecate_legacy_data_path(data_path_task_arg_name, task_args)
395+
323396
# Flatten kwargs
324397
if "kwargs" in task_args:
325398
task_args.update(task_args["kwargs"])
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// Scenario for task check_database
2+
// Dictionary file and class settings
3+
ClassManagement.OpenFile
4+
ClassFileName Customer.kdic //
5+
OK
6+
7+
// Train database settings
8+
TrainDatabase.ClassName Customer
9+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key
10+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName Customer.csv //
11+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services
12+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ServicesBidon.csv //
13+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
14+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName UsagesBidon.csv //
15+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Address
16+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName AddressBidon.csv //
17+
TrainDatabase.DatabaseSpec.Data.HeaderLineUsed true
18+
TrainDatabase.DatabaseSpec.Data.FieldSeparator
19+
TrainDatabase.DatabaseSpec.Data.DatabaseFormatDetector.DetectFileFormat
20+
TrainDatabase.DatabaseSpec.Sampling.SampleNumberPercentage 100.0
21+
TrainDatabase.DatabaseSpec.Sampling.SamplingMode Include sample
22+
TrainDatabase.DatabaseSpec.Selection.SelectionAttribute
23+
TrainDatabase.DatabaseSpec.Selection.SelectionValue
24+
25+
// Log messages limit
26+
AnalysisSpec.SystemParameters.MaxErrorMessageNumberInLog 20
27+
28+
// Execute check database
29+
LearningTools.CheckData
30+
31+
// End of scenario for task check_database
32+
33+
// Exit Khiops
34+
ClassManagement.Quit
35+
OK
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Scenario for task deploy_model
2+
// Dictionary file settings
3+
ClassManagement.OpenFile
4+
ClassFileName Customer.kdic //
5+
OK
6+
7+
// Deploy settings
8+
LearningTools.TransferDatabase
9+
ClassName Customer
10+
11+
// Input database settings
12+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key
13+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName Customer.csv //
14+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services
15+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ServicesBidon.csv //
16+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
17+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName UsagesBidon.csv //
18+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Address
19+
SourceDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName AddressBidon.csv //
20+
SourceDatabase.DatabaseSpec.Data.HeaderLineUsed true
21+
SourceDatabase.DatabaseSpec.Data.FieldSeparator
22+
SourceDatabase.DatabaseSpec.Data.DatabaseFormatDetector.DetectFileFormat
23+
SourceDatabase.DatabaseSpec.Sampling.SampleNumberPercentage 100.0
24+
SourceDatabase.DatabaseSpec.Sampling.SamplingMode Include sample
25+
SourceDatabase.DatabaseSpec.Selection.SelectionAttribute
26+
SourceDatabase.DatabaseSpec.Selection.SelectionValue
27+
28+
// Output database settings
29+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key
30+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ./CustomerDeployed.csv //
31+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services
32+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ./TransferServicesBidon.csv //
33+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
34+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ./TransferUsagesBidon.csv //
35+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Address
36+
TargetDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ./TransferAddressBidon.csv //
37+
TargetDatabase.DatabaseSpec.Data.HeaderLineUsed true
38+
TargetDatabase.DatabaseSpec.Data.FieldSeparator
39+
40+
// Transfer
41+
TransferDatabase
42+
Exit
43+
44+
// End of scenario for task deploy_model
45+
46+
// Exit Khiops
47+
ClassManagement.Quit
48+
OK
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Scenario for task evaluate_predictor
2+
// Dictionary file settings
3+
ClassManagement.OpenFile
4+
ClassFileName ModelingCustomer.kdic //
5+
OK
6+
7+
// Evaluate predictor settings
8+
LearningTools.EvaluatePredictors
9+
MainTargetModality
10+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key
11+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName Customer.csv //
12+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services
13+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ServicesBidon.csv //
14+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
15+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName UsagesBidon.csv //
16+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Address
17+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName AddressBidon.csv //
18+
EvaluationDatabase.DatabaseSpec.Data.HeaderLineUsed true
19+
EvaluationDatabase.DatabaseSpec.Data.FieldSeparator
20+
EvaluationDatabase.DatabaseSpec.Data.DatabaseFormatDetector.DetectFileFormat
21+
EvaluationDatabase.DatabaseSpec.Sampling.SampleNumberPercentage 100.0
22+
EvaluationDatabase.DatabaseSpec.Sampling.SamplingMode Include sample
23+
EvaluatedPredictors.List.Key Customer
24+
EvaluationDatabase.DatabaseSpec.Selection.SelectionAttribute
25+
EvaluationDatabase.DatabaseSpec.Selection.SelectionValue
26+
ExportAsXls false
27+
EvaluationFileName CustomerResults\CustomerAnalysisResults.khj //
28+
29+
// Evaluate predictor
30+
EvaluatePredictors
31+
Exit
32+
33+
// End of scenario for task evaluate_predictor
34+
35+
// Exit Khiops
36+
ClassManagement.Quit
37+
OK
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// Scenario for task train_coclustering
2+
// Dictionary file and class settings
3+
ClassManagement.OpenFile
4+
ClassFileName Customer.kdic //
5+
OK
6+
7+
// Train database settings
8+
Database.ClassName Customer
9+
Database.DatabaseSpec.Data.DatabaseFiles.List.Key
10+
Database.DatabaseSpec.Data.DatabaseFiles.DataTableName Customer.csv //
11+
Database.DatabaseSpec.Data.DatabaseFiles.List.Key Services
12+
Database.DatabaseSpec.Data.DatabaseFiles.DataTableName ServicesBidon.csv //
13+
Database.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
14+
Database.DatabaseSpec.Data.DatabaseFiles.DataTableName UsagesBidon.csv //
15+
Database.DatabaseSpec.Data.DatabaseFiles.List.Key Address
16+
Database.DatabaseSpec.Data.DatabaseFiles.DataTableName AddressBidon.csv //
17+
Database.DatabaseSpec.Data.HeaderLineUsed true
18+
Database.DatabaseSpec.Data.FieldSeparator
19+
Database.DatabaseSpec.Data.DatabaseFormatDetector.DetectFileFormat
20+
Database.DatabaseSpec.Sampling.SampleNumberPercentage 100.0
21+
Database.DatabaseSpec.Sampling.SamplingMode Include sample
22+
Database.DatabaseSpec.Selection.SelectionAttribute
23+
Database.DatabaseSpec.Selection.SelectionValue
24+
25+
// Coclustering variables settings
26+
AnalysisSpec.CoclusteringParameters.Attributes.InsertItemAfter
27+
AnalysisSpec.CoclusteringParameters.Attributes.Name id_customer
28+
AnalysisSpec.CoclusteringParameters.Attributes.InsertItemAfter
29+
AnalysisSpec.CoclusteringParameters.Attributes.Name Name
30+
AnalysisSpec.CoclusteringParameters.FrequencyAttributeName
31+
32+
// Minimum optimization time
33+
AnalysisSpec.SystemParameters.OptimizationTime 0
34+
35+
// Output settings
36+
AnalysisResults.CoclusteringFileName CustomerResults/CustomerCoclusteringResults._khcj //
37+
38+
// Train
39+
BuildCoclustering
40+
41+
// End of scenario for task train_coclustering
42+
43+
// Exit Khiops
44+
ClassManagement.Quit
45+
OK
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// Scenario for task train_predictor
2+
// Dictionary file and class settings
3+
ClassManagement.OpenFile
4+
ClassFileName Customer.kdic //
5+
OK
6+
7+
// Train/test database settings
8+
TrainDatabase.ClassName Customer
9+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key
10+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName Customer.csv //
11+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services
12+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName ServicesBidon.csv //
13+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Services/Usages
14+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName UsagesBidon.csv //
15+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.List.Key Address
16+
TrainDatabase.DatabaseSpec.Data.DatabaseFiles.DataTableName AddressBidon.csv //
17+
TrainDatabase.DatabaseSpec.Data.HeaderLineUsed true
18+
TrainDatabase.DatabaseSpec.Data.FieldSeparator
19+
TrainDatabase.DatabaseSpec.Data.DatabaseFormatDetector.DetectFileFormat
20+
TrainDatabase.DatabaseSpec.Sampling.SampleNumberPercentage 70.0
21+
TrainDatabase.DatabaseSpec.Sampling.SamplingMode Include sample
22+
TrainDatabase.DatabaseSpec.Selection.SelectionAttribute
23+
TrainDatabase.DatabaseSpec.Selection.SelectionValue
24+
TrainDatabase.TestDatabaseSpecificationMode Complementary
25+
26+
// Target variable
27+
AnalysisSpec.TargetAttributeName
28+
AnalysisSpec.MainTargetModality
29+
30+
// Do data preparation only
31+
AnalysisSpec.PredictorsSpec.AdvancedSpec.DataPreparationOnly false
32+
33+
// Selective Naive Bayes settings
34+
AnalysisSpec.PredictorsSpec.AdvancedSpec.SelectiveNaiveBayesParameters.TrainParameters.MaxEvaluatedAttributeNumber 0
35+
AnalysisSpec.PredictorsSpec.AdvancedSpec.SelectiveNaiveBayesParameters.SelectionParameters.MaxSelectedAttributeNumber 0
36+
37+
// Feature engineering
38+
AnalysisSpec.PredictorsSpec.ConstructionSpec.MaxTextFeatureNumber 10000
39+
AnalysisSpec.PredictorsSpec.ConstructionSpec.MaxTreeNumber 10
40+
AnalysisSpec.PredictorsSpec.ConstructionSpec.MaxAttributePairNumber 0
41+
AnalysisSpec.PredictorsSpec.AdvancedSpec.InspectAttributePairsParameters
42+
AllAttributePairs true
43+
Exit
44+
AnalysisSpec.PredictorsSpec.ConstructionSpec.KeepSelectedAttributesOnly true
45+
AnalysisSpec.PredictorsSpec.ConstructionSpec.MaxConstructedAttributeNumber 1000
46+
AnalysisSpec.PredictorsSpec.AdvancedSpec.InspectConstructionDomain
47+
Exit
48+
49+
// Text feature parameters
50+
AnalysisSpec.PredictorsSpec.AdvancedSpec.InspectTextFeaturesParameters
51+
TextFeatures words
52+
Exit
53+
54+
55+
// Data preparation (discretization & grouping) settings
56+
AnalysisSpec.PreprocessingSpec.TargetGrouped false
57+
AnalysisSpec.PreprocessingSpec.InspectAdvancedParameters
58+
DiscretizerUnsupervisedMethodName MODL
59+
GrouperUnsupervisedMethodName MODL
60+
Exit
61+
62+
// Max parts
63+
AnalysisSpec.PreprocessingSpec.MaxPartNumber 0
64+
65+
// Output settings
66+
AnalysisResults.ReportFileName CustomerResults/CustomerAnalysisResults._khj //
67+
68+
// Build model
69+
ComputeStats
70+
71+
// End of scenario for task train_predictor
72+
73+
// Exit Khiops
74+
ClassManagement.Quit
75+
OK

0 commit comments

Comments
 (0)