Skip to content

Commit 0c8137f

Browse files
authored
Merge pull request #379 from KhiopsML/330-add-text-and-textlist-type-support-to-the-core-apis
330 add text and textlist type support to the core apis
2 parents 401c465 + 1d3d80d commit 0c8137f

File tree

5 files changed

+251
-4
lines changed

5 files changed

+251
-4
lines changed

.github/workflows/tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
---
22
name: Tests
33
env:
4-
DEFAULT_SAMPLES_REVISION: 10.2.4
4+
DEFAULT_SAMPLES_REVISION: 11.0.0
55
DEFAULT_KHIOPS_DESKTOP_REVISION: 10.6.0-b.0
66
on:
77
workflow_dispatch:
88
inputs:
99
samples-revision:
10-
default: 10.2.4
10+
default: 11.0.0
1111
description: Git Tag/Branch/Commit for the khiops-samples Repo
1212
image-tag:
1313
default: 10.6.0-b.0.0

doc/samples/samples.rst

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,35 @@ Samples
212212
213213
# If you have Khiops Visualization installed you may open the report as follows
214214
# kh.visualize_report(report_file_path)
215+
.. autofunction:: train_predictor_text
216+
.. code-block:: python
217+
218+
# Imports
219+
import os
220+
from khiops import core as kh
221+
222+
# Set the file paths
223+
dictionary_file_path = os.path.join(
224+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
225+
)
226+
data_table_path = os.path.join(
227+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
228+
)
229+
report_file_path = os.path.join(
230+
"kh_samples", "train_predictor_text", "AnalysisResults.khj"
231+
)
232+
233+
# Train the predictor
234+
kh.train_predictor(
235+
dictionary_file_path,
236+
"FlightNegativeTweets",
237+
data_table_path,
238+
"negativereason",
239+
report_file_path,
240+
max_trees=5,
241+
max_text_features=1000,
242+
text_features="words",
243+
)
215244
.. autofunction:: train_predictor_error_handling
216245
.. code-block:: python
217246
@@ -948,6 +977,44 @@ Samples
948977
kh.deploy_model(
949978
model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path
950979
)
980+
.. autofunction:: deploy_model_text
981+
.. code-block:: python
982+
983+
# Imports
984+
import os
985+
from khiops import core as kh
986+
987+
# Set the file paths
988+
dictionary_file_path = os.path.join(
989+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
990+
)
991+
data_table_path = os.path.join(
992+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
993+
)
994+
output_dir = os.path.join("kh_samples", "deploy_model_text")
995+
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
996+
output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")
997+
998+
# Train the predictor
999+
_, model_dictionary_file_path = kh.train_predictor(
1000+
dictionary_file_path,
1001+
"FlightNegativeTweets",
1002+
data_table_path,
1003+
"negativereason",
1004+
report_file_path,
1005+
max_trees=5,
1006+
max_text_features=1000,
1007+
text_features="words",
1008+
)
1009+
1010+
# Deploy the model on the database
1011+
# It will score it according to the trained predictor
1012+
kh.deploy_model(
1013+
model_dictionary_file_path,
1014+
"SNB_FlightNegativeTweets",
1015+
data_table_path,
1016+
output_data_table_path,
1017+
)
9511018
.. autofunction:: deploy_model_mt
9521019
.. code-block:: python
9531020

khiops/core/dictionary.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def _format_name(name):
7070
def _quote_value(value):
7171
"""Double-quotes a string
7272
73-
Categorical and metadata values are quoted with this method.
73+
Categorical, Text and metadata values are quoted with this method.
7474
"""
7575
if isinstance(value, str):
7676
quoted_value = '"' + value.replace('"', '""') + '"'
@@ -1075,7 +1075,16 @@ def is_native(self):
10751075
``True`` if a variables comes directly from a data column.
10761076
10771077
"""
1078-
base_types = ["Categorical", "Numerical", "Time", "Date", "Timestamp"]
1078+
base_types = [
1079+
"Categorical",
1080+
"Numerical",
1081+
"Time",
1082+
"Date",
1083+
"Timestamp",
1084+
"TimestampTZ",
1085+
"Text",
1086+
"TextList",
1087+
]
10791088
if self.variable_block is None:
10801089
return self.rule == "" and self.type in base_types
10811090
return self.variable_block.rule == ""

khiops/samples/samples.ipynb

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,48 @@
278278
"# kh.visualize_report(report_file_path)"
279279
]
280280
},
281+
{
282+
"cell_type": "markdown",
283+
"metadata": {},
284+
"source": [
285+
"### `train_predictor_text()`\n\n",
286+
"Trains a predictor with just text-specific parameters\n"
287+
]
288+
},
289+
{
290+
"cell_type": "code",
291+
"execution_count": null,
292+
"metadata": {},
293+
"outputs": [],
294+
"source": [
295+
"# Imports\n",
296+
"import os\n",
297+
"from khiops import core as kh\n",
298+
"\n",
299+
"# Set the file paths\n",
300+
"dictionary_file_path = os.path.join(\n",
301+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n",
302+
")\n",
303+
"data_table_path = os.path.join(\n",
304+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
305+
")\n",
306+
"report_file_path = os.path.join(\n",
307+
" \"kh_samples\", \"train_predictor_text\", \"AnalysisResults.khj\"\n",
308+
")\n",
309+
"\n",
310+
"# Train the predictor\n",
311+
"kh.train_predictor(\n",
312+
" dictionary_file_path,\n",
313+
" \"FlightNegativeTweets\",\n",
314+
" data_table_path,\n",
315+
" \"negativereason\",\n",
316+
" report_file_path,\n",
317+
" max_trees=5,\n",
318+
" max_text_features=1000,\n",
319+
" text_features=\"words\",\n",
320+
")"
321+
]
322+
},
281323
{
282324
"cell_type": "markdown",
283325
"metadata": {},
@@ -1248,6 +1290,57 @@
12481290
")"
12491291
]
12501292
},
1293+
{
1294+
"cell_type": "markdown",
1295+
"metadata": {},
1296+
"source": [
1297+
"### `deploy_model_text()`\n\n",
1298+
"Deploys a model learned on textual data\n It is a call to `~.api.deploy_model` with its mandatory parameters, plus\n text-specific parameters.\n\n In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n associated dictionary to the input database. The model predictions are written to\n the output database.\n \n"
1299+
]
1300+
},
1301+
{
1302+
"cell_type": "code",
1303+
"execution_count": null,
1304+
"metadata": {},
1305+
"outputs": [],
1306+
"source": [
1307+
"# Imports\n",
1308+
"import os\n",
1309+
"from khiops import core as kh\n",
1310+
"\n",
1311+
"# Set the file paths\n",
1312+
"dictionary_file_path = os.path.join(\n",
1313+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n",
1314+
")\n",
1315+
"data_table_path = os.path.join(\n",
1316+
" kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
1317+
")\n",
1318+
"output_dir = os.path.join(\"kh_samples\", \"deploy_model_text\")\n",
1319+
"report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n",
1320+
"output_data_table_path = os.path.join(output_dir, \"ScoresNegativeAirlineTweets.txt\")\n",
1321+
"\n",
1322+
"# Train the predictor\n",
1323+
"_, model_dictionary_file_path = kh.train_predictor(\n",
1324+
" dictionary_file_path,\n",
1325+
" \"FlightNegativeTweets\",\n",
1326+
" data_table_path,\n",
1327+
" \"negativereason\",\n",
1328+
" report_file_path,\n",
1329+
" max_trees=5,\n",
1330+
" max_text_features=1000,\n",
1331+
" text_features=\"words\",\n",
1332+
")\n",
1333+
"\n",
1334+
"# Deploy the model on the database\n",
1335+
"# It will score it according to the trained predictor\n",
1336+
"kh.deploy_model(\n",
1337+
" model_dictionary_file_path,\n",
1338+
" \"SNB_FlightNegativeTweets\",\n",
1339+
" data_table_path,\n",
1340+
" output_data_table_path,\n",
1341+
")"
1342+
]
1343+
},
12511344
{
12521345
"cell_type": "markdown",
12531346
"metadata": {},

khiops/samples/samples.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,36 @@ def train_predictor_file_paths():
233233
# kh.visualize_report(report_file_path)
234234

235235

236+
def train_predictor_text():
237+
"""Trains a predictor with just text-specific parameters"""
238+
# Imports
239+
import os
240+
from khiops import core as kh
241+
242+
# Set the file paths
243+
dictionary_file_path = os.path.join(
244+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
245+
)
246+
data_table_path = os.path.join(
247+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
248+
)
249+
report_file_path = os.path.join(
250+
"kh_samples", "train_predictor_text", "AnalysisResults.khj"
251+
)
252+
253+
# Train the predictor
254+
kh.train_predictor(
255+
dictionary_file_path,
256+
"FlightNegativeTweets",
257+
data_table_path,
258+
"negativereason",
259+
report_file_path,
260+
max_trees=5,
261+
max_text_features=1000,
262+
text_features="words",
263+
)
264+
265+
236266
def train_predictor_error_handling():
237267
"""Shows how to handle errors when training a predictor
238268
@@ -1059,6 +1089,52 @@ def deploy_model():
10591089
)
10601090

10611091

1092+
def deploy_model_text():
1093+
"""Deploys a model learned on textual data
1094+
It is a call to `~.api.deploy_model` with its mandatory parameters, plus
1095+
text-specific parameters.
1096+
1097+
In this example, a Selective Naive Bayes (SNB) model is deployed by applying its
1098+
associated dictionary to the input database. The model predictions are written to
1099+
the output database.
1100+
"""
1101+
# Imports
1102+
import os
1103+
from khiops import core as kh
1104+
1105+
# Set the file paths
1106+
dictionary_file_path = os.path.join(
1107+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
1108+
)
1109+
data_table_path = os.path.join(
1110+
kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
1111+
)
1112+
output_dir = os.path.join("kh_samples", "deploy_model_text")
1113+
report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
1114+
output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")
1115+
1116+
# Train the predictor
1117+
_, model_dictionary_file_path = kh.train_predictor(
1118+
dictionary_file_path,
1119+
"FlightNegativeTweets",
1120+
data_table_path,
1121+
"negativereason",
1122+
report_file_path,
1123+
max_trees=5,
1124+
max_text_features=1000,
1125+
text_features="words",
1126+
)
1127+
1128+
# Deploy the model on the database
1129+
# It will score it according to the trained predictor
1130+
kh.deploy_model(
1131+
model_dictionary_file_path,
1132+
"SNB_FlightNegativeTweets",
1133+
data_table_path,
1134+
output_data_table_path,
1135+
)
1136+
1137+
10621138
def deploy_model_mt():
10631139
"""Deploys a multi-table classifier in the simplest way possible
10641140
@@ -1811,6 +1887,7 @@ def build_deployed_dictionary():
18111887
export_dictionary_files,
18121888
train_predictor,
18131889
train_predictor_file_paths,
1890+
train_predictor_text,
18141891
train_predictor_error_handling,
18151892
train_predictor_mt,
18161893
train_predictor_mt_with_specific_rules,
@@ -1829,6 +1906,7 @@ def build_deployed_dictionary():
18291906
train_recoder_with_multiple_parameters,
18301907
train_recoder_mt_flatten,
18311908
deploy_model,
1909+
deploy_model_text,
18321910
deploy_model_mt,
18331911
deploy_model_mt_with_interpretation,
18341912
deploy_model_mt_snowflake,

0 commit comments

Comments
 (0)