Merge pull request #379 from KhiopsML/330-add-text-and-textlist-type-support-to-the-core-apis

popescu-v · web-flow · commit 0c8137f9c900 · 2025-04-02T19:10:57.000+02:00
330 add text and textlist type support to the core apis
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,13 +1,13 @@
 ---
 name: Tests
 env:
-  DEFAULT_SAMPLES_REVISION: 10.2.4
+  DEFAULT_SAMPLES_REVISION: 11.0.0
   DEFAULT_KHIOPS_DESKTOP_REVISION: 10.6.0-b.0
 on:
   workflow_dispatch:
     inputs:
       samples-revision:
-        default: 10.2.4
+        default: 11.0.0
         description: Git Tag/Branch/Commit for the khiops-samples Repo
       image-tag:
         default: 10.6.0-b.0.0
diff --git a/doc/samples/samples.rst b/doc/samples/samples.rst
@@ -212,6 +212,35 @@ Samples
 
     # If you have Khiops Visualization installed you may open the report as follows
     # kh.visualize_report(report_file_path)
+.. autofunction:: train_predictor_text
+.. code-block:: python
+
+    # Imports
+    import os
+    from khiops import core as kh
+
+    # Set the file paths
+    dictionary_file_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
+    )
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    report_file_path = os.path.join(
+        "kh_samples", "train_predictor_text", "AnalysisResults.khj"
+    )
+
+    # Train the predictor
+    kh.train_predictor(
+        dictionary_file_path,
+        "FlightNegativeTweets",
+        data_table_path,
+        "negativereason",
+        report_file_path,
+        max_trees=5,
+        max_text_features=1000,
+        text_features="words",
+    )
 .. autofunction:: train_predictor_error_handling
 .. code-block:: python
 
@@ -948,6 +977,44 @@ Samples
     kh.deploy_model(
         model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path
     )
+.. autofunction:: deploy_model_text
+.. code-block:: python
+
+    # Imports
+    import os
+    from khiops import core as kh
+
+    # Set the file paths
+    dictionary_file_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
+    )
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    output_dir = os.path.join("kh_samples", "deploy_model_text")
+    report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
+    output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")
+
+    # Train the predictor
+    _, model_dictionary_file_path = kh.train_predictor(
+        dictionary_file_path,
+        "FlightNegativeTweets",
+        data_table_path,
+        "negativereason",
+        report_file_path,
+        max_trees=5,
+        max_text_features=1000,
+        text_features="words",
+    )
+
+    # Deploy the model on the database
+    # It will score it according to the trained predictor
+    kh.deploy_model(
+        model_dictionary_file_path,
+        "SNB_FlightNegativeTweets",
+        data_table_path,
+        output_data_table_path,
+    )
 .. autofunction:: deploy_model_mt
 .. code-block:: python
 
diff --git a/khiops/core/dictionary.py b/khiops/core/dictionary.py
@@ -70,7 +70,7 @@ def _format_name(name):
 def _quote_value(value):
     """Double-quotes a string
 
-    Categorical and metadata values are quoted with this method.
+    Categorical, Text and metadata values are quoted with this method.
     """
     if isinstance(value, str):
         quoted_value = '"' + value.replace('"', '""') + '"'
@@ -1075,7 +1075,16 @@ def is_native(self):
             ``True`` if a variables comes directly from a data column.
 
         """
-        base_types = ["Categorical", "Numerical", "Time", "Date", "Timestamp"]
+        base_types = [
+            "Categorical",
+            "Numerical",
+            "Time",
+            "Date",
+            "Timestamp",
+            "TimestampTZ",
+            "Text",
+            "TextList",
+        ]
         if self.variable_block is None:
             return self.rule == "" and self.type in base_types
         return self.variable_block.rule == ""
diff --git a/khiops/samples/samples.ipynb b/khiops/samples/samples.ipynb
@@ -278,6 +278,48 @@
     "# kh.visualize_report(report_file_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `train_predictor_text()`\n\n",
+    "Trains a predictor with just text-specific parameters\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "from khiops import core as kh\n",
+    "\n",
+    "# Set the file paths\n",
+    "dictionary_file_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n",
+    ")\n",
+    "data_table_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
+    ")\n",
+    "report_file_path = os.path.join(\n",
+    "    \"kh_samples\", \"train_predictor_text\", \"AnalysisResults.khj\"\n",
+    ")\n",
+    "\n",
+    "# Train the predictor\n",
+    "kh.train_predictor(\n",
+    "    dictionary_file_path,\n",
+    "    \"FlightNegativeTweets\",\n",
+    "    data_table_path,\n",
+    "    \"negativereason\",\n",
+    "    report_file_path,\n",
+    "    max_trees=5,\n",
+    "    max_text_features=1000,\n",
+    "    text_features=\"words\",\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1248,6 +1290,57 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `deploy_model_text()`\n\n",
+    "Deploys a model learned on textual data\n    It is a call to `~.api.deploy_model` with its mandatory parameters, plus\n    text-specific parameters.\n\n    In this example, a Selective Naive Bayes (SNB) model is deployed by applying its\n    associated dictionary to the input database. The model predictions are written to\n    the output database.\n    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "from khiops import core as kh\n",
+    "\n",
+    "# Set the file paths\n",
+    "dictionary_file_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.kdic\"\n",
+    ")\n",
+    "data_table_path = os.path.join(\n",
+    "    kh.get_samples_dir(), \"NegativeAirlineTweets\", \"NegativeAirlineTweets.txt\"\n",
+    ")\n",
+    "output_dir = os.path.join(\"kh_samples\", \"deploy_model_text\")\n",
+    "report_file_path = os.path.join(output_dir, \"AnalysisResults.khj\")\n",
+    "output_data_table_path = os.path.join(output_dir, \"ScoresNegativeAirlineTweets.txt\")\n",
+    "\n",
+    "# Train the predictor\n",
+    "_, model_dictionary_file_path = kh.train_predictor(\n",
+    "    dictionary_file_path,\n",
+    "    \"FlightNegativeTweets\",\n",
+    "    data_table_path,\n",
+    "    \"negativereason\",\n",
+    "    report_file_path,\n",
+    "    max_trees=5,\n",
+    "    max_text_features=1000,\n",
+    "    text_features=\"words\",\n",
+    ")\n",
+    "\n",
+    "# Deploy the model on the database\n",
+    "# It will score it according to the trained predictor\n",
+    "kh.deploy_model(\n",
+    "    model_dictionary_file_path,\n",
+    "    \"SNB_FlightNegativeTweets\",\n",
+    "    data_table_path,\n",
+    "    output_data_table_path,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/khiops/samples/samples.py b/khiops/samples/samples.py
@@ -233,6 +233,36 @@ def train_predictor_file_paths():
     # kh.visualize_report(report_file_path)
 
 
+def train_predictor_text():
+    """Trains a predictor with just text-specific parameters"""
+    # Imports
+    import os
+    from khiops import core as kh
+
+    # Set the file paths
+    dictionary_file_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
+    )
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    report_file_path = os.path.join(
+        "kh_samples", "train_predictor_text", "AnalysisResults.khj"
+    )
+
+    # Train the predictor
+    kh.train_predictor(
+        dictionary_file_path,
+        "FlightNegativeTweets",
+        data_table_path,
+        "negativereason",
+        report_file_path,
+        max_trees=5,
+        max_text_features=1000,
+        text_features="words",
+    )
+
+
 def train_predictor_error_handling():
     """Shows how to handle errors when training a predictor
 
@@ -1059,6 +1089,52 @@ def deploy_model():
     )
 
 
+def deploy_model_text():
+    """Deploys a model learned on textual data
+    It is a call to `~.api.deploy_model` with its mandatory parameters, plus
+    text-specific parameters.
+
+    In this example, a Selective Naive Bayes (SNB) model is deployed by applying its
+    associated dictionary to the input database. The model predictions are written to
+    the output database.
+    """
+    # Imports
+    import os
+    from khiops import core as kh
+
+    # Set the file paths
+    dictionary_file_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.kdic"
+    )
+    data_table_path = os.path.join(
+        kh.get_samples_dir(), "NegativeAirlineTweets", "NegativeAirlineTweets.txt"
+    )
+    output_dir = os.path.join("kh_samples", "deploy_model_text")
+    report_file_path = os.path.join(output_dir, "AnalysisResults.khj")
+    output_data_table_path = os.path.join(output_dir, "ScoresNegativeAirlineTweets.txt")
+
+    # Train the predictor
+    _, model_dictionary_file_path = kh.train_predictor(
+        dictionary_file_path,
+        "FlightNegativeTweets",
+        data_table_path,
+        "negativereason",
+        report_file_path,
+        max_trees=5,
+        max_text_features=1000,
+        text_features="words",
+    )
+
+    # Deploy the model on the database
+    # It will score it according to the trained predictor
+    kh.deploy_model(
+        model_dictionary_file_path,
+        "SNB_FlightNegativeTweets",
+        data_table_path,
+        output_data_table_path,
+    )
+
+
 def deploy_model_mt():
     """Deploys a multi-table classifier in the simplest way possible
 
@@ -1811,6 +1887,7 @@ def build_deployed_dictionary():
     export_dictionary_files,
     train_predictor,
     train_predictor_file_paths,
+    train_predictor_text,
     train_predictor_error_handling,
     train_predictor_mt,
     train_predictor_mt_with_specific_rules,
@@ -1829,6 +1906,7 @@ def build_deployed_dictionary():
     train_recoder_with_multiple_parameters,
     train_recoder_mt_flatten,
     deploy_model,
+    deploy_model_text,
     deploy_model_mt,
     deploy_model_mt_with_interpretation,
     deploy_model_mt_snowflake,