Merge pull request #21 from dataiku/feature/spacy-error-handling

alexcombessie · web-flow · commit dc86850f2b6b · 2021-04-28T19:13:01.000+02:00
Improve spacy error handling
diff --git a/python-lib/spacy_tokenizer.py b/python-lib/spacy_tokenizer.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """Module with a class to tokenize text data in multiple languages"""
 
+
 import regex as re
 import os
 import logging
@@ -51,12 +52,13 @@
 ALL_UNITS = ORDER_UNITS | WEIGHT_UNITS | DISTANCE_SPEED_UNITS | VOLUME_UNITS | MISC_UNITS
 Token.set_extension(
     "is_measure",
-    getter=lambda token: not token.like_num  # avoid conflict with existing token attribute
+    getter=lambda token: not token.like_num
     and not getattr(token._, "is_datetime", False)
     and token.text[:1].isdigit()
-    and any([re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS]),
+    and any(re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS),
     force=True,
 )
+
 INVISIBLE_CHARS_REGEX = re.compile(
     r"(\p{C}|\p{Z}|\p{M})+"
 )  # matches unicode categories C (control chars), Z (separators) and M (marks)
@@ -95,7 +97,7 @@ class MultilingualTokenizer:
 
     DEFAULT_BATCH_SIZE = 1000
     MAX_NUM_CHARACTERS = 10 ** 7
-    DEFAULT_NUM_PROCESS = 2
+    DEFAULT_NUM_PROCESS = 1
     DEFAULT_FILTER_TOKEN_ATTRIBUTES = {
         "is_space": "Whitespace",
         "is_punct": "Punctuation",
@@ -123,7 +125,7 @@ def __init__(
         use_models: bool = False,
         hashtags_as_token: bool = True,
         batch_size: int = DEFAULT_BATCH_SIZE,
-        max_num_characters: int = MAX_NUM_CHARACTERS
+        max_num_characters: int = MAX_NUM_CHARACTERS,
     ):
         """Initialization method for the MultilingualTokenizer class, with optional arguments
 
@@ -258,17 +260,17 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]:
         text_list = [str(t) if pd.notnull(t) else "" for t in text_list]
         try:
             self._add_spacy_tokenizer(language)
-            tokenized = list(
-                self.spacy_nlp_dict[language].pipe(
-                    text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS
-                )
-            )
-            logging.info(
-                f"Tokenizing {len(tokenized)} document(s) in language '{language}': "
-                + f"done in {perf_counter() - start:.2f} seconds"
-            )
         except TokenizationError as e:
             raise TokenizationError(f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'")
+        tokenized = list(
+            self.spacy_nlp_dict[language].pipe(
+                text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS
+            )
+        )
+        logging.info(
+            f"Tokenizing {len(tokenized)} document(s) in language '{language}': "
+            + f"done in {perf_counter() - start:.2f} seconds"
+        )
         return tokenized
 
     def tokenize_df(
diff --git a/tests/python/integration/test_wordcloud.py b/tests/python/integration/test_wordcloud.py
@@ -53,3 +53,8 @@ def test_wordcloud_unpartitioned_folder_file(user_dss_clients):
 
 def test_wordcloud_unpartitioned_folder_sql(user_dss_clients):
     dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="unpartitionned_folder_sql")
+
+    
+def test_wordcloud_long_text(user_dss_clients):
+    dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="TEST_LONGTEXT")
+
diff --git a/tests/python/unit/test_spacy_tokenizer.py b/tests/python/unit/test_spacy_tokenizer.py
@@ -5,6 +5,7 @@
 
 import os
 
+import pytest
 import pandas as pd
 
 from spacy_tokenizer import MultilingualTokenizer
@@ -45,3 +46,10 @@ def test_tokenize_df_multilingual():
     tokenized_documents = output_df[tokenizer.tokenized_column]
     tokenized_documents_length = [len(doc) for doc in tokenized_documents]
     assert tokenized_documents_length == [12, 8, 13, 9]
+
+
+def test_tokenize_df_long_text():
+    input_df = pd.DataFrame({"input_text": ["Long text"]})
+    tokenizer = MultilingualTokenizer(max_num_characters=1)
+    with pytest.raises(ValueError):
+        tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en")