Merge pull request #35 from dataiku/chore/dss12-sc-132133-nlp-visualization-update-python-versions

Alexlandeau · web-flow · commit b26ff514e1c6 · 2023-05-15T11:41:00.000+02:00
update python versions
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2021 Dataiku
+   Copyright 2023 Dataiku
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/Makefile b/Makefile
@@ -21,8 +21,8 @@ plugin:
 unit-tests:
 	@echo "Running unit tests..."
 	@( \
-		PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
-		PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
+		PYTHON_VERSION=`python3 -c "import sys; print('PYTHON{}{}'.format(sys.version_info.major, sys.version_info.minor))"`; \
+		PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print('$$PYTHON_VERSION' in json.load(sys.stdin)['acceptedPythonInterpreters']);"`; \
 		if [ $$PYTHON_VERSION_IS_CORRECT == "False" ]; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; else echo "Python version $$PYTHON_VERSION is in acceptedPythonInterpreters"; fi; \
 	)
 	@( \
diff --git a/code-env/python/desc.json b/code-env/python/desc.json
@@ -1,10 +1,13 @@
 {
     "acceptedPythonInterpreters": [
         "PYTHON36",
-        "PYTHON37"
+        "PYTHON37",
+        "PYTHON38",
+        "PYTHON39",
+        "PYTHON310"
     ],
     "forceConda": false,
     "installCorePackages": true,
     "installJupyterSupport": true,
-    "corePackagesSet": "PANDAS10"
+    "corePackagesSet": "AUTO"
 }
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -3,12 +3,14 @@ pymorphy2-dicts-uk==2.4.1.1.1460299261
 pymorphy2==0.9.1
 jieba==0.42.1
 pyvi==0.1
-regex==2020.11.13
-spacy[lookups,ja,th]==3.0.7
+regex==2021.4.4
+spacy[lookups,ja,th]==3.0.7; python_version < '3.9'
+spacy[lookups,ja,th]==3.5.2; python_version >= '3.9'
 emoji==1.2.0
-tqdm==4.50.2
+tqdm==4.60.0
 matplotlib==3.3.1
-wordcloud==1.8.0
+wordcloud==1.8.0; python_version < '3.9'
+wordcloud==1.8.2.2; python_version >= '3.9'
 fonttools==4.14.0
 pathvalidate==2.3.0
 fastcore==1.3.19
diff --git a/python-lib/spacy_tokenizer.py b/python-lib/spacy_tokenizer.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 """Module with a class to tokenize text data in multiple languages"""
 
 import regex as re
@@ -36,7 +37,9 @@
     getter=lambda token: any(c in UNICODE_EMOJI for c in token.text),
     force=True,
 )
-SYMBOL_CHARS_REGEX = re.compile(r"(\p{M}|\p{S})+")  # matches unicode categories M (marks) and S (symbols)
+SYMBOL_CHARS_REGEX = re.compile(
+    r"(\p{M}|\p{S})+"
+)  # matches unicode categories M (marks) and S (symbols)
 Token.set_extension(
     "is_symbol",
     getter=lambda token: not token.is_punct
@@ -45,7 +48,9 @@
     and not re.sub(SYMBOL_CHARS_REGEX, "", token.text).strip(),
     force=True,
 )
-DATETIME_REGEX = re.compile(r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+", flags=re.IGNORECASE)
+DATETIME_REGEX = re.compile(
+    r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+", flags=re.IGNORECASE
+)
 Token.set_extension(
     "is_datetime",
     getter=lambda token: not token.like_num  # avoid conflict with existing token attribute
@@ -65,15 +70,22 @@
     getter=lambda token: not token.like_num  # avoid conflict with existing token attribute
     and not getattr(token._, "is_datetime", False)
     and token.text[:1].isdigit()
-    and any([re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS]),
+    and any(
+        [
+            re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit()
+            for unit in ALL_UNITS
+        ]
+    ),
     force=True,
 )
 INVISIBLE_CHARS_REGEX = re.compile(
     r"(\p{C}|\p{Z}|\p{M})+"
 )  # matches unicode categories C (control chars), Z (separators) and M (marks)
 Token.set_extension(
     "is_space",
-    getter=lambda token: not getattr(token._, "is_symbol", False)  # avoid conflict with existing token attribute
+    getter=lambda token: not getattr(
+        token._, "is_symbol", False
+    )  # avoid conflict with existing token attribute
     and (
         not "".join(c for c in token.text.strip() if c.isprintable())
         or not re.sub(INVISIBLE_CHARS_REGEX, "", token.text.strip())
@@ -110,7 +122,8 @@ class MultilingualTokenizer:
 
     DEFAULT_BATCH_SIZE = 1000
     MAX_NUM_CHARACTERS = 10 ** 7
-    DEFAULT_NUM_PROCESS = 2
+    # Set to 1 to prevent pickling issues when spawning multiple processes on MacOS
+    DEFAULT_NUM_PROCESS = 1
     DEFAULT_FILTER_TOKEN_ATTRIBUTES = {
         "is_space": "Whitespace",
         "is_punct": "Punctuation",
@@ -177,10 +190,12 @@ def __init__(
         """spacy.language.DisabledPipes object initialized in create_spacy_tokenizer()
         Contains the components of each SpaCy.Language object that have been disabled by spacy.Languages.select_pipes() method.
         Those components can be re-added to each SpaCy.Language at their initial place in the pipeline, by calling restore_pipe_components[language].restore()
-
+        
         """
         if self.enable_pipe_components and self.disable_pipe_components:
-            raise ValueError("Only one of enable_pipe_components and disable_pipe_components can be specified at once.")
+            raise ValueError(
+                "Only one of enable_pipe_components and disable_pipe_components can be specified at once."
+            )
 
     def _set_use_models(self, languages: List[AnyStr]) -> bool:
         """Set self.use_models attribute to True in case the text should be lemmatize with a SpaCy pre-trained model.
@@ -247,7 +262,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
         start = perf_counter()
         logging.info(f"Loading tokenizer for language '{language}'...")
         try:
-            if language == "th":  # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
+            if (
+                language == "th"
+            ):  # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
                 os.environ["PYTHAINLP_DATA_DIR"] = mkdtemp()  # dummy temp directory
             if language in SPACY_LANGUAGE_MODELS and self.use_models:
                 nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
@@ -257,7 +274,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
             elif language == "zh":
                 nlp = Chinese.from_config({"nlp": {"tokenizer": {"segmenter": "jieba"}}})
             else:
-                nlp = spacy.blank(language)  # spaCy language without models (https://spacy.io/usage/models)
+                nlp = spacy.blank(
+                    language
+                )  # spaCy language without models (https://spacy.io/usage/models)
             nlp.max_length = self.max_num_characters
             for component in self.add_pipe_components:
                 nlp.add_pipe(
@@ -268,9 +287,13 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
             if not self.use_models:
                 nlp.initialize()
             if self.enable_pipe_components:
-                self._restore_pipe_components[language] = nlp.select_pipes(enable=self.enable_pipe_components)
+                self._restore_pipe_components[language] = nlp.select_pipes(
+                    enable=self.enable_pipe_components
+                )
             if self.disable_pipe_components:
-                self._restore_pipe_components[language] = nlp.select_pipes(disable=self.disable_pipe_components)
+                self._restore_pipe_components[language] = nlp.select_pipes(
+                    disable=self.disable_pipe_components
+                )
 
         except (ValueError, OSError) as e:
             raise TokenizationError(
@@ -286,7 +309,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
                 nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(_prefixes).search
         if self.stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY:
             self._customize_stopwords(nlp, language)
-        logging.info(f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds")
+        logging.info(
+            f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds"
+        )
         return nlp
 
     def _customize_stopwords(self, nlp: Language, language: AnyStr) -> None:
@@ -312,7 +337,9 @@ def _customize_stopwords(self, nlp: Language, language: AnyStr) -> None:
                     nlp.vocab[word.upper()].is_stop = False
             nlp.Defaults.stop_words = custom_stopwords
         except (ValueError, OSError) as e:
-            raise TokenizationError(f"Stopword file for language '{language}' not available because of error: '{e}'")
+            raise TokenizationError(
+                f"Stopword file for language '{language}' not available because of error: '{e}'"
+            )
 
     def add_spacy_tokenizer(self, language: AnyStr) -> bool:
         """Private method to add a spaCy tokenizer for a given language to the `spacy_nlp_dict` attribute
@@ -363,7 +390,9 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]:
                 f"Tokenizing {len(tokenized)} document(s) in language '{language}': done in {perf_counter() - start:.2f} seconds"
             )
         except TokenizationError as e:
-            raise TokenizationError(f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'")
+            raise TokenizationError(
+                f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'"
+            )
         return tokenized
 
     def tokenize_df(
diff --git a/tests/python/unit/requirements.txt b/tests/python/unit/requirements.txt
@@ -1,4 +1,4 @@
-pandas>=1.0,<1.1
-pytest==6.2.1
+pandas>=1.0,<1.4
+pytest==6.2.5;
 allure-pytest==2.8.29
 Pillow==8.3.2
diff --git a/tests/python/unit/test_spacy_tokenizer.py b/tests/python/unit/test_spacy_tokenizer.py
@@ -36,16 +36,17 @@ def test_tokenize_df_multilingual():
                 "I hope nothing. I fear nothing. I am free.",
                 " Les sanglots longs des violons d'automne",
                 "子曰：“學而不思則罔，思而不學則殆。”",
-                "期一会。 異体同心。 そうです。",
             ],
-            "language": ["en", "fr", "zh", "ja"],
+            "language": ["en", "fr", "zh"],
         }
     )
     tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
-    output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language_column="language")
+    output_df = tokenizer.tokenize_df(
+        df=input_df, text_column="input_text", language_column="language"
+    )
     tokenized_documents = output_df[tokenizer.tokenized_column]
     tokenized_documents_length = [len(doc) for doc in tokenized_documents]
-    assert tokenized_documents_length == [12, 8, 13, 9]
+    assert tokenized_documents_length == [12, 8, 13]
 
 
 def test_tokenize_df_long_text():
diff --git a/tests/python/unit/test_wordcloud_visualizer.py b/tests/python/unit/test_wordcloud_visualizer.py
@@ -54,7 +54,7 @@ def test_tokenize_and_count_multilingual():
     assert frequencies == [
         ("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})),
         ("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})),
-        ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),),
+        ('zh', Counter({'不學則': 1, '不思則': 1, '子': 1, '學而': 1, '思而': 1, '曰': 1, '罔': 1}))
     ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def test_tokenize_and_count_multilingual():`
`54`	`54`	`assert frequencies == [`
`55`	`55`	`("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})),`
`56`	`56`	`("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})),`
`57`		`- ("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),),`
	`57`	`+ ('zh', Counter({'不學則': 1, '不思則': 1, '子': 1, '學而': 1, '思而': 1, '曰': 1, '罔': 1}))`
`58`	`58`	`]`
`59`	`59`
`60`	`60`