Skip to content

Commit b26ff51

Browse files
authored
Merge pull request #35 from dataiku/chore/dss12-sc-132133-nlp-visualization-update-python-versions
update python versions
2 parents d87745e + cc6555e commit b26ff51

File tree

8 files changed

+65
-30
lines changed

8 files changed

+65
-30
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@
186186
same "printed page" as the copyright notice for easier
187187
identification within third-party archives.
188188

189-
Copyright 2021 Dataiku
189+
Copyright 2023 Dataiku
190190

191191
Licensed under the Apache License, Version 2.0 (the "License");
192192
you may not use this file except in compliance with the License.

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ plugin:
2121
unit-tests:
2222
@echo "Running unit tests..."
2323
@( \
24-
PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
25-
PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
24+
PYTHON_VERSION=`python3 -c "import sys; print('PYTHON{}{}'.format(sys.version_info.major, sys.version_info.minor))"`; \
25+
PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print('$$PYTHON_VERSION' in json.load(sys.stdin)['acceptedPythonInterpreters']);"`; \
2626
if [ $$PYTHON_VERSION_IS_CORRECT == "False" ]; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; else echo "Python version $$PYTHON_VERSION is in acceptedPythonInterpreters"; fi; \
2727
)
2828
@( \

code-env/python/desc.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
{
22
"acceptedPythonInterpreters": [
33
"PYTHON36",
4-
"PYTHON37"
4+
"PYTHON37",
5+
"PYTHON38",
6+
"PYTHON39",
7+
"PYTHON310"
58
],
69
"forceConda": false,
710
"installCorePackages": true,
811
"installJupyterSupport": true,
9-
"corePackagesSet": "PANDAS10"
12+
"corePackagesSet": "AUTO"
1013
}

code-env/python/spec/requirements.txt

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ pymorphy2-dicts-uk==2.4.1.1.1460299261
33
pymorphy2==0.9.1
44
jieba==0.42.1
55
pyvi==0.1
6-
regex==2020.11.13
7-
spacy[lookups,ja,th]==3.0.7
6+
regex==2021.4.4
7+
spacy[lookups,ja,th]==3.0.7; python_version < '3.9'
8+
spacy[lookups,ja,th]==3.5.2; python_version >= '3.9'
89
emoji==1.2.0
9-
tqdm==4.50.2
10+
tqdm==4.60.0
1011
matplotlib==3.3.1
11-
wordcloud==1.8.0
12+
wordcloud==1.8.0; python_version < '3.9'
13+
wordcloud==1.8.2.2; python_version >= '3.9'
1214
fonttools==4.14.0
1315
pathvalidate==2.3.0
1416
fastcore==1.3.19

python-lib/spacy_tokenizer.py

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
"""Module with a class to tokenize text data in multiple languages"""
23

34
import regex as re
@@ -36,7 +37,9 @@
3637
getter=lambda token: any(c in UNICODE_EMOJI for c in token.text),
3738
force=True,
3839
)
39-
SYMBOL_CHARS_REGEX = re.compile(r"(\p{M}|\p{S})+") # matches unicode categories M (marks) and S (symbols)
40+
SYMBOL_CHARS_REGEX = re.compile(
41+
r"(\p{M}|\p{S})+"
42+
) # matches unicode categories M (marks) and S (symbols)
4043
Token.set_extension(
4144
"is_symbol",
4245
getter=lambda token: not token.is_punct
@@ -45,7 +48,9 @@
4548
and not re.sub(SYMBOL_CHARS_REGEX, "", token.text).strip(),
4649
force=True,
4750
)
48-
DATETIME_REGEX = re.compile(r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+", flags=re.IGNORECASE)
51+
DATETIME_REGEX = re.compile(
52+
r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+", flags=re.IGNORECASE
53+
)
4954
Token.set_extension(
5055
"is_datetime",
5156
getter=lambda token: not token.like_num # avoid conflict with existing token attribute
@@ -65,15 +70,22 @@
6570
getter=lambda token: not token.like_num # avoid conflict with existing token attribute
6671
and not getattr(token._, "is_datetime", False)
6772
and token.text[:1].isdigit()
68-
and any([re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS]),
73+
and any(
74+
[
75+
re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit()
76+
for unit in ALL_UNITS
77+
]
78+
),
6979
force=True,
7080
)
7181
INVISIBLE_CHARS_REGEX = re.compile(
7282
r"(\p{C}|\p{Z}|\p{M})+"
7383
) # matches unicode categories C (control chars), Z (separators) and M (marks)
7484
Token.set_extension(
7585
"is_space",
76-
getter=lambda token: not getattr(token._, "is_symbol", False) # avoid conflict with existing token attribute
86+
getter=lambda token: not getattr(
87+
token._, "is_symbol", False
88+
) # avoid conflict with existing token attribute
7789
and (
7890
not "".join(c for c in token.text.strip() if c.isprintable())
7991
or not re.sub(INVISIBLE_CHARS_REGEX, "", token.text.strip())
@@ -110,7 +122,8 @@ class MultilingualTokenizer:
110122

111123
DEFAULT_BATCH_SIZE = 1000
112124
MAX_NUM_CHARACTERS = 10 ** 7
113-
DEFAULT_NUM_PROCESS = 2
125+
# Set to 1 to prevent pickling issues when spawning multiple processes on MacOS
126+
DEFAULT_NUM_PROCESS = 1
114127
DEFAULT_FILTER_TOKEN_ATTRIBUTES = {
115128
"is_space": "Whitespace",
116129
"is_punct": "Punctuation",
@@ -177,10 +190,12 @@ def __init__(
177190
"""spacy.language.DisabledPipes object initialized in create_spacy_tokenizer()
178191
Contains the components of each SpaCy.Language object that have been disabled by spacy.Languages.select_pipes() method.
179192
Those components can be re-added to each SpaCy.Language at their initial place in the pipeline, by calling restore_pipe_components[language].restore()
180-
193+
181194
"""
182195
if self.enable_pipe_components and self.disable_pipe_components:
183-
raise ValueError("Only one of enable_pipe_components and disable_pipe_components can be specified at once.")
196+
raise ValueError(
197+
"Only one of enable_pipe_components and disable_pipe_components can be specified at once."
198+
)
184199

185200
def _set_use_models(self, languages: List[AnyStr]) -> bool:
186201
"""Set self.use_models attribute to True in case the text should be lemmatize with a SpaCy pre-trained model.
@@ -247,7 +262,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
247262
start = perf_counter()
248263
logging.info(f"Loading tokenizer for language '{language}'...")
249264
try:
250-
if language == "th": # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
265+
if (
266+
language == "th"
267+
): # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
251268
os.environ["PYTHAINLP_DATA_DIR"] = mkdtemp() # dummy temp directory
252269
if language in SPACY_LANGUAGE_MODELS and self.use_models:
253270
nlp = spacy.load(SPACY_LANGUAGE_MODELS[language])
@@ -257,7 +274,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
257274
elif language == "zh":
258275
nlp = Chinese.from_config({"nlp": {"tokenizer": {"segmenter": "jieba"}}})
259276
else:
260-
nlp = spacy.blank(language) # spaCy language without models (https://spacy.io/usage/models)
277+
nlp = spacy.blank(
278+
language
279+
) # spaCy language without models (https://spacy.io/usage/models)
261280
nlp.max_length = self.max_num_characters
262281
for component in self.add_pipe_components:
263282
nlp.add_pipe(
@@ -268,9 +287,13 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
268287
if not self.use_models:
269288
nlp.initialize()
270289
if self.enable_pipe_components:
271-
self._restore_pipe_components[language] = nlp.select_pipes(enable=self.enable_pipe_components)
290+
self._restore_pipe_components[language] = nlp.select_pipes(
291+
enable=self.enable_pipe_components
292+
)
272293
if self.disable_pipe_components:
273-
self._restore_pipe_components[language] = nlp.select_pipes(disable=self.disable_pipe_components)
294+
self._restore_pipe_components[language] = nlp.select_pipes(
295+
disable=self.disable_pipe_components
296+
)
274297

275298
except (ValueError, OSError) as e:
276299
raise TokenizationError(
@@ -286,7 +309,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
286309
nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex(_prefixes).search
287310
if self.stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY:
288311
self._customize_stopwords(nlp, language)
289-
logging.info(f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds")
312+
logging.info(
313+
f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds"
314+
)
290315
return nlp
291316

292317
def _customize_stopwords(self, nlp: Language, language: AnyStr) -> None:
@@ -312,7 +337,9 @@ def _customize_stopwords(self, nlp: Language, language: AnyStr) -> None:
312337
nlp.vocab[word.upper()].is_stop = False
313338
nlp.Defaults.stop_words = custom_stopwords
314339
except (ValueError, OSError) as e:
315-
raise TokenizationError(f"Stopword file for language '{language}' not available because of error: '{e}'")
340+
raise TokenizationError(
341+
f"Stopword file for language '{language}' not available because of error: '{e}'"
342+
)
316343

317344
def add_spacy_tokenizer(self, language: AnyStr) -> bool:
318345
"""Private method to add a spaCy tokenizer for a given language to the `spacy_nlp_dict` attribute
@@ -363,7 +390,9 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]:
363390
f"Tokenizing {len(tokenized)} document(s) in language '{language}': done in {perf_counter() - start:.2f} seconds"
364391
)
365392
except TokenizationError as e:
366-
raise TokenizationError(f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'")
393+
raise TokenizationError(
394+
f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'"
395+
)
367396
return tokenized
368397

369398
def tokenize_df(

tests/python/unit/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
pandas>=1.0,<1.1
2-
pytest==6.2.1
1+
pandas>=1.0,<1.4
2+
pytest==6.2.5;
33
allure-pytest==2.8.29
44
Pillow==8.3.2

tests/python/unit/test_spacy_tokenizer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,17 @@ def test_tokenize_df_multilingual():
3636
"I hope nothing. I fear nothing. I am free.",
3737
" Les sanglots longs des violons d'automne",
3838
"子曰:“學而不思則罔,思而不學則殆。”",
39-
"期一会。 異体同心。 そうです。",
4039
],
41-
"language": ["en", "fr", "zh", "ja"],
40+
"language": ["en", "fr", "zh"],
4241
}
4342
)
4443
tokenizer = MultilingualTokenizer(stopwords_folder_path=stopwords_folder_path)
45-
output_df = tokenizer.tokenize_df(df=input_df, text_column="input_text", language_column="language")
44+
output_df = tokenizer.tokenize_df(
45+
df=input_df, text_column="input_text", language_column="language"
46+
)
4647
tokenized_documents = output_df[tokenizer.tokenized_column]
4748
tokenized_documents_length = [len(doc) for doc in tokenized_documents]
48-
assert tokenized_documents_length == [12, 8, 13, 9]
49+
assert tokenized_documents_length == [12, 8, 13]
4950

5051

5152
def test_tokenize_df_long_text():

tests/python/unit/test_wordcloud_visualizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def test_tokenize_and_count_multilingual():
5454
assert frequencies == [
5555
("en", Counter({"hope": 1, "Nothing": 3, "fear": 1, "free": 1})),
5656
("fr", Counter({"sanglots": 1, "longs": 1, "violons": 1, "automne": 1})),
57-
("zh", Counter({"子": 1, "曰": 1, "學而": 1, "不思則": 1, "罔": 1, "思而": 1, "不學則": 1}),),
57+
('zh', Counter({'不學則': 1, '不思則': 1, '子': 1, '學而': 1, '思而': 1, '曰': 1, '罔': 1}))
5858
]
5959

6060

0 commit comments

Comments
 (0)