1+ # -*- coding: utf-8 -*-
12"""Module with a class to tokenize text data in multiple languages"""
23
34import regex as re
3637 getter = lambda token : any (c in UNICODE_EMOJI for c in token .text ),
3738 force = True ,
3839)
39- SYMBOL_CHARS_REGEX = re .compile (r"(\p{M}|\p{S})+" ) # matches unicode categories M (marks) and S (symbols)
40+ SYMBOL_CHARS_REGEX = re .compile (
41+ r"(\p{M}|\p{S})+"
42+ ) # matches unicode categories M (marks) and S (symbols)
4043Token .set_extension (
4144 "is_symbol" ,
4245 getter = lambda token : not token .is_punct
4548 and not re .sub (SYMBOL_CHARS_REGEX , "" , token .text ).strip (),
4649 force = True ,
4750)
48- DATETIME_REGEX = re .compile (r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+" , flags = re .IGNORECASE )
51+ DATETIME_REGEX = re .compile (
52+ r"(:|-|\.|\/|am|pm|hrs|hr|h|minutes|mins|min|sec|s|ms|ns|y)+" , flags = re .IGNORECASE
53+ )
4954Token .set_extension (
5055 "is_datetime" ,
5156 getter = lambda token : not token .like_num # avoid conflict with existing token attribute
6570 getter = lambda token : not token .like_num # avoid conflict with existing token attribute
6671 and not getattr (token ._ , "is_datetime" , False )
6772 and token .text [:1 ].isdigit ()
68- and any ([re .sub (NUMERIC_SEPARATOR_REGEX , "" , token .lower_ ).replace (unit , "" ).isdigit () for unit in ALL_UNITS ]),
73+ and any (
74+ [
75+ re .sub (NUMERIC_SEPARATOR_REGEX , "" , token .lower_ ).replace (unit , "" ).isdigit ()
76+ for unit in ALL_UNITS
77+ ]
78+ ),
6979 force = True ,
7080)
7181INVISIBLE_CHARS_REGEX = re .compile (
7282 r"(\p{C}|\p{Z}|\p{M})+"
7383) # matches unicode categories C (control chars), Z (separators) and M (marks)
7484Token .set_extension (
7585 "is_space" ,
76- getter = lambda token : not getattr (token ._ , "is_symbol" , False ) # avoid conflict with existing token attribute
86+ getter = lambda token : not getattr (
87+ token ._ , "is_symbol" , False
88+ ) # avoid conflict with existing token attribute
7789 and (
7890 not "" .join (c for c in token .text .strip () if c .isprintable ())
7991 or not re .sub (INVISIBLE_CHARS_REGEX , "" , token .text .strip ())
@@ -110,7 +122,8 @@ class MultilingualTokenizer:
110122
111123 DEFAULT_BATCH_SIZE = 1000
112124 MAX_NUM_CHARACTERS = 10 ** 7
113- DEFAULT_NUM_PROCESS = 2
125+ # Set to 1 to prevent pickling issues when spawning multiple processes on MacOS
126+ DEFAULT_NUM_PROCESS = 1
114127 DEFAULT_FILTER_TOKEN_ATTRIBUTES = {
115128 "is_space" : "Whitespace" ,
116129 "is_punct" : "Punctuation" ,
@@ -177,10 +190,12 @@ def __init__(
177190 """spacy.language.DisabledPipes object initialized in create_spacy_tokenizer()
178191 Contains the components of each SpaCy.Language object that have been disabled by spacy.Languages.select_pipes() method.
179192 Those components can be re-added to each SpaCy.Language at their initial place in the pipeline, by calling restore_pipe_components[language].restore()
180-
193+
181194 """
182195 if self .enable_pipe_components and self .disable_pipe_components :
183- raise ValueError ("Only one of enable_pipe_components and disable_pipe_components can be specified at once." )
196+ raise ValueError (
197+ "Only one of enable_pipe_components and disable_pipe_components can be specified at once."
198+ )
184199
185200 def _set_use_models (self , languages : List [AnyStr ]) -> bool :
186201 """Set self.use_models attribute to True in case the text should be lemmatize with a SpaCy pre-trained model.
@@ -247,7 +262,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
247262 start = perf_counter ()
248263 logging .info (f"Loading tokenizer for language '{ language } '..." )
249264 try :
250- if language == "th" : # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
265+ if (
266+ language == "th"
267+ ): # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded
251268 os .environ ["PYTHAINLP_DATA_DIR" ] = mkdtemp () # dummy temp directory
252269 if language in SPACY_LANGUAGE_MODELS and self .use_models :
253270 nlp = spacy .load (SPACY_LANGUAGE_MODELS [language ])
@@ -257,7 +274,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
257274 elif language == "zh" :
258275 nlp = Chinese .from_config ({"nlp" : {"tokenizer" : {"segmenter" : "jieba" }}})
259276 else :
260- nlp = spacy .blank (language ) # spaCy language without models (https://spacy.io/usage/models)
277+ nlp = spacy .blank (
278+ language
279+ ) # spaCy language without models (https://spacy.io/usage/models)
261280 nlp .max_length = self .max_num_characters
262281 for component in self .add_pipe_components :
263282 nlp .add_pipe (
@@ -268,9 +287,13 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
268287 if not self .use_models :
269288 nlp .initialize ()
270289 if self .enable_pipe_components :
271- self ._restore_pipe_components [language ] = nlp .select_pipes (enable = self .enable_pipe_components )
290+ self ._restore_pipe_components [language ] = nlp .select_pipes (
291+ enable = self .enable_pipe_components
292+ )
272293 if self .disable_pipe_components :
273- self ._restore_pipe_components [language ] = nlp .select_pipes (disable = self .disable_pipe_components )
294+ self ._restore_pipe_components [language ] = nlp .select_pipes (
295+ disable = self .disable_pipe_components
296+ )
274297
275298 except (ValueError , OSError ) as e :
276299 raise TokenizationError (
@@ -286,7 +309,9 @@ def _create_spacy_tokenizer(self, language: AnyStr) -> Language:
286309 nlp .tokenizer .prefix_search = spacy .util .compile_prefix_regex (_prefixes ).search
287310 if self .stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY :
288311 self ._customize_stopwords (nlp , language )
289- logging .info (f"Loading tokenizer for language '{ language } ': done in { perf_counter () - start :.2f} seconds" )
312+ logging .info (
313+ f"Loading tokenizer for language '{ language } ': done in { perf_counter () - start :.2f} seconds"
314+ )
290315 return nlp
291316
292317 def _customize_stopwords (self , nlp : Language , language : AnyStr ) -> None :
@@ -312,7 +337,9 @@ def _customize_stopwords(self, nlp: Language, language: AnyStr) -> None:
312337 nlp .vocab [word .upper ()].is_stop = False
313338 nlp .Defaults .stop_words = custom_stopwords
314339 except (ValueError , OSError ) as e :
315- raise TokenizationError (f"Stopword file for language '{ language } ' not available because of error: '{ e } '" )
340+ raise TokenizationError (
341+ f"Stopword file for language '{ language } ' not available because of error: '{ e } '"
342+ )
316343
317344 def add_spacy_tokenizer (self , language : AnyStr ) -> bool :
318345 """Private method to add a spaCy tokenizer for a given language to the `spacy_nlp_dict` attribute
@@ -363,7 +390,9 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]:
363390 f"Tokenizing { len (tokenized )} document(s) in language '{ language } ': done in { perf_counter () - start :.2f} seconds"
364391 )
365392 except TokenizationError as e :
366- raise TokenizationError (f"Tokenization error: { e } for document(s): '{ truncate_text_list (text_list )} '" )
393+ raise TokenizationError (
394+ f"Tokenization error: { e } for document(s): '{ truncate_text_list (text_list )} '"
395+ )
367396 return tokenized
368397
369398 def tokenize_df (
0 commit comments