|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | """Module with a class to tokenize text data in multiple languages""" |
3 | 3 |
|
| 4 | + |
4 | 5 | import regex as re |
5 | 6 | import os |
6 | 7 | import logging |
|
51 | 52 | ALL_UNITS = ORDER_UNITS | WEIGHT_UNITS | DISTANCE_SPEED_UNITS | VOLUME_UNITS | MISC_UNITS |
52 | 53 | Token.set_extension( |
53 | 54 | "is_measure", |
54 | | - getter=lambda token: not token.like_num # avoid conflict with existing token attribute |
| 55 | + getter=lambda token: not token.like_num |
55 | 56 | and not getattr(token._, "is_datetime", False) |
56 | 57 | and token.text[:1].isdigit() |
57 | | - and any([re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS]), |
| 58 | + and any(re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS), |
58 | 59 | force=True, |
59 | 60 | ) |
| 61 | + |
60 | 62 | INVISIBLE_CHARS_REGEX = re.compile( |
61 | 63 | r"(\p{C}|\p{Z}|\p{M})+" |
62 | 64 | ) # matches unicode categories C (control chars), Z (separators) and M (marks) |
@@ -95,7 +97,7 @@ class MultilingualTokenizer: |
95 | 97 |
|
96 | 98 | DEFAULT_BATCH_SIZE = 1000 |
97 | 99 | MAX_NUM_CHARACTERS = 10 ** 7 |
98 | | - DEFAULT_NUM_PROCESS = 2 |
| 100 | + DEFAULT_NUM_PROCESS = 1 |
99 | 101 | DEFAULT_FILTER_TOKEN_ATTRIBUTES = { |
100 | 102 | "is_space": "Whitespace", |
101 | 103 | "is_punct": "Punctuation", |
@@ -123,7 +125,7 @@ def __init__( |
123 | 125 | use_models: bool = False, |
124 | 126 | hashtags_as_token: bool = True, |
125 | 127 | batch_size: int = DEFAULT_BATCH_SIZE, |
126 | | - max_num_characters: int = MAX_NUM_CHARACTERS |
| 128 | + max_num_characters: int = MAX_NUM_CHARACTERS, |
127 | 129 | ): |
128 | 130 | """Initialization method for the MultilingualTokenizer class, with optional arguments |
129 | 131 |
|
@@ -258,17 +260,17 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]: |
258 | 260 | text_list = [str(t) if pd.notnull(t) else "" for t in text_list] |
259 | 261 | try: |
260 | 262 | self._add_spacy_tokenizer(language) |
261 | | - tokenized = list( |
262 | | - self.spacy_nlp_dict[language].pipe( |
263 | | - text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS |
264 | | - ) |
265 | | - ) |
266 | | - logging.info( |
267 | | - f"Tokenizing {len(tokenized)} document(s) in language '{language}': " |
268 | | - + f"done in {perf_counter() - start:.2f} seconds" |
269 | | - ) |
270 | 263 | except TokenizationError as e: |
271 | 264 | raise TokenizationError(f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'") |
| 265 | + tokenized = list( |
| 266 | + self.spacy_nlp_dict[language].pipe( |
| 267 | + text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS |
| 268 | + ) |
| 269 | + ) |
| 270 | + logging.info( |
| 271 | + f"Tokenizing {len(tokenized)} document(s) in language '{language}': " |
| 272 | + + f"done in {perf_counter() - start:.2f} seconds" |
| 273 | + ) |
272 | 274 | return tokenized |
273 | 275 |
|
274 | 276 | def tokenize_df( |
|
0 commit comments