Skip to content

Commit dc86850

Browse files
Merge pull request #21 from dataiku/feature/spacy-error-handling
Improve spacy error handling
2 parents f1ad043 + c56ca63 commit dc86850

File tree

3 files changed

+28
-13
lines changed

3 files changed

+28
-13
lines changed

python-lib/spacy_tokenizer.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""Module with a class to tokenize text data in multiple languages"""
33

4+
45
import regex as re
56
import os
67
import logging
@@ -51,12 +52,13 @@
5152
ALL_UNITS = ORDER_UNITS | WEIGHT_UNITS | DISTANCE_SPEED_UNITS | VOLUME_UNITS | MISC_UNITS
5253
Token.set_extension(
5354
"is_measure",
54-
getter=lambda token: not token.like_num # avoid conflict with existing token attribute
55+
getter=lambda token: not token.like_num
5556
and not getattr(token._, "is_datetime", False)
5657
and token.text[:1].isdigit()
57-
and any([re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS]),
58+
and any(re.sub(NUMERIC_SEPARATOR_REGEX, "", token.lower_).replace(unit, "").isdigit() for unit in ALL_UNITS),
5859
force=True,
5960
)
61+
6062
INVISIBLE_CHARS_REGEX = re.compile(
6163
r"(\p{C}|\p{Z}|\p{M})+"
6264
) # matches unicode categories C (control chars), Z (separators) and M (marks)
@@ -95,7 +97,7 @@ class MultilingualTokenizer:
9597

9698
DEFAULT_BATCH_SIZE = 1000
9799
MAX_NUM_CHARACTERS = 10 ** 7
98-
DEFAULT_NUM_PROCESS = 2
100+
DEFAULT_NUM_PROCESS = 1
99101
DEFAULT_FILTER_TOKEN_ATTRIBUTES = {
100102
"is_space": "Whitespace",
101103
"is_punct": "Punctuation",
@@ -123,7 +125,7 @@ def __init__(
123125
use_models: bool = False,
124126
hashtags_as_token: bool = True,
125127
batch_size: int = DEFAULT_BATCH_SIZE,
126-
max_num_characters: int = MAX_NUM_CHARACTERS
128+
max_num_characters: int = MAX_NUM_CHARACTERS,
127129
):
128130
"""Initialization method for the MultilingualTokenizer class, with optional arguments
129131
@@ -258,17 +260,17 @@ def tokenize_list(self, text_list: List[AnyStr], language: AnyStr) -> List[Doc]:
258260
text_list = [str(t) if pd.notnull(t) else "" for t in text_list]
259261
try:
260262
self._add_spacy_tokenizer(language)
261-
tokenized = list(
262-
self.spacy_nlp_dict[language].pipe(
263-
text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS
264-
)
265-
)
266-
logging.info(
267-
f"Tokenizing {len(tokenized)} document(s) in language '{language}': "
268-
+ f"done in {perf_counter() - start:.2f} seconds"
269-
)
270263
except TokenizationError as e:
271264
raise TokenizationError(f"Tokenization error: {e} for document(s): '{truncate_text_list(text_list)}'")
265+
tokenized = list(
266+
self.spacy_nlp_dict[language].pipe(
267+
text_list, batch_size=self.batch_size, n_process=self.DEFAULT_NUM_PROCESS
268+
)
269+
)
270+
logging.info(
271+
f"Tokenizing {len(tokenized)} document(s) in language '{language}': "
272+
+ f"done in {perf_counter() - start:.2f} seconds"
273+
)
272274
return tokenized
273275

274276
def tokenize_df(

tests/python/integration/test_wordcloud.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,8 @@ def test_wordcloud_unpartitioned_folder_file(user_dss_clients):
5353

5454
def test_wordcloud_unpartitioned_folder_sql(user_dss_clients):
5555
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="unpartitionned_folder_sql")
56+
57+
58+
def test_wordcloud_long_text(user_dss_clients):
59+
dss_scenario.run(user_dss_clients, project_key=TEST_PROJECT_KEY, scenario_id="TEST_LONGTEXT")
60+

tests/python/unit/test_spacy_tokenizer.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import os
77

8+
import pytest
89
import pandas as pd
910

1011
from spacy_tokenizer import MultilingualTokenizer
@@ -45,3 +46,10 @@ def test_tokenize_df_multilingual():
4546
tokenized_documents = output_df[tokenizer.tokenized_column]
4647
tokenized_documents_length = [len(doc) for doc in tokenized_documents]
4748
assert tokenized_documents_length == [12, 8, 13, 9]
49+
50+
51+
def test_tokenize_df_long_text():
52+
input_df = pd.DataFrame({"input_text": ["Long text"]})
53+
tokenizer = MultilingualTokenizer(max_num_characters=1)
54+
with pytest.raises(ValueError):
55+
tokenizer.tokenize_df(df=input_df, text_column="input_text", language="en")

0 commit comments

Comments
 (0)