-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
151 lines (122 loc) · 4.25 KB
/
preprocess.py
File metadata and controls
151 lines (122 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from pathlib import Path
from typing import List, Tuple
from dataset import SlavNERDataset
from enum import Enum
import typer
class Tokenizers(str, Enum):
nltk = "nltk"
stanza = "stanza"
trankit = "trankit"
def get_nltk_tokenizers() -> Tuple:
import nltk
nltk.download('punkt')
import nltk.tokenize
word = nltk.tokenize.toktok.ToktokTokenizer()
def word_tokenizer(text: str, lang: str) -> List[str]:
return word.tokenize(text)
def sent_tokenizer(text: str, lang: str = 'en') -> List[str]:
lang_map = {
'en': 'english',
'sl': 'slovene',
'cs': 'czech',
'cz': 'czech',
'pl': 'polish',
'uk': 'english', # sadly, nltk does not have an Ukrainian model...
'ua': 'english', # sadly, nltk does not have an Ukrainian model...
'bg': 'english', # sadly, nltk does not have a Bulgarian model...
'ru': 'english' # sadly, nltk does not have a Russian model...
}
return nltk.tokenize.sent_tokenize(text, lang_map[lang])
return word_tokenizer, sent_tokenizer
def get_stanza_tokenizers() -> Tuple:
import stanza
cache_tokenizer = None
cache_lang = None
def _get_tokenizer(lang: str):
'''
A simple "caching" mechanizsm as loding the tokenizer on each call is
extremly expensive.
'''
nonlocal cache_tokenizer, cache_lang # not pretty but works
if cache_tokenizer is not None and lang == cache_lang:
return cache_tokenizer
stanza.download(lang)
cache_tokenizer = stanza.Pipeline(lang=lang, processors='tokenize')
cache_lang = lang
return cache_tokenizer
def word_tokenizer(text: str, lang: str = 'en') -> List[str]:
tokenizer = _get_tokenizer(lang)
doc = tokenizer(text)
tokens = []
for sent in doc.sentences:
tokens.extend([token.text for token in sent.tokens if token.text])
return tokens
def sent_tokenizer(text: str, lang: str = 'en') -> List[str]:
tokenizer = _get_tokenizer(lang)
doc = tokenizer(text)
return [sent.text for sent in doc.sentences]
return word_tokenizer, sent_tokenizer
def get_trankit_tokenizers() -> Tuple:
from trankit import Pipeline
lang_map = {
'bg': 'bulgarian',
'cs': 'czech',
'pl': 'polish',
'ru': 'russian',
'sl': 'slovenian',
'uk': 'ukrainian'
}
pipeline = Pipeline('czech')
for lang in lang_map:
pipeline.add(lang_map[lang])
def word_tokenizer(text: str, lang: str) -> List[str]:
if len(text) == 0:
return ['']
if pipeline.active_lang != lang_map[lang]:
pipeline.set_active(lang_map[lang])
result = pipeline.tokenize(text, is_sent=True)
return [token['text'] for token in result['tokens']]
def sent_tokenizer(text: str, lang: str = 'en') -> List[str]:
if len(text) == 0:
return ['']
if pipeline.active_lang != lang_map[lang]:
pipeline.set_active(lang_map[lang])
result = pipeline.ssplit(text)
return [sentence['text'] for sentence in result['sentences']]
return word_tokenizer, sent_tokenizer
def main(
output_path: Path = typer.Argument(
...,
exists=False,
dir_okay=False,
readable=True
),
input_path: Path = typer.Option(
"./bsnlp2021_train_r1/",
exists=True,
dir_okay=True,
file_okay=False
),
tokenizer: Tokenizers = typer.Option(
Tokenizers.nltk,
case_sensitive=False
),
output_file_suffix: str = typer.Option(
".out",
),
):
if tokenizer == 'nltk':
word_tokenizer, sent_tokenizer = get_nltk_tokenizers()
elif tokenizer == 'stanza':
word_tokenizer, sent_tokenizer = get_stanza_tokenizers()
elif tokenizer == 'trankit':
word_tokenizer, sent_tokenizer = get_trankit_tokenizers()
dataset = SlavNERDataset(
data_dir=Path(input_path),
word_tokenizer=word_tokenizer,
sent_tokenizer=sent_tokenizer,
output_file_suffix=output_file_suffix
)
dataset.to_df().to_csv(output_path, index=False)
if __name__ == '__main__':
typer.run(main)