diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv new file mode 100644 index 000000000..5ff302fb6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv @@ -0,0 +1,35 @@ +박사 Dr. +박사 dr. +씨 Mr. +씨 mr. +양 Ms. +양 ms. +여사 Mrs. +여사 mrs. +산 mt. +산 Mt. +교수 Prof. +교수 prof. +시니어 sr. +시니어 Sr. +주니어 jr. +주니어 Jr. +대로 Ave. +대로 ave. +번호 no. +번호 No. +왼쪽 괄호 ( +오른쪽 괄호 ) +더하기 + +마이너스 - +시그마 Σ +에타 η +카파 κ +오메가 ω +시그마 σ +알파 α +뉴 ν +델타 δ +이오타 ι +박사학위 Ph.D. +등 etc. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py index b9de5c299..b02e80984 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -34,27 +34,25 @@ def __init__(self, cardinal: GraphFst): cardinal = cardinal.just_cardinals month = pynini.string_file(get_abs_path("data/months.tsv")) - spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - year_suffix = pynini.cross("년", "") month_suffix = pynini.cross("월", "") day_suffix = pynini.cross("일", "") - year_component = ( - pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"") - ) + delete_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + between_fields = delete_space + pynutil.insert(NEMO_SPACE) - month_component = ( - pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"") - ) + year_component = pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"") + month_component = pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"") + day_component = pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"") - day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"") + graph_component = year_component | month_component - graph_component = year_component | month_component | day_component graph_date = ( - pynini.closure(year_component, 0, 1) - + pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1) - + pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1) + year_component + | month_component + | (year_component + between_fields + month_component) + | (month_component + between_fields + day_component) + | (year_component + between_fields + month_component + between_fields + day_component) ) final_graph = graph_component | graph_date diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index ecb92df1d..a51596e94 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -41,7 +41,7 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - decimal_part = pynini.closure(graph_zero | graph_digit) + decimal_part = pynini.closure(graph_zero | graph_digit, 1) decimal_point = pynutil.delete("점") integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") @@ -55,7 +55,7 @@ def __init__(self, cardinal: GraphFst): ) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger - self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part + self.just_decimal = cardinals | (cardinals + pynini.cross("점", ".") + decimal_part) graph_sign = ( pynutil.insert("negative: \"") + (pynini.cross("마이너스", "-") | pynini.accep("-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index f890e477e..719fa168f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -28,17 +28,16 @@ class MoneyFst(GraphFst): cardinal: CardinalFst """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinals = cardinal.just_cardinals + decimals = decimal.just_decimal currency = pynini.string_file(get_abs_path("data/currency.tsv")) # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - - graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing - + graph_integer = pynutil.insert('integer_part: "') + decimals + pynutil.insert('"') + spacing graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") graph_final = graph_integer + graph_unit diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index e99a1b083..205737043 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -73,27 +73,25 @@ def __init__(self): pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") ) - hour = pynini.closure(hour_component, 0, 1) - minute = pynini.closure(delete_space + minute_component, 0, 1) - second = pynini.closure(delete_space + second_component, 0, 1) + hm_opt = pynini.closure(delete_space + minute_component, 0, 1) + hs_opt = pynini.closure(delete_space + second_component, 0, 1) - graph_regular = hour + minute + second + hms = hour_component + hm_opt + hs_opt + ms = minute_component + pynini.closure(delete_space + second_component, 0, 1) + s_only = second_component + + graph_regular = pynini.union(hms, ms, s_only).optimize() # 오전 = AM, 오후 = PM - prefix_words = ( - pynini.union( - (pynini.accep("오전")), (pynini.accep("오후")), (pynini.accep("새벽")), (pynini.accep("아침")) - ) - + spacing - ) - prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") + ampm_words = pynini.union("오전", "오후", "새벽", "아침") + ampm_tag = pynutil.insert('suffix: "') + ampm_words + pynutil.insert('"') # 전 = before, 후 = after suffix_words = pynini.accep("전") | pynini.accep("후") suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") time_graph = ( - pynini.closure(delete_space + prefix_tag, 0, 1) + pynini.closure(delete_space + ampm_tag, 0, 1) + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3620f07ac..8e5d39c85 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -18,7 +18,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst @@ -28,6 +35,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst @@ -56,7 +64,7 @@ def __init__( far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + far_file = os.path.join(cache_dir, f"ko_itn_{input_case}_tokenize.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") @@ -81,7 +89,7 @@ def __init__( date = DateFst(cardinal) date_graph = date.fst - money = MoneyFst(cardinal) + money = MoneyFst(cardinal, decimal) money_graph = money.fst telephone = TelephoneFst() @@ -91,6 +99,7 @@ def __init__( measure_graph = measure.fst word_graph = WordFst().fst + whitelist_graph = WhiteListFst().fst classify = ( pynutil.add_weight(cardinal_graph, 1.1) @@ -103,12 +112,15 @@ def __init__( | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(whitelist_graph, 1.01) ) - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") - tagger = pynini.closure(token, 1) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + space = NEMO_WHITE_SPACE @ delete_extra_space + space_opt = pynini.closure(space, 0, 1) - self.fst = tagger + graph = delete_space + token + pynini.closure(space_opt + token) + delete_space + self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py new file mode 100644 index 000000000..fd443c287 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py index 449789448..cedf4703d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py @@ -68,11 +68,13 @@ def __init__(self): graph_fraction = ( pynutil.delete("fraction {") + delete_space - + pynutil.delete('denominator: "') + + optional_sign + + delete_space + + pynutil.delete('numerator: "') + measurement + pynutil.delete('"') + delete_space - + pynutil.delete('numerator: "') + + pynutil.delete('denominator: "') + pynutil.insert("/") + measurement + pynutil.delete('"') diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 10aae347e..cdfac05d0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -25,6 +25,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst @@ -67,6 +68,8 @@ def __init__(self): word = WordFst() word_graph = word.fst + whitelist_graph = WhiteListFst().fst + graph = pynini.union( cardinal_graph, ordinal_graph, @@ -78,5 +81,6 @@ def __init__(self): telephone_graph, measure_graph, word_graph, + whitelist_graph, ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py new file mode 100644 index 000000000..395423017 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + + +class WhiteListFst(GraphFst): + ''' + tokens { whitelist: "ATM" } -> A T M + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + + whitelist = pynutil.delete("name: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + graph = whitelist + self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt index ecad6dc19..89fa75eb6 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt @@ -1,16 +1,15 @@ 이천이십사년팔월이십팔일~2024년8월28일 -이천이십삼년 구월 오일~2023년 9월 5일 +이천이십삼년 구월 오일~2023년9월5일 천구백구십구년십이월삼십일일~1999년12월31일 -이천년 이월 이십구일~2000년 2월 29일 +이천년 이월 이십구일~2000년2월29일 이천십년시월십일~2010년10월10일 이천이십일년유월십육일~2021년6월16일 이천삼십년삼월십사일~2030년3월14일 -천구백팔십팔년 오월 이십일~1988년 5월 20일 -이천일년 칠월 구일~2001년 7월 9일 +천구백팔십팔년 오월 이십일~1988년5월20일 +이천일년 칠월 구일~2001년7월9일 이천십팔년사월삼십일~2018년4월30일 삼천년팔월십오일~3000년8월15일 -이천구년 일월이십일~2009년 1월20일 +이천구년 일월이십일~2009년1월20일 이천삼십오년~2035년 오월~5월 -이십사일~24일 구천구백구십구년삼월일일~9999년3월1일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt index 8dfc77823..96fbb7005 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -14,12 +14,4 @@ 사십번째~40번째 오십번째~50번째 오십삼번째~53번째 -백번째~100번째 -한개~1개 -한마리~1마리 -열병~10병 -스물한송이~21송이 -사십그루~40그루 -여섯사람~6사람 -열다섯장~15장 -서른일곱권~37권 \ No newline at end of file +백번째~100번째 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt index 450039132..2700b6ccc 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -6,8 +6,8 @@ 열두시 반~12:30 두시 오초~2:00:05 두시 삼십분 오초~2:30:05 -오전두시~오전2:00 -오후네시반~오후4:30 +오전 두시~오전 2:00 +오후 네시 반~오후 4:30 두시전~2:00 전 두시십분후~2:10 후 한시 십오분 삼십초~1:15:30