From 22c6918b9fa4f6ca749107f50db90da37d6a670f Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Tue, 17 Feb 2026 15:20:00 -0800 Subject: [PATCH 1/5] Korean ITN fixes Signed-off-by: Jinwoo Bae --- .../ko/data/whitelist.tsv | 35 ++++++++++++++++++ .../ko/taggers/date.py | 24 +++++++------ .../ko/taggers/decimal.py | 2 +- .../ko/taggers/money.py | 6 ++-- .../ko/taggers/time.py | 26 +++++++------- .../ko/taggers/tokenize_and_classify.py | 18 ++++++---- .../ko/taggers/whitelist.py | 36 +++++++++++++++++++ .../ko/verbalizers/measure.py | 6 ++-- .../ko/verbalizers/verbalize.py | 4 +++ .../ko/verbalizers/whitelist.py | 32 +++++++++++++++++ .../test_cases_date.txt | 11 +++--- .../test_cases_ordinal.txt | 10 +----- .../test_cases_time.txt | 4 +-- 13 files changed, 161 insertions(+), 53 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv new file mode 100644 index 000000000..5ff302fb6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/whitelist.tsv @@ -0,0 +1,35 @@ +박사 Dr. +박사 dr. +씨 Mr. +씨 mr. +양 Ms. +양 ms. +여사 Mrs. +여사 mrs. +산 mt. +산 Mt. +교수 Prof. +교수 prof. +시니어 sr. +시니어 Sr. +주니어 jr. +주니어 Jr. +대로 Ave. +대로 ave. +번호 no. +번호 No. +왼쪽 괄호 ( +오른쪽 괄호 ) +더하기 + +마이너스 - +시그마 Σ +에타 η +카파 κ +오메가 ω +시그마 σ +알파 α +뉴 ν +델타 δ +이오타 ι +박사학위 Ph.D. +등 etc. \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py index b9de5c299..78d7cf6b7 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -34,27 +34,31 @@ def __init__(self, cardinal: GraphFst): cardinal = cardinal.just_cardinals month = pynini.string_file(get_abs_path("data/months.tsv")) - spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - year_suffix = pynini.cross("년", "") month_suffix = pynini.cross("월", "") day_suffix = pynini.cross("일", "") + + delete_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) + between_fields = delete_space + pynutil.insert(NEMO_SPACE) year_component = ( - pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"") + pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"") ) - month_component = ( - pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"") + pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"") + ) + day_component = ( + pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"") ) - day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"") + graph_component = year_component | month_component - graph_component = year_component | month_component | day_component graph_date = ( - pynini.closure(year_component, 0, 1) - + pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1) - + pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1) + year_component + | month_component + | (year_component + between_fields + month_component) + | (month_component + between_fields + day_component) + | (year_component + between_fields + month_component + between_fields + day_component) ) final_graph = graph_component | graph_date diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index ecb92df1d..dcf4cede4 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -41,7 +41,7 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - decimal_part = pynini.closure(graph_zero | graph_digit) + decimal_part = (graph_zero | graph_digit) + pynini.closure(graph_zero | graph_digit) decimal_point = pynutil.delete("점") integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index f890e477e..74bfe7562 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -28,16 +28,18 @@ class MoneyFst(GraphFst): cardinal: CardinalFst """ - def __init__(self, cardinal: GraphFst): + def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="money", kind="classify") cardinals = cardinal.just_cardinals + decimals = decimal.just_decimal currency = pynini.string_file(get_abs_path("data/currency.tsv")) # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing + number = decimals | cardinals + graph_integer = pynutil.insert("integer_part: \"") + number + pynutil.insert("\"") + spacing graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index e99a1b083..d6697340c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -73,31 +73,29 @@ def __init__(self): pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"") ) - hour = pynini.closure(hour_component, 0, 1) - minute = pynini.closure(delete_space + minute_component, 0, 1) - second = pynini.closure(delete_space + second_component, 0, 1) + hm_opt = pynini.closure(delete_space + minute_component, 0, 1) + hs_opt = pynini.closure(delete_space + second_component, 0, 1) - graph_regular = hour + minute + second + hms = hour_component + hm_opt + hs_opt + ms = minute_component + pynini.closure(delete_space + second_component, 0, 1) + s_only = second_component - # 오전 = AM, 오후 = PM - prefix_words = ( - pynini.union( - (pynini.accep("오전")), (pynini.accep("오후")), (pynini.accep("새벽")), (pynini.accep("아침")) - ) - + spacing - ) - prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"") + graph_regular = pynini.union(hms, ms, s_only).optimize() + # 오전 = AM, 오후 = PM + ampm_words = pynini.union("오전", "오후", "새벽", "아침") + ampm_tag = pynutil.insert('suffix: "') + ampm_words + pynutil.insert('"') + # 전 = before, 후 = after suffix_words = pynini.accep("전") | pynini.accep("후") suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") time_graph = ( - pynini.closure(delete_space + prefix_tag, 0, 1) + pynini.closure(delete_space + ampm_tag, 0, 1) + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) ) - + # Adding cardinal graph to prevent processing out of range numbers final_graph = time_graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 3620f07ac..abcbd2745 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -18,7 +18,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main, NEMO_WHITE_SPACE, delete_extra_space, delete_space from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst @@ -29,7 +29,7 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst - +from nemo_text_processing.inverse_text_normalization.ko.taggers.whitelist import WhiteListFst class ClassifyFst(GraphFst): """ @@ -56,7 +56,7 @@ def __init__( far_file = None if cache_dir is not None and cache_dir != "None": os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + far_file = os.path.join(cache_dir, f"ko_itn_{input_case}_tokenize.far") if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] logging.info(f"ClassifyFst.fst was restored from {far_file}.") @@ -81,7 +81,7 @@ def __init__( date = DateFst(cardinal) date_graph = date.fst - money = MoneyFst(cardinal) + money = MoneyFst(cardinal, decimal) money_graph = money.fst telephone = TelephoneFst() @@ -91,6 +91,7 @@ def __init__( measure_graph = measure.fst word_graph = WordFst().fst + whitelist_graph = WhiteListFst().fst classify = ( pynutil.add_weight(cardinal_graph, 1.1) @@ -103,12 +104,15 @@ def __init__( | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(word_graph, 100) + | pynutil.add_weight(whitelist_graph, 1.01) ) - token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") - tagger = pynini.closure(token, 1) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + space = NEMO_WHITE_SPACE @ delete_extra_space + space_opt = pynini.closure(space, 0, 1) - self.fst = tagger + graph = delete_space + token + pynini.closure(space_opt + token) + delete_space + self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py new file mode 100644 index 000000000..fd443c287 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/whitelist.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv". + """ + + def __init__(self): + super().__init__(name="whitelist", kind="classify") + + whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")) + graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py index 449789448..cedf4703d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/measure.py @@ -68,11 +68,13 @@ def __init__(self): graph_fraction = ( pynutil.delete("fraction {") + delete_space - + pynutil.delete('denominator: "') + + optional_sign + + delete_space + + pynutil.delete('numerator: "') + measurement + pynutil.delete('"') + delete_space - + pynutil.delete('numerator: "') + + pynutil.delete('denominator: "') + pynutil.insert("/") + measurement + pynutil.delete('"') diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 10aae347e..25f832442 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -26,6 +26,7 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -67,6 +68,8 @@ def __init__(self): word = WordFst() word_graph = word.fst + whitelist_graph = WhiteListFst().fst + graph = pynini.union( cardinal_graph, ordinal_graph, @@ -78,5 +81,6 @@ def __init__(self): telephone_graph, measure_graph, word_graph, + whitelist_graph, ) self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py new file mode 100644 index 000000000..9a66518ac --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_NOT_QUOTE + + +class WhiteListFst(GraphFst): + ''' + tokens { whitelist: "ATM" } -> A T M + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic) + + whitelist = pynutil.delete("name: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + graph = whitelist + self.fst = graph.optimize() diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt index ecad6dc19..89fa75eb6 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_date.txt @@ -1,16 +1,15 @@ 이천이십사년팔월이십팔일~2024년8월28일 -이천이십삼년 구월 오일~2023년 9월 5일 +이천이십삼년 구월 오일~2023년9월5일 천구백구십구년십이월삼십일일~1999년12월31일 -이천년 이월 이십구일~2000년 2월 29일 +이천년 이월 이십구일~2000년2월29일 이천십년시월십일~2010년10월10일 이천이십일년유월십육일~2021년6월16일 이천삼십년삼월십사일~2030년3월14일 -천구백팔십팔년 오월 이십일~1988년 5월 20일 -이천일년 칠월 구일~2001년 7월 9일 +천구백팔십팔년 오월 이십일~1988년5월20일 +이천일년 칠월 구일~2001년7월9일 이천십팔년사월삼십일~2018년4월30일 삼천년팔월십오일~3000년8월15일 -이천구년 일월이십일~2009년 1월20일 +이천구년 일월이십일~2009년1월20일 이천삼십오년~2035년 오월~5월 -이십사일~24일 구천구백구십구년삼월일일~9999년3월1일 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt index 8dfc77823..96fbb7005 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_ordinal.txt @@ -14,12 +14,4 @@ 사십번째~40번째 오십번째~50번째 오십삼번째~53번째 -백번째~100번째 -한개~1개 -한마리~1마리 -열병~10병 -스물한송이~21송이 -사십그루~40그루 -여섯사람~6사람 -열다섯장~15장 -서른일곱권~37권 \ No newline at end of file +백번째~100번째 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt index 450039132..2700b6ccc 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_time.txt @@ -6,8 +6,8 @@ 열두시 반~12:30 두시 오초~2:00:05 두시 삼십분 오초~2:30:05 -오전두시~오전2:00 -오후네시반~오후4:30 +오전 두시~오전 2:00 +오후 네시 반~오후 4:30 두시전~2:00 전 두시십분후~2:10 후 한시 십오분 삼십초~1:15:30 From eb78144869352da962650f69d85d7a01358fc441 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Feb 2026 23:32:07 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/date.py | 14 ++++---------- .../inverse_text_normalization/ko/taggers/time.py | 4 ++-- .../ko/taggers/tokenize_and_classify.py | 12 ++++++++++-- .../ko/verbalizers/verbalize.py | 4 ++-- .../ko/verbalizers/whitelist.py | 2 +- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py index 78d7cf6b7..b02e80984 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/date.py @@ -37,19 +37,13 @@ def __init__(self, cardinal: GraphFst): year_suffix = pynini.cross("년", "") month_suffix = pynini.cross("월", "") day_suffix = pynini.cross("일", "") - + delete_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1) between_fields = delete_space + pynutil.insert(NEMO_SPACE) - year_component = ( - pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"") - ) - month_component = ( - pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"") - ) - day_component = ( - pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"") - ) + year_component = pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"") + month_component = pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"") + day_component = pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"") graph_component = year_component | month_component diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py index d6697340c..205737043 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/time.py @@ -85,7 +85,7 @@ def __init__(self): # 오전 = AM, 오후 = PM ampm_words = pynini.union("오전", "오후", "새벽", "아침") ampm_tag = pynutil.insert('suffix: "') + ampm_words + pynutil.insert('"') - + # 전 = before, 후 = after suffix_words = pynini.accep("전") | pynini.accep("후") suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"") @@ -95,7 +95,7 @@ def __init__(self): + graph_regular + pynini.closure(delete_space + suffix_tag, 0, 1) ) - + # Adding cardinal graph to prevent processing out of range numbers final_graph = time_graph diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index abcbd2745..8e5d39c85 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -18,7 +18,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main, NEMO_WHITE_SPACE, delete_extra_space, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst @@ -28,8 +35,9 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst + class ClassifyFst(GraphFst): """ diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 25f832442..cdfac05d0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -25,8 +25,8 @@ from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst class VerbalizeFst(GraphFst): @@ -69,7 +69,7 @@ def __init__(self): word_graph = word.fst whitelist_graph = WhiteListFst().fst - + graph = pynini.union( cardinal_graph, ordinal_graph, diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py index 9a66518ac..395423017 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/whitelist.py @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, NEMO_NOT_QUOTE +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst class WhiteListFst(GraphFst): From b40c2bbfe338853be92ac5be537bdbb9294e23dd Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Wed, 18 Feb 2026 16:01:38 -0800 Subject: [PATCH 3/5] Fix KO ITN decimal and money graph cleanup Signed-off-by: Jinwoo Bae --- .../inverse_text_normalization/ko/taggers/decimal.py | 2 +- .../inverse_text_normalization/ko/taggers/money.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index dcf4cede4..6fec26164 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -41,7 +41,7 @@ def __init__(self, cardinal: GraphFst): cardinals = cardinal.just_cardinals graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - decimal_part = (graph_zero | graph_digit) + pynini.closure(graph_zero | graph_digit) + decimal_part = pynini.closure(graph_zero | graph_digit, 1) decimal_point = pynutil.delete("점") integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index 74bfe7562..4cdbb6e21 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -37,10 +37,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - - number = decimals | cardinals - graph_integer = pynutil.insert("integer_part: \"") + number + pynutil.insert("\"") + spacing - + graph_integer = ( + pynutil.insert("integer_part: \"") + + (decimals | cardinals) + + pynutil.insert("\"") + + spacing + ) graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") graph_final = graph_integer + graph_unit From 45a54fa463120129949c0033a1c8358c45c69d30 Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Fri, 20 Feb 2026 09:09:54 -0800 Subject: [PATCH 4/5] Fix KO ITN decimal-money ambiguity Signed-off-by: Jinwoo Bae --- .../inverse_text_normalization/ko/taggers/decimal.py | 2 +- .../inverse_text_normalization/ko/taggers/money.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py index 6fec26164..a51596e94 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/decimal.py @@ -55,7 +55,7 @@ def __init__(self, cardinal: GraphFst): ) # If decimal is used to express big numbers like 15000 -> "1.5만" self.decimal = graph_decimal_regular | graph_deicimal_larger - self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part + self.just_decimal = cardinals | (cardinals + pynini.cross("점", ".") + decimal_part) graph_sign = ( pynutil.insert("negative: \"") + (pynini.cross("마이너스", "-") | pynini.accep("-")) + pynutil.insert("\"") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index 4cdbb6e21..72506615d 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -38,11 +38,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) graph_integer = ( - pynutil.insert("integer_part: \"") - + (decimals | cardinals) - + pynutil.insert("\"") + pynutil.insert('integer_part: "') + + decimals + + pynutil.insert('"') + spacing - ) + ) graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") graph_final = graph_integer + graph_unit From 15fc35663f746426891fecd1517004abcfefae2b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Feb 2026 19:30:12 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/money.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py index 72506615d..719fa168f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/money.py @@ -37,12 +37,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Accepting space if there are one between integer and currency spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1) - graph_integer = ( - pynutil.insert('integer_part: "') - + decimals - + pynutil.insert('"') - + spacing - ) + graph_integer = pynutil.insert('integer_part: "') + decimals + pynutil.insert('"') + spacing graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"") graph_final = graph_integer + graph_unit