Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
박사 Dr.
박사 dr.
씨 Mr.
씨 mr.
양 Ms.
양 ms.
여사 Mrs.
여사 mrs.
산 mt.
산 Mt.
교수 Prof.
교수 prof.
시니어 sr.
시니어 Sr.
주니어 jr.
주니어 Jr.
대로 Ave.
대로 ave.
번호 no.
번호 No.
왼쪽 괄호 (
오른쪽 괄호 )
더하기 +
마이너스 -
시그마 Σ
에타 η
카파 κ
오메가 ω
시그마 σ
알파 α
뉴 ν
델타 δ
이오타 ι
박사학위 Ph.D.
등 etc.
24 changes: 11 additions & 13 deletions nemo_text_processing/inverse_text_normalization/ko/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,25 @@ def __init__(self, cardinal: GraphFst):
cardinal = cardinal.just_cardinals
month = pynini.string_file(get_abs_path("data/months.tsv"))

spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1)

year_suffix = pynini.cross("년", "")
month_suffix = pynini.cross("월", "")
day_suffix = pynini.cross("일", "")

year_component = (
pynutil.insert("year: \"") + cardinal + pynini.closure(year_suffix, 0, 1) + pynutil.insert("\"")
)
delete_space = pynini.closure(pynutil.delete(NEMO_SPACE), 0, 1)
between_fields = delete_space + pynutil.insert(NEMO_SPACE)

month_component = (
pynutil.insert("month: \"") + spacing + month + pynini.closure(month_suffix, 0, 1) + pynutil.insert("\"")
)
year_component = pynutil.insert("year: \"") + cardinal + year_suffix + pynutil.insert("\"")
month_component = pynutil.insert("month: \"") + month + month_suffix + pynutil.insert("\"")
day_component = pynutil.insert("day: \"") + cardinal + day_suffix + pynutil.insert("\"")

day_component = pynutil.insert("day: \"") + spacing + cardinal + day_suffix + spacing + pynutil.insert("\"")
graph_component = year_component | month_component

graph_component = year_component | month_component | day_component
graph_date = (
pynini.closure(year_component, 0, 1)
+ pynini.closure((pynutil.insert(NEMO_SPACE)) + month_component, 0, 1)
+ pynini.closure((pynutil.insert(NEMO_SPACE)) + day_component, 0, 1)
year_component
| month_component
| (year_component + between_fields + month_component)
| (month_component + between_fields + day_component)
| (year_component + between_fields + month_component + between_fields + day_component)
)

final_graph = graph_component | graph_date
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, cardinal: GraphFst):
cardinals = cardinal.just_cardinals
graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
decimal_part = pynini.closure(graph_zero | graph_digit)
decimal_part = pynini.closure(graph_zero | graph_digit, 1)

decimal_point = pynutil.delete("점")
integer_part = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"")
Expand All @@ -55,7 +55,7 @@ def __init__(self, cardinal: GraphFst):
) # If decimal is used to express big numbers like 15000 -> "1.5만"

self.decimal = graph_decimal_regular | graph_deicimal_larger
self.just_decimal = cardinals + pynini.cross("점", ".") + decimal_part
self.just_decimal = cardinals | (cardinals + pynini.cross("점", ".") + decimal_part)

graph_sign = (
pynutil.insert("negative: \"") + (pynini.cross("마이너스", "-") | pynini.accep("-")) + pynutil.insert("\"")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,16 @@ class MoneyFst(GraphFst):
cardinal: CardinalFst
"""

def __init__(self, cardinal: GraphFst):
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="money", kind="classify")

cardinals = cardinal.just_cardinals
decimals = decimal.just_decimal
currency = pynini.string_file(get_abs_path("data/currency.tsv"))

# Accepting space if there are one between integer and currency
spacing = pynini.closure(pynini.accep(NEMO_SPACE), 0, 1)

graph_integer = pynutil.insert("integer_part: \"") + cardinals + pynutil.insert("\"") + spacing

graph_integer = pynutil.insert('integer_part: "') + decimals + pynutil.insert('"') + spacing
graph_unit = pynutil.insert(" currency: \"") + currency + pynutil.insert("\"")

graph_final = graph_integer + graph_unit
Expand Down
22 changes: 10 additions & 12 deletions nemo_text_processing/inverse_text_normalization/ko/taggers/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,27 +73,25 @@ def __init__(self):
pynutil.insert("seconds: \"") + (graph_0_to_59 + spacing + second_suffix) + pynutil.insert("\"")
)

hour = pynini.closure(hour_component, 0, 1)
minute = pynini.closure(delete_space + minute_component, 0, 1)
second = pynini.closure(delete_space + second_component, 0, 1)
hm_opt = pynini.closure(delete_space + minute_component, 0, 1)
hs_opt = pynini.closure(delete_space + second_component, 0, 1)

graph_regular = hour + minute + second
hms = hour_component + hm_opt + hs_opt
ms = minute_component + pynini.closure(delete_space + second_component, 0, 1)
s_only = second_component

graph_regular = pynini.union(hms, ms, s_only).optimize()

# 오전 = AM, 오후 = PM
prefix_words = (
pynini.union(
(pynini.accep("오전")), (pynini.accep("오후")), (pynini.accep("새벽")), (pynini.accep("아침"))
)
+ spacing
)
prefix_tag = pynutil.insert("prefix: \"") + prefix_words + pynutil.insert("\"")
ampm_words = pynini.union("오전", "오후", "새벽", "아침")
ampm_tag = pynutil.insert('suffix: "') + ampm_words + pynutil.insert('"')

# 전 = before, 후 = after
suffix_words = pynini.accep("전") | pynini.accep("후")
suffix_tag = pynutil.insert("suffix: \"") + suffix_words + pynutil.insert("\"")

time_graph = (
pynini.closure(delete_space + prefix_tag, 0, 1)
pynini.closure(delete_space + ampm_tag, 0, 1)
+ graph_regular
+ pynini.closure(delete_space + suffix_tag, 0, 1)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main
from nemo_text_processing.inverse_text_normalization.ko.graph_utils import (
INPUT_LOWER_CASED,
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.date import DateFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.decimal import DecimalFst
Expand All @@ -28,6 +35,7 @@
from nemo_text_processing.inverse_text_normalization.ko.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst


Expand Down Expand Up @@ -56,7 +64,7 @@ def __init__(
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far")
far_file = os.path.join(cache_dir, f"ko_itn_{input_case}_tokenize.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
Expand All @@ -81,7 +89,7 @@ def __init__(
date = DateFst(cardinal)
date_graph = date.fst

money = MoneyFst(cardinal)
money = MoneyFst(cardinal, decimal)
money_graph = money.fst

telephone = TelephoneFst()
Expand All @@ -91,6 +99,7 @@ def __init__(
measure_graph = measure.fst

word_graph = WordFst().fst
whitelist_graph = WhiteListFst().fst

classify = (
pynutil.add_weight(cardinal_graph, 1.1)
Expand All @@ -103,12 +112,15 @@ def __init__(
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
| pynutil.add_weight(whitelist_graph, 1.01)
)

token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ")
tagger = pynini.closure(token, 1)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
space = NEMO_WHITE_SPACE @ delete_extra_space
space_opt = pynini.closure(space, 0, 1)

self.fst = tagger
graph = delete_space + token + pynini.closure(space_opt + token) + delete_space
self.fst = graph.optimize()

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# Copyright 2015 and onwards Google, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst
from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path


class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelisted tokens
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""

def __init__(self):
super().__init__(name="whitelist", kind="classify")

whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
graph = (pynutil.insert('name: "')) + (whitelist) + pynutil.insert('"')

self.fst = graph.optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ def __init__(self):
graph_fraction = (
pynutil.delete("fraction {")
+ delete_space
+ pynutil.delete('denominator: "')
+ optional_sign
+ delete_space
+ pynutil.delete('numerator: "')
+ measurement
+ pynutil.delete('"')
+ delete_space
+ pynutil.delete('numerator: "')
+ pynutil.delete('denominator: "')
+ pynutil.insert("/")
+ measurement
+ pynutil.delete('"')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.telephone import TelephoneFst
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst


Expand Down Expand Up @@ -67,6 +68,8 @@ def __init__(self):
word = WordFst()
word_graph = word.fst

whitelist_graph = WhiteListFst().fst

graph = pynini.union(
cardinal_graph,
ordinal_graph,
Expand All @@ -78,5 +81,6 @@ def __init__(self):
telephone_graph,
measure_graph,
word_graph,
whitelist_graph,
)
self.fst = graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pynini
from pynini.lib import pynutil

from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst


class WhiteListFst(GraphFst):
'''
tokens { whitelist: "ATM" } -> A T M
'''

def __init__(self, deterministic: bool = True, lm: bool = False):
super().__init__(name="whitelist", kind="verbalize", deterministic=deterministic)

whitelist = pynutil.delete("name: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"")
graph = whitelist
self.fst = graph.optimize()
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
이천이십사년팔월이십팔일~2024년8월28일
이천이십삼년 구월 오일~2023년 9월 5일
이천이십삼년 구월 오일~2023년9월5일
천구백구십구년십이월삼십일일~1999년12월31일
이천년 이월 이십구일~2000년 2월 29일
이천년 이월 이십구일~2000년2월29일
이천십년시월십일~2010년10월10일
이천이십일년유월십육일~2021년6월16일
이천삼십년삼월십사일~2030년3월14일
천구백팔십팔년 오월 이십일~1988년 5월 20일
이천일년 칠월 구일~2001년 7월 9일
천구백팔십팔년 오월 이십일~1988년5월20일
이천일년 칠월 구일~2001년7월9일
이천십팔년사월삼십일~2018년4월30일
삼천년팔월십오일~3000년8월15일
이천구년 일월이십일~2009년 1월20일
이천구년 일월이십일~2009년1월20일
이천삼십오년~2035년
오월~5월
이십사일~24일
구천구백구십구년삼월일일~9999년3월1일
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,4 @@
사십번째~40번째
오십번째~50번째
오십삼번째~53번째
백번째~100번째
한개~1개
한마리~1마리
열병~10병
스물한송이~21송이
사십그루~40그루
여섯사람~6사람
열다섯장~15장
서른일곱권~37권
백번째~100번째
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
열두시 반~12:30
두시 오초~2:00:05
두시 삼십분 오초~2:30:05
오전두시~오전2:00
오후네시반~오후4:30
오전 두시~오전 2:00
오후 네시 반~오후 4:30
두시전~2:00 전
두시십분후~2:10 후
한시 십오분 삼십초~1:15:30
Expand Down