From 935647935162403f89bb10a880b204ff7dc43eef Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Mon, 6 Jan 2025 15:52:08 -0800 Subject: [PATCH 1/3] initial commit Signed-off-by: tbartley94 --- .../zh/taggers/cardinal.py | 29 +++++++++++-------- .../zh/taggers/tokenize_and_classify.py | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index f3b30238c..d1bc42151 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst, NEMO_CHAR from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path @@ -53,6 +53,7 @@ def __init__(self): # grammar for tens, not the output for Cardinal grammar but for pure Arabic digits (used in other grammars) graph_tens = (ties + graph_digits) | (pynini.cross(pynini.accep("零"), "0") + graph_digits) graph_all = graph_tens | graph_teens | pynutil.insert("00") + graph_all = graph_all.optimize() # grammar for hundreds 百 graph_hundreds_complex = ( @@ -336,7 +337,7 @@ def __init__(self): graph_teens, graph_digits, zero, - ) + ).optimize() # combining grammar; output consists only arabic numbers graph_just_cardinals = pynini.union( @@ -354,29 +355,33 @@ def __init__(self): graph_teens, graph_digits, zero, - ) + ).optimize() # delete unnecessary leading zero delete_leading_zeros = pynutil.delete(pynini.closure("0")) stop_at_non_zero = pynini.difference(NEMO_DIGIT, "0") - rest_of_cardinal = pynini.closure(NEMO_DIGIT) | pynini.closure(NEMO_SIGMA) + rest_of_cardinal = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | (pynini.closure(NEMO_DIGIT)) # general use cases for other graphs + rest_of_cardinal_2 = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | (pynini.closure(NEMO_DIGIT, 2)) # for normal cardinal graph # output for cardinal grammar without leading zero clean_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal - clean_cardinal = clean_cardinal | "0" - graph = graph @ clean_cardinal # output for regular cardinals - self.for_ordinals = graph # used for ordinal grammars + clean_cardinal_2 = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal_2 + clean_just_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal + + # union zero with graph to stop overproduced 0's from the inserts. + # TODO: Rewrite digits graphs so that we don't have free floating zero inserts. + self.for_ordinals = (graph | zero) @ clean_cardinal # used for ordinal grammars + self.for_ordinals = self.for_ordinals.optimize() # output for pure arabic number without leading zero - clean_just_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal - clean_just_cardinal = clean_just_cardinal | "0" graph_just_cardinals = graph_just_cardinals @ clean_just_cardinal # output for other grammars - self.just_cardinals = graph_just_cardinals # used for other grammars + self.just_cardinals = graph_just_cardinals | zero # used for other grammars + self.just_cardinals = self.just_cardinals.optimize() # final grammar for cardinal output; tokenization optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", '"-"'))) | ( pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", '"-"')) ) - final_graph = optional_minus_graph + pynutil.insert('integer: "') + graph + pynutil.insert('"') + final_graph = optional_minus_graph + pynutil.insert('integer: "') + ((graph | zero) @ clean_cardinal_2) + pynutil.insert('"') final_graph = self.add_tokens(final_graph) - self.fst = final_graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py index 2877d4160..3de2cf768 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/tokenize_and_classify.py @@ -98,7 +98,7 @@ def __init__( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) ) - graph = token_plus_punct + pynini.closure(delete_zero_or_one_space + token_plus_punct) + graph = token_plus_punct + pynini.closure(delete_space + token_plus_punct) graph = delete_space + graph + delete_space self.fst = graph.optimize() From c040b9bfa31ea39c3a5f64150e2ac3e31530a066 Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Mon, 6 Jan 2025 15:53:04 -0800 Subject: [PATCH 2/3] initial commit, style fix Signed-off-by: tbartley94 --- .../text_normalization/zh/taggers/cardinal.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index 21437e82f..a0c3b587d 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 + alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT ** 3 + hundreds = NEMO_DIGIT**3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT ** 4 + thousands = NEMO_DIGIT**4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT ** 5 + ten_thousands = NEMO_DIGIT**5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT ** 6 - hundred_thousands_position = NEMO_DIGIT ** 2 + hundred_thousands = NEMO_DIGIT**6 + hundred_thousands_position = NEMO_DIGIT**2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT ** 7 - million_position = NEMO_DIGIT ** 3 + millions = NEMO_DIGIT**7 + million_position = NEMO_DIGIT**3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT ** 8 - ten_million_position = NEMO_DIGIT ** 4 + ten_millions = NEMO_DIGIT**8 + ten_million_position = NEMO_DIGIT**4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT ** 9 + hundred_millions = NEMO_DIGIT**9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT ** 10 - thousand_millions_position = NEMO_DIGIT ** 2 + thousand_millions = NEMO_DIGIT**10 + thousand_millions_position = NEMO_DIGIT**2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT ** 11 - ten_billions_position = NEMO_DIGIT ** 3 + ten_billions = NEMO_DIGIT**11 + ten_billions_position = NEMO_DIGIT**3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT ** 12 - hundred_billions_position = NEMO_DIGIT ** 4 + hundred_billions = NEMO_DIGIT**12 + hundred_billions_position = NEMO_DIGIT**4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), From 04c35799d7d9e9b80048d26c15471fc44bdf900c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 00:03:40 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../zh/taggers/cardinal.py | 17 +++++++--- .../text_normalization/zh/taggers/cardinal.py | 34 +++++++++---------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index d1bc42151..d1f2eb29b 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst, NEMO_CHAR +from nemo_text_processing.inverse_text_normalization.zh.graph_utils import NEMO_CHAR, NEMO_DIGIT, NEMO_SIGMA, GraphFst from nemo_text_processing.inverse_text_normalization.zh.utils import get_abs_path @@ -360,8 +360,12 @@ def __init__(self): # delete unnecessary leading zero delete_leading_zeros = pynutil.delete(pynini.closure("0")) stop_at_non_zero = pynini.difference(NEMO_DIGIT, "0") - rest_of_cardinal = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | (pynini.closure(NEMO_DIGIT)) # general use cases for other graphs - rest_of_cardinal_2 = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | (pynini.closure(NEMO_DIGIT, 2)) # for normal cardinal graph + rest_of_cardinal = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | ( + pynini.closure(NEMO_DIGIT) + ) # general use cases for other graphs + rest_of_cardinal_2 = (pynini.closure(NEMO_DIGIT) + pynini.closure(NEMO_CHAR, 1)) | ( + pynini.closure(NEMO_DIGIT, 2) + ) # for normal cardinal graph # output for cardinal grammar without leading zero clean_cardinal = delete_leading_zeros + stop_at_non_zero + rest_of_cardinal @@ -382,6 +386,11 @@ def __init__(self): optional_minus_graph = (pynini.closure(pynutil.insert("negative: ") + pynini.cross("负", '"-"'))) | ( pynini.closure(pynutil.insert("negative: ") + pynini.cross("負", '"-"')) ) - final_graph = optional_minus_graph + pynutil.insert('integer: "') + ((graph | zero) @ clean_cardinal_2) + pynutil.insert('"') + final_graph = ( + optional_minus_graph + + pynutil.insert('integer: "') + + ((graph | zero) @ clean_cardinal_2) + + pynutil.insert('"') + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py index a0c3b587d..21437e82f 100644 --- a/nemo_text_processing/text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/zh/taggers/cardinal.py @@ -35,7 +35,7 @@ def __init__(self, deterministic: bool = True): graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv")) graph_teen_alt = pynini.string_file(get_abs_path("data/number/teen_alt.tsv")) - alls = NEMO_DIGIT**2 | NEMO_DIGIT**1 + alls = NEMO_DIGIT ** 2 | NEMO_DIGIT ** 1 graph_all = ( (graph_ties + (graph_digit | pynutil.delete('0'))) | graph_teen_alt | graph_digit ) # graph_all when within a larger number e.g., 316-> 三百一十六 instead of 三百十六 @@ -46,7 +46,7 @@ def __init__(self, deterministic: bool = True): ) # graph_all when at the head of the larger numbere.g., 13万 -> 十三万 instead of 一十三万 graph_all_alt = alls @ graph_all_alt - hundreds = NEMO_DIGIT**3 + hundreds = NEMO_DIGIT ** 3 graph_hundred_component = (graph_digit + pynutil.insert('百')) + pynini.union( pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0') + pynutil.insert('零')) + graph_all), @@ -56,7 +56,7 @@ def __init__(self, deterministic: bool = True): self.digit = graph_digit.optimize() self.all = graph_all.optimize() - thousands = NEMO_DIGIT**4 + thousands = NEMO_DIGIT ** 4 graph_thousand_component = (graph_digit_alt + pynutil.insert('千')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_hundred_component, @@ -64,7 +64,7 @@ def __init__(self, deterministic: bool = True): ) graph_thousand = thousands @ graph_thousand_component - ten_thousands = NEMO_DIGIT**5 + ten_thousands = NEMO_DIGIT ** 5 graph_ten_thousand_component = (graph_digit_alt + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, @@ -73,8 +73,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_thousand = ten_thousands @ graph_ten_thousand_component - hundred_thousands = NEMO_DIGIT**6 - hundred_thousands_position = NEMO_DIGIT**2 + hundred_thousands = NEMO_DIGIT ** 6 + hundred_thousands_position = NEMO_DIGIT ** 2 hundred_thousands_position = hundred_thousands_position @ graph_all_alt graph_hundred_thousand_component = (hundred_thousands_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -84,8 +84,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component - millions = NEMO_DIGIT**7 - million_position = NEMO_DIGIT**3 + millions = NEMO_DIGIT ** 7 + million_position = NEMO_DIGIT ** 3 million_position = million_position @ graph_hundred_component graph_million_component = (million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -95,8 +95,8 @@ def __init__(self, deterministic: bool = True): ) graph_million = millions @ graph_million_component - ten_millions = NEMO_DIGIT**8 - ten_million_position = NEMO_DIGIT**4 + ten_millions = NEMO_DIGIT ** 8 + ten_million_position = NEMO_DIGIT ** 4 ten_million_position = ten_million_position @ graph_thousand_component graph_ten_million_component = (ten_million_position + pynutil.insert('万')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -106,7 +106,7 @@ def __init__(self, deterministic: bool = True): ) graph_ten_million = ten_millions @ graph_ten_million_component - hundred_millions = NEMO_DIGIT**9 + hundred_millions = NEMO_DIGIT ** 9 graph_hundred_million_component = (graph_digit_alt + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), graph_ten_million_component, @@ -119,8 +119,8 @@ def __init__(self, deterministic: bool = True): ) graph_hundred_million = hundred_millions @ graph_hundred_million_component - thousand_millions = NEMO_DIGIT**10 - thousand_millions_position = NEMO_DIGIT**2 + thousand_millions = NEMO_DIGIT ** 10 + thousand_millions_position = NEMO_DIGIT ** 2 thousand_millions_position = thousand_millions_position @ graph_all_alt graph_thousand_million_component = (thousand_millions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -134,8 +134,8 @@ def __init__(self, deterministic: bool = True): ) graph_thousand_million = thousand_millions @ graph_thousand_million_component - ten_billions = NEMO_DIGIT**11 - ten_billions_position = NEMO_DIGIT**3 + ten_billions = NEMO_DIGIT ** 11 + ten_billions_position = NEMO_DIGIT ** 3 ten_billions_position = ten_billions_position @ graph_hundred_component graph_ten_billions_component = (ten_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')), @@ -149,8 +149,8 @@ def __init__(self, deterministic: bool = True): ) graph_ten_billions = ten_billions @ graph_ten_billions_component - hundred_billions = NEMO_DIGIT**12 - hundred_billions_position = NEMO_DIGIT**4 + hundred_billions = NEMO_DIGIT ** 12 + hundred_billions_position = NEMO_DIGIT ** 4 hundred_billions_position = hundred_billions_position @ graph_thousand_component graph_hundred_billions_component = (hundred_billions_position + pynutil.insert('亿')) + pynini.union( pynini.closure(pynutil.delete('0')),