diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..3e0140224 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-15-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hy/utils.py b/nemo_text_processing/inverse_text_normalization/hy/utils.py index 1f1349115..dbae6cfe1 100644 --- a/nemo_text_processing/inverse_text_normalization/hy/utils.py +++ b/nemo_text_processing/inverse_text_normalization/hy/utils.py @@ -22,7 +22,6 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/inverse_text_normalization/mr/utils.py b/nemo_text_processing/inverse_text_normalization/mr/utils.py index 1f1349115..dbae6cfe1 100644 --- a/nemo_text_processing/inverse_text_normalization/mr/utils.py +++ b/nemo_text_processing/inverse_text_normalization/mr/utils.py @@ -22,7 +22,6 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path diff --git a/nemo_text_processing/text_normalization/en/data/money/per_unit.tsv b/nemo_text_processing/text_normalization/en/data/money/per_unit.tsv index 40804ca56..8562f12de 100644 --- a/nemo_text_processing/text_normalization/en/data/money/per_unit.tsv +++ b/nemo_text_processing/text_normalization/en/data/money/per_unit.tsv @@ -67,4 +67,4 @@ /shift per shift /project per project /class per class -/session per session \ No newline at end of file +/session per session diff --git a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv index dc563bdab..e3cbaaf6e 100644 --- a/nemo_text_processing/text_normalization/fr/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/fr/data/whitelist.tsv @@ -10,4 +10,4 @@ apr. J.-C. après jésus-christ av. J.-C. avant Jésus-Christ le hon. l’honorable le très hon. le très hononrable -% pour cent \ No newline at end of file +% pour cent diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..7da791489 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,3 @@ +ई. पू. ईसा पूर्व +ई. ईसवी +तक तक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv new file mode 100644 index 000000000..cf62891d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -0,0 +1,9 @@ +रुपए पैसे +पाउंड पेंस +वॉन जिओन +डॉलर सेंट +लीरा कुरस +टका पैसे +येन सेन +नाइरा कोबो +यूरो सेंट diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index ced1b8949..37b145918 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -21,6 +21,7 @@ import pynini from pynini import Far +from pynini.examples import plurals from pynini.export import export from pynini.lib import byte, pynutil, utf8 @@ -99,6 +100,30 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): logging.info(f'Created {file_name}') +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + def convert_space(fst) -> 'pynini.FstLike': """ Converts space to nonbreaking space. @@ -113,6 +138,44 @@ def convert_space(fst) -> 'pynini.FstLike': return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + class GraphFst: """ Base class for all grammar fsts. diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..1a7b0f97e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -14,7 +14,6 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.text_normalization.hi.graph_utils import ( NEMO_HI_DIGIT, NEMO_HI_NON_ZERO, @@ -26,6 +25,7 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) class DateFst(GraphFst): @@ -62,12 +62,17 @@ def __init__(self, cardinal: GraphFst): years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - graph_dd_mm = days_graph + delete_dash + months_graph + graph_dd_mm = days_graph + (delete_dash | pynini.accep("")) + months_graph - graph_mm_dd = months_graph + delete_dash + days_graph + graph_mm_dd = months_graph + (delete_dash | pynini.accep("")) + days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +83,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = ( + months_graph + (delete_dash | pynini.accep("")) + years_graph + pynutil.insert(" preserve_order: true ") + ) + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("text: \"") + + (cardinal.final_graph | graph_year) + + insert_space + + range_graph + + insert_space + + (cardinal.final_graph | graph_year) + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -88,6 +108,8 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy | graph_mm_yyyy + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..a8cc3fad3 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -44,7 +44,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + + # Handling symbols like x, X, * + symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) graph_measurements = ( pynutil.insert("decimal { ") @@ -52,8 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) + graph_measurements |= ( pynutil.insert("cardinal { ") + optional_graph_negative @@ -62,7 +66,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit + ) + + # Handling cardinal clubbed with symbol as single token + graph_measurements |= ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") ) graph = graph_measurements diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..3de3017ed 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,9 +24,11 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + Args: cardinal: CardinalFst decimal: DecimalFst @@ -34,29 +36,23 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph - optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, - ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") - - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( - optional_graph_negative - + self.currency - + insert_space - + self.interger - + pynutil.delete(".") - + insert_space - + self.fraction + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') + + graph_major_only = currency_major + insert_space + integer + graph_major_and_minor = ( + currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..bdec90c06 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -68,11 +68,11 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +107,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..151a72e99 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -43,7 +43,6 @@ def __init__(self, punctuation: PunctuationFst, deterministic: bool = True): *[chr(i) for i in range(ord("ऀ"), ord("ः") + 1)], # Hindi vowels and consonants *[chr(i) for i in range(ord("अ"), ord("ह") + 1)], # More Hindi characters *[chr(i) for i in range(ord("ा"), ord("्") + 1)], # Hindi diacritics - *[chr(i) for i in range(ord("०"), ord("९") + 1)], # Hindi digits ).optimize() # Include punctuation in the graph diff --git a/nemo_text_processing/text_normalization/hi/utils.py b/nemo_text_processing/text_normalization/hi/utils.py index 102212183..d21135e42 100644 --- a/nemo_text_processing/text_normalization/hi/utils.py +++ b/nemo_text_processing/text_normalization/hi/utils.py @@ -40,7 +40,6 @@ def load_labels(abs_path): """ label_tsv = open(abs_path, encoding="utf-8") labels = list(csv.reader(label_tsv, delimiter="\t")) - label_tsv.close() return labels diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py index 9882aa4cf..9f80a9eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py @@ -13,9 +13,11 @@ # limitations under the License. import pynini -from pynini.lib import pynutil +from pynini.lib import pynutil, rewrite from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..9e1bdd3b9 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,6 +39,10 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -49,6 +53,10 @@ def __init__(self): graph_mm_yyyy = month + NEMO_SPACE + year + graph_era = era + + graph_range = range + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -60,7 +68,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..048140295 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,26 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +43,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 3f478a2d2..ed419f2f7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/nemo_text_processing/text_normalization/hy/utils.py b/nemo_text_processing/text_normalization/hy/utils.py index 26c9f5119..92a8c8e96 100644 --- a/nemo_text_processing/text_normalization/hy/utils.py +++ b/nemo_text_processing/text_normalization/hy/utils.py @@ -22,7 +22,6 @@ def get_abs_path(rel_path): Args: rel_path: relative path to this file - Returns absolute path """ return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path @@ -34,7 +33,6 @@ def load_labels(abs_path): Args: abs_path: absolute path - Returns dictionary of mappings """ label_tsv = open(abs_path) diff --git a/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt index 2c70d8932..75a82e10e 100644 --- a/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/de/data_text_normalization/test_cases_electronic.txt @@ -16,4 +16,4 @@ at w e z y r eins neun acht sechs~@wezyr1986 zwei-D-Mammogram~2D-Mammogram zwei-D-Mammogram~2-D-Mammogram drei-D-Drucker~3D-Drucker -drei-D-Drucker~3-D-Drucker \ No newline at end of file +drei-D-Drucker~3-D-Drucker diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt index 3a306158b..46fc6b615 100644 --- a/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/en/data_text_normalization/test_cases_electronic.txt @@ -41,4 +41,4 @@ https://www.nvidia.com/dgx-basepod/~HTTPS colon slash slash WWW dot NVIDIA dot c i can use your card ending in 8876~i can use your card ending in eight eight seven six upgrade/update~upgrade slash update upgrade / update~upgrade slash update -upgrade/update/downgrade~upgrade slash update slash downgrade \ No newline at end of file +upgrade/update/downgrade~upgrade slash update slash downgrade diff --git a/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt b/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt index 82dd7a63a..24604345a 100644 --- a/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt +++ b/tests/nemo_text_processing/es/data_text_normalization/test_cases_ordinal.txt @@ -117,4 +117,4 @@ todo mi reconocimiento~todo mi reconocimiento V~quinto El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico.~El texto de Li Qin en este libro ahora está disponible en forma de libro electrónico. Xi Jinping es el actual presidente de China.~Xi Jinping es el actual presidente de China. -Matías fue el XI apóstol.~Matías fue el undécimo apóstol. \ No newline at end of file +Matías fue el XI apóstol.~Matías fue el undécimo apóstol. diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt index 6a6b179dc..9939d7cf9 100644 --- a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt +++ b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_whitelist.txt @@ -4,4 +4,4 @@ Mᵐᵉ~madame Mᵐᵉˢ~mesdames Mˡˡᵉ~mademoiselle Mˡˡᵉˢ~mademoiselles -18%~dix-huit pour cent \ No newline at end of file +18%~dix-huit pour cent diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..b45b40a9e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -14,6 +14,7 @@ १०-२९-२०००~अक्टूबर उनतीस दो हज़ार ११-१४-११००~नवंबर चौदह ग्यारह सौ ०३-२०१०~मार्च दो हज़ार दस -११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..b576dac38 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट diff --git a/tests/nemo_text_processing/hi/test_cardinal.py b/tests/nemo_text_processing/hi/test_cardinal.py index 8298ec0e3..ca218ca02 100644 --- a/tests/nemo_text_processing/hi/test_cardinal.py +++ b/tests/nemo_text_processing/hi/test_cardinal.py @@ -33,10 +33,3 @@ class TestCardinal: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_cardinal.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_date.py b/tests/nemo_text_processing/hi/test_date.py index df12e9874..2dacb7b80 100644 --- a/tests/nemo_text_processing/hi/test_date.py +++ b/tests/nemo_text_processing/hi/test_date.py @@ -33,10 +33,3 @@ class TestDate: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_date.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_decimal.py b/tests/nemo_text_processing/hi/test_decimal.py index 582b59422..2e907bc37 100644 --- a/tests/nemo_text_processing/hi/test_decimal.py +++ b/tests/nemo_text_processing/hi/test_decimal.py @@ -33,10 +33,3 @@ class TestDecimal: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_decimal.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_fraction.py b/tests/nemo_text_processing/hi/test_fraction.py index bedf9d0f7..9e2a728a6 100644 --- a/tests/nemo_text_processing/hi/test_fraction.py +++ b/tests/nemo_text_processing/hi/test_fraction.py @@ -33,10 +33,3 @@ class TestFraction: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_fraction.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_measure.py b/tests/nemo_text_processing/hi/test_measure.py index 71352cdc8..01fdaeb2f 100644 --- a/tests/nemo_text_processing/hi/test_measure.py +++ b/tests/nemo_text_processing/hi/test_measure.py @@ -33,10 +33,3 @@ class TestMeasure: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_measure.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_money.py b/tests/nemo_text_processing/hi/test_money.py index 0665146a6..d68089769 100644 --- a/tests/nemo_text_processing/hi/test_money.py +++ b/tests/nemo_text_processing/hi/test_money.py @@ -33,10 +33,3 @@ class TestMoney: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_money.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..ce823ec54 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -14,12 +14,8 @@ runtest () { denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # trim white space - # spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" - # denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" - - # trim white space and remove space before punctuation - spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" - denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" + spoken="$(echo "${spoken}" | tr -d ' ')" + denorm_pred="$(echo "${denorm_pred}" | tr -d ' ')" # input expected actual assertEquals "$written" "$spoken" "$denorm_pred" diff --git a/tests/nemo_text_processing/hi/test_time.py b/tests/nemo_text_processing/hi/test_time.py index 402faf414..5ae2a04e6 100644 --- a/tests/nemo_text_processing/hi/test_time.py +++ b/tests/nemo_text_processing/hi/test_time.py @@ -33,10 +33,3 @@ class TestTime: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_time.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hy/test_decimal.py b/tests/nemo_text_processing/hy/test_decimal.py index aaa65a0b7..69f0693ab 100644 --- a/tests/nemo_text_processing/hy/test_decimal.py +++ b/tests/nemo_text_processing/hy/test_decimal.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt index cc8e7667c..31197871e 100644 --- a/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt +++ b/tests/nemo_text_processing/it/data_text_normalization/test_cases_time.txt @@ -3,4 +3,4 @@ 17:15:26~diciassette e quindici minuti e ventisei secondi~diciassette e un quarto e ventisei secondi 23:45~ventitre e quarantacinque minuti 03:38~tre e trentotto minuti -l'evento inizia alle 16:00~l'evento inizia alle sedici \ No newline at end of file +l'evento inizia alle 16:00~l'evento inizia alle sedici diff --git a/tests/nemo_text_processing/ja/test_cardinal.py b/tests/nemo_text_processing/ja/test_cardinal.py index 0c8faf728..99d3cfd95 100644 --- a/tests/nemo_text_processing/ja/test_cardinal.py +++ b/tests/nemo_text_processing/ja/test_cardinal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import pytest from parameterized import parameterized diff --git a/tools/text_processing_deployment/export_grammars.sh b/tools/text_processing_deployment/export_grammars.sh index 017472ae9..c292c60fb 100644 --- a/tools/text_processing_deployment/export_grammars.sh +++ b/tools/text_processing_deployment/export_grammars.sh @@ -70,6 +70,13 @@ else WHITELIST="" fi +# check if WHITELIST file exists +if [[ ${WHITELIST} != "" ]] && [[ -f $WHITELIST ]]; then + WHITELIST="--whitelist=${WHITELIST} " + echo "[I] Whitelist file wasn't provided or doesn't exist, using default" +else + WHITELIST="" +fi if [[ ${OVERWRITE_CACHE,,} == "true" ]] ; then OVERWRITE_CACHE="--overwrite_cache "