From 7ede1d163d6fe8053b5a6664d958131ef9faa0f7 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Fri, 24 Jan 2025 12:42:14 +0530 Subject: [PATCH 01/30] Future Implementations for classes - Measure, Money, and Date Signed-off-by: Namrata Gachchi --- .../hi/data/date/year_suffix.tsv | 3 + .../hi/data/measure/unit.tsv | 4 +- .../hi/data/money/currency.tsv | 3 +- .../hi/data/money/major_minor_currencies.tsv | 9 +++ .../text_normalization/hi/data/time/hours.tsv | 1 + .../text_normalization/hi/taggers/date.py | 29 ++++++- .../text_normalization/hi/taggers/measure.py | 30 ++++++- .../text_normalization/hi/taggers/money.py | 40 +++++----- .../hi/taggers/tokenize_and_classify.py | 8 +- .../text_normalization/hi/verbalizers/date.py | 10 ++- .../hi/verbalizers/money.py | 79 ++++++++++++++----- .../hi/verbalizers/verbalize.py | 20 +++-- .../hi/verbalizers/whitelist.py | 2 + .../test_cases_date.txt | 3 +- .../test_cases_measure.txt | 4 + .../test_cases_money.txt | 20 ++++- 16 files changed, 200 insertions(+), 65 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..7da791489 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,3 @@ +ई. पू. ईसा पूर्व +ई. ईसवी +तक तक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv new file mode 100644 index 000000000..cf62891d1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.tsv @@ -0,0 +1,9 @@ +रुपए पैसे +पाउंड पेंस +वॉन जिओन +डॉलर सेंट +लीरा कुरस +टका पैसे +येन सेन +नाइरा कोबो +यूरो सेंट diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..3b96b8425 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -26,6 +26,7 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) class DateFst(GraphFst): @@ -62,12 +63,17 @@ def __init__(self, cardinal: GraphFst): years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - graph_dd_mm = days_graph + delete_dash + months_graph + graph_dd_mm = days_graph + (delete_dash | pynini.accep("")) + months_graph - graph_mm_dd = months_graph + delete_dash + days_graph + graph_mm_dd = months_graph + (delete_dash | pynini.accep("")) + days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +84,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = ( + months_graph + (delete_dash | pynini.accep("")) + years_graph + pynutil.insert(" preserve_order: true ") + ) + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("text: \"") + + (cardinal.final_graph | graph_year) + + insert_space + + range_graph + + insert_space + + (cardinal.final_graph | graph_year) + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -88,6 +109,8 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy | graph_mm_yyyy + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..a8cc3fad3 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -44,7 +44,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + + # Handling symbols like x, X, * + symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) graph_measurements = ( pynutil.insert("decimal { ") @@ -52,8 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) + graph_measurements |= ( pynutil.insert("cardinal { ") + optional_graph_negative @@ -62,7 +66,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit + ) + + # Handling cardinal clubbed with symbol as single token + graph_measurements |= ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") ) graph = graph_measurements diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..3de3017ed 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,9 +24,11 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + Args: cardinal: CardinalFst decimal: DecimalFst @@ -34,29 +36,23 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph - optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, - ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") - - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( - optional_graph_negative - + self.currency - + insert_space - + self.interger - + pynutil.delete(".") - + insert_space - + self.fraction + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') + + graph_major_only = currency_major + insert_space + integer + graph_major_and_minor = ( + currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..bdec90c06 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -68,11 +68,11 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +107,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..9e1bdd3b9 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,6 +39,10 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -49,6 +53,10 @@ def __init__(self): graph_mm_yyyy = month + NEMO_SPACE + year + graph_era = era + + graph_range = range + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -60,7 +68,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..048140295 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,26 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +43,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py index 3f478a2d2..ed419f2f7 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/whitelist.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + import pynini from pynini.lib import pynutil diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..b45b40a9e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -14,6 +14,7 @@ १०-२९-२०००~अक्टूबर उनतीस दो हज़ार ११-१४-११००~नवंबर चौदह ग्यारह सौ ०३-२०१०~मार्च दो हज़ार दस -११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..b576dac38 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट From d3ac9f03bca8801a56804bf50e559cd3521f9c40 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Mon, 27 Jan 2025 16:59:41 +0530 Subject: [PATCH 02/30] Resolved the conflicts with mm_yyyy and date ranges and added the previously removed failing test cases. Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/taggers/date.py | 4 ++-- .../text_normalization/hi/verbalizers/date.py | 8 ++------ .../hi/data_text_normalization/test_cases_date.txt | 1 + 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 3b96b8425..3c41bfb24 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -85,7 +85,7 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") graph_mm_yyyy = ( - months_graph + (delete_dash | pynini.accep("")) + years_graph + pynutil.insert(" preserve_order: true ") + months_graph + delete_dash + insert_space + years_graph ) graph_year_suffix = era_graph @@ -108,7 +108,7 @@ def __init__(self, cardinal: GraphFst): | graph_mm_dd | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy - | graph_mm_yyyy + | pynutil.add_weight(graph_mm_yyyy, -0.2) | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(graph_range, -0.005) ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 9e1bdd3b9..a754a8e62 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -39,9 +39,9 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") graph_dd_mm = day + NEMO_SPACE + month @@ -53,10 +53,6 @@ def __init__(self): graph_mm_yyyy = month + NEMO_SPACE + year - graph_era = era - - graph_range = range - optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index b45b40a9e..069ad7211 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -14,6 +14,7 @@ १०-२९-२०००~अक्टूबर उनतीस दो हज़ार ११-१४-११००~नवंबर चौदह ग्यारह सौ ०३-२०१०~मार्च दो हज़ार दस +११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस १२० ई. पू.~एक सौ बीस ईसा पूर्व From 5c67c52b958eed3ceb09cc655bf5e126704b08d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:38:33 +0000 Subject: [PATCH 03/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 3c41bfb24..7fb8994f8 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -84,9 +84,7 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = ( - months_graph + delete_dash + insert_space + years_graph - ) + graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph graph_year_suffix = era_graph From 767b56e7484be426b44cd4b182da13339f42cbfe Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 28 Jan 2025 10:23:31 +0530 Subject: [PATCH 04/30] removed the unused empty string implementation Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 7fb8994f8..4029bb9ef 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -63,9 +63,9 @@ def __init__(self, cardinal: GraphFst): years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - graph_dd_mm = days_graph + (delete_dash | pynini.accep("")) + months_graph + graph_dd_mm = days_graph + delete_dash + months_graph - graph_mm_dd = months_graph + (delete_dash | pynini.accep("")) + days_graph + graph_mm_dd = months_graph + delete_dash + days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") From a2be3f943d98f5932b3306cd9583a16458e8d821 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 04:55:01 +0000 Subject: [PATCH 05/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 4029bb9ef..aed7c3a81 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -63,9 +63,9 @@ def __init__(self, cardinal: GraphFst): years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - graph_dd_mm = days_graph + delete_dash + months_graph + graph_dd_mm = days_graph + delete_dash + months_graph - graph_mm_dd = months_graph + delete_dash + days_graph + graph_mm_dd = months_graph + delete_dash + days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") From a5a68006514d5bc87a17591ec6c47ecd8b52e35a Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Wed, 29 Jan 2025 16:19:00 +0530 Subject: [PATCH 06/30] minor fixes for the tagger files Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/taggers/measure.py | 7 ++++++- .../text_normalization/hi/taggers/money.py | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index a8cc3fad3..1eccb6e2f 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -19,6 +19,11 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) + + class MeasureFst(GraphFst): """ Finite state transducer for classifying measure, suppletive aware, e.g. @@ -35,7 +40,7 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = cardinal.final_graph + cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands decimal_graph = decimal.final_graph_wo_negative unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index 3de3017ed..28f00f832 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -41,14 +41,17 @@ def __init__(self, cardinal: GraphFst): cardinal_graph = cardinal.final_graph + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, + ) currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') - graph_major_only = currency_major + insert_space + integer + graph_major_only = optional_graph_negative + currency_major + insert_space + integer graph_major_and_minor = ( - currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor + optional_graph_negative + currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor ) graph_currencies = graph_major_only | graph_major_and_minor From 0b840db6f576ee22e7cb622df93ae1f7a9072d25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 29 Jan 2025 10:49:59 +0000 Subject: [PATCH 07/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/money.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index 28f00f832..6d9ac6dcc 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -51,7 +51,14 @@ def __init__(self, cardinal: GraphFst): graph_major_only = optional_graph_negative + currency_major + insert_space + integer graph_major_and_minor = ( - optional_graph_negative + currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor + optional_graph_negative + + currency_major + + insert_space + + integer + + pynini.cross(".", " ") + + fraction + + insert_space + + currency_minor ) graph_currencies = graph_major_only | graph_major_and_minor From dbd2cb3c6e0d58cdf53d60e87a4c6a1a804a0b0d Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Wed, 12 Feb 2025 09:39:34 +0530 Subject: [PATCH 08/30] reformatted decimal final graph Signed-off-by: Namrata Gachchi --- Jenkinsfile | 2 +- nemo_text_processing/text_normalization/hi/taggers/measure.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..53c784920 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/02-12-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 1eccb6e2f..744a9314f 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -41,7 +41,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands - decimal_graph = decimal.final_graph_wo_negative + point = pynutil.delete(".") + decimal_graph = decimal.graph_integer + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) optional_graph_negative = pynini.closure( From 2f9564d63fc6ebabb23b8274de1d51f3cc492eb7 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 13 Feb 2025 17:58:41 +0530 Subject: [PATCH 09/30] incorporated the suggestion for decimal graph Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/taggers/cardinal.py | 3 +++ .../text_normalization/hi/taggers/measure.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..05d7a4ee4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -80,6 +80,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 1, teens_ties) graph_ten_thousands |= create_larger_number_graph(teens_and_ties, suffix_thousands, 0, graph_hundreds) graph_ten_thousands.optimize() + self.graph_ten_thousands = graph_ten_thousands # Lakhs graph and ten lakhs graph suffix_lakhs = pynutil.insert(" लाख") @@ -90,6 +91,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 1, graph_thousands) graph_lakhs |= create_larger_number_graph(digit, suffix_lakhs, 0, graph_ten_thousands) graph_lakhs.optimize() + self.graph_lakhs = graph_lakhs graph_ten_lakhs = create_graph_suffix(teens_and_ties, suffix_lakhs, 5) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 4, digit) @@ -98,6 +100,7 @@ def create_larger_number_graph(digit_graph, suffix, zeros_counts, sub_graph): graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 1, graph_thousands) graph_ten_lakhs |= create_larger_number_graph(teens_and_ties, suffix_lakhs, 0, graph_ten_thousands) graph_ten_lakhs.optimize() + self.graph_ten_lakhs = graph_ten_lakhs # Crores graph ten crores graph suffix_crores = pynutil.insert(" करोड़") diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 744a9314f..0456e64e9 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -40,9 +40,10 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands + cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands | cardinal.graph_lakhs | cardinal.graph_ten_lakhs point = pynutil.delete(".") - decimal_graph = decimal.graph_integer + point + insert_space + decimal.graph_fractional + decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) optional_graph_negative = pynini.closure( From b3dc83a00571bf3a3a7024211be55e9152adc5ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Feb 2025 12:29:37 +0000 Subject: [PATCH 10/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/measure.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 0456e64e9..fff6a4057 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -40,10 +40,18 @@ class MeasureFst(GraphFst): def __init__(self, cardinal: GraphFst, decimal: GraphFst): super().__init__(name="measure", kind="classify") - cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands | cardinal.graph_ten_thousands | cardinal.graph_lakhs | cardinal.graph_ten_lakhs + cardinal_graph = ( + digit + | teens_and_ties + | cardinal.graph_hundreds + | cardinal.graph_thousands + | cardinal.graph_ten_thousands + | cardinal.graph_lakhs + | cardinal.graph_ten_lakhs + ) point = pynutil.delete(".") decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") - decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional + decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) optional_graph_negative = pynini.closure( From 8cab4927742093a2fe73ed8b619206c11e6f3471 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Wed, 5 Mar 2025 18:13:43 +0530 Subject: [PATCH 11/30] Century implementations Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 5 +++++ .../hi/data_text_normalization/test_cases_date.txt | 3 +++ 2 files changed, 8 insertions(+) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index aed7c3a81..12c5bdad5 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -74,6 +74,10 @@ def __init__(self, cardinal: GraphFst): range_graph = pynini.cross("-", "से") + # Graph for century + century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal.final_graph) + pynini.accep("वीं") + century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -109,6 +113,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_mm_yyyy, -0.2) | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(century_text, -0.001) ) self.final_graph = final_graph.optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 069ad7211..a86514091 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -19,3 +19,6 @@ २०२४~दो हज़ार चौबीस १२० ई. पू.~एक सौ बीस ईसा पूर्व २९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व +३२७वीं सदी~तीन सौ सत्ताईसवीं सदी +१८वीं शताब्दी~अठारहवीं शताब्दी +१९वीं दशक~उन्नीसवीं दशक From 9143b0bf3403f93a1cd2a9c188fdd64a61ad3b17 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Mon, 10 Mar 2025 17:50:42 +0530 Subject: [PATCH 12/30] Working on the yyyy format for the date class Signed-off-by: Namrata Gachchi --- .../hi/data/numbers/teens_and_ties.tsv | 16 ++++++++-------- .../text_normalization/hi/taggers/date.py | 16 +++++++++++++++- .../data_text_normalization/test_cases_date.txt | 3 +++ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv index 1d61c77b7..fbf248266 100644 --- a/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv +++ b/nemo_text_processing/text_normalization/hi/data/numbers/teens_and_ties.tsv @@ -79,12 +79,12 @@ ८८ अट्ठासी ८९ नवासी ९० नब्बे -९१ इक्यानबे -९२ बानबे -९३ तिरानबे -९४ चौरानबे -९५ पंचानबे -९६ छियानबे -९७ सत्तानबे -९८ अट्ठानबे +९१ इक्यानबे +९२ बानबे +९३ तिरानबे +९४ चौरानबे +९५ पंचानबे +९६ छियानबे +९७ सत्तानबे +९८ अट्ठानबे ९९ निन्यानबे diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 12c5bdad5..1c9cd01e1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -27,6 +27,9 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) +digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) +teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) +teens_and_ties = pynutil.add_weight(teens_ties, -0.1) class DateFst(GraphFst): @@ -52,6 +55,14 @@ def __init__(self, cardinal: GraphFst): (NEMO_HI_DIGIT + NEMO_HI_NON_ZERO + NEMO_HI_DIGIT + NEMO_HI_DIGIT), cardinal.graph_hundreds_as_thousand ) + cardinal_graph = ( + digit + | teens_and_ties + | cardinal.graph_hundreds + | graph_year_thousands + | graph_year_hundreds_as_thousands + ) + graph_year = graph_year_thousands | graph_year_hundreds_as_thousands delete_dash = pynutil.delete("-") @@ -75,7 +86,7 @@ def __init__(self, cardinal: GraphFst): range_graph = pynini.cross("-", "से") # Graph for century - century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal.final_graph) + pynini.accep("वीं") + century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space graph_dd_mm_yyyy = ( @@ -86,6 +97,8 @@ def __init__(self, cardinal: GraphFst): months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph ) + graph_yyyy = pynutil.insert("text: \"") + pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynutil.insert("\"") + insert_space + pynutil.insert(" preserve_order: true ") + graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph @@ -114,6 +127,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(century_text, -0.001) + | pynutil.add_weight(graph_yyyy, -0.01) ) self.final_graph = final_graph.optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index a86514091..ca506ec36 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -22,3 +22,6 @@ ३२७वीं सदी~तीन सौ सत्ताईसवीं सदी १८वीं शताब्दी~अठारहवीं शताब्दी १९वीं दशक~उन्नीसवीं दशक +१८२३ में~अठारह सौ तेईस में +१९९२ का दशक~उन्नीस सौ बानबे का दशक +१९३२ शताब्दी~उन्नीस सौ बत्तीस शताब्दी From d8ea2466bce987a985743a35bf2518510d6634dc Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Wed, 12 Mar 2025 17:43:03 +0530 Subject: [PATCH 13/30] reverted yyyy code Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 3 --- .../hi/data_text_normalization/test_cases_date.txt | 3 --- 2 files changed, 6 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 1c9cd01e1..db598ea53 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -97,8 +97,6 @@ def __init__(self, cardinal: GraphFst): months_graph + (delete_dash | delete_slash) + days_graph + (delete_dash | delete_slash) + years_graph ) - graph_yyyy = pynutil.insert("text: \"") + pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynutil.insert("\"") + insert_space + pynutil.insert(" preserve_order: true ") - graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph @@ -127,7 +125,6 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(century_text, -0.001) - | pynutil.add_weight(graph_yyyy, -0.01) ) self.final_graph = final_graph.optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index ca506ec36..a86514091 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -22,6 +22,3 @@ ३२७वीं सदी~तीन सौ सत्ताईसवीं सदी १८वीं शताब्दी~अठारहवीं शताब्दी १९वीं दशक~उन्नीसवीं दशक -१८२३ में~अठारह सौ तेईस में -१९९२ का दशक~उन्नीस सौ बानबे का दशक -१९३२ शताब्दी~उन्नीस सौ बत्तीस शताब्दी From 751f7ff38778e92521e25ed1e761166499b4a6ac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 12 Mar 2025 12:15:29 +0000 Subject: [PATCH 14/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index db598ea53..b3b1c4973 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -56,11 +56,7 @@ def __init__(self, cardinal: GraphFst): ) cardinal_graph = ( - digit - | teens_and_ties - | cardinal.graph_hundreds - | graph_year_thousands - | graph_year_hundreds_as_thousands + digit | teens_and_ties | cardinal.graph_hundreds | graph_year_thousands | graph_year_hundreds_as_thousands ) graph_year = graph_year_thousands | graph_year_hundreds_as_thousands From a76a6ac20294521e153b6fda91ac2be8825a50b0 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Mon, 17 Mar 2025 17:42:45 +0530 Subject: [PATCH 15/30] working on future implementations Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 7 ++++++- .../hi/data_text_normalization/test_cases_date.txt | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index b3b1c4973..f1a9d24b8 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -81,10 +81,14 @@ def __init__(self, cardinal: GraphFst): range_graph = pynini.cross("-", "से") - # Graph for century + # Graph for year century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space + # Graph for year + year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep(" में") + year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -121,6 +125,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(century_text, -0.001) + | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index a86514091..b4bf8bf34 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -22,3 +22,4 @@ ३२७वीं सदी~तीन सौ सत्ताईसवीं सदी १८वीं शताब्दी~अठारहवीं शताब्दी १९वीं दशक~उन्नीसवीं दशक +१९९९ में~उन्नीस सौ निन्यानबे में From 95c22375dcf3511379856a12106ac1628902bb8e Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 18 Mar 2025 18:00:16 +0530 Subject: [PATCH 16/30] working on improving the date class accuracy Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 2 +- .../hi/data_text_normalization/test_cases_date.txt | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index f1a9d24b8..3e709d507 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -86,7 +86,7 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep(" में") + year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.union(" में", " का", " की", " के", " से", " तक") year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space graph_dd_mm_yyyy = ( diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index b4bf8bf34..c067ff81e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -23,3 +23,9 @@ १८वीं शताब्दी~अठारहवीं शताब्दी १९वीं दशक~उन्नीसवीं दशक १९९९ में~उन्नीस सौ निन्यानबे में +१९९० का~उन्नीस सौ नब्बे का +१९९२ की~उन्नीस सौ बानबे की +१९६० के अभिनेता है~उन्नीस सौ साठ के अभिनेता है +१७८८ से~सत्रह सौ अट्ठासी से +१९५४ तक~उन्नीस सौ चौवन तक + From 6736fe79e5e3462b57ddedcb85f6cb6729b05a32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Mar 2025 12:38:09 +0000 Subject: [PATCH 17/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 3e709d507..12bff71c1 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -86,7 +86,9 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.union(" में", " का", " की", " के", " से", " तक") + year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.union( + " में", " का", " की", " के", " से", " तक" + ) year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space graph_dd_mm_yyyy = ( From 3376e15cebeabdf3a749bbeae463178f6ade2afe Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 20 Mar 2025 10:44:52 +0530 Subject: [PATCH 18/30] added year prefix for the date class Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 ++++ .../hi/data_text_normalization/test_cases_date.txt | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 12bff71c1..9c189e0f7 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -63,6 +63,7 @@ def __init__(self, cardinal: GraphFst): delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space @@ -91,6 +92,8 @@ def __init__(self, cardinal: GraphFst): ) year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space + year_prefix = pynutil.insert("text: \"") + pynini.union("सन् ", "सन ", "साल ") + insert_space + graph_year + pynutil.insert("\"") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -128,6 +131,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(century_text, -0.001) | pynutil.add_weight(year_text, -0.001) + | year_prefix ) self.final_graph = final_graph.optimize() diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index c067ff81e..31a66f54f 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -28,4 +28,6 @@ १९६० के अभिनेता है~उन्नीस सौ साठ के अभिनेता है १७८८ से~सत्रह सौ अट्ठासी से १९५४ तक~उन्नीस सौ चौवन तक - +सन १९९९~सन उन्नीस सौ निन्यानबे +सन् १९२०~सन् उन्नीस सौ बीस +साल १९७१~साल उन्नीस सौ इकहत्तर From 8d3db99838c96885fb0319ca7ba8d7583bfc9aa5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Mar 2025 05:15:44 +0000 Subject: [PATCH 19/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 9c189e0f7..992d36807 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -92,7 +92,13 @@ def __init__(self, cardinal: GraphFst): ) year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space - year_prefix = pynutil.insert("text: \"") + pynini.union("सन् ", "सन ", "साल ") + insert_space + graph_year + pynutil.insert("\"") + year_prefix = ( + pynutil.insert("text: \"") + + pynini.union("सन् ", "सन ", "साल ") + + insert_space + + graph_year + + pynutil.insert("\"") + ) graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph From 89fcd6cbc743f971ef8a93198ed12ee336f47e93 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 20 Mar 2025 12:40:25 +0530 Subject: [PATCH 20/30] working on the commma cases for date class Signed-off-by: Namrata Gachchi --- nemo_text_processing/text_normalization/hi/taggers/date.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 992d36807..bc644eb40 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -114,6 +114,8 @@ def __init__(self, cardinal: GraphFst): graph_year_suffix = era_graph + comma_graph = pynutil.insert("text: \"") + delete_comma + insert_space + graph_year + pynutil.insert("\"") + graph_range = ( pynutil.insert("text: \"") + (cardinal.final_graph | graph_year) @@ -138,6 +140,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(century_text, -0.001) | pynutil.add_weight(year_text, -0.001) | year_prefix + | comma_graph ) self.final_graph = final_graph.optimize() From ea26dd312bafed18f734054ebadb172daee6b43e Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Fri, 21 Mar 2025 09:47:21 +0530 Subject: [PATCH 21/30] minor fixes Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/taggers/date.py | 8 ++++---- .../hi/data_text_normalization/test_cases_date.txt | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index bc644eb40..a1512db7f 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -87,7 +87,7 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.union( + year_number = graph_year + pynini.union( " में", " का", " की", " के", " से", " तक" ) year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space @@ -118,11 +118,11 @@ def __init__(self, cardinal: GraphFst): graph_range = ( pynutil.insert("text: \"") - + (cardinal.final_graph | graph_year) + + cardinal_graph + insert_space + range_graph + insert_space - + (cardinal.final_graph | graph_year) + + cardinal_graph + pynutil.insert("\"") + pynutil.insert(" preserve_order: true ") ) @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_range, -0.005) | pynutil.add_weight(century_text, -0.001) | pynutil.add_weight(year_text, -0.001) - | year_prefix + | pynutil.add_weight(year_prefix, -0.009) | comma_graph ) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 31a66f54f..a4b3caf07 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -31,3 +31,4 @@ सन १९९९~सन उन्नीस सौ निन्यानबे सन् १९२०~सन् उन्नीस सौ बीस साल १९७१~साल उन्नीस सौ इकहत्तर +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file From 60d7fe35512290b88789198e2413dd4b7a6a27aa Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Mar 2025 04:18:27 +0000 Subject: [PATCH 22/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index a1512db7f..623413820 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -87,9 +87,7 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = graph_year + pynini.union( - " में", " का", " की", " के", " से", " तक" - ) + year_number = graph_year + pynini.union(" में", " का", " की", " के", " से", " तक") year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space year_prefix = ( From 6a2b8c2d6a0ff494d48e53533419dfd6b2718fff Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 25 Mar 2025 17:49:14 +0530 Subject: [PATCH 23/30] implemented mixed fractions Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/data/date/year_suffix.tsv | 3 +-- nemo_text_processing/text_normalization/hi/taggers/date.py | 2 +- .../text_normalization/hi/verbalizers/fraction.py | 3 ++- .../hi/data_text_normalization/test_cases_fraction.txt | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv index 7da791489..acb37d534 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -1,3 +1,2 @@ ई. पू. ईसा पूर्व -ई. ईसवी -तक तक \ No newline at end of file +ई. ईसवी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 623413820..cbc4bbee4 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -87,7 +87,7 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = graph_year + pynini.union(" में", " का", " की", " के", " से", " तक") + year_number = graph_year + pynini.union(" में", " का", " की", " के", " से", " तक", " ईस्वी", " शताब्दी", " दशक", " सदी") year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space year_prefix = ( diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..a1dd00ce5 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -39,10 +39,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") denominator = pynutil.delete("denominator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") insert_bata = pynutil.insert(" बटा ") + insert_aur = pynutil.insert(" और ") fraction_default = numerator + insert_bata + denominator - self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space) + fraction_default + self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default graph = self.graph diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index 25c18b777..a2c9fb519 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -19,3 +19,5 @@ १०००००००००००००/३~एक नील बटा तीन १०००००००००००००००/८~एक पद्म बटा आठ १०००००००००००००००००/४१२~एक शंख बटा चार सौ बारह +२ २/७~दो और दो बटा सात +१२० ७५/९०~एक सौ बीस और पचहत्तर बटा नब्बे \ No newline at end of file From 15c411ebb819f7981021a2b91294811b292785f9 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 25 Mar 2025 17:51:24 +0530 Subject: [PATCH 24/30] rectified the test case Signed-off-by: Namrata Gachchi --- .../hi/data_text_normalization/test_cases_fraction.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt index a2c9fb519..d1473412e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_fraction.txt @@ -1,5 +1,5 @@ ९९/९९~निन्यानबे बटा निन्यानबे -२२ ३१/१७~बाईस इकतीस बटा सत्रह +२२ ३१/१७~बाईस और इकतीस बटा सत्रह ९७/०~सत्तानबे बटा शून्य २५६३/४१२~दो हज़ार पाँच सौ तिरेसठ बटा चार सौ बारह ७२८६०/७०~बहत्तर हज़ार आठ सौ साठ बटा सत्तर From 6d367342699cc454b644b2b59c33adfbcd92996a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 25 Mar 2025 12:19:59 +0000 Subject: [PATCH 25/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/hi/taggers/date.py | 4 +++- .../text_normalization/hi/verbalizers/fraction.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index cbc4bbee4..b2c0ac717 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -87,7 +87,9 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Graph for year - year_number = graph_year + pynini.union(" में", " का", " की", " के", " से", " तक", " ईस्वी", " शताब्दी", " दशक", " सदी") + year_number = graph_year + pynini.union( + " में", " का", " की", " के", " से", " तक", " ईस्वी", " शताब्दी", " दशक", " सदी" + ) year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space year_prefix = ( diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index a1dd00ce5..cba534e61 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -43,7 +43,11 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): fraction_default = numerator + insert_bata + denominator - self.graph = optional_sign + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + fraction_default + self.graph = ( + optional_sign + + pynini.closure(pynini.closure(integer, 0, 1) + insert_space + insert_aur) + + fraction_default + ) graph = self.graph From 6e724fe38bb192bf388092483eb1288479a1a5f6 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 3 Apr 2025 16:48:29 +0530 Subject: [PATCH 26/30] working on quarterly measurements Signed-off-by: Namrata Gachchi --- .../hi/data/measure/quarterly_units.tsv | 12 ++++++++ .../text_normalization/hi/taggers/measure.py | 28 ++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv new file mode 100644 index 000000000..eaddf930a --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv @@ -0,0 +1,12 @@ +s सेकंड +hr घंटा +h घंटे +min मिनट +doz दर्जन +yr साल +yr वर्ष +hp हॉर्सपॉवर +d दिन +month महीना +months महीने +हफ़्ते हफ़्ते \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index fff6a4057..9eddd7f31 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -53,18 +53,24 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv")) + quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv")) optional_graph_negative = pynini.closure( pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, ) + # Define the quarterly measurements + quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),]) + quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\" ") + # Define the unit handling unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") # Handling symbols like x, X, * symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) - graph_measurements = ( + graph_decimal = ( pynutil.insert("decimal { ") + optional_graph_negative + decimal_graph @@ -73,7 +79,16 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + unit ) - graph_measurements |= ( + graph_quarter = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + quarter_graph + + pynutil.insert(" }") + + delete_space + + units + ) + + graph_cardinal = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") @@ -85,7 +100,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # Handling cardinal clubbed with symbol as single token - graph_measurements |= ( + graph_exceptions = ( pynutil.insert("cardinal { ") + optional_graph_negative + pynutil.insert("integer: \"") @@ -104,7 +119,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") ) - graph = graph_measurements + graph = ( + pynutil.add_weight(graph_decimal, 0.01) + | pynutil.add_weight(graph_quarter, 0.001) + | pynutil.add_weight(graph_cardinal, 0.01) + | pynutil.add_weight(graph_exceptions, 0.01) + ) self.graph = graph.optimize() final_graph = self.add_tokens(graph) From 990c25af58d0e3a9a2a5abd82621478412b44f37 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Thu, 17 Apr 2025 15:12:45 +0530 Subject: [PATCH 27/30] reformatted the prefixes and suffixes for date tagger class Signed-off-by: Namrata Gachchi --- .../hi/data/date/prefixes.tsv | 3 +++ .../hi/data/date/suffixes.tsv | 10 ++++++++++ .../text_normalization/hi/taggers/date.py | 18 +++++++++++++----- .../text_normalization/hi/taggers/measure.py | 8 ++++---- 4 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv create mode 100644 nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv new file mode 100644 index 000000000..d4c1ca0b1 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -0,0 +1,3 @@ +सन् +सन +साल \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv new file mode 100644 index 000000000..6806d3f12 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/suffixes.tsv @@ -0,0 +1,10 @@ + में + का + की + के + से + तक + ईस्वी + शताब्दी + दशक + सदी \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index b2c0ac717..1727f0403 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -31,6 +31,15 @@ teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) +# Read suffixes from file into a list +with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: + suffixes_list = f.read().splitlines() +with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: + prefixes_list = f.read().splitlines() + +# Create union of suffixes and prefixes +suffix_union = pynini.union(*suffixes_list) +prefix_union = pynini.union(*prefixes_list) class DateFst(GraphFst): """ @@ -86,15 +95,14 @@ def __init__(self, cardinal: GraphFst): century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space - # Graph for year - year_number = graph_year + pynini.union( - " में", " का", " की", " के", " से", " तक", " ईस्वी", " शताब्दी", " दशक", " सदी" - ) + # Updated logic to use suffix_union + year_number = graph_year + suffix_union year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space + # Updated logic to use prefix_union year_prefix = ( pynutil.insert("text: \"") - + pynini.union("सन् ", "सन ", "साल ") + + prefix_union + insert_space + graph_year + pynutil.insert("\"") diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 9eddd7f31..acf2c12ac 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -61,11 +61,11 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): # Define the quarterly measurements quarter = pynini.string_map([(".५", "साढ़े"), ("१.५", "डेढ़"), ("२.५", "ढाई"),]) - quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\" ") + quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") + units = pynutil.insert("units: \"") + quarterly_units_graph + pynutil.insert("\" ") # Handling symbols like x, X, * symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) @@ -80,7 +80,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) graph_quarter = ( - pynutil.insert("decimal { ") + pynutil.insert("cardinal { ") + optional_graph_negative + quarter_graph + pynutil.insert(" }") @@ -121,7 +121,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): graph = ( pynutil.add_weight(graph_decimal, 0.01) - | pynutil.add_weight(graph_quarter, 0.001) + | pynutil.add_weight(graph_quarter, 0.005) | pynutil.add_weight(graph_cardinal, 0.01) | pynutil.add_weight(graph_exceptions, 0.01) ) From 4c3f426e02d843459c9bdbadc01e4848e6b8aae7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Apr 2025 09:44:12 +0000 Subject: [PATCH 28/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/hi/taggers/date.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 1727f0403..a0777ecda 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -41,6 +41,7 @@ suffix_union = pynini.union(*suffixes_list) prefix_union = pynini.union(*prefixes_list) + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. @@ -96,17 +97,11 @@ def __init__(self, cardinal: GraphFst): century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space # Updated logic to use suffix_union - year_number = graph_year + suffix_union + year_number = graph_year + suffix_union year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space # Updated logic to use prefix_union - year_prefix = ( - pynutil.insert("text: \"") - + prefix_union - + insert_space - + graph_year - + pynutil.insert("\"") - ) + year_prefix = pynutil.insert("text: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph From 7fe10ccf787d55bc5dc218cf1c27d866c9764f71 Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Mon, 21 Apr 2025 17:56:16 +0530 Subject: [PATCH 29/30] replaced text tag with era tag for the date class Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/taggers/date.py | 11 ++++------- .../text_normalization/hi/taggers/measure.py | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index a0777ecda..468753e23 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -94,14 +94,14 @@ def __init__(self, cardinal: GraphFst): # Graph for year century_number = pynini.compose(pynini.closure(NEMO_HI_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") - century_text = pynutil.insert("text: \"") + century_number + pynutil.insert("\"") + insert_space + century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space # Updated logic to use suffix_union year_number = graph_year + suffix_union - year_text = pynutil.insert("text: \"") + year_number + pynutil.insert("\"") + insert_space + year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space # Updated logic to use prefix_union - year_prefix = pynutil.insert("text: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph @@ -117,10 +117,8 @@ def __init__(self, cardinal: GraphFst): graph_year_suffix = era_graph - comma_graph = pynutil.insert("text: \"") + delete_comma + insert_space + graph_year + pynutil.insert("\"") - graph_range = ( - pynutil.insert("text: \"") + pynutil.insert("era: \"") + cardinal_graph + insert_space + range_graph @@ -143,7 +141,6 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(century_text, -0.001) | pynutil.add_weight(year_text, -0.001) | pynutil.add_weight(year_prefix, -0.009) - | comma_graph ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index acf2c12ac..954215771 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -64,8 +64,8 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): quarter_graph = pynutil.insert("integer_part: \"") + quarter + pynutil.insert("\"") # Define the unit handling - unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") - units = pynutil.insert("units: \"") + quarterly_units_graph + pynutil.insert("\" ") + unit = pynutil.insert(" units: \"") + unit_graph + pynutil.insert("\" ") + units = pynutil.insert(" units: \"") + quarterly_units_graph + pynutil.insert("\" ") # Handling symbols like x, X, * symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) From f0dcc0b707c09027a5717993227fdf1892a56eda Mon Sep 17 00:00:00 2001 From: Namrata Gachchi Date: Tue, 22 Apr 2025 21:28:31 +0530 Subject: [PATCH 30/30] Removed the text tag reference from date class verbalizer Signed-off-by: Namrata Gachchi --- .../text_normalization/hi/verbalizers/date.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index a754a8e62..187acf7d6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -41,8 +41,6 @@ def __init__(self): graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - graph_range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") - graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -64,7 +62,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era) + delete_space + optional_preserve_order )