From 513fff52d51376f397794c4ad9b38ed4042cfafc Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 27 Nov 2024 15:17:49 +0530 Subject: [PATCH 01/28] Addition of whitelist and word classes Signed-off-by: Tarushi V --- .../hi/data/whitelist/whitelist.tsv | 13 +++++++++++++ .../hi/data/whitelist/whitelist_fraction.tsv | 3 --- .../hi/data/whitelist/whitelist_time.tsv | 2 -- .../hi/taggers/tokenize_and_classify.py | 6 +++--- .../hi/taggers/whitelist.py | 2 +- .../hi/verbalizers/verbalize.py | 4 +++- .../test_cases_whitelist.txt | 12 ++++++++++++ .../test_cases_word.txt | 15 +++++++++++++++ ...test_sparrowhawk_inverse_text_normalization.sh | 10 ++++++++++ tests/nemo_text_processing/hi/test_whitelist.py | 9 +++++++++ tests/nemo_text_processing/hi/test_word.py | 10 ++++++++++ tools/text_processing_deployment/pynini_export.py | 2 +- 12 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv new file mode 100644 index 000000000..f9eb081b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -0,0 +1,13 @@ +१/४ पाव +१/२ आधा +३/४ पौन +१:३० डेढ़ बजे +२:३० ढाई बजे +१.५ डेढ़ +२.५ ढाई +कु. कुमारी +स्मि. श्रीमती +श्री. श्री +श्री. श्रीमान +मा. मास्टर +डॉ. डॉक्टर \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv deleted file mode 100644 index d3596a955..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv +++ /dev/null @@ -1,3 +0,0 @@ -१/४ पाव -१/२ आधा -३/४ पौन \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv deleted file mode 100644 index aaf5baf8b..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv +++ /dev/null @@ -1,2 +0,0 @@ -१:३० डेढ़ -२:३० ढाई \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 5267da2bb..9c8168aa0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -35,7 +35,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst class ClassifyFst(GraphFst): @@ -83,7 +83,7 @@ def __init__( money = MoneyFst(cardinal, decimal) money_graph = money.fst punct_graph = PunctuationFst().fst - # whitelist_graph = WhiteListFst(input_file=whitelist).fst + whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst classify = ( @@ -96,7 +96,7 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(word_graph, 100) - # | pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(whitelist_graph, 1.01) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py index 2d522c4ba..caeab03b1 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py @@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): super().__init__(name="whitelist", kind="classify") if input_file is None: - input_file = get_abs_path("data/whitelist.tsv") + input_file = get_abs_path("data/whitelist/whitelist.tsv") if not os.path.exists(input_file): raise ValueError(f"Whitelist file {input_file} not found") diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index b6f9bd70a..7aaef4fc3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -22,6 +22,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -44,12 +45,13 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst - + word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst graph = ( cardinal_graph | whitelist_graph + | word_graph | ordinal_graph | decimal_graph | fraction_graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..30824fced --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,12 @@ +डेढ़ बजे~१:३० +ढाई बजे~२:३० +मास्टर निखिल तनिष~मा. निखिल तनिष +पाव~१/४ +श्रीमती ज्योत्सना~स्मि. ज्योत्सना +डॉक्टर~डॉ. +आधा कप चाय~१/२ कप चाय +श्रीमान भारत कुमार~श्री. भारत कुमार +डॉक्टर प्रशांत~डॉ. प्रशांत +डेढ़~१.५ +कुमारी~कु. +ढाई~२.५ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..ce044e7cf --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,15 @@ +नींद~नींद +याहू!~याहू! +-~- +आआआ~आआआ +आकाशगंगा~आकाशगंगा +लटरपटर~लटरपटर +कच्चा-पक्का~कच्चा-पक्का +गुब्बारा~गुब्बारा +चिट्ठी~चिट्ठी +ढूंढना~ढूंढना +लोहे का!~लोहे का! +टाटा~टाटा +~ +झ~झ +संगीत~संगीत \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index 61093c60d..aec7299d5 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,16 @@ testITNMoney() { runtest $input } +testITNWord() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +testITNWhiteList() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py index 4a090d823..c6a228e6e 100644 --- a/tests/nemo_text_processing/hi/test_whitelist.py +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -15,6 +15,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,7 @@ class TestWhitelist: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +33,10 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py index 4d6bd2261..30d809356 100644 --- a/tests/nemo_text_processing/hi/test_word.py +++ b/tests/nemo_text_processing/hi/test_word.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,8 @@ class TestWord: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +34,10 @@ class TestWord: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 23b1f7deb..6b82dfbec 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -21,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.rw.graph_utils import generator_main +from nemo_text_processing.text_normalization.en.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes From 535af69bb96d376cadcd9e8f03eebddc4afc3c06 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 09:52:07 +0000 Subject: [PATCH 02/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/tokenize_and_classify.py | 2 +- .../inverse_text_normalization/hi/verbalizers/verbalize.py | 2 +- tests/nemo_text_processing/hi/test_whitelist.py | 2 +- tests/nemo_text_processing/hi/test_word.py | 5 ++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 9c8168aa0..a5a371d90 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -34,8 +34,8 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 7aaef4fc3..d88bd25d9 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -22,8 +22,8 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst class VerbalizeFst(GraphFst): diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py index c6a228e6e..1e45e6a0e 100644 --- a/tests/nemo_text_processing/hi/test_whitelist.py +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -33,7 +33,7 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py index 30d809356..6fc5883cc 100644 --- a/tests/nemo_text_processing/hi/test_word.py +++ b/tests/nemo_text_processing/hi/test_word.py @@ -15,8 +15,8 @@ import pytest from parameterized import parameterized -from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -27,14 +27,13 @@ class TestWord: ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) - @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred == expected - + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit From d4e380fdc8a65a8c9847575b991cf80e6873a50c Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 27 Nov 2024 16:43:57 +0530 Subject: [PATCH 03/28] Updation of Jenkins date Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index e9cfcde12..fe6a75161 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-27-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 60f87577e4418f35a06d313166864b8575873d6d Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 28 Nov 2024 15:23:19 +0530 Subject: [PATCH 04/28] Cleanup Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index fe6a75161..63fb1a01b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-27-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-28-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 9aa85c0cf927a7459d0e9ee00c91c109c1df7dc8 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 29 Nov 2024 15:49:43 +0530 Subject: [PATCH 05/28] Updation Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 63fb1a01b..40dd4d626 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-28-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From bf6ebe3c5f5fd8841af8e0176abbbfc8b1116b23 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 29 Nov 2024 17:30:48 +0530 Subject: [PATCH 06/28] Updation Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 40dd4d626..4883d7169 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From ba19d36ced05b630b88a4a9c404b01a0a5208442 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 12 Dec 2024 15:54:29 +0530 Subject: [PATCH 07/28] Future implementations for date Signed-off-by: Tarushi V --- .../hi/data/date/century.tsv | 3 ++ .../hi/taggers/date.py | 48 ++++++++++++----- .../hi/verbalizers/date.py | 52 ++++++++++++++----- 3 files changed, 76 insertions(+), 27 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv new file mode 100644 index 000000000..bd188a059 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -0,0 +1,3 @@ +ई. पू. ईसा पूर्व +ई. ईस्वी +ई. ईसवी diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 61183ae72..f5c10ad7c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -14,7 +14,7 @@ # limitations under the License. import pynini from pynini.lib import pynutil - + from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( NEMO_HI_DIGIT, GraphFst, @@ -22,9 +22,9 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path - - +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst + + class DateFst(GraphFst): """ Finite state transducer for classifying date, @@ -34,22 +34,25 @@ class DateFst(GraphFst): cardinal: CardinalFst date: DateFst """ - + def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") - + graph_year = pynutil.add_weight( pynini.compose(cardinal.graph_no_exception, pynini.closure(NEMO_HI_DIGIT, 1, 4)), 0.03 ) - + month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() - + graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() + + self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") - + graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day graph_month_day += pynutil.insert(" preserve_order: true") @@ -58,9 +61,28 @@ def __init__(self, cardinal: GraphFst): graph_month_day_year += pynutil.insert(" preserve_order: true") graph_month_year = self.month + delete_space + self.year graph_saal = self.year - - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year - self.graph = graph.optimize() - + graph_AD_BC = self.year + delete_space + self.century + graph_day_month_year_century = self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century + graph_year_range = self.year + delete_space + pynutil.delete("से") + delete_space + self.year + + + graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range final_graph = self.add_tokens(graph) self.fst = final_graph + +#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +#cardinal = CardinalFst() +#date = DateFst(cardinal) +#input_text = "पच्चीस मार्च दो हज़ार दस" +#input_text = "छ: मार्च उन्नीस सौ नब्बे" +#input_text = "छ: मार्च उन्नीस सौ नब्बे ईस्वी" +#input_text = "छह मार्च दो हज़ार दस" +#input_text = "तीन फ़रवरी" +#input_text = "चौवालीस सौ ईसा पूर्व" +#input_text = "फ़रवरी चौवालीस सौ ईसा पूर्व" +#input_text = "चौवालीस सौ ईस्वी" +#input_text = "उन्नीस सौ बीस से उन्नीस सौ छब्बीस" +#input_text = "उन्नीस सौ बीस से छब्बीस" +#output = apply_fst(input_text, date.fst) +#print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 5442777da..235c2ccb1 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -11,25 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - + import pynini from pynini.lib import pynutil - + from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_space, ) - - + +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst + + class DateFst(GraphFst): """ Finite state transducer for verbalizing date, e.g. date { month: "जनवरी" day: "५" year: "२०१२" preserve_order: true } -> जनवरी ५ २०१२ date { day: "५" month: "जनवरी" year: "२०१२" preserve_order: true } -> ५ जनवरी २०१२ """ - + def __init__(self): super().__init__(name="date", kind="verbalize") month = ( @@ -61,22 +63,32 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") ) - graph_fy = period + delete_space + year + graph_fy = year + graph_fy |= period + delete_space + year + #century + graph_century = year + delete_extra_space + period # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year - + # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year - + # day month year century + graph_dmyc = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period + # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) - + # month year century + graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period + # month day graph_md = month + pynini.closure(delete_extra_space + day, 0, 1) - + # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) - + # date range + graph_year_range = year + delete_extra_space + pynutil.insert("-") + delete_extra_space + year + + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -86,12 +98,24 @@ def __init__(self): + pynutil.delete("\"") + delete_space ) - + final_graph = ( - (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm) + (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm | graph_century | graph_dmyc | graph_myc | graph_year_range) + delete_space + optional_preserve_order ) - + delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() +date = DateFst() +#input_text = 'date { period: "सन " year: "२०१९" }' +#input_text = 'date { day: "१७"month: "अप्रैल"year: "२००२" }' +#input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" }' +#input_text = 'date { day: "१७" month: "अक्टूबर" year: "२०१९" }' +#input_text = 'date { year: "४४००" }' +#input_text = 'date { year: "४४००" text: "ईस्वी" }' +#input_text = 'date { year: "४४००" text: "ई. पू." }' +#input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" text: "ई. पू." }' +input_text = 'date { year: "१९२०" year: "१९२६" }' +output = apply_fst(input_text, date.fst) +print(output) From 6452e610eb75c968528b08435f4e762900650570 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Mon, 16 Dec 2024 16:12:52 +0530 Subject: [PATCH 08/28] pushing rough date code for ref Signed-off-by: Tarushi V --- .../hi/taggers/date.py | 22 ++++++++++-------- .../hi/verbalizers/date.py | 23 +++++++++++++------ .../test_cases_date.txt | 10 +++++++- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index f5c10ad7c..51c4d1298 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -49,7 +49,7 @@ def __init__(self, cardinal: GraphFst): self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") - self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.year = pynutil.insert("year: \"") + graph_year + delete_space + pynini.cross("से", "-") + delete_space + graph_year + delete_space + pynutil.insert("\" ") self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") @@ -64,16 +64,16 @@ def __init__(self, cardinal: GraphFst): graph_AD_BC = self.year + delete_space + self.century graph_day_month_year_century = self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century - graph_year_range = self.year + delete_space + pynutil.delete("से") + delete_space + self.year + graph_year_range = self.year + graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day - - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range + graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range | graph_date_exceptions final_graph = self.add_tokens(graph) self.fst = final_graph -#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -#cardinal = CardinalFst() -#date = DateFst(cardinal) +from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +cardinal = CardinalFst() +date = DateFst(cardinal) #input_text = "पच्चीस मार्च दो हज़ार दस" #input_text = "छ: मार्च उन्नीस सौ नब्बे" #input_text = "छ: मार्च उन्नीस सौ नब्बे ईस्वी" @@ -81,8 +81,10 @@ def __init__(self, cardinal: GraphFst): #input_text = "तीन फ़रवरी" #input_text = "चौवालीस सौ ईसा पूर्व" #input_text = "फ़रवरी चौवालीस सौ ईसा पूर्व" -#input_text = "चौवालीस सौ ईस्वी" +input_text = "चौवालीस सौ ईस्वी" #input_text = "उन्नीस सौ बीस से उन्नीस सौ छब्बीस" #input_text = "उन्नीस सौ बीस से छब्बीस" -#output = apply_fst(input_text, date.fst) -#print(output) +#input_text = "मार्च की दो" +#input_text = "फ़रवरी की बीस" +output = apply_fst(input_text, date.fst) +print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 235c2ccb1..922bcf049 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -65,18 +65,22 @@ def __init__(self): ) graph_fy = year graph_fy |= period + delete_space + year + #century graph_century = year + delete_extra_space + period + # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + # day month year century graph_dmyc = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) + # month year century graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period @@ -85,9 +89,12 @@ def __init__(self): # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) - # date range - graph_year_range = year + delete_extra_space + pynutil.insert("-") + delete_extra_space + year - + + # year range + graph_year_range = year + + # date exceptions + #graph_date_exceptions = day + delete_extra_space + pynutil.insert("की") + delete_extra_space + month optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space @@ -107,7 +114,8 @@ def __init__(self): delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() -date = DateFst() + +#date = DateFst() #input_text = 'date { period: "सन " year: "२०१९" }' #input_text = 'date { day: "१७"month: "अप्रैल"year: "२००२" }' #input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" }' @@ -116,6 +124,7 @@ def __init__(self): #input_text = 'date { year: "४४००" text: "ईस्वी" }' #input_text = 'date { year: "४४००" text: "ई. पू." }' #input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" text: "ई. पू." }' -input_text = 'date { year: "१९२०" year: "१९२६" }' -output = apply_fst(input_text, date.fst) -print(output) +#input_text = 'date { year: "१९२०-२६" }' +#input_text = 'date { month: "फ़रवरी" day: "२०" }' +#output = apply_fst(input_text, date.fst) +#print(output) diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index bdc450fdd..96d5cbadf 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -22,4 +22,12 @@ सत्ताईस जुलाई दो हज़ार ग्यारह~२७ जुलाई, २०११ जुलाई सत्ताईस~जुलाई २७ वर्ष दो हज़ार उन्नीस~वर्ष २०१९ -सन उन्नीस सौ नब्बे~सन १९९० \ No newline at end of file +सन उन्नीस सौ नब्बे~सन १९९० +उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ +दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ +दो हज़ार पाँच से उन्नीस~२००५-१९ +चौंतीस सौ ईसा पूर्व~३४०० ई. पू. +उन्नीस सौ बीस ईस्वी~१९२० ई. +पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. +इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. +पच्चीस ईसा पूर्व~२५ ई.पू. From 3821339e20d72763723ec4bea8abd4ed95638105 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 9 Jan 2025 14:32:34 +0530 Subject: [PATCH 09/28] Future implementations date.py Signed-off-by: Tarushi V --- .../hi/data/date/century.tsv | 2 +- .../hi/taggers/date.py | 33 +++++-------------- .../hi/verbalizers/date.py | 23 ++----------- .../test_cases_date.txt | 4 ++- tools/text_processing_deployment/Dockerfile | 13 +++++--- 5 files changed, 23 insertions(+), 52 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv index bd188a059..da69e23eb 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv +++ b/nemo_text_processing/inverse_text_normalization/hi/data/date/century.tsv @@ -1,3 +1,3 @@ -ई. पू. ईसा पूर्व +ई.पू. ईसा पूर्व ई. ईस्वी ई. ईसवी diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index 51c4d1298..d3fb48eca 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -22,7 +22,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class DateFst(GraphFst): @@ -46,10 +46,10 @@ def __init__(self, cardinal: GraphFst): graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() - self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") - self.year = pynutil.insert("year: \"") + graph_year + delete_space + pynini.cross("से", "-") + delete_space + graph_year + delete_space + pynutil.insert("\" ") + self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") + self.year_range = pynutil.insert("year: \"") + graph_year + delete_space + pynini.cross("से", "-") + delete_space + graph_year + delete_space + pynutil.insert("\" ") self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") @@ -64,27 +64,12 @@ def __init__(self, cardinal: GraphFst): graph_AD_BC = self.year + delete_space + self.century graph_day_month_year_century = self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century - graph_year_range = self.year + graph_year_range = self.year_range + graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day - - graph = graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range | graph_date_exceptions + graph_date_exceptions += pynutil.insert("preserve_order: true") + + + graph = (graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range | graph_date_exceptions) final_graph = self.add_tokens(graph) self.fst = final_graph - -from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -cardinal = CardinalFst() -date = DateFst(cardinal) -#input_text = "पच्चीस मार्च दो हज़ार दस" -#input_text = "छ: मार्च उन्नीस सौ नब्बे" -#input_text = "छ: मार्च उन्नीस सौ नब्बे ईस्वी" -#input_text = "छह मार्च दो हज़ार दस" -#input_text = "तीन फ़रवरी" -#input_text = "चौवालीस सौ ईसा पूर्व" -#input_text = "फ़रवरी चौवालीस सौ ईसा पूर्व" -input_text = "चौवालीस सौ ईस्वी" -#input_text = "उन्नीस सौ बीस से उन्नीस सौ छब्बीस" -#input_text = "उन्नीस सौ बीस से छब्बीस" -#input_text = "मार्च की दो" -#input_text = "फ़रवरी की बीस" -output = apply_fst(input_text, date.fst) -print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 922bcf049..1945f9e5c 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -21,9 +21,7 @@ delete_extra_space, delete_space, ) - -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst - + class DateFst(GraphFst): """ @@ -93,9 +91,6 @@ def __init__(self): # year range graph_year_range = year - # date exceptions - #graph_date_exceptions = day + delete_extra_space + pynutil.insert("की") + delete_extra_space + month - optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -105,7 +100,7 @@ def __init__(self): + pynutil.delete("\"") + delete_space ) - + final_graph = ( (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm | graph_century | graph_dmyc | graph_myc | graph_year_range) + delete_space @@ -114,17 +109,3 @@ def __init__(self): delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() - -#date = DateFst() -#input_text = 'date { period: "सन " year: "२०१९" }' -#input_text = 'date { day: "१७"month: "अप्रैल"year: "२००२" }' -#input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" }' -#input_text = 'date { day: "१७" month: "अक्टूबर" year: "२०१९" }' -#input_text = 'date { year: "४४००" }' -#input_text = 'date { year: "४४००" text: "ईस्वी" }' -#input_text = 'date { year: "४४००" text: "ई. पू." }' -#input_text = 'date { day: "२५" month: "मार्च" year: "२०१०" text: "ई. पू." }' -#input_text = 'date { year: "१९२०-२६" }' -#input_text = 'date { month: "फ़रवरी" day: "२०" }' -#output = apply_fst(input_text, date.fst) -#print(output) diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt index 96d5cbadf..6d570a9c5 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_date.txt @@ -26,8 +26,10 @@ उन्नीस सौ नब्बे से उन्नीस सौ इक्यानबे~१९९०-१९९१ दो हज़ार पाँच से दो हज़ार उन्नीस~२००५-२०१९ दो हज़ार पाँच से उन्नीस~२००५-१९ -चौंतीस सौ ईसा पूर्व~३४०० ई. पू. +चौंतीस सौ ईसा पूर्व~३४०० ई.पू. उन्नीस सौ बीस ईस्वी~१९२० ई. पच्चीस जनवरी अठारह सौ तिरेपन ईसवी~२५ जनवरी, १८५३ ई. इकत्तीस मई उन्नीस सौ नब्बे ईसवी~३१ मई, १९९० ई. पच्चीस ईसा पूर्व~२५ ई.पू. +मार्च की दो~मार्च २ +फ़रवरी की बीस~फ़रवरी २० diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile index 22c2b8b92..be6fedcda 100644 --- a/tools/text_processing_deployment/Dockerfile +++ b/tools/text_processing_deployment/Dockerfile @@ -16,22 +16,25 @@ # Dockerfile for C++ (inverse) text normalization backend Sparrowhawk https://github.com/google/sparrowhawk # set base image (host OS) -FROM conda/miniconda3 +FROM continuumio/miniconda3 + # set the working directory in the container WORKDIR /workspace # install dependencies RUN echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list +RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall build-essential pkg-config git make wget RUN conda install conda-build -y -RUN apt-get update && apt-get install -y --reinstall build-essential pkg-config && apt-get upgrade -y && apt-get install -y git && apt-get install make +RUN conda install -c conda-forge thrax=1.3.4 -y RUN git clone https://github.com/google/re2 RUN cd re2 && git checkout tags/2022-02-01 && make && make install -RUN apt-get install build-essential -y && apt-get install wget -y RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz RUN tar xzvf protobuf-2.5.0.tar.gz RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig -RUN conda install -c conda-forge thrax=1.3.4 -y +RUN printf "# Conda lib path \n/opt/conda/lib" > /etc/ld.so.conf.d/conda.so.conf +ENV CPPFLAGS="-I/opt/conda/include" +ENV LDFLAGS="-L/opt/conda/lib" RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig RUN git clone https://github.com/kward/shunit2.git -RUN echo "DONE" \ No newline at end of file +RUN echo "DONE" From 6ece14bf3c3f0340df1b69768eaaca39e86f0557 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 9 Jan 2025 14:35:13 +0530 Subject: [PATCH 10/28] Cleanup Signed-off-by: Tarushi V --- .../inverse_text_normalization/hi/taggers/date.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index d3fb48eca..e1070fd71 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -69,7 +69,6 @@ def __init__(self, cardinal: GraphFst): graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day graph_date_exceptions += pynutil.insert("preserve_order: true") - graph = (graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range | graph_date_exceptions) final_graph = self.add_tokens(graph) self.fst = final_graph From 6ec714c8a394845c4ba275a9620d310d6529a412 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Jan 2025 09:08:17 +0000 Subject: [PATCH 11/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/date.py | 45 ++++++++++---- .../hi/verbalizers/date.py | 58 +++++++++++++------ 2 files changed, 73 insertions(+), 30 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py index e1070fd71..6859f0834 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/date.py @@ -14,7 +14,7 @@ # limitations under the License. import pynini from pynini.lib import pynutil - + from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( NEMO_HI_DIGIT, GraphFst, @@ -23,8 +23,8 @@ insert_space, ) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path - - + + class DateFst(GraphFst): """ Finite state transducer for classifying date, @@ -34,25 +34,34 @@ class DateFst(GraphFst): cardinal: CardinalFst date: DateFst """ - + def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") - + graph_year = pynutil.add_weight( pynini.compose(cardinal.graph_no_exception, pynini.closure(NEMO_HI_DIGIT, 1, 4)), 0.03 ) - + month_graph = pynini.string_file(get_abs_path("data/date/months.tsv")) graph_date_days = pynini.string_file(get_abs_path("data/date/date_days.tsv")).invert() graph_century = pynini.string_file(get_abs_path("data/date/century.tsv")).invert() - + self.day = pynutil.insert("day: \"") + graph_date_days + pynutil.insert("\" ") self.month = pynutil.insert("month: \"") + month_graph + pynutil.insert("\" ") self.year = pynutil.insert("year: \"") + graph_year + pynutil.insert("\" ") - self.year_range = pynutil.insert("year: \"") + graph_year + delete_space + pynini.cross("से", "-") + delete_space + graph_year + delete_space + pynutil.insert("\" ") + self.year_range = ( + pynutil.insert("year: \"") + + graph_year + + delete_space + + pynini.cross("से", "-") + + delete_space + + graph_year + + delete_space + + pynutil.insert("\" ") + ) self.century = pynutil.insert("text: \"") + graph_century + pynutil.insert("\" ") insert_comma = pynutil.insert(", ") - + graph_day_month = self.day + delete_space + self.month graph_month_day = self.month + delete_space + self.day graph_month_day += pynutil.insert(" preserve_order: true") @@ -62,13 +71,27 @@ def __init__(self, cardinal: GraphFst): graph_month_year = self.month + delete_space + self.year graph_saal = self.year graph_AD_BC = self.year + delete_space + self.century - graph_day_month_year_century = self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + graph_day_month_year_century = ( + self.day + delete_space + self.month + delete_space + self.year + delete_space + self.century + ) graph_month_year_century = self.month + delete_space + self.year + delete_space + self.century graph_year_range = self.year_range graph_date_exceptions = self.month + delete_space + pynutil.delete("की") + delete_space + self.day graph_date_exceptions += pynutil.insert("preserve_order: true") - graph = (graph_day_month | graph_month_day | graph_day_month_year | graph_month_day_year | graph_month_year | graph_saal | graph_AD_BC | graph_day_month_year_century | graph_month_year_century | graph_year_range | graph_date_exceptions) + graph = ( + graph_day_month + | graph_month_day + | graph_day_month_year + | graph_month_day_year + | graph_month_year + | graph_saal + | graph_AD_BC + | graph_day_month_year_century + | graph_month_year_century + | graph_year_range + | graph_date_exceptions + ) final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py index 1945f9e5c..eacfb5765 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/date.py @@ -11,25 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - + import pynini from pynini.lib import pynutil - + from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( NEMO_NOT_QUOTE, GraphFst, delete_extra_space, delete_space, ) - - + + class DateFst(GraphFst): """ Finite state transducer for verbalizing date, e.g. date { month: "जनवरी" day: "५" year: "२०१२" preserve_order: true } -> जनवरी ५ २०१२ date { day: "५" month: "जनवरी" year: "२०१२" preserve_order: true } -> ५ जनवरी २०१२ """ - + def __init__(self): super().__init__(name="date", kind="verbalize") month = ( @@ -63,34 +63,43 @@ def __init__(self): ) graph_fy = year graph_fy |= period + delete_space + year - - #century + + # century graph_century = year + delete_extra_space + period - + # month (day) year graph_mdy = month + delete_extra_space + day + pynutil.insert(",") + delete_extra_space + year - + # (day) month year graph_dmy = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year - + # day month year century - graph_dmyc = day + delete_extra_space + month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period - + graph_dmyc = ( + day + + delete_extra_space + + month + + pynutil.insert(",") + + delete_extra_space + + year + + delete_extra_space + + period + ) + # month year graph_my = month + pynini.closure(delete_extra_space + year, 0, 1) - + # month year century graph_myc = month + pynutil.insert(",") + delete_extra_space + year + delete_extra_space + period - + # month day graph_md = month + pynini.closure(delete_extra_space + day, 0, 1) - + # day month graph_dm = day + pynini.closure(delete_extra_space + month, 0, 1) - + # year range graph_year_range = year - + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -102,10 +111,21 @@ def __init__(self): ) final_graph = ( - (graph_fy | graph_mdy | graph_dmy | graph_my | graph_md | graph_dm | graph_century | graph_dmyc | graph_myc | graph_year_range) + ( + graph_fy + | graph_mdy + | graph_dmy + | graph_my + | graph_md + | graph_dm + | graph_century + | graph_dmyc + | graph_myc + | graph_year_range + ) + delete_space + optional_preserve_order ) - + delete_tokens = self.delete_tokens(final_graph) self.fst = delete_tokens.optimize() From 2adeee40617353c86cdb208f828c25a95f114a43 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 15 Jan 2025 09:52:52 +0530 Subject: [PATCH 12/28] Updation of Jenkinsfile Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..5e3916ce2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-15-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From b5ede2f95817b7971af7b19a82eab2bebb2eaf79 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Mon, 20 Jan 2025 12:19:53 +0530 Subject: [PATCH 13/28] Telephone.py-hindi itn Signed-off-by: Tarushi V --- .../hi/data/telephone/eng_to_hindi_digit.tsv | 10 ++ .../telephone/teens_and_ties_eng_to_hin.tsv | 90 +++++++++++++++++ .../hi/taggers/telephone.py | 99 +++++++++++++++++++ .../hi/taggers/tokenize_and_classify.py | 4 + .../hi/verbalizers/telephone.py | 73 ++++++++++++++ .../hi/verbalizers/verbalize.py | 3 + .../test_cases_telephone.txt | 25 +++++ ..._sparrowhawk_inverse_text_normalization.sh | 5 + .../nemo_text_processing/hi/test_telephone.py | 31 ++++++ 9 files changed, 340 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py create mode 100644 nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt create mode 100644 tests/nemo_text_processing/hi/test_telephone.py diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv new file mode 100644 index 000000000..53c5e36cb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv @@ -0,0 +1,10 @@ +० zero +१ one +२ two +३ three +४ four +५ five +६ six +७ seven +८ eight +९ nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv new file mode 100644 index 000000000..ac37b55f2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/teens_and_ties_eng_to_hin.tsv @@ -0,0 +1,90 @@ +१० ten +११ eleven +१२ twelve +१३ thirteen +१४ fourteen +१५ fifteen +१६ sixteen +१७ seventeen +१८ eighteen +१९ nineteen +२० twenty +२१ twenty one +२२ twenty two +२३ twenty three +२४ twenty four +२५ twenty five +२६ twenty six +२७ twenty seven +२८ twenty eight +२९ twenty nine +३० thirty +३१ thirty one +३२ thirty two +३३ thirty three +३४ thirty four +३५ thirty five +३६ thirty six +३७ thirty seven +३८ thirty eight +३९ thirty nine +४० forty +४१ forty one +४२ forty two +४३ forty three +४४ forty four +४५ forty five +४६ forty six +४७ forty seven +४८ forty eight +४९ forty nine +५० fifty +५१ fifty one +५२ fifty two +५३ fifty three +५४ fifty four +५५ fifty five +५६ fifty six +५७ fifty seven +५८ fifty eight +५९ fifty nine +६० sixty +६१ sixty one +६२ sixty two +६३ sixty three +६४ sixty four +६५ sixty five +६६ sixty six +६७ sixty seven +६८ sixty eight +६९ sixty nine +७० seventy +७१ seventy one +७२ seventy two +७३ seventy three +७४ seventy four +७५ seventy five +७६ seventy six +७७ seventy seven +७८ seventy eight +७९ seventy nine +८० eighty +८१ eighty one +८२ eighty two +८३ eighty three +८४ eighty four +८५ eighty five +८६ eighty six +८७ eighty seven +८८ eighty eight +८९ eighty nine +९० ninety +९१ ninety one +९२ ninety two +९३ ninety three +९४ ninety four +९५ ninety five +९६ ninety six +९७ ninety seven +९८ ninety eight +९९ ninety nine diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py new file mode 100644 index 000000000..63136e472 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -0,0 +1,99 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + NEMO_HI_DIGIT, + GraphFst, + delete_extra_space, + delete_space, + insert_space, +) +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for classifying telephone numbers, e.g. + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } + + Args: + Cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="classify") + + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() + country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() + + self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") + self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") + + self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") + self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") + self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) + + self.city_code_with_single_digits = pynutil.insert("city_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") + self.city_code_with_double_digits = pynutil.insert("city_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") + self.city_code = (self.city_code_with_single_digits | self.city_code_with_double_digits) + + self.landline_hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + hindi_digit_graph + pynutil.insert("\" ") + self.landline_english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 6) + english_digit_graph + pynutil.insert("\" ") + + delete_plus = pynini.union( + pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") + ) + + delete_zero = pynini.union( + pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") + ) + + graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + + graph_landline_with_hindi_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit + graph_landline_with_english_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit + + graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_hindi_digit | graph_landline_with_english_digit) + final_graph = self.add_tokens(graph) + self.fst = final_graph + +#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +#cardinal = CardinalFst() +#telephone = TelephoneFst(cardinal) +#input_text = "प्लस इक्यानवे nine four one one one two three four one two" +#input_text = "प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य" +#input_text = "plus nine eight zero nine four one one one two three four one" +#input_text = "plus sixty two nine four one one one two three" +#input_text = "प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य" +#input_text = 'Plus ninety one नौ सात छह चार एक zero five eight two three' +#input_text = "plus eleven nine four one one one two three" +#input_text = "zero eight zero two nine four one one one two" #landline example of bangalore +#input_text = "zero eleven two nine four one one one two" #Delhi +#input_text = "zero four zero two seven eight one eight three nine" #hyd +#input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" +#output = apply_fst(input_text, telephone.fst) +#print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index a5a371d90..2fda42cc6 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -34,6 +34,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst @@ -82,6 +83,8 @@ def __init__( measure_graph = measure.fst money = MoneyFst(cardinal, decimal) money_graph = money.fst + telephone = TelephoneFst(cardinal) + telephone_graph = telephone.fst punct_graph = PunctuationFst().fst whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst @@ -95,6 +98,7 @@ def __init__( | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) + | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(word_graph, 100) | pynutil.add_weight(whitelist_graph, 1.01) ) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py new file mode 100644 index 000000000..5a475414d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -0,0 +1,73 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2025 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst + + +class TelephoneFst(GraphFst): + """ + Finite state transducer for verbalizing telephone, e.g. + telephone { number_part: "123-123-5678" } + -> 123-123-5678 + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="telephone", kind="verbalize") + + number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + optional_country_code = pynini.closure( + pynutil.delete("country_code: \"") + + pynutil.insert("+") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + optional_city_code = pynini.closure( + pynutil.delete("city_code: \"") + + pynutil.insert("०") + + delete_space + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + + pynini.accep(" "), + 0, + 1, + ) + delete_tokens = self.delete_tokens(optional_country_code + number_part) + delete_tokens |= self.delete_tokens(optional_city_code + number_part) + self.fst = delete_tokens.optimize() + +#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +#cardinal = CardinalFst() +#telephone = TelephoneFst(cardinal) +#input_text = 'telephone { country_code: "९१" number_part: "९८७६५४३२१०" }' +#input_text = 'telephone { country_code: "९१" number_part: "९४१११२३४१२" }' +#input_text = 'telephone { country_code: "९१" number_part: "९४२२२२२२२" }' +#input_text = 'telephone{ country_code: "९१" number_part: "११२३४५६७८९" }' +#input_text = 'telephone{ country_code: "९१" number_part: "९८७६५४३२११" }' +#input_text = 'telephone{ country_code: "९१" number_part: "९४५६७८९०१२" }' +#input_text = 'telephone{ country_code: "९१" number_part: "९५६७८९०१२३" }' +#input_text = 'telephone { city_code: "७९" number_part: "१९८७६५४" }' +#input_text = 'telephone { city_code: "४०" number_part: "२७८१८३९" }' +#input_text = 'telephone { city_code: "११" number_part: "२९४१११२" }' +#input_text = 'telephone { city_code: "८०" number_part: "२९४१११२" }' +#output = apply_fst(input_text, telephone.fst) +#print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index d88bd25d9..411b08863 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -22,6 +22,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst @@ -45,6 +46,7 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst + telephone_graph = TelephoneFst(cardinal).fst word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst @@ -59,5 +61,6 @@ def __init__(self): | time_graph | measure_graph | money_graph + | telephone_graph ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt new file mode 100644 index 000000000..0c001b20f --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -0,0 +1,25 @@ +प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ +प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ +plus sixty two nine four one one one two three~+६२ ९४१११२३ +प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० +plus eleven nine four one one one two three~+११ ९४१११२३ +zero eleven nine four one one one two three~०११ ९४१११२३ +शून्य ग्यारह नौ चार एक एक एक दो तीन~०११ ९४१११२३ +zero eight zero two nine four one one one two~०८० २९४१११२ +शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ +zero eleven two nine four one one one two~०११ २९४१११२ +शून्य ग्यारह दो नौ चार एक एक एक दो~०११ २९४१११२ +zero four zero two seven eight one eight three nine~०४० २७८१८३९ +शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +प्लस नौ एक नौ तीन आठ दो सात एक चार छह पांच शून्य~+९१ ९३८२७१४६५० +प्लस नौ एक नौ शून्य पांच एक तीन चार आठ दो सात छह~+९१ ९०५१३४८२७६ +प्लस नौ एक नौ चार तीन सात दो शून्य पांच छह एक आठ~+९१ ९४३७२०५६१८ +PLUS ninety one nine three eight two seven one four six five zero~+९१ ९३८२७१४६५० +plus nine one nine zero five one three four eight two seven six~+९१ ९०५१३४८२७६ +plus ninety one nine four three seven two zero five six one eight~+९१ ९४३७२०५६१८ +ZERO seven three चार पाँच छह सात आठ नौ शून्य~०७३ ४५६७८९० +शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ +ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० +zero two eight seven six five four three two seven~०२८ ७६५४३२७ diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index aec7299d5..a365a834d 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,11 @@ testITNMoney() { runtest $input } +testITNTelephone() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_telephone.txt + runtest $input +} + testITNWord() { input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt runtest $input diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py new file mode 100644 index 000000000..895f042b0 --- /dev/null +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -0,0 +1,31 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTelephone: + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred.strip() == expected.strip() From 461962bf9459af23a771c2354a18b38f237a8525 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:50:39 +0000 Subject: [PATCH 14/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/telephone.py | 143 ++++++++++++------ .../hi/taggers/tokenize_and_classify.py | 2 +- .../hi/verbalizers/telephone.py | 37 ++--- .../hi/verbalizers/verbalize.py | 2 +- .../nemo_text_processing/hi/test_telephone.py | 1 + 5 files changed, 120 insertions(+), 65 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 63136e472..04f7a8b23 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -22,7 +22,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path class TelephoneFst(GraphFst): @@ -36,64 +36,117 @@ class TelephoneFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") - + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() - + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() - + country_code_graph_single_digits |= pynini.string_file( + get_abs_path("data/telephone/eng_to_hindi_digit.tsv") + ).invert() + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() - - self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") - self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") - - self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") - self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") - self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) - - self.city_code_with_single_digits = pynutil.insert("city_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") - self.city_code_with_double_digits = pynutil.insert("city_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") - self.city_code = (self.city_code_with_single_digits | self.city_code_with_double_digits) - - self.landline_hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + hindi_digit_graph + pynutil.insert("\" ") - self.landline_english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 6) + english_digit_graph + pynutil.insert("\" ") - + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + self.city_code_with_single_digits = ( + pynutil.insert("city_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.city_code_with_double_digits = ( + pynutil.insert("city_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.city_code = self.city_code_with_single_digits | self.city_code_with_double_digits + + self.landline_hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.landline_english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 6) + + english_digit_graph + + pynutil.insert("\" ") + ) + delete_plus = pynini.union( pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") ) - + delete_zero = pynini.union( pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") ) - - graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - - graph_landline_with_hindi_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit - graph_landline_with_english_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit - graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_hindi_digit | graph_landline_with_english_digit) + graph_landline_with_hindi_digit = ( + delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit + ) + graph_landline_with_english_digit = ( + delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit + ) + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_hindi_digit + | graph_landline_with_english_digit + ) final_graph = self.add_tokens(graph) self.fst = final_graph -#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -#cardinal = CardinalFst() -#telephone = TelephoneFst(cardinal) -#input_text = "प्लस इक्यानवे nine four one one one two three four one two" -#input_text = "प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य" -#input_text = "plus nine eight zero nine four one one one two three four one" -#input_text = "plus sixty two nine four one one one two three" -#input_text = "प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य" -#input_text = 'Plus ninety one नौ सात छह चार एक zero five eight two three' -#input_text = "plus eleven nine four one one one two three" -#input_text = "zero eight zero two nine four one one one two" #landline example of bangalore -#input_text = "zero eleven two nine four one one one two" #Delhi -#input_text = "zero four zero two seven eight one eight three nine" #hyd -#input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" -#output = apply_fst(input_text, telephone.fst) -#print(output) + +# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +# cardinal = CardinalFst() +# telephone = TelephoneFst(cardinal) +# input_text = "प्लस इक्यानवे nine four one one one two three four one two" +# input_text = "प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य" +# input_text = "plus nine eight zero nine four one one one two three four one" +# input_text = "plus sixty two nine four one one one two three" +# input_text = "प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य" +# input_text = 'Plus ninety one नौ सात छह चार एक zero five eight two three' +# input_text = "plus eleven nine four one one one two three" +# input_text = "zero eight zero two nine four one one one two" #landline example of bangalore +# input_text = "zero eleven two nine four one one one two" #Delhi +# input_text = "zero four zero two seven eight one eight three nine" #hyd +# input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" +# output = apply_fst(input_text, telephone.fst) +# print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 2fda42cc6..62554bd14 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -33,8 +33,8 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst -from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 5a475414d..682c9416a 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -16,8 +16,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class TelephoneFst(GraphFst): @@ -54,20 +54,21 @@ def __init__(self, cardinal: GraphFst): delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) self.fst = delete_tokens.optimize() - -#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -#cardinal = CardinalFst() -#telephone = TelephoneFst(cardinal) -#input_text = 'telephone { country_code: "९१" number_part: "९८७६५४३२१०" }' -#input_text = 'telephone { country_code: "९१" number_part: "९४१११२३४१२" }' -#input_text = 'telephone { country_code: "९१" number_part: "९४२२२२२२२" }' -#input_text = 'telephone{ country_code: "९१" number_part: "११२३४५६७८९" }' -#input_text = 'telephone{ country_code: "९१" number_part: "९८७६५४३२११" }' -#input_text = 'telephone{ country_code: "९१" number_part: "९४५६७८९०१२" }' -#input_text = 'telephone{ country_code: "९१" number_part: "९५६७८९०१२३" }' -#input_text = 'telephone { city_code: "७९" number_part: "१९८७६५४" }' -#input_text = 'telephone { city_code: "४०" number_part: "२७८१८३९" }' -#input_text = 'telephone { city_code: "११" number_part: "२९४१११२" }' -#input_text = 'telephone { city_code: "८०" number_part: "२९४१११२" }' -#output = apply_fst(input_text, telephone.fst) -#print(output) + + +# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +# cardinal = CardinalFst() +# telephone = TelephoneFst(cardinal) +# input_text = 'telephone { country_code: "९१" number_part: "९८७६५४३२१०" }' +# input_text = 'telephone { country_code: "९१" number_part: "९४१११२३४१२" }' +# input_text = 'telephone { country_code: "९१" number_part: "९४२२२२२२२" }' +# input_text = 'telephone{ country_code: "९१" number_part: "११२३४५६७८९" }' +# input_text = 'telephone{ country_code: "९१" number_part: "९८७६५४३२११" }' +# input_text = 'telephone{ country_code: "९१" number_part: "९४५६७८९०१२" }' +# input_text = 'telephone{ country_code: "९१" number_part: "९५६७८९०१२३" }' +# input_text = 'telephone { city_code: "७९" number_part: "१९८७६५४" }' +# input_text = 'telephone { city_code: "४०" number_part: "२७८१८३९" }' +# input_text = 'telephone { city_code: "११" number_part: "२९४१११२" }' +# input_text = 'telephone { city_code: "८०" number_part: "२९४१११२" }' +# output = apply_fst(input_text, telephone.fst) +# print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 411b08863..165fe7a7e 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -21,8 +21,8 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.telephone import TelephoneFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 895f042b0..145f554a4 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -23,6 +23,7 @@ class TestTelephone: inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_telephone.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit From 3a011b9ff10efb191b0d73ec32924bf1319e1457 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Mon, 20 Jan 2025 17:44:11 +0530 Subject: [PATCH 15/28] Telephone.py - Hindi ITN Signed-off-by: Tarushi V --- .../hi/taggers/telephone.py | 22 ++---------------- .../hi/verbalizers/telephone.py | 23 ++----------------- .../test_cases_telephone.txt | 2 +- .../nemo_text_processing/hi/test_telephone.py | 2 +- 4 files changed, 6 insertions(+), 43 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 04f7a8b23..97df99eae 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -22,7 +22,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class TelephoneFst(GraphFst): @@ -131,22 +131,4 @@ def __init__(self, cardinal: GraphFst): | graph_landline_with_english_digit ) final_graph = self.add_tokens(graph) - self.fst = final_graph - - -# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -# cardinal = CardinalFst() -# telephone = TelephoneFst(cardinal) -# input_text = "प्लस इक्यानवे nine four one one one two three four one two" -# input_text = "प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य" -# input_text = "plus nine eight zero nine four one one one two three four one" -# input_text = "plus sixty two nine four one one one two three" -# input_text = "प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य" -# input_text = 'Plus ninety one नौ सात छह चार एक zero five eight two three' -# input_text = "plus eleven nine four one one one two three" -# input_text = "zero eight zero two nine four one one one two" #landline example of bangalore -# input_text = "zero eleven two nine four one one one two" #Delhi -# input_text = "zero four zero two seven eight one eight three nine" #hyd -# input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" -# output = apply_fst(input_text, telephone.fst) -# print(output) + self.fst = final_graph \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 682c9416a..e0c721dc5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -16,7 +16,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space @@ -42,7 +41,7 @@ def __init__(self, cardinal: GraphFst): 1, ) optional_city_code = pynini.closure( - pynutil.delete("city_code: \"") + pynutil.delete("extension: \"") + pynutil.insert("०") + delete_space + pynini.closure(NEMO_NOT_QUOTE, 1) @@ -53,22 +52,4 @@ def __init__(self, cardinal: GraphFst): ) delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) - self.fst = delete_tokens.optimize() - - -# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -# cardinal = CardinalFst() -# telephone = TelephoneFst(cardinal) -# input_text = 'telephone { country_code: "९१" number_part: "९८७६५४३२१०" }' -# input_text = 'telephone { country_code: "९१" number_part: "९४१११२३४१२" }' -# input_text = 'telephone { country_code: "९१" number_part: "९४२२२२२२२" }' -# input_text = 'telephone{ country_code: "९१" number_part: "११२३४५६७८९" }' -# input_text = 'telephone{ country_code: "९१" number_part: "९८७६५४३२११" }' -# input_text = 'telephone{ country_code: "९१" number_part: "९४५६७८९०१२" }' -# input_text = 'telephone{ country_code: "९१" number_part: "९५६७८९०१२३" }' -# input_text = 'telephone { city_code: "७९" number_part: "१९८७६५४" }' -# input_text = 'telephone { city_code: "४०" number_part: "२७८१८३९" }' -# input_text = 'telephone { city_code: "११" number_part: "२९४१११२" }' -# input_text = 'telephone { city_code: "८०" number_part: "२९४१११२" }' -# output = apply_fst(input_text, telephone.fst) -# print(output) + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt index 0c001b20f..34d031b41 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -1,7 +1,6 @@ प्लस इक्यानवे nine four one one one two three four one two~+९१ ९४१११२३४१२ प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ -plus sixty two nine four one one one two three~+६२ ९४१११२३ प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० plus eleven nine four one one one two three~+११ ९४१११२३ zero eleven nine four one one one two three~०११ ९४१११२३ @@ -23,3 +22,4 @@ ZERO seven three चार पाँच छह सात आठ नौ शून शून्य चार शून्य पाँच चार एक दो सात तीन आठ~०४० ५४१२७३८ ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० zero two eight seven six five four three two seven~०२८ ७६५४३२७ +PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index 145f554a4..b01b11871 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From a5cf0500eadc33dc51a8e93d7d0c681b9400dfcd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 04:19:39 +0000 Subject: [PATCH 16/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/hi/taggers/telephone.py | 2 +- .../inverse_text_normalization/hi/verbalizers/telephone.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 97df99eae..1b938f241 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -131,4 +131,4 @@ def __init__(self, cardinal: GraphFst): | graph_landline_with_english_digit ) final_graph = self.add_tokens(graph) - self.fst = final_graph \ No newline at end of file + self.fst = final_graph diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index e0c721dc5..3f4b4de1f 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -52,4 +52,4 @@ def __init__(self, cardinal: GraphFst): ) delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) - self.fst = delete_tokens.optimize() \ No newline at end of file + self.fst = delete_tokens.optimize() From 1c506e26c56b5abb3cd83e28b4c311b87249fc35 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Tue, 21 Jan 2025 09:58:06 +0530 Subject: [PATCH 17/28] Telephone modified tagger and verbalizer Signed-off-by: Tarushi V --- .../hi/taggers/telephone.py | 108 +++++------------- 1 file changed, 28 insertions(+), 80 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 1b938f241..2449eaff5 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -36,99 +36,47 @@ class TelephoneFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") - + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() - + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file( - get_abs_path("data/telephone/eng_to_hindi_digit.tsv") - ).invert() - + country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file( - get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") - ).invert() - - self.hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 9) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 9) - + english_digit_graph - + delete_space - + pynutil.insert("\" ") - ) - - self.country_code_with_single_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.country_code_with_double_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits - - self.city_code_with_single_digits = ( - pynutil.insert("city_code: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.city_code_with_double_digits = ( - pynutil.insert("city_code: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.city_code = self.city_code_with_single_digits | self.city_code_with_double_digits - - self.landline_hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 6) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.landline_english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 6) - + english_digit_graph - + pynutil.insert("\" ") - ) - + country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() + + self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") + self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") + + self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") + self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") + self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) + + self.city_code_with_single_digits = pynutil.insert("extension: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") + self.city_code_with_double_digits = pynutil.insert("extension: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") + self.city_code = (self.city_code_with_single_digits | self.city_code_with_double_digits) + + self.landline_hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + hindi_digit_graph + pynutil.insert("\" ") + self.landline_english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 6) + english_digit_graph + pynutil.insert("\" ") + delete_plus = pynini.union( pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") ) - + delete_zero = pynini.union( pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") ) - - graph_number_with_hindi_digit = ( - delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit - ) + + graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit + + graph_landline_with_hindi_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit + graph_landline_with_english_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit - graph_landline_with_hindi_digit = ( - delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit - ) - graph_landline_with_english_digit = ( - delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit - ) - - graph = ( - graph_number_with_hindi_digit - | graph_number_with_english_digit - | graph_landline_with_hindi_digit - | graph_landline_with_english_digit - ) + graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_hindi_digit | graph_landline_with_english_digit) final_graph = self.add_tokens(graph) self.fst = final_graph From 4503378b6a8d50901135437bb39a77de0b98b168 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Jan 2025 04:29:18 +0000 Subject: [PATCH 18/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/telephone.py | 108 +++++++++++++----- 1 file changed, 80 insertions(+), 28 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 2449eaff5..52960efcc 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -36,47 +36,99 @@ class TelephoneFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="telephone", kind="classify") - + hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - + english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() - + country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() - country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() - + country_code_graph_single_digits |= pynini.string_file( + get_abs_path("data/telephone/eng_to_hindi_digit.tsv") + ).invert() + country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() - - self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") - self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") - - self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") - self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") - self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) - - self.city_code_with_single_digits = pynutil.insert("extension: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") - self.city_code_with_double_digits = pynutil.insert("extension: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") - self.city_code = (self.city_code_with_single_digits | self.city_code_with_double_digits) - - self.landline_hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + hindi_digit_graph + pynutil.insert("\" ") - self.landline_english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 6) + english_digit_graph + pynutil.insert("\" ") - + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + self.city_code_with_single_digits = ( + pynutil.insert("extension: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.city_code_with_double_digits = ( + pynutil.insert("extension: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.city_code = self.city_code_with_single_digits | self.city_code_with_double_digits + + self.landline_hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 6) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.landline_english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 6) + + english_digit_graph + + pynutil.insert("\" ") + ) + delete_plus = pynini.union( pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") ) - + delete_zero = pynini.union( pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") ) - - graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - - graph_landline_with_hindi_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit - graph_landline_with_english_digit = delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit - graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_hindi_digit | graph_landline_with_english_digit) + graph_landline_with_hindi_digit = ( + delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit + ) + graph_landline_with_english_digit = ( + delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit + ) + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_hindi_digit + | graph_landline_with_english_digit + ) final_graph = self.add_tokens(graph) self.fst = final_graph From eb269ef28cf77c6dc78fd7df2b1f5f0bbc3a244c Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 24 Jan 2025 12:13:01 +0530 Subject: [PATCH 19/28] telephone tagger with 3,4,5 digit std codes Signed-off-by: Tarushi V --- .../hi/taggers/telephone.py | 146 +++++++++--------- 1 file changed, 72 insertions(+), 74 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 52960efcc..4ec0889aa 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -22,7 +22,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst class TelephoneFst(GraphFst): @@ -49,61 +49,49 @@ def __init__(self, cardinal: GraphFst): ).invert() country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file( - get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") - ).invert() - - self.hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 9) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 9) - + english_digit_graph - + delete_space - + pynutil.insert("\" ") - ) - - self.country_code_with_single_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.country_code_with_double_digits = ( - pynutil.insert("country_code: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits - - self.city_code_with_single_digits = ( - pynutil.insert("extension: \"") - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) - + pynutil.insert("\" ") - ) - self.city_code_with_double_digits = ( - pynutil.insert("extension: \"") - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) - + pynutil.insert("\" ") - ) - self.city_code = self.city_code_with_single_digits | self.city_code_with_double_digits - - self.landline_hindi_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 6) - + hindi_digit_graph - + pynutil.insert("\" ") - ) - self.landline_english_digit = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 6) - + english_digit_graph - + pynutil.insert("\" ") - ) - + country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() + + self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") + self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") + + self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") + self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") + self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) + + #two, three, four-digit extension code with zero + self.city_two_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 2) + pynutil.insert("\" ") + self.city_two_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 2) + pynutil.insert("\" ") + self.city_three_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + pynutil.insert("\" ") + self.city_three_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 3) + pynutil.insert("\" ") + self.city_four_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 4) + pynutil.insert("\" ") + self.city_four_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 4) + pynutil.insert("\" ") + + #concise extensions graphs - 2,3,4-digit + self.city_two_digit_extension = self.city_two_digit_code_hindi | self.city_two_digit_code_english + self.city_three_digit_extension = self.city_three_digit_code_hindi | self.city_three_digit_code_english + self.city_four_digit_extension = (self.city_four_digit_code_hindi | self.city_four_digit_code_english) + + #7-digit landline graph for 2-digit extension in hindi and english digits + self.landline_with_extension_two_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 6) + pynutil.insert("\" ") + self.landline_with_extension_two_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 6) + pynutil.insert("\" ") + self.landline_two = (self.landline_with_extension_two_hindi | self.landline_with_extension_two_english) + + #7-digit landline graph for 3-digit extension in hindi and english digits + self.landline_with_extension_three_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 7) + pynutil.insert("\" ") + self.landline_with_extension_three_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 7) + pynutil.insert("\" ") + self.landline_three = (self.landline_with_extension_three_hindi | self.landline_with_extension_three_english) + + #7-digit landline graph for 4-digit extension in hindi and english digits + self.landline_with_extension_four_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 8) + pynutil.insert("\" ") + self.landline_with_extension_four_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 8) + pynutil.insert("\" ") + self.landline_four = (self.landline_with_extension_four_hindi | self.landline_with_extension_four_english) + + self.pincode_in_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + hindi_digit_graph + pynutil.insert("\" ") + self.pincode_in_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 5) + english_digit_graph + pynutil.insert("\" ") + + self.credit_card_last_four_digits_in_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + hindi_digit_graph + pynutil.insert("\" ") + self.credit_card_last_four_digits_in_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 3) + english_digit_graph + pynutil.insert("\" ") + delete_plus = pynini.union( pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") ) @@ -111,24 +99,34 @@ def __init__(self, cardinal: GraphFst): delete_zero = pynini.union( pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") ) - - graph_number_with_hindi_digit = ( - delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit - ) + + graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - - graph_landline_with_hindi_digit = ( - delete_zero + delete_space + self.city_code + delete_space + self.landline_hindi_digit - ) - graph_landline_with_english_digit = ( - delete_zero + delete_space + self.city_code + delete_space + self.landline_english_digit - ) - - graph = ( - graph_number_with_hindi_digit - | graph_number_with_english_digit - | graph_landline_with_hindi_digit - | graph_landline_with_english_digit - ) + + graph_landline_with_two_digit_extension = delete_zero + delete_space + self.city_two_digit_extension + delete_space + self.landline_two + graph_landline_with_three_digit_extension = delete_zero + delete_space + self.city_three_digit_extension + delete_space + self.landline_three + graph_landline_with_four_digit_extension = delete_zero + delete_space + self.city_four_digit_extension + delete_space + self.landline_four + + graph_pincode = self.pincode_in_hindi | self.pincode_in_english + + graph_credit_card_last_four_digits = self.credit_card_last_four_digits_in_hindi | self.credit_card_last_four_digits_in_english + + graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_two_digit_extension | graph_landline_with_three_digit_extension | graph_landline_with_three_digit_extension | graph_pincode | graph_credit_card_last_four_digits) + final_graph = self.add_tokens(graph) self.fst = final_graph + +#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +#cardinal = CardinalFst() +#telephone = TelephoneFst(cardinal) + +#input_text = "zero eight zero two two nine four one one one" # zero+ two digit extension + landline in english +#input_text = "zero eight zero nine two two nine four one one one" # zero + three digit extension + landline in english +#input_text = "zero eight zero nine one two two nine four one one one" #zero + four digit extension + landline in english + +#input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" #zero + two digit extension + landline in hindi +#input_text = "शून्य सात नौ नौ एक नौ आठ सात छह पांच चार" #zero + three digit extension + landline in hindi +#input_text = "शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार" #zero+ four digit digit extension + landline in hindi + +#output = apply_fst(input_text, telephone.fst) +#print(output) From 26e9d7f50c7c27e1dbd946b0c2b16b241441e4fb Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 24 Jan 2025 12:17:50 +0530 Subject: [PATCH 20/28] Further additions - telephone.py Signed-off-by: Tarushi V --- .../hi/verbalizers/telephone.py | 11 +++++++++++ .../test_cases_telephone.txt | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 3f4b4de1f..66ccb7927 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -17,6 +17,8 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst + class TelephoneFst(GraphFst): @@ -53,3 +55,12 @@ def __init__(self, cardinal: GraphFst): delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) self.fst = delete_tokens.optimize() + +#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +#cardinal = CardinalFst() +#telephone = TelephoneFst(cardinal) +#input_text = 'telephone { number_part: "१९८७६५" }' +#input_text ='telephone { number_part: "३४०१" }' +#input_text = 'telephone { number_part: "०७९१" }' +#output = apply_fst(input_text, telephone.fst) +#print(output) diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt index 34d031b41..c5a2d574e 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -23,3 +23,9 @@ ZERO seven three चार पाँच छह सात आठ नौ शून ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० zero two eight seven six five four three two seven~०२८ ७६५४३२७ PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ +एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है +बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं \ No newline at end of file From b743170959fe81fed20cc2d3a7c108e07dc5a9b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Jan 2025 06:51:26 +0000 Subject: [PATCH 21/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/telephone.py | 240 +++++++++++++----- .../hi/verbalizers/telephone.py | 22 +- 2 files changed, 185 insertions(+), 77 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 4ec0889aa..dd03551dc 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -22,7 +22,7 @@ delete_space, insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path, apply_fst +from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path class TelephoneFst(GraphFst): @@ -49,49 +49,138 @@ def __init__(self, cardinal: GraphFst): ).invert() country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() - country_code_graph_double_digits |= pynini.string_file(get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")).invert() - - self.hindi_digit = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + hindi_digit_graph + pynutil.insert("\" ") - self.english_digit = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 9) + english_digit_graph + delete_space + pynutil.insert("\" ") - - self.country_code_with_single_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + pynutil.insert("\" ") - self.country_code_with_double_digits = pynutil.insert("country_code: \"") + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + pynutil.insert("\" ") - self.country_code = (self.country_code_with_single_digits | self.country_code_with_double_digits) - - #two, three, four-digit extension code with zero - self.city_two_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 2) + pynutil.insert("\" ") - self.city_two_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 2) + pynutil.insert("\" ") - self.city_three_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + pynutil.insert("\" ") - self.city_three_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 3) + pynutil.insert("\" ") - self.city_four_digit_code_hindi = pynutil.insert("extension: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 4) + pynutil.insert("\" ") - self.city_four_digit_code_english = pynutil.insert("extension: \"") + pynini.closure(english_digit_graph + delete_space, 0, 4) + pynutil.insert("\" ") - - #concise extensions graphs - 2,3,4-digit + country_code_graph_double_digits |= pynini.string_file( + get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") + ).invert() + + self.hindi_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 9) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.english_digit = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 9) + + english_digit_graph + + delete_space + + pynutil.insert("\" ") + ) + + self.country_code_with_single_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.country_code_with_double_digits = ( + pynutil.insert("country_code: \"") + + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) + + pynutil.insert("\" ") + ) + self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits + + # two, three, four-digit extension code with zero + self.city_two_digit_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.city_two_digit_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 2) + + pynutil.insert("\" ") + ) + self.city_three_digit_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + + pynutil.insert("\" ") + ) + self.city_three_digit_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 3) + + pynutil.insert("\" ") + ) + self.city_four_digit_code_hindi = ( + pynutil.insert("extension: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 4) + + pynutil.insert("\" ") + ) + self.city_four_digit_code_english = ( + pynutil.insert("extension: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 4) + + pynutil.insert("\" ") + ) + + # concise extensions graphs - 2,3,4-digit self.city_two_digit_extension = self.city_two_digit_code_hindi | self.city_two_digit_code_english self.city_three_digit_extension = self.city_three_digit_code_hindi | self.city_three_digit_code_english - self.city_four_digit_extension = (self.city_four_digit_code_hindi | self.city_four_digit_code_english) - - #7-digit landline graph for 2-digit extension in hindi and english digits - self.landline_with_extension_two_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 6) + pynutil.insert("\" ") - self.landline_with_extension_two_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 6) + pynutil.insert("\" ") - self.landline_two = (self.landline_with_extension_two_hindi | self.landline_with_extension_two_english) - - #7-digit landline graph for 3-digit extension in hindi and english digits - self.landline_with_extension_three_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 7) + pynutil.insert("\" ") - self.landline_with_extension_three_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 7) + pynutil.insert("\" ") - self.landline_three = (self.landline_with_extension_three_hindi | self.landline_with_extension_three_english) - - #7-digit landline graph for 4-digit extension in hindi and english digits - self.landline_with_extension_four_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 8) + pynutil.insert("\" ") - self.landline_with_extension_four_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 8) + pynutil.insert("\" ") - self.landline_four = (self.landline_with_extension_four_hindi | self.landline_with_extension_four_english) - - self.pincode_in_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + hindi_digit_graph + pynutil.insert("\" ") - self.pincode_in_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 5) + english_digit_graph + pynutil.insert("\" ") - - self.credit_card_last_four_digits_in_hindi = pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + hindi_digit_graph + pynutil.insert("\" ") - self.credit_card_last_four_digits_in_english = pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 3) + english_digit_graph + pynutil.insert("\" ") - + self.city_four_digit_extension = self.city_four_digit_code_hindi | self.city_four_digit_code_english + + # 7-digit landline graph for 2-digit extension in hindi and english digits + self.landline_with_extension_two_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 6) + + pynutil.insert("\" ") + ) + self.landline_with_extension_two_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 6) + + pynutil.insert("\" ") + ) + self.landline_two = self.landline_with_extension_two_hindi | self.landline_with_extension_two_english + + # 7-digit landline graph for 3-digit extension in hindi and english digits + self.landline_with_extension_three_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 7) + + pynutil.insert("\" ") + ) + self.landline_with_extension_three_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 7) + + pynutil.insert("\" ") + ) + self.landline_three = self.landline_with_extension_three_hindi | self.landline_with_extension_three_english + + # 7-digit landline graph for 4-digit extension in hindi and english digits + self.landline_with_extension_four_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 8) + + pynutil.insert("\" ") + ) + self.landline_with_extension_four_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 8) + + pynutil.insert("\" ") + ) + self.landline_four = self.landline_with_extension_four_hindi | self.landline_with_extension_four_english + + self.pincode_in_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 5) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.pincode_in_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 5) + + english_digit_graph + + pynutil.insert("\" ") + ) + + self.credit_card_last_four_digits_in_hindi = ( + pynutil.insert("number_part: \"") + + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + + hindi_digit_graph + + pynutil.insert("\" ") + ) + self.credit_card_last_four_digits_in_english = ( + pynutil.insert("number_part: \"") + + pynini.closure(english_digit_graph + delete_space, 0, 3) + + english_digit_graph + + pynutil.insert("\" ") + ) + delete_plus = pynini.union( pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") ) @@ -99,34 +188,53 @@ def __init__(self, cardinal: GraphFst): delete_zero = pynini.union( pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") ) - - graph_number_with_hindi_digit = delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + + graph_number_with_hindi_digit = ( + delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit + ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - - graph_landline_with_two_digit_extension = delete_zero + delete_space + self.city_two_digit_extension + delete_space + self.landline_two - graph_landline_with_three_digit_extension = delete_zero + delete_space + self.city_three_digit_extension + delete_space + self.landline_three - graph_landline_with_four_digit_extension = delete_zero + delete_space + self.city_four_digit_extension + delete_space + self.landline_four - + + graph_landline_with_two_digit_extension = ( + delete_zero + delete_space + self.city_two_digit_extension + delete_space + self.landline_two + ) + graph_landline_with_three_digit_extension = ( + delete_zero + delete_space + self.city_three_digit_extension + delete_space + self.landline_three + ) + graph_landline_with_four_digit_extension = ( + delete_zero + delete_space + self.city_four_digit_extension + delete_space + self.landline_four + ) + graph_pincode = self.pincode_in_hindi | self.pincode_in_english - - graph_credit_card_last_four_digits = self.credit_card_last_four_digits_in_hindi | self.credit_card_last_four_digits_in_english - - graph = (graph_number_with_hindi_digit | graph_number_with_english_digit | graph_landline_with_two_digit_extension | graph_landline_with_three_digit_extension | graph_landline_with_three_digit_extension | graph_pincode | graph_credit_card_last_four_digits) - + + graph_credit_card_last_four_digits = ( + self.credit_card_last_four_digits_in_hindi | self.credit_card_last_four_digits_in_english + ) + + graph = ( + graph_number_with_hindi_digit + | graph_number_with_english_digit + | graph_landline_with_two_digit_extension + | graph_landline_with_three_digit_extension + | graph_landline_with_three_digit_extension + | graph_pincode + | graph_credit_card_last_four_digits + ) + final_graph = self.add_tokens(graph) self.fst = final_graph - -#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -#cardinal = CardinalFst() -#telephone = TelephoneFst(cardinal) -#input_text = "zero eight zero two two nine four one one one" # zero+ two digit extension + landline in english -#input_text = "zero eight zero nine two two nine four one one one" # zero + three digit extension + landline in english -#input_text = "zero eight zero nine one two two nine four one one one" #zero + four digit extension + landline in english -#input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" #zero + two digit extension + landline in hindi -#input_text = "शून्य सात नौ नौ एक नौ आठ सात छह पांच चार" #zero + three digit extension + landline in hindi -#input_text = "शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार" #zero+ four digit digit extension + landline in hindi +# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +# cardinal = CardinalFst() +# telephone = TelephoneFst(cardinal) + +# input_text = "zero eight zero two two nine four one one one" # zero+ two digit extension + landline in english +# input_text = "zero eight zero nine two two nine four one one one" # zero + three digit extension + landline in english +# input_text = "zero eight zero nine one two two nine four one one one" #zero + four digit extension + landline in english + +# input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" #zero + two digit extension + landline in hindi +# input_text = "शून्य सात नौ नौ एक नौ आठ सात छह पांच चार" #zero + three digit extension + landline in hindi +# input_text = "शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार" #zero+ four digit digit extension + landline in hindi -#output = apply_fst(input_text, telephone.fst) -#print(output) +# output = apply_fst(input_text, telephone.fst) +# print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index 66ccb7927..a59a2ca97 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -16,9 +16,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst - +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class TelephoneFst(GraphFst): @@ -55,12 +54,13 @@ def __init__(self, cardinal: GraphFst): delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) self.fst = delete_tokens.optimize() - -#from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -#cardinal = CardinalFst() -#telephone = TelephoneFst(cardinal) -#input_text = 'telephone { number_part: "१९८७६५" }' -#input_text ='telephone { number_part: "३४०१" }' -#input_text = 'telephone { number_part: "०७९१" }' -#output = apply_fst(input_text, telephone.fst) -#print(output) + + +# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst +# cardinal = CardinalFst() +# telephone = TelephoneFst(cardinal) +# input_text = 'telephone { number_part: "१९८७६५" }' +# input_text ='telephone { number_part: "३४०१" }' +# input_text = 'telephone { number_part: "०७९१" }' +# output = apply_fst(input_text, telephone.fst) +# print(output) From ab5d886f951f34a2b2e8f9a3aa89450df6e2297c Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Mon, 27 Jan 2025 15:29:34 +0530 Subject: [PATCH 22/28] Jenkins update Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5e3916ce2..bbad2c3da 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-15-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-27-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From a0ff72e2426953ec63a63be411f959c6f529d4e5 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Tue, 28 Jan 2025 12:09:58 +0530 Subject: [PATCH 23/28] Telephone.py Signed-off-by: Tarushi V --- .../hi/taggers/telephone.py | 115 +++--------------- .../hi/verbalizers/telephone.py | 11 -- .../test_cases_telephone.txt | 16 ++- .../nemo_text_processing/hi/test_telephone.py | 1 - 4 files changed, 31 insertions(+), 112 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index dd03551dc..cbfef8393 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -16,13 +16,10 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - NEMO_HI_DIGIT, GraphFst, - delete_extra_space, delete_space, - insert_space, ) -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst, get_abs_path +from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path class TelephoneFst(GraphFst): @@ -80,80 +77,32 @@ def __init__(self, cardinal: GraphFst): self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits # two, three, four-digit extension code with zero - self.city_two_digit_code_hindi = ( + self.city_code_hindi = ( pynutil.insert("extension: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 2) + + pynini.closure(hindi_digit_graph + delete_space, 2, 5) + pynutil.insert("\" ") ) - self.city_two_digit_code_english = ( + self.city_code_english = ( pynutil.insert("extension: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 2) + + pynini.closure(english_digit_graph + delete_space, 2, 5) + pynutil.insert("\" ") ) - self.city_three_digit_code_hindi = ( - pynutil.insert("extension: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 3) - + pynutil.insert("\" ") - ) - self.city_three_digit_code_english = ( - pynutil.insert("extension: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 3) - + pynutil.insert("\" ") - ) - self.city_four_digit_code_hindi = ( - pynutil.insert("extension: \"") - + pynini.closure(hindi_digit_graph + delete_space, 0, 4) - + pynutil.insert("\" ") - ) - self.city_four_digit_code_english = ( - pynutil.insert("extension: \"") - + pynini.closure(english_digit_graph + delete_space, 0, 4) - + pynutil.insert("\" ") - ) - - # concise extensions graphs - 2,3,4-digit - self.city_two_digit_extension = self.city_two_digit_code_hindi | self.city_two_digit_code_english - self.city_three_digit_extension = self.city_three_digit_code_hindi | self.city_three_digit_code_english - self.city_four_digit_extension = self.city_four_digit_code_hindi | self.city_four_digit_code_english - - # 7-digit landline graph for 2-digit extension in hindi and english digits - self.landline_with_extension_two_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 6) - + pynutil.insert("\" ") - ) - self.landline_with_extension_two_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 6) - + pynutil.insert("\" ") - ) - self.landline_two = self.landline_with_extension_two_hindi | self.landline_with_extension_two_english + + self.city_extension = self.city_code_hindi | self.city_code_english - # 7-digit landline graph for 3-digit extension in hindi and english digits - self.landline_with_extension_three_hindi = ( + # 7-digit landline graph in hindi and english digits + self.landline_hindi = ( pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 7) + pynutil.insert("\" ") ) - self.landline_with_extension_three_english = ( + self.landline_english = ( pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 7) + pynutil.insert("\" ") ) - self.landline_three = self.landline_with_extension_three_hindi | self.landline_with_extension_three_english - - # 7-digit landline graph for 4-digit extension in hindi and english digits - self.landline_with_extension_four_hindi = ( - pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 8) - + pynutil.insert("\" ") - ) - self.landline_with_extension_four_english = ( - pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 8) - + pynutil.insert("\" ") - ) - self.landline_four = self.landline_with_extension_four_hindi | self.landline_with_extension_four_english + + self.landline = self.landline_hindi | self.landline_english self.pincode_in_hindi = ( pynutil.insert("number_part: \"") @@ -168,13 +117,13 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert("\" ") ) - self.credit_card_last_four_digits_in_hindi = ( + self.credit_card_last_digits_hindi = ( pynutil.insert("number_part: \"") + pynini.closure(hindi_digit_graph + delete_space, 0, 3) + hindi_digit_graph + pynutil.insert("\" ") ) - self.credit_card_last_four_digits_in_english = ( + self.credit_card_last_digits_english = ( pynutil.insert("number_part: \"") + pynini.closure(english_digit_graph + delete_space, 0, 3) + english_digit_graph @@ -194,47 +143,23 @@ def __init__(self, cardinal: GraphFst): ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - graph_landline_with_two_digit_extension = ( - delete_zero + delete_space + self.city_two_digit_extension + delete_space + self.landline_two - ) - graph_landline_with_three_digit_extension = ( - delete_zero + delete_space + self.city_three_digit_extension + delete_space + self.landline_three - ) - graph_landline_with_four_digit_extension = ( - delete_zero + delete_space + self.city_four_digit_extension + delete_space + self.landline_four + graph_landline_with_extension = ( + delete_zero + delete_space + self.city_extension + delete_space + self.landline ) graph_pincode = self.pincode_in_hindi | self.pincode_in_english - graph_credit_card_last_four_digits = ( - self.credit_card_last_four_digits_in_hindi | self.credit_card_last_four_digits_in_english + graph_credit_card_last_digits = ( + self.credit_card_last_digits_hindi | self.credit_card_last_digits_english ) graph = ( graph_number_with_hindi_digit | graph_number_with_english_digit - | graph_landline_with_two_digit_extension - | graph_landline_with_three_digit_extension - | graph_landline_with_three_digit_extension + | graph_landline_with_extension | graph_pincode - | graph_credit_card_last_four_digits + | graph_credit_card_last_digits ) final_graph = self.add_tokens(graph) self.fst = final_graph - - -# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -# cardinal = CardinalFst() -# telephone = TelephoneFst(cardinal) - -# input_text = "zero eight zero two two nine four one one one" # zero+ two digit extension + landline in english -# input_text = "zero eight zero nine two two nine four one one one" # zero + three digit extension + landline in english -# input_text = "zero eight zero nine one two two nine four one one one" #zero + four digit extension + landline in english - -# input_text = "शून्य सात नौ एक नौ आठ सात छह पांच चार" #zero + two digit extension + landline in hindi -# input_text = "शून्य सात नौ नौ एक नौ आठ सात छह पांच चार" #zero + three digit extension + landline in hindi -# input_text = "शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार" #zero+ four digit digit extension + landline in hindi - -# output = apply_fst(input_text, telephone.fst) -# print(output) diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py index a59a2ca97..3f4b4de1f 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py @@ -16,7 +16,6 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.utils import apply_fst from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space @@ -54,13 +53,3 @@ def __init__(self, cardinal: GraphFst): delete_tokens = self.delete_tokens(optional_country_code + number_part) delete_tokens |= self.delete_tokens(optional_city_code + number_part) self.fst = delete_tokens.optimize() - - -# from nemo_text_processing.inverse_text_normalization.hi.taggers.cardinal import CardinalFst -# cardinal = CardinalFst() -# telephone = TelephoneFst(cardinal) -# input_text = 'telephone { number_part: "१९८७६५" }' -# input_text ='telephone { number_part: "३४०१" }' -# input_text = 'telephone { number_part: "०७९१" }' -# output = apply_fst(input_text, telephone.fst) -# print(output) diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt index c5a2d574e..0c51d8df0 100644 --- a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt @@ -3,12 +3,8 @@ plus nine eight nine four one one one two three four zero one~+९८ ९४१११२३४०१ प्लस नौ एक नौ आठ सात छह पांच चार तीन दो एक शून्य~+९१ ९८७६५४३२१० plus eleven nine four one one one two three~+११ ९४१११२३ -zero eleven nine four one one one two three~०११ ९४१११२३ -शून्य ग्यारह नौ चार एक एक एक दो तीन~०११ ९४१११२३ zero eight zero two nine four one one one two~०८० २९४१११२ शून्य आठ शून्य दो नौ चार एक एक एक दो~०८० २९४१११२ -zero eleven two nine four one one one two~०११ २९४१११२ -शून्य ग्यारह दो नौ चार एक एक एक दो~०११ २९४१११२ zero four zero two seven eight one eight three nine~०४० २७८१८३९ शून्य चार शून्य दो सात आठ एक आठ तीन नौ~०४० २७८१८३९ शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ @@ -23,9 +19,19 @@ ZERO seven three चार पाँच छह सात आठ नौ शून ZERO seven three four five six seven eight nine zero~०७३ ४५६७८९० zero two eight seven six five four three two seven~०२८ ७६५४३२७ PLUS eighty one nine seven four seven two zero zero one one eight~+८१ ९७४७२००११८ +zero eight zero two two nine four one one one~०८० २२९४१११ +शून्य सात नौ एक नौ आठ सात छह पांच चार~०७९ १९८७६५४ +zero eight zero nine two two nine four one one one~०८०९ २२९४१११ +शून्य सात नौ नौ एक नौ आठ सात छह पांच चार~०७९९ १९८७६५४ +zero three one nine two two two nine four one one one~०३१९२ २२९४१११ +शून्य सात नौ एक एक एक नौ आठ सात छह पांच चार~०७९११ १९८७६५४ एक एक शून्य शून्य सात शून्य दिल्ली के वसंत कुंज का पिनकोड है~११००७० दिल्ली के वसंत कुंज का पिनकोड है बंगलौर के बैयापानहली का पिनकोड पाँच छह शून्य शून्य तीन आठ है~बंगलौर के बैयापानहली का पिनकोड ५६००३८ है दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं -क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं \ No newline at end of file +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं +दिल्ली के वसंत कुंज का पिनकोड one one zero zero seven zero है~दिल्ली के वसंत कुंज का पिनकोड ११००७० है +five six zero zero three eight बंगलौर के बैयापानहली का पिनकोड है~५६००३८ बंगलौर के बैयापानहली का पिनकोड है +मेरे क्रेडिट कार्ड के आखिरी डिजिट शून्य शून्य तीन आठ हैं~मेरे क्रेडिट कार्ड के आखिरी डिजिट ००३८ हैं +क्रेडिट कार्ड के आखिरी डिजिट four three seven two हैं~क्रेडिट कार्ड के आखिरी डिजिट ४३७२ हैं diff --git a/tests/nemo_text_processing/hi/test_telephone.py b/tests/nemo_text_processing/hi/test_telephone.py index b01b11871..7e43f7e82 100644 --- a/tests/nemo_text_processing/hi/test_telephone.py +++ b/tests/nemo_text_processing/hi/test_telephone.py @@ -16,7 +16,6 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file From cac9be65f313670f22a149dc2b049b4cab67d722 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 28 Jan 2025 06:40:52 +0000 Subject: [PATCH 24/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/telephone.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index cbfef8393..0162dacc4 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -15,10 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -87,7 +84,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 2, 5) + pynutil.insert("\" ") ) - + self.city_extension = self.city_code_hindi | self.city_code_english # 7-digit landline graph in hindi and english digits @@ -101,7 +98,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 7) + pynutil.insert("\" ") ) - + self.landline = self.landline_hindi | self.landline_english self.pincode_in_hindi = ( @@ -143,15 +140,11 @@ def __init__(self, cardinal: GraphFst): ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - graph_landline_with_extension = ( - delete_zero + delete_space + self.city_extension + delete_space + self.landline - ) + graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline graph_pincode = self.pincode_in_hindi | self.pincode_in_english - graph_credit_card_last_digits = ( - self.credit_card_last_digits_hindi | self.credit_card_last_digits_english - ) + graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english graph = ( graph_number_with_hindi_digit From e23887149980d3a19ba6036ea338c62ba640d884 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 29 Jan 2025 17:39:39 +0530 Subject: [PATCH 25/28] Updated tagger-telephone.py Signed-off-by: Tarushi V --- .../inverse_text_normalization/hi/taggers/telephone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 0162dacc4..1d1d3c875 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -90,12 +90,12 @@ def __init__(self, cardinal: GraphFst): # 7-digit landline graph in hindi and english digits self.landline_hindi = ( pynutil.insert("number_part: \"") - + pynini.closure(hindi_digit_graph + delete_space, 7) + + pynini.closure(hindi_digit_graph + delete_space, 7, 7) + pynutil.insert("\" ") ) self.landline_english = ( pynutil.insert("number_part: \"") - + pynini.closure(english_digit_graph + delete_space, 7) + + pynini.closure(english_digit_graph + delete_space, 7, 7) + pynutil.insert("\" ") ) From d4d27da30b975dac821929ae7e92da70ba3349a7 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 30 Jan 2025 17:01:42 +0530 Subject: [PATCH 26/28] Telephone and Jenkinsfile cleanup Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- .../hi/taggers/telephone.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index bbad2c3da..ba381f535 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-27-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 1d1d3c875..6e695f997 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -15,7 +15,10 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( + GraphFst, + delete_space, +) from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -84,7 +87,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 2, 5) + pynutil.insert("\" ") ) - + self.city_extension = self.city_code_hindi | self.city_code_english # 7-digit landline graph in hindi and english digits @@ -98,7 +101,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 7, 7) + pynutil.insert("\" ") ) - + self.landline = self.landline_hindi | self.landline_english self.pincode_in_hindi = ( @@ -140,11 +143,15 @@ def __init__(self, cardinal: GraphFst): ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline + graph_landline_with_extension = ( + delete_zero + delete_space + self.city_extension + delete_space + self.landline + ) graph_pincode = self.pincode_in_hindi | self.pincode_in_english - graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + graph_credit_card_last_digits = ( + self.credit_card_last_digits_hindi | self.credit_card_last_digits_english + ) graph = ( graph_number_with_hindi_digit From e072a016093c4013bdc09603e2aa8eec515ca63f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 30 Jan 2025 11:33:24 +0000 Subject: [PATCH 27/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/telephone.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py index 6e695f997..1d1d3c875 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py @@ -15,10 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path @@ -87,7 +84,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 2, 5) + pynutil.insert("\" ") ) - + self.city_extension = self.city_code_hindi | self.city_code_english # 7-digit landline graph in hindi and english digits @@ -101,7 +98,7 @@ def __init__(self, cardinal: GraphFst): + pynini.closure(english_digit_graph + delete_space, 7, 7) + pynutil.insert("\" ") ) - + self.landline = self.landline_hindi | self.landline_english self.pincode_in_hindi = ( @@ -143,15 +140,11 @@ def __init__(self, cardinal: GraphFst): ) graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit - graph_landline_with_extension = ( - delete_zero + delete_space + self.city_extension + delete_space + self.landline - ) + graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline graph_pincode = self.pincode_in_hindi | self.pincode_in_english - graph_credit_card_last_digits = ( - self.credit_card_last_digits_hindi | self.credit_card_last_digits_english - ) + graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english graph = ( graph_number_with_hindi_digit From f6084c3e95263b5d95def5aeac488b79fe2cf19e Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 3 Apr 2025 15:16:18 +0530 Subject: [PATCH 28/28] Update Jenkins Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index ba381f535..82a0a4799 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-30-25-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages {