From 6aac2446e18166043eb94491ffa908f92496e5f5 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sat, 9 May 2020 16:22:48 +0100 Subject: [PATCH] Add test cases --- pythainlp/util/normalize.py | 7 +++---- tests/test_util.py | 32 +++++++++++++++++--------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pythainlp/util/normalize.py b/pythainlp/util/normalize.py index 92e27fee4..b434315a8 100644 --- a/pythainlp/util/normalize.py +++ b/pythainlp/util/normalize.py @@ -204,6 +204,8 @@ def normalize(text: str) -> str: If a user wants to customize the selection or the order of rules to be applied, they can choose to call those functions by themselves. + Note: for Unicode normalization, see unicodedata.normalize(). + :param str text: input text :return: normalized text according to the fules :rtype: str @@ -213,10 +215,7 @@ def normalize(text: str) -> str: from pythainlp.util import normalize - normalize('สระะน้ำ') - # output: สระน้ำ - - normalize('เเปลก') + normalize('เเปลก') # starts with two Sara E # output: แปลก normalize('นานาาา') diff --git a/tests/test_util.py b/tests/test_util.py index cdd020a5c..34d2491e7 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -15,6 +15,7 @@ bahttext, collate, countthai, + delete_tone, dict_trie, digit_to_text, eng_to_thai, @@ -278,38 +279,38 @@ def test_trie(self): def test_normalize(self): self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) - # sara e + sara e + # normalize sara e + sara e self.assertEqual(normalize("เเปลก"), "แปลก") - # consonant + follow vowel + tone mark - self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") - - # consonant + nikhahit + sara aa + # normalize consonant + nikhahit + sara aa self.assertEqual(normalize("นํา"), "นำ") self.assertEqual(normalize("\u0e01\u0e4d\u0e32"), "\u0e01\u0e33") - # consonant + nikhahit + tone mark + sara aa + # normalize consonant + tone mark + nikhahit + sara aa self.assertEqual( - normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33" + normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + tone mark + nikhahit + sara aa + # reorder consonant + follow vowel + tone mark + self.assertEqual(normalize("\u0e01\u0e30\u0e48"), "\u0e01\u0e48\u0e30") + + # reorder consonant + nikhahit + tone mark + sara aa self.assertEqual( - normalize("\u0e01\u0e48\u0e4d\u0e32"), "\u0e01\u0e48\u0e33" + normalize("\u0e01\u0e4d\u0e48\u0e32"), "\u0e01\u0e48\u0e33" ) - # consonant + follow vowel + tone mark + # reorder consonant + follow vowel + tone mark self.assertEqual(normalize("\u0e01\u0e32\u0e48"), "\u0e01\u0e48\u0e32") - # repeating following vowels + # remove repeating following vowels self.assertEqual(normalize("กาา"), "กา") self.assertEqual(normalize("กา า า า"), "กา") self.assertEqual(normalize("กา าาะา"), "กาะา") - # repeating tone marks + # remove epeating tone marks self.assertEqual(normalize("\u0e01\u0e48\u0e48"), "\u0e01\u0e48") - # repeating different ton emarks + # remove repeating different ton emarks self.assertEqual(normalize("\u0e01\u0e48\u0e49"), "\u0e01\u0e49") self.assertEqual( normalize("\u0e01\u0e48\u0e49\u0e48\u0e49"), "\u0e01\u0e49" @@ -325,11 +326,12 @@ def test_normalize(self): self.assertEqual(remove_dup_spaces(" ab c d "), "ab c d") self.assertEqual(remove_dup_spaces("\nab c \n d \n"), "ab c\nd") - # removing tone marks + # remove tone marks self.assertEqual(remove_tonemark("จิ้น"), "จิน") self.assertEqual(remove_tonemark("เก๋า"), "เกา") + self.assertEqual(delete_tone("เจ๋งเป้ง"), remove_tonemark("เจ๋งเป้ง")) - # removing zero width chars + # remove zero width chars self.assertEqual(remove_zw("กา\u200b"), "กา") self.assertEqual(remove_zw("ก\u200cา"), "กา") self.assertEqual(remove_zw("\u200bกา"), "กา")