From e94d65ec227769fb7bb1dadf51a7f7cbc5d60540 Mon Sep 17 00:00:00 2001 From: Pat Chormai Date: Fri, 1 Oct 2021 22:46:05 +0200 Subject: [PATCH 1/5] Add misspell module --- pythainlp/tools/__init__.py | 3 + pythainlp/tools/misspell.py | 137 ++++++++++++++++++++++++++++++++++++ tests/test_misspell.py | 69 ++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 pythainlp/tools/misspell.py create mode 100644 tests/test_misspell.py diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py index 7488f4207..317dd065d 100644 --- a/pythainlp/tools/__init__.py +++ b/pythainlp/tools/__init__.py @@ -4,6 +4,7 @@ "get_full_data_path", "get_pythainlp_data_path", "get_pythainlp_path", + "misspell", ] from pythainlp.tools.path import ( @@ -12,3 +13,5 @@ get_pythainlp_data_path, get_pythainlp_path, ) + +from pythainlp.tools.misspell import misspell diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py new file mode 100644 index 000000000..0b6997fca --- /dev/null +++ b/pythainlp/tools/misspell.py @@ -0,0 +1,137 @@ +import numpy as np +from typing import List + +THAI_CHARACTERS_WITHOUT_SHIFT = [ + "ผปแอิืทมใฝ", + "ฟหกดเ้่าสวง", + "ๆไำพะัีรนยบลฃ", + "ๅ/_ภถุึคตจขช", +] + +THAI_CHARACTERS_WITH_SHIFT = [ + "()ฉฮฺ์?ฒฬฦ", + "ฤฆฏโฌ็๋ษศซ.", + '๐"ฎฑธํ๊ณฯญฐ,', + "+๑๒๓๔ู฿๕๖๗๘๙", +] + +ENGLISH_CHARACTERS_WITHOUT_SHIFT = [ + "1234567890-=", + "qwertyuiop[]\\", + "asdfghjkl;'", + "zxcvbnm,./", +] + +ENGLISH_CHARACTERS_WITH_SHIFT = [ + "!@#$%^&*()_+", + "QWERTYUIOP{}|", + 'ASDFGHJKL:"', + "ZXCVBNM<>?", +] + + +ALL_CHARACTERS = [ + THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT, + ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT, +] + + +def search_location_of_character(char: str): + for language_ix in [0, 1]: + for ix, row in enumerate(ALL_CHARACTERS[language_ix]): + if char in row: + return (language_ix, ix // 4, ix % 4, row.index(char)) + + +def find_neighbour_locations( + loc: tuple, + char: str, + kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)], +): + language_ix, is_shift, row, pos = loc + + valid_neighbours = [] + for kr, ks in kernel: + _row, _pos = row + kr, pos + ks + if 0 <= _row <= 3 and 0 <= _pos <= len( + ALL_CHARACTERS[language_ix][is_shift * 4 + _row] + ): + valid_neighbours.append((language_ix, is_shift, _row, _pos, char)) + + return valid_neighbours + + +def find_potential_misspell_candidates(char: str, verbose: bool = False): + loc = search_location_of_character(char) + if loc is None: + return None + + valid_neighbours = find_neighbour_locations(loc, char) + + chars = [] + printing_locations = ["▐"] * 3 + [char] + ["​▐"] * 3 + + for language_ix, is_shift, row, pos, char in valid_neighbours: + try: + char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos] + chars.append(char) + kernel = (row - loc[1], pos - loc[2]) + + if kernel == (-1, -1): + ix = 5 + elif kernel == (-1, 0): + ix = 6 + elif kernel[0] == 0: + ix = 3 + kernel[1] + elif kernel == (1, 0): + ix = 0 + elif kernel == (1, 1): + ix = 1 + else: + continue + printing_locations[ix] = char + except IndexError as e: + continue + except Exception as e: + print("Something wrong with: ", char) + raise e + + return chars + + +def misspell(sentence: str, ratio: float = 0.05): + """ + Simulate some mispellings for the input sentence. + The number of mispelled locations is governed by ratio. + + :params str sentence: sentence to be mispelled + :params float ratio: number of misspell per 100 characters. Defaults to 0.5. + + :return: sentence containing some misspelled + :rtype: str + + :Example: + :: + from pythainlp.tools import misspell + + sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826 โดยพ่อขุนรามคำแหง" + + misspell(sent, ratio=0.1) + # output: + ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง + """ + num_misspells = np.floor(len(sentence) * ratio).astype(int) + positions = np.random.choice(len(sentence), size=num_misspells, replace=False) + + # convert strings to array of characters + misspelled = list(sentence) + for pos in positions: + potential_candidates = find_potential_misspell_candidates(sentence[pos]) + if potential_candidates is None: + continue + + candidate = np.random.choice(potential_candidates) + + misspelled[pos] = candidate + + return "".join(misspelled) diff --git a/tests/test_misspell.py b/tests/test_misspell.py new file mode 100644 index 000000000..cdbbad2c2 --- /dev/null +++ b/tests/test_misspell.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +import unittest +import numpy as np +from pythainlp.tools import misspell + + +def _count_difference(st1, st2): + # this assumes len(st1) == len(st2) + + count = 0 + for i in range(len(st1)): + if st1[i] != st2[i]: + count += 1 + + return count + + +class TestTextMisspellPackage(unittest.TestCase): + def setUp(self): + self.texts = ["เรารักคุณมากที่สุดในโลก", "เราอยู่ที่มหาวิทยาลัยขอนแก่น"] + + def test_misspell_naive(self): + for text in self.texts: + misspelled = misspell(text, ratio=0.1) + + self.assertEqual(len(text), len(misspelled)) + + diff = _count_difference(text, misspelled) + + self.assertGreater(diff, 0, "we have some misspells.") + + def test_misspell_with_ratio_0_percent(self): + for text in self.texts: + misspelled = misspell(text, ratio=0.0) + + self.assertEqual(len(text), len(misspelled)) + + diff = _count_difference(text, misspelled) + + self.assertEqual(diff, 0, "we shouldn't have any misspell with ratio=0.") + + def test_misspell_with_ratio_50_percent(self): + for text in self.texts: + misspelled = misspell(text, ratio=0.5) + + self.assertEqual(len(text), len(misspelled)) + + diff = _count_difference(text, misspelled) + + self.assertLessEqual( + np.abs(diff - 0.5 * len(text)), + 2, + f"we should have around 0.5*len(text)±2 number misspell with ratio=0.5. (diff: {diff})", + ) + + def test_misspell_with_ratio_100_percent(self): + for text in self.texts: + misspelled = misspell(text, ratio=1) + + self.assertEqual(len(text), len(misspelled)) + + diff = _count_difference(text, misspelled) + + self.assertLessEqual( + np.abs(diff - len(text)), + 2, + f"we should have around len(text)-2 number misspell with ratio=1.0. (diff: {diff})", + ) From ae647d8cee7cb298589ef41a63c43ef0a9128a2d Mon Sep 17 00:00:00 2001 From: Pat Chormai Date: Sat, 2 Oct 2021 20:05:15 +0200 Subject: [PATCH 2/5] add document --- docs/api/tools.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api/tools.rst b/docs/api/tools.rst index 611b27e8c..135b66dde 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -10,3 +10,4 @@ Modules .. autofunction:: get_full_data_path .. autofunction:: get_pythainlp_data_path .. autofunction:: get_pythainlp_path +.. autofunction:: misspell From c79567b4df281945091aa388d501f53c23b40675 Mon Sep 17 00:00:00 2001 From: Pat Chormai Date: Sat, 2 Oct 2021 20:14:07 +0200 Subject: [PATCH 3/5] fix format --- pythainlp/tools/misspell.py | 6 +++--- tests/test_misspell.py | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py index 0b6997fca..f87f7aaab 100644 --- a/pythainlp/tools/misspell.py +++ b/pythainlp/tools/misspell.py @@ -61,7 +61,7 @@ def find_neighbour_locations( return valid_neighbours -def find_potential_misspell_candidates(char: str, verbose: bool = False): +def find_misspell_candidates(char: str, verbose: bool = False): loc = search_location_of_character(char) if loc is None: return None @@ -105,7 +105,7 @@ def misspell(sentence: str, ratio: float = 0.05): The number of mispelled locations is governed by ratio. :params str sentence: sentence to be mispelled - :params float ratio: number of misspell per 100 characters. Defaults to 0.5. + :params float ratio: number of misspells per 100 chars. Defaults to 0.5. :return: sentence containing some misspelled :rtype: str @@ -126,7 +126,7 @@ def misspell(sentence: str, ratio: float = 0.05): # convert strings to array of characters misspelled = list(sentence) for pos in positions: - potential_candidates = find_potential_misspell_candidates(sentence[pos]) + potential_candidates = find_misspell_candidates(sentence[pos]) if potential_candidates is None: continue diff --git a/tests/test_misspell.py b/tests/test_misspell.py index cdbbad2c2..5b039324d 100644 --- a/tests/test_misspell.py +++ b/tests/test_misspell.py @@ -18,7 +18,10 @@ def _count_difference(st1, st2): class TestTextMisspellPackage(unittest.TestCase): def setUp(self): - self.texts = ["เรารักคุณมากที่สุดในโลก", "เราอยู่ที่มหาวิทยาลัยขอนแก่น"] + self.texts = [ + "เรารักคุณมากที่สุดในโลก", + "เราอยู่ที่มหาวิทยาลัยขอนแก่น" + ] def test_misspell_naive(self): for text in self.texts: @@ -38,7 +41,10 @@ def test_misspell_with_ratio_0_percent(self): diff = _count_difference(text, misspelled) - self.assertEqual(diff, 0, "we shouldn't have any misspell with ratio=0.") + self.assertEqual( + diff, 0, + "we shouldn't have any misspell with ratio=0." + ) def test_misspell_with_ratio_50_percent(self): for text in self.texts: @@ -51,7 +57,7 @@ def test_misspell_with_ratio_50_percent(self): self.assertLessEqual( np.abs(diff - 0.5 * len(text)), 2, - f"we should have around 0.5*len(text)±2 number misspell with ratio=0.5. (diff: {diff})", + f"expect 0.5*len(text)±2 misspells with ratio=0.5. (Δ={diff})", ) def test_misspell_with_ratio_100_percent(self): @@ -65,5 +71,5 @@ def test_misspell_with_ratio_100_percent(self): self.assertLessEqual( np.abs(diff - len(text)), 2, - f"we should have around len(text)-2 number misspell with ratio=1.0. (diff: {diff})", + f"expect len(text)-2 misspells with ratio=1.5. (Δ={diff})", ) From 1dd5d8e13a1eef3a0b13083e2dcdb0936376618a Mon Sep 17 00:00:00 2001 From: Pat Chormai Date: Sat, 2 Oct 2021 20:14:45 +0200 Subject: [PATCH 4/5] fix format --- pythainlp/tools/misspell.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py index f87f7aaab..7f3716158 100644 --- a/pythainlp/tools/misspell.py +++ b/pythainlp/tools/misspell.py @@ -121,7 +121,11 @@ def misspell(sentence: str, ratio: float = 0.05): ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง """ num_misspells = np.floor(len(sentence) * ratio).astype(int) - positions = np.random.choice(len(sentence), size=num_misspells, replace=False) + positions = np.random.choice( + len(sentence), + size=num_misspells, + replace=False + ) # convert strings to array of characters misspelled = list(sentence) From 3a508236d3040ef5c866022617b15da62f4bd038 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 7 Oct 2021 06:51:24 +0100 Subject: [PATCH 5/5] Shorten the example for brevity and char limit --- pythainlp/tools/misspell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py index 7f3716158..186f1895e 100644 --- a/pythainlp/tools/misspell.py +++ b/pythainlp/tools/misspell.py @@ -114,11 +114,11 @@ def misspell(sentence: str, ratio: float = 0.05): :: from pythainlp.tools import misspell - sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826 โดยพ่อขุนรามคำแหง" + sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826" misspell(sent, ratio=0.1) # output: - ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง + ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 """ num_misspells = np.floor(len(sentence) * ratio).astype(int) positions = np.random.choice(