From e94d65ec227769fb7bb1dadf51a7f7cbc5d60540 Mon Sep 17 00:00:00 2001
From: Pat Chormai <pat.chormai@gmail.com>
Date: Fri, 1 Oct 2021 22:46:05 +0200
Subject: [PATCH 1/5] Add misspell module

---
 pythainlp/tools/__init__.py |   3 +
 pythainlp/tools/misspell.py | 137 ++++++++++++++++++++++++++++++++++++
 tests/test_misspell.py      |  69 ++++++++++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 pythainlp/tools/misspell.py
 create mode 100644 tests/test_misspell.py

diff --git a/pythainlp/tools/__init__.py b/pythainlp/tools/__init__.py
index 7488f4207..317dd065d 100644
--- a/pythainlp/tools/__init__.py
+++ b/pythainlp/tools/__init__.py
@@ -4,6 +4,7 @@
     "get_full_data_path",
     "get_pythainlp_data_path",
     "get_pythainlp_path",
+    "misspell",
 ]
 
 from pythainlp.tools.path import (
@@ -12,3 +13,5 @@
     get_pythainlp_data_path,
     get_pythainlp_path,
 )
+
+from pythainlp.tools.misspell import misspell
diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
new file mode 100644
index 000000000..0b6997fca
--- /dev/null
+++ b/pythainlp/tools/misspell.py
@@ -0,0 +1,137 @@
+import numpy as np
+from typing import List
+
+THAI_CHARACTERS_WITHOUT_SHIFT = [
+    "ผปแอิืทมใฝ",
+    "ฟหกดเ้่าสวง",
+    "ๆไำพะัีรนยบลฃ",
+    "ๅ/_ภถุึคตจขช",
+]
+
+THAI_CHARACTERS_WITH_SHIFT = [
+    "()ฉฮฺ์?ฒฬฦ",
+    "ฤฆฏโฌ็๋ษศซ.",
+    '๐"ฎฑธํ๊ณฯญฐ,',
+    "+๑๒๓๔ู฿๕๖๗๘๙",
+]
+
+ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
+    "1234567890-=",
+    "qwertyuiop[]\\",
+    "asdfghjkl;'",
+    "zxcvbnm,./",
+]
+
+ENGLISH_CHARACTERS_WITH_SHIFT = [
+    "!@#$%^&*()_+",
+    "QWERTYUIOP{}|",
+    'ASDFGHJKL:"',
+    "ZXCVBNM<>?",
+]
+
+
+ALL_CHARACTERS = [
+    THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
+    ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
+]
+
+
+def search_location_of_character(char: str):
+    for language_ix in [0, 1]:
+        for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
+            if char in row:
+                return (language_ix, ix // 4, ix % 4, row.index(char))
+
+
+def find_neighbour_locations(
+    loc: tuple,
+    char: str,
+    kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
+):
+    language_ix, is_shift, row, pos = loc
+
+    valid_neighbours = []
+    for kr, ks in kernel:
+        _row, _pos = row + kr, pos + ks
+        if 0 <= _row <= 3 and 0 <= _pos <= len(
+            ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
+        ):
+            valid_neighbours.append((language_ix, is_shift, _row, _pos, char))
+
+    return valid_neighbours
+
+
+def find_potential_misspell_candidates(char: str, verbose: bool = False):
+    loc = search_location_of_character(char)
+    if loc is None:
+        return None
+
+    valid_neighbours = find_neighbour_locations(loc, char)
+
+    chars = []
+    printing_locations = ["▐"] * 3 + [char] + ["​▐"] * 3
+
+    for language_ix, is_shift, row, pos, char in valid_neighbours:
+        try:
+            char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
+            chars.append(char)
+            kernel = (row - loc[1], pos - loc[2])
+
+            if kernel == (-1, -1):
+                ix = 5
+            elif kernel == (-1, 0):
+                ix = 6
+            elif kernel[0] == 0:
+                ix = 3 + kernel[1]
+            elif kernel == (1, 0):
+                ix = 0
+            elif kernel == (1, 1):
+                ix = 1
+            else:
+                continue
+            printing_locations[ix] = char
+        except IndexError as e:
+            continue
+        except Exception as e:
+            print("Something wrong with: ", char)
+            raise e
+
+    return chars
+
+
+def misspell(sentence: str, ratio: float = 0.05):
+    """
+    Simulate some mispellings for the input sentence.
+    The number of mispelled locations is governed by ratio.
+
+    :params str sentence: sentence to be mispelled
+    :params float ratio: number of misspell per 100 characters. Defaults to 0.5.
+
+    :return: sentence containing some misspelled
+    :rtype: str
+
+    :Example:
+        ::
+            from pythainlp.tools import misspell
+
+            sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826 โดยพ่อขุนรามคำแหง"
+
+            misspell(sent, ratio=0.1)
+            # output:
+            ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง
+    """
+    num_misspells = np.floor(len(sentence) * ratio).astype(int)
+    positions = np.random.choice(len(sentence), size=num_misspells, replace=False)
+
+    # convert strings to array of characters
+    misspelled = list(sentence)
+    for pos in positions:
+        potential_candidates = find_potential_misspell_candidates(sentence[pos])
+        if potential_candidates is None:
+            continue
+
+        candidate = np.random.choice(potential_candidates)
+
+        misspelled[pos] = candidate
+
+    return "".join(misspelled)
diff --git a/tests/test_misspell.py b/tests/test_misspell.py
new file mode 100644
index 000000000..cdbbad2c2
--- /dev/null
+++ b/tests/test_misspell.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import numpy as np
+from pythainlp.tools import misspell
+
+
+def _count_difference(st1, st2):
+    # this assumes len(st1) == len(st2)
+
+    count = 0
+    for i in range(len(st1)):
+        if st1[i] != st2[i]:
+            count += 1
+
+    return count
+
+
+class TestTextMisspellPackage(unittest.TestCase):
+    def setUp(self):
+        self.texts = ["เรารักคุณมากที่สุดในโลก", "เราอยู่ที่มหาวิทยาลัยขอนแก่น"]
+
+    def test_misspell_naive(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.1)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertGreater(diff, 0, "we have some misspells.")
+
+    def test_misspell_with_ratio_0_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.0)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertEqual(diff, 0, "we shouldn't have any  misspell with ratio=0.")
+
+    def test_misspell_with_ratio_50_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=0.5)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertLessEqual(
+                np.abs(diff - 0.5 * len(text)),
+                2,
+                f"we should have around 0.5*len(text)±2 number misspell with ratio=0.5. (diff: {diff})",
+            )
+
+    def test_misspell_with_ratio_100_percent(self):
+        for text in self.texts:
+            misspelled = misspell(text, ratio=1)
+
+            self.assertEqual(len(text), len(misspelled))
+
+            diff = _count_difference(text, misspelled)
+
+            self.assertLessEqual(
+                np.abs(diff - len(text)),
+                2,
+                f"we should have around len(text)-2 number misspell with ratio=1.0. (diff: {diff})",
+            )

From ae647d8cee7cb298589ef41a63c43ef0a9128a2d Mon Sep 17 00:00:00 2001
From: Pat Chormai <pat.chormai@gmail.com>
Date: Sat, 2 Oct 2021 20:05:15 +0200
Subject: [PATCH 2/5] add document

---
 docs/api/tools.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/api/tools.rst b/docs/api/tools.rst
index 611b27e8c..135b66dde 100644
--- a/docs/api/tools.rst
+++ b/docs/api/tools.rst
@@ -10,3 +10,4 @@ Modules
 .. autofunction:: get_full_data_path
 .. autofunction:: get_pythainlp_data_path
 .. autofunction:: get_pythainlp_path
+.. autofunction:: misspell

From c79567b4df281945091aa388d501f53c23b40675 Mon Sep 17 00:00:00 2001
From: Pat Chormai <pat.chormai@gmail.com>
Date: Sat, 2 Oct 2021 20:14:07 +0200
Subject: [PATCH 3/5] fix format

---
 pythainlp/tools/misspell.py |  6 +++---
 tests/test_misspell.py      | 14 ++++++++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
index 0b6997fca..f87f7aaab 100644
--- a/pythainlp/tools/misspell.py
+++ b/pythainlp/tools/misspell.py
@@ -61,7 +61,7 @@ def find_neighbour_locations(
     return valid_neighbours
 
 
-def find_potential_misspell_candidates(char: str, verbose: bool = False):
+def find_misspell_candidates(char: str, verbose: bool = False):
     loc = search_location_of_character(char)
     if loc is None:
         return None
@@ -105,7 +105,7 @@ def misspell(sentence: str, ratio: float = 0.05):
     The number of mispelled locations is governed by ratio.
 
     :params str sentence: sentence to be mispelled
-    :params float ratio: number of misspell per 100 characters. Defaults to 0.5.
+    :params float ratio: number of misspells per 100 chars. Defaults to 0.5.
 
     :return: sentence containing some misspelled
     :rtype: str
@@ -126,7 +126,7 @@ def misspell(sentence: str, ratio: float = 0.05):
     # convert strings to array of characters
     misspelled = list(sentence)
     for pos in positions:
-        potential_candidates = find_potential_misspell_candidates(sentence[pos])
+        potential_candidates = find_misspell_candidates(sentence[pos])
         if potential_candidates is None:
             continue
 
diff --git a/tests/test_misspell.py b/tests/test_misspell.py
index cdbbad2c2..5b039324d 100644
--- a/tests/test_misspell.py
+++ b/tests/test_misspell.py
@@ -18,7 +18,10 @@ def _count_difference(st1, st2):
 
 class TestTextMisspellPackage(unittest.TestCase):
     def setUp(self):
-        self.texts = ["เรารักคุณมากที่สุดในโลก", "เราอยู่ที่มหาวิทยาลัยขอนแก่น"]
+        self.texts = [
+            "เรารักคุณมากที่สุดในโลก",
+            "เราอยู่ที่มหาวิทยาลัยขอนแก่น"
+        ]
 
     def test_misspell_naive(self):
         for text in self.texts:
@@ -38,7 +41,10 @@ def test_misspell_with_ratio_0_percent(self):
 
             diff = _count_difference(text, misspelled)
 
-            self.assertEqual(diff, 0, "we shouldn't have any  misspell with ratio=0.")
+            self.assertEqual(
+                diff, 0,
+                "we shouldn't have any  misspell with ratio=0."
+            )
 
     def test_misspell_with_ratio_50_percent(self):
         for text in self.texts:
@@ -51,7 +57,7 @@ def test_misspell_with_ratio_50_percent(self):
             self.assertLessEqual(
                 np.abs(diff - 0.5 * len(text)),
                 2,
-                f"we should have around 0.5*len(text)±2 number misspell with ratio=0.5. (diff: {diff})",
+                f"expect 0.5*len(text)±2 misspells with ratio=0.5. (Δ={diff})",
             )
 
     def test_misspell_with_ratio_100_percent(self):
@@ -65,5 +71,5 @@ def test_misspell_with_ratio_100_percent(self):
             self.assertLessEqual(
                 np.abs(diff - len(text)),
                 2,
-                f"we should have around len(text)-2 number misspell with ratio=1.0. (diff: {diff})",
+                f"expect len(text)-2 misspells with ratio=1.5. (Δ={diff})",
             )

From 1dd5d8e13a1eef3a0b13083e2dcdb0936376618a Mon Sep 17 00:00:00 2001
From: Pat Chormai <pat.chormai@gmail.com>
Date: Sat, 2 Oct 2021 20:14:45 +0200
Subject: [PATCH 4/5] fix format

---
 pythainlp/tools/misspell.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
index f87f7aaab..7f3716158 100644
--- a/pythainlp/tools/misspell.py
+++ b/pythainlp/tools/misspell.py
@@ -121,7 +121,11 @@ def misspell(sentence: str, ratio: float = 0.05):
             ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง
     """
     num_misspells = np.floor(len(sentence) * ratio).astype(int)
-    positions = np.random.choice(len(sentence), size=num_misspells, replace=False)
+    positions = np.random.choice(
+        len(sentence),
+        size=num_misspells,
+        replace=False
+    )
 
     # convert strings to array of characters
     misspelled = list(sentence)

From 3a508236d3040ef5c866022617b15da62f4bd038 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Thu, 7 Oct 2021 06:51:24 +0100
Subject: [PATCH 5/5] Shorten the example for brevity and char limit

---
 pythainlp/tools/misspell.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py
index 7f3716158..186f1895e 100644
--- a/pythainlp/tools/misspell.py
+++ b/pythainlp/tools/misspell.py
@@ -114,11 +114,11 @@ def misspell(sentence: str, ratio: float = 0.05):
         ::
             from pythainlp.tools import misspell
 
-            sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826 โดยพ่อขุนรามคำแหง"
+            sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826"
 
             misspell(sent, ratio=0.1)
             # output:
-            ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 โดยพ่อจุสรามคำแหง
+            ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727
     """
     num_misspells = np.floor(len(sentence) * ratio).astype(int)
     positions = np.random.choice(