diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 1a7c104b5..33c83769b 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -33,6 +33,7 @@ "text_to_arabic_digit", "text_to_thai_digit", "thai_digit_to_arabic_digit", + "thai_keyboard_dist", "thai_strftime", "thai_time", "thai_to_eng", @@ -55,6 +56,11 @@ text_to_thai_digit, thai_digit_to_arabic_digit, ) +from pythainlp.util.keyboard import ( + eng_to_thai, + thai_keyboard_dist, + thai_to_eng, +) from pythainlp.util.emojiconv import emoji_to_thai from pythainlp.util.keyboard import eng_to_thai, thai_to_eng from pythainlp.util.keywords import find_keyword, rank diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 3008081c3..eacb656d1 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- """ -Correct text in one language that is incorrectly-typed -with a keyboard layout in another language. +Functions related to keyboard layout. """ + EN_TH_KEYB_PAIRS = { "Z": "(", "z": "ผ", @@ -103,6 +103,19 @@ EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS) TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS) +TIS_820_2531_MOD = [ + ["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"], + ["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"], + ["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"], + ["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"], +] +TIS_820_2531_MOD_SHIFT = [ + ["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"], + ["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"], + ["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."], + ["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"], +] + def eng_to_thai(text: str) -> str: """ @@ -148,3 +161,63 @@ def thai_to_eng(text: str) -> str: # output: 'Bank of Thailand' """ return text.translate(TH_EN_TRANSLATE_TABLE) + + +def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float: + """ + Calculate euclidean distance between two Thai characters + according to their location on a Thai keyboard layout. + + A modified TIS 820-2531 standard keyboard layout, which is developed + from Kedmanee layout and is the most commonly used Thai keyboard layout, + is used in distance calculation. + + The modified TIS 820-2531 is TIS 820-2531 with few key extensions + proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in + https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html + + Noted that the latest TIS 820-2538 has slight changes in layout from + TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in + https://www.nectec.or.th/it-standards/std820/std820.html + Since TIS 820-2538 is not widely adopted by keyboard manufacturer, + this function uses the de facto standard modified TIS 820-2531 instead. + + :param str c1: first character + :param str c2: second character + :param str shift_dist: return value if they're shifted + :return: euclidean distance between two characters + :rtype: float + + :Example: + + from pythainlp.util import thai_keyboard_dist + thai_keyboard_dist("ด", "ะ") + # output: 1.4142135623730951 + thai_keyboard_dist("ฟ", "ฤ") + # output: 0.0 + thai_keyboard_dist("ฟ", "ห") + # output: 1.0 + thai_keyboard_dist("ฟ", "ก") + # output: 2.0 + thai_keyboard_dist("ฟ", "ฤ", 0.5) + # output: 0.5 + """ + def get_char_coord( + ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT] + ): + for layout in layouts: + for row in layout: + if ch in row: + r = layout.index(row) + c = row.index(ch) + return (r, c) + raise ValueError(ch + " not found in given keyboard layout") + + coord1 = get_char_coord(c1) + coord2 = get_char_coord(c2) + distance = ( + (coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2 + ) ** (0.5) + if distance == 0 and c1 != c2: + return shift_dist + return distance diff --git a/tests/test_util.py b/tests/test_util.py index 54c7b07f3..cefe15bae 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -44,6 +44,7 @@ time_to_thaiword, thai_to_eng, thaiword_to_num, + thai_keyboard_dist, ) @@ -157,6 +158,17 @@ def test_rank(self): rank(["แมว", "คน", "แมว"], exclude_stopwords=True) ) + # ### pythainlp.util.keyboard + + def test_thai_keyboard_dist(self): + self.assertEqual(thai_keyboard_dist("ฟ", "ฤ"), 0.0) + self.assertEqual(thai_keyboard_dist("ฟ", "ห"), 1.0) + self.assertEqual(thai_keyboard_dist("ฟ", "ก"), 2.0) + self.assertEqual(thai_keyboard_dist("ฟ", "ฤ", 0.5), 0.5) + self.assertNotEqual( + thai_keyboard_dist("๘", "๙"), thai_keyboard_dist("๙", "๐") + ) + # ### pythainlp.util.date def test_date(self): @@ -238,7 +250,8 @@ def test_time_to_thaiword(self): time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที" ) self.assertEqual( - time_to_thaiword(time(12, 3, 1)), "สิบสองนาฬิกาสามนาทีหนึ่งวินาที", + time_to_thaiword(time(12, 3, 1)), + "สิบสองนาฬิกาสามนาทีหนึ่งวินาที", ) self.assertEqual( time_to_thaiword(datetime(2014, 5, 22, 12, 3, 0), precision="s"), @@ -353,13 +366,16 @@ def test_thaiword_to_date(self): now + timedelta(days=0), thaiword_to_date("วันนี้", now) ) self.assertEqual( - now + timedelta(days=1), thaiword_to_date("พรุ่งนี้", now), + now + timedelta(days=1), + thaiword_to_date("พรุ่งนี้", now), ) self.assertEqual( - now + timedelta(days=2), thaiword_to_date("มะรืนนี้", now), + now + timedelta(days=2), + thaiword_to_date("มะรืนนี้", now), ) self.assertEqual( - now + timedelta(days=-1), thaiword_to_date("เมื่อวาน", now), + now + timedelta(days=-1), + thaiword_to_date("เมื่อวาน", now), ) self.assertEqual( now + timedelta(days=-2), thaiword_to_date("วานซืน", now) @@ -538,14 +554,16 @@ def test_emoji_to_thai(self): emoji_to_thai( "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀" ), - ("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ " - "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:") + ( + "จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ " + "ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:" + ), ) self.assertEqual( emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"), - "หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:" + "หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:", ) self.assertEqual( emoji_to_thai("🇹🇭 นี่คิือธงประเทศไทย"), - ":ธง_ไทย: นี่คิือธงประเทศไทย" + ":ธง_ไทย: นี่คิือธงประเทศไทย", )