Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"text_to_arabic_digit",
"text_to_thai_digit",
"thai_digit_to_arabic_digit",
"thai_keyboard_dist",
"thai_strftime",
"thai_time",
"thai_to_eng",
Expand All @@ -55,6 +56,11 @@
text_to_thai_digit,
thai_digit_to_arabic_digit,
)
from pythainlp.util.keyboard import (
eng_to_thai,
thai_keyboard_dist,
thai_to_eng,
)
from pythainlp.util.emojiconv import emoji_to_thai
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
from pythainlp.util.keywords import find_keyword, rank
Expand Down
77 changes: 75 additions & 2 deletions pythainlp/util/keyboard.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf-8 -*-
"""
Correct text in one language that is incorrectly-typed
with a keyboard layout in another language.
Functions related to keyboard layout.
"""

EN_TH_KEYB_PAIRS = {
"Z": "(",
"z": "ผ",
Expand Down Expand Up @@ -103,6 +103,19 @@
EN_TH_TRANSLATE_TABLE = str.maketrans(EN_TH_KEYB_PAIRS)
TH_EN_TRANSLATE_TABLE = str.maketrans(TH_EN_KEYB_PAIRS)

TIS_820_2531_MOD = [
["-", "ๅ", "/", "", "_", "ภ", "ถ", "ุ", "ึ", "ค", "ต", "จ", "ข", "ช"],
["ๆ", "ไ", "ำ", "พ", "ะ", "ั", "ี", "ร", "น", "ย", "บ", "ล", "ฃ"],
["ฟ", "ห", "ก", "ด", "เ", "้", "่", "า", "ส", "ว", "ง"],
["ผ", "ป", "แ", "อ", "ิ", "ื", "ท", "ม", "ใ", "ฝ"],
]
TIS_820_2531_MOD_SHIFT = [
["%", "+", "๑", "๒", "๓", "๔", "ู", "฿", "๕", "๖", "๗", "๘", "๙"],
["๐", "\"", "ฎ", "ฑ", "ธ", "ํ", "๊", "ณ", "ฯ", "ญ", "ฐ", ",", "ฅ"],
["ฤ", "ฆ", "ฏ", "โ", "ฌ", "็", "๋", "ษ", "ศ", "ซ", "."],
["(", ")", "ฉ", "ฮ", "ฺ", "์", "?", "ฒ", "ฬ", "ฦ"],
]


def eng_to_thai(text: str) -> str:
"""
Expand Down Expand Up @@ -148,3 +161,63 @@ def thai_to_eng(text: str) -> str:
# output: 'Bank of Thailand'
"""
return text.translate(TH_EN_TRANSLATE_TABLE)


def thai_keyboard_dist(c1: str, c2: str, shift_dist: float = 0.0) -> float:
"""
Calculate euclidean distance between two Thai characters
according to their location on a Thai keyboard layout.

A modified TIS 820-2531 standard keyboard layout, which is developed
from Kedmanee layout and is the most commonly used Thai keyboard layout,
is used in distance calculation.

The modified TIS 820-2531 is TIS 820-2531 with few key extensions
proposed in TIS 820-2536 draft. See Figure 4, notice grey keys, in
https://www.nectec.or.th/it-standards/keyboard_layout/thai-key.html

Noted that the latest TIS 820-2538 has slight changes in layout from
TIS 820-2531. See Figure 2, notice the Thai Baht sign and ฅ-ฃ pair, in
https://www.nectec.or.th/it-standards/std820/std820.html
Since TIS 820-2538 is not widely adopted by keyboard manufacturer,
this function uses the de facto standard modified TIS 820-2531 instead.

:param str c1: first character
:param str c2: second character
:param str shift_dist: return value if they're shifted
:return: euclidean distance between two characters
:rtype: float

:Example:

from pythainlp.util import thai_keyboard_dist
thai_keyboard_dist("ด", "ะ")
# output: 1.4142135623730951
thai_keyboard_dist("ฟ", "ฤ")
# output: 0.0
thai_keyboard_dist("ฟ", "ห")
# output: 1.0
thai_keyboard_dist("ฟ", "ก")
# output: 2.0
thai_keyboard_dist("ฟ", "ฤ", 0.5)
# output: 0.5
"""
def get_char_coord(
ch: str, layouts=[TIS_820_2531_MOD, TIS_820_2531_MOD_SHIFT]
):
for layout in layouts:
for row in layout:
if ch in row:
r = layout.index(row)
c = row.index(ch)
return (r, c)
raise ValueError(ch + " not found in given keyboard layout")

coord1 = get_char_coord(c1)
coord2 = get_char_coord(c2)
distance = (
(coord1[0] - coord2[0]) ** 2 + (coord1[1] - coord2[1]) ** 2
) ** (0.5)
if distance == 0 and c1 != c2:
return shift_dist
return distance
34 changes: 26 additions & 8 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
time_to_thaiword,
thai_to_eng,
thaiword_to_num,
thai_keyboard_dist,
)


Expand Down Expand Up @@ -157,6 +158,17 @@ def test_rank(self):
rank(["แมว", "คน", "แมว"], exclude_stopwords=True)
)

# ### pythainlp.util.keyboard

def test_thai_keyboard_dist(self):
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ"), 0.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ห"), 1.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ก"), 2.0)
self.assertEqual(thai_keyboard_dist("ฟ", "ฤ", 0.5), 0.5)
self.assertNotEqual(
thai_keyboard_dist("๘", "๙"), thai_keyboard_dist("๙", "๐")
)

# ### pythainlp.util.date

def test_date(self):
Expand Down Expand Up @@ -238,7 +250,8 @@ def test_time_to_thaiword(self):
time_to_thaiword(time(12, 3, 0)), "สิบสองนาฬิกาสามนาที"
)
self.assertEqual(
time_to_thaiword(time(12, 3, 1)), "สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
time_to_thaiword(time(12, 3, 1)),
"สิบสองนาฬิกาสามนาทีหนึ่งวินาที",
)
self.assertEqual(
time_to_thaiword(datetime(2014, 5, 22, 12, 3, 0), precision="s"),
Expand Down Expand Up @@ -353,13 +366,16 @@ def test_thaiword_to_date(self):
now + timedelta(days=0), thaiword_to_date("วันนี้", now)
)
self.assertEqual(
now + timedelta(days=1), thaiword_to_date("พรุ่งนี้", now),
now + timedelta(days=1),
thaiword_to_date("พรุ่งนี้", now),
)
self.assertEqual(
now + timedelta(days=2), thaiword_to_date("มะรืนนี้", now),
now + timedelta(days=2),
thaiword_to_date("มะรืนนี้", now),
)
self.assertEqual(
now + timedelta(days=-1), thaiword_to_date("เมื่อวาน", now),
now + timedelta(days=-1),
thaiword_to_date("เมื่อวาน", now),
)
self.assertEqual(
now + timedelta(days=-2), thaiword_to_date("วานซืน", now)
Expand Down Expand Up @@ -538,14 +554,16 @@ def test_emoji_to_thai(self):
emoji_to_thai(
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ ใกล้ชิดประชาชนดี 😀"
),
("จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:")
(
"จะมานั่งรถเมล์เหมือนผมก็ได้นะครับ "
"ใกล้ชิดประชาชนดี :หน้ายิ้มยิงฟัน:"
),
)
self.assertEqual(
emoji_to_thai("หิวข้าวอยากกินอาหารญี่ปุ่น 🍣"),
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:"
"หิวข้าวอยากกินอาหารญี่ปุ่น :ซูชิ:",
)
self.assertEqual(
emoji_to_thai("🇹🇭 นี่คิือธงประเทศไทย"),
":ธง_ไทย: นี่คิือธงประเทศไทย"
":ธง_ไทย: นี่คิือธงประเทศไทย",
)