From 27b7732fd8ef307eb89b9bd30bb6d863066a818e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 5 Nov 2018 20:22:47 +0700 Subject: [PATCH 1/8] remove WordNet wrapper (pythainlp/corpurs/wordnet.py) - the entire set of functions are exactly the same as nltk.wordnet, user can call nltk.wordnet by themselves. --- pythainlp/corpus/wordnet.py | 69 ------------------------------------- tests/__init__.py | 7 ---- 2 files changed, 76 deletions(-) delete mode 100644 pythainlp/corpus/wordnet.py diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py deleted file mode 100644 index d3e23922f..000000000 --- a/pythainlp/corpus/wordnet.py +++ /dev/null @@ -1,69 +0,0 @@ -# -*- coding: utf-8 -*- -""" -WordNet -""" -import nltk - -try: - nltk.data.find("corpora/omw") -except LookupError: - nltk.download("omw") - -try: - nltk.data.find("corpora/wordnet") -except LookupError: - nltk.download("wordnet") - -from nltk.corpus import wordnet - - -def synsets(word, pos=None, lang="tha"): - return wordnet.synsets(lemma=word, pos=pos, lang=lang) - - -def synset(name_synsets): - return wordnet.synset(name_synsets) - - -def all_lemma_names(pos=None, lang="tha"): - return wordnet.all_lemma_names(pos=pos, lang=lang) - - -def all_synsets(pos=None): - return wordnet.all_synsets(pos=pos) - - -def langs(): - return wordnet.langs() - - -def lemmas(word, pos=None, lang="tha"): - return wordnet.lemmas(word, pos=pos, lang=lang) - - -def lemma(name_synsets): - return wordnet.lemma(name_synsets) - - -def lemma_from_key(key): - return wordnet.lemma_from_key(key) - - -def path_similarity(synsets1, synsets2): - return wordnet.path_similarity(synsets1, synsets2) - - -def lch_similarity(synsets1, synsets2): - return wordnet.lch_similarity(synsets1, synsets2) - - -def wup_similarity(synsets1, synsets2): - return wordnet.wup_similarity(synsets1, synsets2) - - -def morphy(form, pos=None): - return wordnet.morphy(form, pos=None) - - -def custom_lemmas(tab_file, lang): - return wordnet.custom_lemmas(tab_file, lang) diff --git a/tests/__init__.py b/tests/__init__.py index 4cae93fd6..2e8f89385 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,7 +14,6 @@ thai_words, tnc, ttc, - wordnet, ) from pythainlp.date import now, now_reign_year, reign_year_to_ad from pythainlp.g2p import ipa @@ -85,12 +84,6 @@ def test_tnc(self): def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) - def test_wordnet(self): - self.assertEqual( - wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] - ) - self.assertIsNotNone(wordnet.langs()) - # ### pythainlp.date def test_date(self): From 7b3d077bf44325d110d336889eb52c7fe7b3b1c5 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 14:30:30 +0700 Subject: [PATCH 2/8] - rank() handles empty list or None case - add more test cases - revert wordnet.py removal --- docs/api/tokenize.rst | 2 +- pythainlp/corpus/wordnet.py | 72 ++++++++++++++++++++++++++++++++++ pythainlp/rank/__init__.py | 11 ++++-- pythainlp/tokenize/__init__.py | 16 ++++---- tests/__init__.py | 11 ++++++ 5 files changed, 99 insertions(+), 13 deletions(-) create mode 100644 pythainlp/corpus/wordnet.py diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index c7ed5941e..f91e5a2da 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -9,4 +9,4 @@ The :class:`pythainlp.tokenize` contains multiple functions for tokenizing a chu .. autofunction:: dict_word_tokenize .. autofunction:: subword_tokenize .. autofunction:: sent_tokenize -.. autofunction:: create_custom_dict_trie +.. autofunction:: dict_trie diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py new file mode 100644 index 000000000..07ec0f526 --- /dev/null +++ b/pythainlp/corpus/wordnet.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +""" +NLTK WordNet wrapper + +API here is exactly the same as NLTK API, +except that lang (language) argument will be "tha" (Thai) by default. +""" +import nltk + +try: + nltk.data.find("corpora/omw") +except LookupError: + nltk.download("omw") + +try: + nltk.data.find("corpora/wordnet") +except LookupError: + nltk.download("wordnet") + +from nltk.corpus import wordnet + + +def synsets(word, pos=None, lang="tha"): + return wordnet.synsets(lemma=word, pos=pos, lang=lang) + + +def synset(name_synsets): + return wordnet.synset(name_synsets) + + +def all_lemma_names(pos=None, lang="tha"): + return wordnet.all_lemma_names(pos=pos, lang=lang) + + +def all_synsets(pos=None): + return wordnet.all_synsets(pos=pos) + + +def langs(): + return wordnet.langs() + + +def lemmas(word, pos=None, lang="tha"): + return wordnet.lemmas(word, pos=pos, lang=lang) + + +def lemma(name_synsets): + return wordnet.lemma(name_synsets) + + +def lemma_from_key(key): + return wordnet.lemma_from_key(key) + + +def path_similarity(synsets1, synsets2): + return wordnet.path_similarity(synsets1, synsets2) + + +def lch_similarity(synsets1, synsets2): + return wordnet.lch_similarity(synsets1, synsets2) + + +def wup_similarity(synsets1, synsets2): + return wordnet.wup_similarity(synsets1, synsets2) + + +def morphy(form, pos=None): + return wordnet.morphy(form, pos=None) + + +def custom_lemmas(tab_file, lang): + return wordnet.custom_lemmas(tab_file, lang) diff --git a/pythainlp/rank/__init__.py b/pythainlp/rank/__init__.py index dbf5781b7..12b3c04be 100644 --- a/pythainlp/rank/__init__.py +++ b/pythainlp/rank/__init__.py @@ -7,15 +7,18 @@ # เรียงจำนวนคำของประโยค -def rank(data, stopword=False): +def rank(words, stopword=False): """ Sort words by frequency - รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [(คำ, จำนวน), (คำ, จำนวน), ...] + รับค่าเป็น ''list'' คืนค่าเป็น ''Counter'' Counter({"คำ": จำนวน, "คำ": จำนวน}) """ + if not words: + return None + if stopword: - data = [word for word in data if word not in _STOPWORDS] + words = [word for word in words if word not in _STOPWORDS] - rankdata = Counter(data) + rankdata = Counter(words) return rankdata diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 804d11e23..e81c3214d 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -59,18 +59,18 @@ def segment(text): return segment(text) -def dict_word_tokenize(text, custom_dict_trie, engine="newmm"): +def dict_word_tokenize(text, custom_dict, engine="newmm"): """ :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure. :param str text: text to be tokenized - :param dict custom_dict_trie: a dictionary trie + :param dict custom_dict: a dictionary trie :param str engine: choose between different options of engine to token (newmm, longest) :return: list of words **Example**:: - >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie - >>> listword = ["แมว", "ดี"] - >>> data_dict = create_custom_dict_trie(listword) - >>> dict_word_tokenize("แมวดีดีแมว", data_dict) + >>> from pythainlp.tokenize import dict_word_tokenize, dict_trie + >>> words = ["แมว", "ดี"] + >>> trie = dict_trie(words) + >>> dict_word_tokenize("แมวดีดีแมว", trie) ['แมว', 'ดี', 'ดี', 'แมว'] """ if engine == "newmm" or engine == "onecut": @@ -82,7 +82,7 @@ def dict_word_tokenize(text, custom_dict_trie, engine="newmm"): else: # default, use "newmm" engine from .newmm import mmcut as segment - return segment(text, custom_dict_trie) + return segment(text, custom_dict) def sent_tokenize(text, engine="whitespace+newline"): @@ -126,7 +126,7 @@ def syllable_tokenize(text): words = word_tokenize(text) trie = dict_trie(dict_source=thai_syllables()) for word in words: - tokens.extend(dict_word_tokenize(text=word, custom_dict_trie=trie)) + tokens.extend(dict_word_tokenize(text=word, custom_dict=trie)) return tokens diff --git a/tests/__init__.py b/tests/__init__.py index 2e8f89385..2b66eacc7 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,6 +14,7 @@ thai_words, tnc, ttc, + wordnet, ) from pythainlp.date import now, now_reign_year, reign_year_to_ad from pythainlp.g2p import ipa @@ -84,6 +85,12 @@ def test_tnc(self): def test_ttc(self): self.assertIsNotNone(ttc.word_freqs()) + def test_wordnet(self): + self.assertEqual( + wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] + ) + self.assertIsNotNone(wordnet.langs()) + # ### pythainlp.date def test_date(self): @@ -197,7 +204,9 @@ def test_number(self): # ### pythainlp.rank def test_rank(self): + self.assertEqual(rank([]), None) self.assertEqual(rank(["แมว", "คน", "แมว"]), Counter({"แมว": 2, "คน": 1})) + self.assertIsNotNone(rank(["แมว", "คน", "แมว"], stopword=True)) # ### pythainlp.romanization @@ -207,6 +216,7 @@ def test_romanization(self): def test_romanization_royin(self): engine = "royin" + self.assertIsNone(romanize("กก", engine=engine)) self.assertEqual(romanize("แมว", engine=engine), "maeo") self.assertEqual(romanize("เดือน", engine=engine), "duean") self.assertEqual(romanize("ดู", engine=engine), "du") @@ -355,6 +365,7 @@ def test_is_thaiword(self): def test_normalize(self): self.assertEqual(normalize("เเปลก"), "แปลก") + self.assertIsNotNone(normalize("พรรค์จันทร์ab์")) def test_keyboard(self): self.assertEqual(eng_to_thai("l;ylfu8iy["), "สวัสดีครับ") From 7b188229451b18f1b9f4020e23fdfcd098ad2226 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 14:57:08 +0700 Subject: [PATCH 3/8] more test case --- pythainlp/rank/__init__.py | 4 +--- tests/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pythainlp/rank/__init__.py b/pythainlp/rank/__init__.py index 12b3c04be..9b831391d 100644 --- a/pythainlp/rank/__init__.py +++ b/pythainlp/rank/__init__.py @@ -18,9 +18,7 @@ def rank(words, stopword=False): if stopword: words = [word for word in words if word not in _STOPWORDS] - rankdata = Counter(words) - - return rankdata + return Counter(words) if __name__ == "__main__": diff --git a/tests/__init__.py b/tests/__init__.py index 2b66eacc7..b1085122b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -216,7 +216,7 @@ def test_romanization(self): def test_romanization_royin(self): engine = "royin" - self.assertIsNone(romanize("กก", engine=engine)) + self.assertIsNotNone(romanize("กก", engine=engine)) self.assertEqual(romanize("แมว", engine=engine), "maeo") self.assertEqual(romanize("เดือน", engine=engine), "duean") self.assertEqual(romanize("ดู", engine=engine), "du") @@ -352,6 +352,7 @@ def test_deletetone(self): def test_is_thai(self): self.assertEqual(is_thai("ประเทศไทย"), {"thai": 100.0}) + self.assertIsNotNone(is_thai("เผือก", check_all=True)) def test_is_thaichar(self): self.assertEqual(is_thaichar("ก"), True) From c6edbf479153c6267112b64bb6a24118223b4e84 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 15:13:03 +0700 Subject: [PATCH 4/8] Soundex functions handle empty or None case --- pythainlp/soundex/lk82.py | 3 +++ pythainlp/soundex/metasound.py | 3 +++ pythainlp/soundex/udom83.py | 3 +++ tests/__init__.py | 13 ++++++++++++- 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index e9a401b04..b46e65b01 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -28,6 +28,9 @@ def lk82(text): :param str text: Thai word :return: LK82 soundex """ + if not text: + return "" + res = [] text = _RE_1.sub("", text) # 4.ลบวรรณยุกต์ text = _RE_2.sub("", text) # 4.ลบตัวการันต์ diff --git a/pythainlp/soundex/metasound.py b/pythainlp/soundex/metasound.py index 0a6a9e4ba..91f813b13 100644 --- a/pythainlp/soundex/metasound.py +++ b/pythainlp/soundex/metasound.py @@ -34,6 +34,9 @@ def metasound(text, length=4): metasound("รักษ์") # 'ร100' metasound("บูรณการ", 5)) # 'บ5515' """ + if not text: + return "" + # keep only consonants and thanthakhat chars = [] for ch in text: diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py index 8d4f464ff..fc4d3c949 100644 --- a/pythainlp/soundex/udom83.py +++ b/pythainlp/soundex/udom83.py @@ -37,6 +37,9 @@ def udom83(text): :return: Udom83 soundex """ + if not text: + return "" + text = _RE_1.sub("ัน\\1", text) text = _RE_2.sub("ั\\1", text) text = _RE_3.sub("ัน\\1", text) diff --git a/tests/__init__.py b/tests/__init__.py index b1085122b..7ad8151a2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -34,7 +34,7 @@ from pythainlp.rank import rank from pythainlp.romanization import romanize from pythainlp.sentiment import sentiment -from pythainlp.soundex import lk82, metasound, udom83 +from pythainlp.soundex import lk82, metasound, soundex, udom83 from pythainlp.spell import correct, spell from pythainlp.summarize import summarize from pythainlp.tag import pos_tag, pos_tag_sents @@ -233,13 +233,24 @@ def test_sentiment(self): # ### pythainlp.soundex def test_soundex(self): + self.assertIsNotNone(soundex("a", engine="lk82")) + self.assertIsNotNone(soundex("a", engine="udom83")) + self.assertIsNotNone(soundex("a", engine="metasound")) + self.assertIsNotNone(soundex("a", engine="XXX")) + self.assertEqual(lk82("รถ"), "ร3000") + self.assertIsNotNone(lk82("เกาะกูร์")) + self.assertEqual(lk82(""), "") + self.assertEqual(udom83("รถ"), "ร800000") + self.assertEqual(udom83(None), "") + self.assertEqual(metasound("บูรณะ"), "บ550") self.assertEqual(metasound("คน"), "ค500") self.assertEqual(metasound("คนA"), "ค500") self.assertEqual(metasound("ดา"), "ด000") self.assertEqual(metasound("รักษ์"), metasound("รัก")) + self.assertEqual(metasound(""), "") # ### pythainlp.spell From a09044846bfca9a29c7ce309bcd355a8c6edfe09 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 15:21:18 +0700 Subject: [PATCH 5/8] check length before accessing text index --- pythainlp/soundex/lk82.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index b46e65b01..4369e1342 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -41,7 +41,8 @@ def lk82(text): res.append(text[0].translate(_TRANS1)) text = text[1:] else: - res.append(text[1].translate(_TRANS1)) + if len(text) > 0: + res.append(text[1].translate(_TRANS1)) res.append(text[0].translate(_TRANS2)) text = text[2:] From f4ffbbd0cb5f4146a5ae37b0fdbbf9ca478aa7f2 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 15:39:48 +0700 Subject: [PATCH 6/8] Handle empty text --- pythainlp/soundex/lk82.py | 9 ++++++--- pythainlp/soundex/udom83.py | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index 4369e1342..b659ef3a7 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Thai soundex - lk82 system +Thai soundex - LK82 system Python implementation: Korakot Chaovavanich https://gist.github.com/korakot/0b772e09340cac2f493868da035597e8 @@ -23,7 +23,7 @@ def lk82(text): """ - LK82 - It's a thai soundex rule. + LK82 - It's a Thai soundex rule. :param str text: Thai word :return: LK82 soundex @@ -31,12 +31,15 @@ def lk82(text): if not text: return "" - res = [] text = _RE_1.sub("", text) # 4.ลบวรรณยุกต์ text = _RE_2.sub("", text) # 4.ลบตัวการันต์ text = _RE_3.sub("", text) # 5.ทิ้งไม้ไต่คู่ ฯลฯ + if not text: + return "" + # 6.เข้ารหัสตัวแรก + res = [] if "ก" <= text[0] <= "ฮ": res.append(text[0].translate(_TRANS1)) text = text[1:] diff --git a/pythainlp/soundex/udom83.py b/pythainlp/soundex/udom83.py index fc4d3c949..bf7ec5bba 100644 --- a/pythainlp/soundex/udom83.py +++ b/pythainlp/soundex/udom83.py @@ -52,6 +52,9 @@ def udom83(text): text = _RE_10.sub("", text) text = _RE_11.sub("", text) + if not text: + return "" + sd = text[0].translate(_TRANS1) sd += text[1:].translate(_TRANS2) From 9ab4d94c7ff1f8f884e3276e5af80eef3ededf9a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 16:00:03 +0700 Subject: [PATCH 7/8] remove if __name__ == "__main__" --- pythainlp/number/numtoword.py | 4 ---- pythainlp/rank/__init__.py | 4 ---- pythainlp/romanization/royin.py | 4 ---- pythainlp/sentiment/__init__.py | 5 ----- pythainlp/tokenize/longest.py | 5 ----- pythainlp/tokenize/multi_cut.py | 7 ------- pythainlp/tokenize/pyicu.py | 4 ---- pythainlp/util/keyboard.py | 5 ----- tests/__init__.py | 4 +++- 9 files changed, 3 insertions(+), 39 deletions(-) diff --git a/pythainlp/number/numtoword.py b/pythainlp/number/numtoword.py index 174b7a768..394984d70 100644 --- a/pythainlp/number/numtoword.py +++ b/pythainlp/number/numtoword.py @@ -95,7 +95,3 @@ def num_to_thaiword(number): pos += 1 return ret - - -if __name__ == "__main__": - print(bahttext(4000.0)) diff --git a/pythainlp/rank/__init__.py b/pythainlp/rank/__init__.py index 9b831391d..84a7a7271 100644 --- a/pythainlp/rank/__init__.py +++ b/pythainlp/rank/__init__.py @@ -19,7 +19,3 @@ def rank(words, stopword=False): words = [word for word in words if word not in _STOPWORDS] return Counter(words) - - -if __name__ == "__main__": - print(rank(["แมว", "ชอบ", "ปลา", "แมว", "ชอบ", "นอน", "คน", "เป็น", "ทาส", "แมว"])) diff --git a/pythainlp/romanization/royin.py b/pythainlp/romanization/royin.py index 87776b8e2..69a3671d9 100644 --- a/pythainlp/romanization/royin.py +++ b/pythainlp/romanization/royin.py @@ -177,7 +177,3 @@ def romanize(word): word2 = "".join(word2) word2 = _replace_consonants(word2, res) return word2 - - -if __name__ == "__main__": - print(romanize("กร") == romanize("กอน")) diff --git a/pythainlp/sentiment/__init__.py b/pythainlp/sentiment/__init__.py index c2b018382..d0af157be 100644 --- a/pythainlp/sentiment/__init__.py +++ b/pythainlp/sentiment/__init__.py @@ -49,8 +49,3 @@ def sentiment(text, engine="old"): featurized_test_sentence = {i: (i in text) for i in vocabulary} return classifier.classify(featurized_test_sentence) - - -if __name__ == "__main__": - text = "เสียใจแย่มากเลย" - print(sentiment(text)) diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 1b50e41cb..483685da2 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -138,8 +138,3 @@ def segment(text, trie=None): if not trie: trie = DEFAULT_DICT_TRIE return Tokenizer(trie).tokenize(text) - - -if __name__ == "__main__": - text = "รถประจำทาง ลำปาง-แม่เมาะ AB-2390-30" - print(segment(text)) diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 0b7e115f4..80f621c27 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -128,10 +128,3 @@ def find_all_segment(text, trie=None): """ ww = list(multicut(text, trie=trie)) return list(combine(ww)) - - -if __name__ == "__main__": - text = "ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด" - print(mmcut(text)) - for one in find_all_segment(text): - print(one) diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 4a02b9f4e..9a2ffa581 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -28,7 +28,3 @@ def _gen_words(text): def segment(text): text = re.sub("([^\u0E00-\u0E7F\n ]+)", " \\1 ", text) return list(_gen_words(text)) - - -if __name__ == "__main__": - print(segment("พูดไทย2คำEnglishคำ")) diff --git a/pythainlp/util/keyboard.py b/pythainlp/util/keyboard.py index 7cdbe2aa0..9615d53a1 100644 --- a/pythainlp/util/keyboard.py +++ b/pythainlp/util/keyboard.py @@ -121,8 +121,3 @@ def thai_to_eng(text): return "".join( [TH_EN_KEYB_PAIRS[ch] if (ch in TH_EN_KEYB_PAIRS) else ch for ch in text] ) - - -if __name__ == "__main__": - print(eng_to_thai("l;ylfu8iy[")) - print(thai_to_eng("นามรสนอำันี")) diff --git a/tests/__init__.py b/tests/__init__.py index 7ad8151a2..d80cc21d0 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -239,7 +239,9 @@ def test_soundex(self): self.assertIsNotNone(soundex("a", engine="XXX")) self.assertEqual(lk82("รถ"), "ร3000") - self.assertIsNotNone(lk82("เกาะกูร์")) + self.assertIsNotNone(lk82("เกาะ")) + self.assertIsNotNone(lk82("อุยกูร์")) + self.assertIsNotNone(lk82("หยากไย่")) self.assertEqual(lk82(""), "") self.assertEqual(udom83("รถ"), "ร800000") From 3ed552e01df3e57a054a20ad60936d781c47ab07 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 6 Nov 2018 16:04:52 +0700 Subject: [PATCH 8/8] check str length --- pythainlp/soundex/lk82.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/soundex/lk82.py b/pythainlp/soundex/lk82.py index b659ef3a7..f7b21a764 100644 --- a/pythainlp/soundex/lk82.py +++ b/pythainlp/soundex/lk82.py @@ -44,7 +44,7 @@ def lk82(text): res.append(text[0].translate(_TRANS1)) text = text[1:] else: - if len(text) > 0: + if len(text) > 1: res.append(text[1].translate(_TRANS1)) res.append(text[0].translate(_TRANS2)) text = text[2:]