PyThaiNLP
diff --git a/‎notebooks/test_wsd.ipynb‎
Lines changed: 6 additions & 6 deletions b/‎notebooks/test_wsd.ipynb‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎pythainlp/ancient/aksonhan.py‎
Lines changed: 10 additions & 9 deletions b/‎pythainlp/ancient/aksonhan.py‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎pythainlp/tokenize/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎pythainlp/tokenize/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pythainlp/tokenize/core.py‎
Lines changed: 49 additions & 45 deletions b/‎pythainlp/tokenize/core.py‎
Lines changed: 49 additions & 45 deletions
diff --git a/‎pythainlp/tokenize/deepcut.py‎
Lines changed: 1 addition & 1 deletion b/‎pythainlp/tokenize/deepcut.py‎
Lines changed: 1 addition & 1 deletion
@@ -30,7 +30,7 @@
     }
    ],
    "source": [
-    "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"
+    "print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))"
    ]
   },
   {
@@ -50,7 +50,7 @@
     }
    ],
    "source": [
-    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"
+    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))"
    ]
   },
   {
@@ -68,7 +68,7 @@
     }
    ],
    "source": [
-    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"
+    "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))"
    ]
   },
   {
@@ -92,7 +92,7 @@
    },
    "outputs": [],
    "source": [
-    "_w=thai_wsd_dict()"
+    "w = thai_wsd_dict()"
    ]
   },
   {
@@ -115,7 +115,7 @@
     }
    ],
    "source": [
-    "_w.keys()"
+    "w.keys()"
    ]
   },
   {
@@ -138,7 +138,7 @@
     }
    ],
    "source": [
-    "_w[\"word\"][0],_w[\"meaning\"][0]"
+    "w[\"word\"][0], w[\"meaning\"][0]"
    ]
   },
   {
 
@@ -2,22 +2,22 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 from pythainlp.util import Trie
-from pythainlp import thai_consonants,thai_tonemarks
+from pythainlp import thai_consonants, thai_tonemarks
 from pythainlp.tokenize import Tokenizer
 from pythainlp.corpus import thai_orst_words
 
 
 _dict_aksonhan = {}
 for i in list(thai_consonants):
-    if i=="ร":
+    if i == "ร":
         continue
     for j in list(thai_tonemarks):
-        _dict_aksonhan[i+j+i] = "ั"+j+i
-        _dict_aksonhan[i+i+j+i] = i+"ั"+j+i
-    _dict_aksonhan[i+i] = "ั"+i
+        _dict_aksonhan[i + j + i] = "ั" + j + i
+        _dict_aksonhan[i + i + j + i] = i + "ั" + j + i
+    _dict_aksonhan[i + i] = "ั" + i
 _set_aksonhan = set(_dict_aksonhan.keys())
-_trie = Trie(list(_dict_aksonhan.keys())+list(thai_consonants))
-_tokenizer = Tokenizer(custom_dict=_trie,engine="mm")
+_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
+_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
 _dict_thai = set(thai_orst_words())  # call Thai words
 
 
@@ -52,13 +52,14 @@ def aksonhan_to_current(word: str) -> str:
         return word
     elif word in _set_aksonhan:
         return _dict_aksonhan[word]
-    elif word in _dict_thai: # word in Thai words
+    elif word in _dict_thai:  # word in Thai words
         return word
+
     _seg = _tokenizer.word_tokenize(word)
     _w = []
     for i in _seg:
         if i in _set_aksonhan:
             _w.append(_dict_aksonhan[i])
         else:
             _w.append(i)
-    return ''.join(_w)
+    return "".join(_w)
@@ -10,12 +10,12 @@
     "Tokenizer",
     "Trie",
     "clause_tokenize",
+    "paragraph_tokenize",
     "sent_tokenize",
     "subword_tokenize",
     "syllable_tokenize",
-    "word_tokenize",
     "word_detokenize",
-    "paragraph_tokenize",
+    "word_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -33,12 +33,12 @@
 from pythainlp.tokenize.core import (
     Tokenizer,
     clause_tokenize,
+    paragraph_tokenize,
     sent_tokenize,
     subword_tokenize,
     syllable_tokenize,
-    word_tokenize,
     word_detokenize,
-    paragraph_tokenize,
+    word_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus
 
@@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
 
 def word_detokenize(
     segments: Union[List[List[str]], List[str]], output: str = "str"
-) -> Union[str, List[str]]:
+) -> Union[List[str], str]:
     """
     Word detokenizer.
 
@@ -65,16 +65,18 @@ def word_detokenize(
         print(word_detokenize(["เรา", "เล่น"]))
         # output: เราเล่น
     """
-    _list_all = []
+    list_all = []
+
     if isinstance(segments[0], str):
         segments = [segments]
+
     from pythainlp import thai_characters
 
     for i, s in enumerate(segments):
-        _list_sents = []
-        _add_index = []
-        _space_index = []
-        _mark_index = []
+        list_sents = []
+        add_index = []
+        space_index = []
+        mark_index = []
         for j, w in enumerate(s):
             if j > 0:
                 # previous word
@@ -85,35 +87,36 @@ def word_detokenize(
                     and not w.isspace()
                     and not p_w.isspace()
                 ):
-                    _list_sents.append(" ")
-                    _add_index.append(j)
+                    list_sents.append(" ")
+                    add_index.append(j)
                 # if previous word is number or other language and is not space
                 elif p_w[0] not in thai_characters and not p_w.isspace():
-                    _list_sents.append(" ")
-                    _add_index.append(j)
+                    list_sents.append(" ")
+                    add_index.append(j)
                 # if word is Thai iteration mark
                 elif w == "ๆ":
                     if not p_w.isspace():
-                        _list_sents.append(" ")
-                    _mark_index.append(j)
-                elif w.isspace() and j - 1 not in _space_index:
-                    _space_index.append(j)
-                elif j - 1 in _mark_index:
-                    _list_sents.append(" ")
-            _list_sents.append(w)
-        _list_all.append(_list_sents)
+                        list_sents.append(" ")
+                    mark_index.append(j)
+                elif w.isspace() and j - 1 not in space_index:
+                    space_index.append(j)
+                elif j - 1 in mark_index:
+                    list_sents.append(" ")
+            list_sents.append(w)
+        list_all.append(list_sents)
+
     if output == "list":
-        return _list_all
-    else:
-        _text = []
-        for i in _list_all:
-            _text.append("".join(i))
-        return " ".join(_text)
+        return list_all
+
+    text = []
+    for i in list_all:
+        text.append("".join(i))
+    return " ".join(text)
 
 
 def word_tokenize(
     text: str,
-    custom_dict: Trie = None,
+    custom_dict: Trie = Trie([]),
     engine: str = DEFAULT_WORD_TOKENIZE_ENGINE,
     keep_whitespace: bool = True,
     join_broken_num: bool = True,
@@ -290,7 +293,7 @@ def word_tokenize(
 
         if isinstance(custom_dict, str):
             segments = segment(text, custom_dict=custom_dict)
-        elif not isinstance(custom_dict, str) and custom_dict is not None:
+        elif not isinstance(custom_dict, str) and not custom_dict:
             raise ValueError(
                 f"""Tokenizer \"{engine}\":
                 custom_dict must be a str.
@@ -415,11 +418,12 @@ def sent_tokenize(
         segments = segment.split_into_sentences(text)
     elif engine.startswith("wtp"):
         if "-" not in engine:
-            _size="mini"
+            _size = "mini"
         else:
             _size = engine.split("-")[-1]
         from pythainlp.tokenize.wtsplit import tokenize as segment
-        segments = segment(text,size=_size,tokenize="sentence")
+
+        segments = segment(text, size=_size, tokenize="sentence")
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
@@ -435,8 +439,8 @@ def sent_tokenize(
 def paragraph_tokenize(
     text: str,
     engine: str = "wtp-mini",
-    paragraph_threshold:float=0.5,
-    style:str='newline',
+    paragraph_threshold: float = 0.5,
+    style: str = "newline",
 ) -> List[List[str]]:
     """
     Paragraph tokenizer.
@@ -479,23 +483,25 @@ def paragraph_tokenize(
     """
     if engine.startswith("wtp"):
         if "-" not in engine:
-            _size="mini"
+            size = "mini"
         else:
-            _size = engine.split("-")[-1]
+            size = engine.split("-")[-1]
+
         from pythainlp.tokenize.wtsplit import tokenize as segment
-        segments = segment(
-                      text,
-                      size=_size,
-                      tokenize="paragraph",
-                      paragraph_threshold=paragraph_threshold,
-                      style=style,
-                    )
 
+        segments = segment(
+            text,
+            size=size,
+            tokenize="paragraph",
+            paragraph_threshold=paragraph_threshold,
+            style=style,
+        )
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
             It might be a typo; if not, please consult our document."""
         )
+
     return segments
 
 
@@ -622,7 +628,7 @@ def subword_tokenize(
 
 def syllable_tokenize(
     text: str,
-    engine: str=DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
+    engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
     keep_whitespace: bool = True,
 ) -> List[str]:
     """
@@ -652,9 +658,7 @@ def syllable_tokenize(
             It might be a typo; if not, please consult our document."""
         )
     return subword_tokenize(
-        text=text,
-        engine=engine,
-        keep_whitespace=keep_whitespace
+        text=text, engine=engine, keep_whitespace=keep_whitespace
     )
 
 
@@ -727,7 +731,7 @@ class Tokenizer:
 
     def __init__(
         self,
-        custom_dict: Union[Trie, Iterable[str], str] = None,
+        custom_dict: Union[Trie, Iterable[str], str] = [],
         engine: str = "newmm",
         keep_whitespace: bool = True,
         join_broken_num: bool = True,
@@ -743,7 +747,7 @@ def __init__(
         :param bool keep_whitespace: True to keep whitespace, a common mark
                                     for end of phrase in Thai
         """
-        self.__trie_dict = None
+        self.__trie_dict = Trie([])
         if custom_dict:
             self.__trie_dict = dict_trie(custom_dict)
         else:
 
@@ -21,7 +21,7 @@
 
 
 def segment(
-    text: str, custom_dict: Union[Trie, List[str], str] = None
+    text: str, custom_dict: Union[Trie, List[str], str] = []
 ) -> List[str]:
     if not text or not isinstance(text, str):
         return []
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@`
`30`	`30`	`}`
`31`	`31`	`],`
`32`	`32`	`"source": [`
`33`		`- "print(get_sense(\"เขากำลังอบขนมคุกกี้\",\"คุกกี้\"))"`
	`33`	`+ "print(get_sense(\"เขากำลังอบขนมคุกกี้\", \"คุกกี้\"))"`
`34`	`34`	`]`
`35`	`35`	`},`
`36`	`36`	`{`
`@@ -50,7 +50,7 @@`
`50`	`50`	`}`
`51`	`51`	`],`
`52`	`52`	`"source": [`
`53`		`- "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คุกกี้\"))"`
	`53`	`+ "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คุกกี้\"))"`
`54`	`54`	`]`
`55`	`55`	`},`
`56`	`56`	`{`
`@@ -68,7 +68,7 @@`
`68`	`68`	`}`
`69`	`69`	`],`
`70`	`70`	`"source": [`
`71`		`- "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\",\"คน\"))"`
	`71`	`+ "print(get_sense(\"เว็บนี้ต้องการคุกกี้ในการทำงาน\", \"คน\"))"`
`72`	`72`	`]`
`73`	`73`	`},`
`74`	`74`	`{`
`@@ -92,7 +92,7 @@`
`92`	`92`	`},`
`93`	`93`	`"outputs": [],`
`94`	`94`	`"source": [`
`95`		`- "_w=thai_wsd_dict()"`
	`95`	`+ "w = thai_wsd_dict()"`
`96`	`96`	`]`
`97`	`97`	`},`
`98`	`98`	`{`
`@@ -115,7 +115,7 @@`
`115`	`115`	`}`
`116`	`116`	`],`
`117`	`117`	`"source": [`
`118`		`- "_w.keys()"`
	`118`	`+ "w.keys()"`
`119`	`119`	`]`
`120`	`120`	`},`
`121`	`121`	`{`
`@@ -138,7 +138,7 @@`
`138`	`138`	`}`
`139`	`139`	`],`
`140`	`140`	`"source": [`
`141`		`- "_w[\"word\"][0],_w[\"meaning\"][0]"`
	`141`	`+ "w[\"word\"][0], w[\"meaning\"][0]"`
`142`	`142`	`]`
`143`	`143`	`},`
`144`	`144`	`{`