@@ -48,7 +48,7 @@ def clause_tokenize(doc: List[str]) -> List[List[str]]:
4848
4949def word_detokenize (
5050 segments : Union [List [List [str ]], List [str ]], output : str = "str"
51- ) -> Union [str , List [str ]]:
51+ ) -> Union [List [str ], str ]:
5252 """
5353 Word detokenizer.
5454
@@ -65,16 +65,18 @@ def word_detokenize(
6565 print(word_detokenize(["เรา", "เล่น"]))
6666 # output: เราเล่น
6767 """
68- _list_all = []
68+ list_all = []
69+
6970 if isinstance (segments [0 ], str ):
7071 segments = [segments ]
72+
7173 from pythainlp import thai_characters
7274
7375 for i , s in enumerate (segments ):
74- _list_sents = []
75- _add_index = []
76- _space_index = []
77- _mark_index = []
76+ list_sents = []
77+ add_index = []
78+ space_index = []
79+ mark_index = []
7880 for j , w in enumerate (s ):
7981 if j > 0 :
8082 # previous word
@@ -85,35 +87,36 @@ def word_detokenize(
8587 and not w .isspace ()
8688 and not p_w .isspace ()
8789 ):
88- _list_sents .append (" " )
89- _add_index .append (j )
90+ list_sents .append (" " )
91+ add_index .append (j )
9092 # if previous word is number or other language and is not space
9193 elif p_w [0 ] not in thai_characters and not p_w .isspace ():
92- _list_sents .append (" " )
93- _add_index .append (j )
94+ list_sents .append (" " )
95+ add_index .append (j )
9496 # if word is Thai iteration mark
9597 elif w == "ๆ" :
9698 if not p_w .isspace ():
97- _list_sents .append (" " )
98- _mark_index .append (j )
99- elif w .isspace () and j - 1 not in _space_index :
100- _space_index .append (j )
101- elif j - 1 in _mark_index :
102- _list_sents .append (" " )
103- _list_sents .append (w )
104- _list_all .append (_list_sents )
99+ list_sents .append (" " )
100+ mark_index .append (j )
101+ elif w .isspace () and j - 1 not in space_index :
102+ space_index .append (j )
103+ elif j - 1 in mark_index :
104+ list_sents .append (" " )
105+ list_sents .append (w )
106+ list_all .append (list_sents )
107+
105108 if output == "list" :
106- return _list_all
107- else :
108- _text = []
109- for i in _list_all :
110- _text .append ("" .join (i ))
111- return " " .join (_text )
109+ return list_all
110+
111+ text = []
112+ for i in list_all :
113+ text .append ("" .join (i ))
114+ return " " .join (text )
112115
113116
114117def word_tokenize (
115118 text : str ,
116- custom_dict : Trie = None ,
119+ custom_dict : Trie = Trie ([]) ,
117120 engine : str = DEFAULT_WORD_TOKENIZE_ENGINE ,
118121 keep_whitespace : bool = True ,
119122 join_broken_num : bool = True ,
@@ -290,7 +293,7 @@ def word_tokenize(
290293
291294 if isinstance (custom_dict , str ):
292295 segments = segment (text , custom_dict = custom_dict )
293- elif not isinstance (custom_dict , str ) and custom_dict is not None :
296+ elif not isinstance (custom_dict , str ) and not custom_dict :
294297 raise ValueError (
295298 f"""Tokenizer \" { engine } \" :
296299 custom_dict must be a str.
@@ -415,11 +418,12 @@ def sent_tokenize(
415418 segments = segment .split_into_sentences (text )
416419 elif engine .startswith ("wtp" ):
417420 if "-" not in engine :
418- _size = "mini"
421+ _size = "mini"
419422 else :
420423 _size = engine .split ("-" )[- 1 ]
421424 from pythainlp .tokenize .wtsplit import tokenize as segment
422- segments = segment (text ,size = _size ,tokenize = "sentence" )
425+
426+ segments = segment (text , size = _size , tokenize = "sentence" )
423427 else :
424428 raise ValueError (
425429 f"""Tokenizer \" { engine } \" not found.
@@ -435,8 +439,8 @@ def sent_tokenize(
435439def paragraph_tokenize (
436440 text : str ,
437441 engine : str = "wtp-mini" ,
438- paragraph_threshold :float = 0.5 ,
439- style :str = ' newline' ,
442+ paragraph_threshold : float = 0.5 ,
443+ style : str = " newline" ,
440444) -> List [List [str ]]:
441445 """
442446 Paragraph tokenizer.
@@ -479,23 +483,25 @@ def paragraph_tokenize(
479483 """
480484 if engine .startswith ("wtp" ):
481485 if "-" not in engine :
482- _size = "mini"
486+ size = "mini"
483487 else :
484- _size = engine .split ("-" )[- 1 ]
488+ size = engine .split ("-" )[- 1 ]
489+
485490 from pythainlp .tokenize .wtsplit import tokenize as segment
486- segments = segment (
487- text ,
488- size = _size ,
489- tokenize = "paragraph" ,
490- paragraph_threshold = paragraph_threshold ,
491- style = style ,
492- )
493491
492+ segments = segment (
493+ text ,
494+ size = size ,
495+ tokenize = "paragraph" ,
496+ paragraph_threshold = paragraph_threshold ,
497+ style = style ,
498+ )
494499 else :
495500 raise ValueError (
496501 f"""Tokenizer \" { engine } \" not found.
497502 It might be a typo; if not, please consult our document."""
498503 )
504+
499505 return segments
500506
501507
@@ -622,7 +628,7 @@ def subword_tokenize(
622628
623629def syllable_tokenize (
624630 text : str ,
625- engine : str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE ,
631+ engine : str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE ,
626632 keep_whitespace : bool = True ,
627633) -> List [str ]:
628634 """
@@ -652,9 +658,7 @@ def syllable_tokenize(
652658 It might be a typo; if not, please consult our document."""
653659 )
654660 return subword_tokenize (
655- text = text ,
656- engine = engine ,
657- keep_whitespace = keep_whitespace
661+ text = text , engine = engine , keep_whitespace = keep_whitespace
658662 )
659663
660664
@@ -727,7 +731,7 @@ class Tokenizer:
727731
728732 def __init__ (
729733 self ,
730- custom_dict : Union [Trie , Iterable [str ], str ] = None ,
734+ custom_dict : Union [Trie , Iterable [str ], str ] = [] ,
731735 engine : str = "newmm" ,
732736 keep_whitespace : bool = True ,
733737 join_broken_num : bool = True ,
@@ -743,7 +747,7 @@ def __init__(
743747 :param bool keep_whitespace: True to keep whitespace, a common mark
744748 for end of phrase in Thai
745749 """
746- self .__trie_dict = None
750+ self .__trie_dict = Trie ([])
747751 if custom_dict :
748752 self .__trie_dict = dict_trie (custom_dict )
749753 else :
0 commit comments