diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 23c08a472..ea33d00df 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -446,7 +446,12 @@ def sent_tokenize( return segments -def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]: +def paragraph_tokenize( + text: str, + engine: str = "wtp-mini", + paragraph_threshold:float=0.5, + style:str='newline', + ) -> List[List[str]]: """ Paragraph tokenizer. @@ -492,7 +497,13 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold: else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold) + segments = segment( + text, + size=_size, + tokenize="paragraph", + paragraph_threshold=paragraph_threshold, + style=style, + ) else: raise ValueError( diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 6364aeffa..2bcbd4183 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -30,6 +30,7 @@ def _tokenize( model:str="wtp-bert-mini", tokenize:str="sentence", paragraph_threshold:float=0.5, + style:str='newline', )-> List[str]: global _MODEL_NAME,_MODEL if _MODEL_NAME != model: @@ -38,15 +39,34 @@ def _tokenize( if tokenize=="sentence": return _MODEL.split(text,lang_code=lang_code) else: # Paragraph - return _MODEL.split( - text, - lang_code=lang_code, - do_paragraph_segmentation=True, - paragraph_threshold=paragraph_threshold + if style=='newline': + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True, + paragraph_threshold=paragraph_threshold + ) + elif style=='opus100': + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True, + threshold=paragraph_threshold, + style=style, + ) + else: + raise ValueError( + f"""Segmentation style \"{style}\" not found. + It might be a typo; if not, please consult our document.""" ) - -def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]: +def tokenize( + text:str, + size:str="mini", + tokenize:str="sentence", + paragraph_threshold:float=0.5, + style:str='newline', + )-> List[str]: _model_load="" if size=="tiny": _model_load="wtp-bert-tiny" @@ -56,4 +76,10 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_thres _model_load="wtp-canine-s-12l" else: # mini _model_load="wtp-bert-mini" - return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold) + return _tokenize( + text, + model=_model_load, + tokenize=tokenize, + paragraph_threshold=paragraph_threshold, + style=style, + )