From 3f3b74ca00779ded08ccf7a166ca22bc1072dcf8 Mon Sep 17 00:00:00 2001 From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com> Date: Sat, 17 Jun 2023 13:52:41 +0700 Subject: [PATCH 1/2] Update core.py by adding paragraph_threshold --- pythainlp/tokenize/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 73b98a88a..587c15eb9 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -439,7 +439,7 @@ def sent_tokenize( return segments -def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: +def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]: """ Paragraph tokenizer. @@ -485,7 +485,8 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: else: _size = engine.split("-")[-1] from pythainlp.tokenize.wtsplit import tokenize as segment - segments = segment(text,size=_size,tokenize="paragraph") + segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold) + else: raise ValueError( f"""Tokenizer \"{engine}\" not found. From 211b13bd1b3657d1d036190820b8f3d5f9daa28c Mon Sep 17 00:00:00 2001 From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com> Date: Sat, 17 Jun 2023 13:53:17 +0700 Subject: [PATCH 2/2] Update wtsplit.py by adding paragraph_threshold --- pythainlp/tokenize/wtsplit.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py index 20c8a8eb1..6364aeffa 100644 --- a/pythainlp/tokenize/wtsplit.py +++ b/pythainlp/tokenize/wtsplit.py @@ -28,7 +28,8 @@ def _tokenize( text:str, lang_code:str="th", model:str="wtp-bert-mini", - tokenize:str="sentence" + tokenize:str="sentence", + paragraph_threshold:float=0.5, )-> List[str]: global _MODEL_NAME,_MODEL if _MODEL_NAME != model: @@ -40,11 +41,12 @@ def _tokenize( return _MODEL.split( text, lang_code=lang_code, - do_paragraph_segmentation=True + do_paragraph_segmentation=True, + paragraph_threshold=paragraph_threshold ) -def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: +def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]: _model_load="" if size=="tiny": _model_load="wtp-bert-tiny" @@ -54,4 +56,4 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: _model_load="wtp-canine-s-12l" else: # mini _model_load="wtp-bert-mini" - return _tokenize(text, model=_model_load,tokenize=tokenize) + return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold)