From 3f3b74ca00779ded08ccf7a166ca22bc1072dcf8 Mon Sep 17 00:00:00 2001
From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com>
Date: Sat, 17 Jun 2023 13:52:41 +0700
Subject: [PATCH 1/2] Update core.py by adding paragraph_threshold

---
 pythainlp/tokenize/core.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 73b98a88a..587c15eb9 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -439,7 +439,7 @@ def sent_tokenize(
     return segments
 
 
-def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
+def paragraph_tokenize(text: str, engine: str = "wtp-mini", paragraph_threshold:float=0.5) -> List[List[str]]:
     """
     Paragraph tokenizer.
 
@@ -485,7 +485,8 @@ def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
         else:
             _size = engine.split("-")[-1]
         from pythainlp.tokenize.wtsplit import tokenize as segment
-        segments = segment(text,size=_size,tokenize="paragraph")
+        segments = segment(text,size=_size,tokenize="paragraph",paragraph_threshold=paragraph_threshold)
+        
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.

From 211b13bd1b3657d1d036190820b8f3d5f9daa28c Mon Sep 17 00:00:00 2001
From: Pavarissy <69553539+pavaris-pm@users.noreply.github.com>
Date: Sat, 17 Jun 2023 13:53:17 +0700
Subject: [PATCH 2/2] Update wtsplit.py by adding paragraph_threshold

---
 pythainlp/tokenize/wtsplit.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
index 20c8a8eb1..6364aeffa 100644
--- a/pythainlp/tokenize/wtsplit.py
+++ b/pythainlp/tokenize/wtsplit.py
@@ -28,7 +28,8 @@ def _tokenize(
         text:str,
         lang_code:str="th",
         model:str="wtp-bert-mini",
-        tokenize:str="sentence"
+        tokenize:str="sentence",
+        paragraph_threshold:float=0.5,
     )-> List[str]:
     global _MODEL_NAME,_MODEL
     if _MODEL_NAME != model:
@@ -40,11 +41,12 @@ def _tokenize(
         return _MODEL.split(
             text,
             lang_code=lang_code,
-            do_paragraph_segmentation=True
+            do_paragraph_segmentation=True,
+            paragraph_threshold=paragraph_threshold
         )
 
 
-def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
+def tokenize(text:str, size:str="mini", tokenize:str="sentence", paragraph_threshold:float=0.5)-> List[str]:
     _model_load=""
     if size=="tiny":
         _model_load="wtp-bert-tiny"
@@ -54,4 +56,4 @@ def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
         _model_load="wtp-canine-s-12l"
     else:  # mini
         _model_load="wtp-bert-mini"
-    return _tokenize(text, model=_model_load,tokenize=tokenize)
+    return _tokenize(text, model=_model_load,tokenize=tokenize,paragraph_threshold=paragraph_threshold)