From 11087cb63581c63ae638053335a8b6b5d82bfcc0 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Tue, 6 Jun 2023 02:43:06 +0700
Subject: [PATCH 1/3] Add wtpsplit to sentence segmentation

Add sentence segmentation with 'wtpsplit' #803
---
 docker_requirements.txt       |  1 +
 pythainlp/tokenize/core.py    | 13 ++++++++
 pythainlp/tokenize/wtsplit.py | 57 +++++++++++++++++++++++++++++++++++
 setup.py                      |  2 ++
 tests/test_tokenize.py        | 24 +++++++++++++++
 5 files changed, 97 insertions(+)
 create mode 100644 pythainlp/tokenize/wtsplit.py

diff --git a/docker_requirements.txt b/docker_requirements.txt
index 1cf59d425..6394b2ed2 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -34,3 +34,4 @@ khanaa==0.0.6
 spacy_thai==0.7.1
 esupar==1.3.8
 ufal.chu-liu-edmonds==1.0.2
+wtpsplit==1.0.1
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 2482d08ff..5b2f04acf 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -344,6 +344,12 @@ def sent_tokenize(
         * *thaisum* - The implementation of sentence segmentator from \
             Nakhun Chumpolsathien, 2020
         * *tltk* - split by `TLTK <https://pypi.org/project/tltk/>`_.,
+        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
+            It support many size of models. You can use ``wtp`` to use mini model, \
+            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
         * *whitespace+newline* - split by whitespaces and newline.
         * *whitespace* - split by whitespaces. Specifiaclly, with \
                          :class:`regex` pattern  ``r" +"``
@@ -414,6 +420,13 @@ def sent_tokenize(
 
         segment = segmentor()
         segments = segment.split_into_sentences(text)
+    elif engine.startswith("wtp"):
+        if "-" not in engine:
+            _size="mini"
+        else:
+            _size = engine.split("-")[-1]
+        from pythainlp.tokenize.wtsplit import tokenize as segment
+        segments = segment(text,size=_size,tokenize="sentence")
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py
new file mode 100644
index 000000000..20c8a8eb1
--- /dev/null
+++ b/pythainlp/tokenize/wtsplit.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation
+
+GitHub: https://github.com/bminixhofer/wtpsplit
+"""
+from typing import List
+from wtpsplit import WtP
+
+_MODEL = None
+_MODEL_NAME = None
+
+
+def _tokenize(
+        text:str,
+        lang_code:str="th",
+        model:str="wtp-bert-mini",
+        tokenize:str="sentence"
+    )-> List[str]:
+    global _MODEL_NAME,_MODEL
+    if _MODEL_NAME != model:
+        _MODEL = WtP(model_name_or_model=model)
+        _MODEL_NAME = model
+    if tokenize=="sentence":
+        return _MODEL.split(text,lang_code=lang_code)
+    else: # Paragraph
+        return _MODEL.split(
+            text,
+            lang_code=lang_code,
+            do_paragraph_segmentation=True
+        )
+
+
+def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]:
+    _model_load=""
+    if size=="tiny":
+        _model_load="wtp-bert-tiny"
+    elif size=="base":
+        _model_load="wtp-canine-s-1l"
+    elif size=="large":
+        _model_load="wtp-canine-s-12l"
+    else:  # mini
+        _model_load="wtp-bert-mini"
+    return _tokenize(text, model=_model_load,tokenize=tokenize)
diff --git a/setup.py b/setup.py
index c03533bf7..425ce176a 100644
--- a/setup.py
+++ b/setup.py
@@ -78,6 +78,7 @@
         "sentencepiece>=0.1.91"
     ],
     "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"],
+    "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"],
     "wordnet": ["nltk>=3.3"],
     "generate": ["fastai<2.0"],
     "sefr_cut": ["sefr_cut>=1.1"],
@@ -136,6 +137,7 @@
         "onnxruntime>=1.10.0",
         "thai_nner",
         "wunsen>=0.0.3",
+        "wtpsplit>=1.0.1",
         "spacy_thai>=0.7.1",
         "ufal.chu-liu-edmonds>=1.0.2",
     ],
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 18e4cacbc..56d94c16d 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -306,6 +306,30 @@ def test_sent_tokenize(self):
                 engine="thaisum",
             ),
         )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp",
+            ),
+        )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp-tiny",
+            ),
+        )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp-base",
+            ),
+        )
+        self.assertIsNotNone(
+            sent_tokenize(
+                sent_3,
+                engine="wtp-large",
+            ),
+        )
         self.assertFalse(
             " "
             in sent_tokenize(

From f71a099334c3aad7c1a991f4b80b4c377eec23b8 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Tue, 6 Jun 2023 03:15:46 +0700
Subject: [PATCH 2/3] Add paragraph_tokenize

Tokenizes text into paragraph.
---
 docs/api/tokenize.rst          |  1 +
 pythainlp/tokenize/__init__.py |  2 ++
 pythainlp/tokenize/core.py     | 55 ++++++++++++++++++++++++++++++++++
 tests/test_tokenize.py         | 34 +++++++++++++--------
 4 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
index ced072da4..dcec5dc07 100644
--- a/docs/api/tokenize.rst
+++ b/docs/api/tokenize.rst
@@ -10,6 +10,7 @@ Modules
 
 .. autofunction:: clause_tokenize
 .. autofunction:: sent_tokenize
+.. autofunction:: paragraph_tokenize
 .. autofunction:: subword_tokenize
 .. autofunction:: word_tokenize
 .. autofunction:: word_detokenize
diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
index 39d7a7151..674153cc7 100644
--- a/pythainlp/tokenize/__init__.py
+++ b/pythainlp/tokenize/__init__.py
@@ -25,6 +25,7 @@
     "subword_tokenize",
     "word_tokenize",
     "word_detokenize",
+    "paragraph_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -46,6 +47,7 @@
     subword_tokenize,
     word_tokenize,
     word_detokenize,
+    paragraph_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 5b2f04acf..73b98a88a 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -439,6 +439,61 @@ def sent_tokenize(
     return segments
 
 
+def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
+    """
+    Paragraph tokenizer.
+
+    Tokenizes text into paragraph.
+
+    :param str text: text to be tokenized
+    :param str engine: the name paragraph tokenizer
+    :return: list of paragraph
+    :rtype: List[List[str]]
+    **Options for engine**
+        * *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
+            It support many size of models. You can use ``wtp`` to use mini model, \
+            ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
+            ``wtp-mini`` to use ``wtp-bert-mini`` model, \
+            ``wtp-base`` to use ``wtp-canine-s-1l`` model, \
+            and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
+
+    :Example:
+
+    Split the text based on *wtp*::
+
+        from pythainlp.tokenize import paragraph_tokenize
+
+        sent = (
+            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+            +"  มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+            +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+        )
+
+        paragraph_tokenize(sent)
+        # output: [
+        # ['(1) '], 
+        # [
+        #   'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต  ',
+        #   'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
+        #   'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
+        #   'ณ ที่นี้'
+        # ]]
+    """
+    if engine.startswith("wtp"):
+        if "-" not in engine:
+            _size="mini"
+        else:
+            _size = engine.split("-")[-1]
+        from pythainlp.tokenize.wtsplit import tokenize as segment
+        segments = segment(text,size=_size,tokenize="paragraph")
+    else:
+        raise ValueError(
+            f"""Tokenizer \"{engine}\" not found.
+            It might be a typo; if not, please consult our document."""
+        )
+    return segments
+
+
 def subword_tokenize(
     text: str,
     engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 56d94c16d..76d8eed29 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -23,6 +23,7 @@
     tltk,
     oskut,
     word_detokenize,
+    paragraph_tokenize,
 )
 from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
 from pythainlp.util import dict_trie
@@ -318,18 +319,18 @@ def test_sent_tokenize(self):
                 engine="wtp-tiny",
             ),
         )
-        self.assertIsNotNone(
-            sent_tokenize(
-                sent_3,
-                engine="wtp-base",
-            ),
-        )
-        self.assertIsNotNone(
-            sent_tokenize(
-                sent_3,
-                engine="wtp-large",
-            ),
-        )
+        # self.assertIsNotNone(
+        #     sent_tokenize(
+        #         sent_3,
+        #         engine="wtp-base",
+        #     ),
+        # )
+        # self.assertIsNotNone(
+        #     sent_tokenize(
+        #         sent_3,
+        #         engine="wtp-large",
+        #     ),
+        # )
         self.assertFalse(
             " "
             in sent_tokenize(
@@ -341,6 +342,15 @@ def test_sent_tokenize(self):
         with self.assertRaises(ValueError):
             sent_tokenize("ฉันไป กิน", engine="XX")  # engine does not exist
 
+    def test_paragraph_tokenize(self):
+        sent = (
+            "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
+            + "จากผลงานวิจัยที่เคยทำมาในอดีต"
+            + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+            + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
+        )
+        self.assertIsNotNone(paragraph_tokenize(sent))
+
     def test_subword_tokenize(self):
         self.assertEqual(subword_tokenize(None), [])
         self.assertEqual(subword_tokenize(""), [])

From 95f4ea79bdc3065bdb4efd599be47f8729943f67 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Tue, 6 Jun 2023 12:47:39 +0700
Subject: [PATCH 3/3] Add case to test_paragraph_tokenize

---
 tests/test_tokenize.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 76d8eed29..4659ff08c 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -350,6 +350,8 @@ def test_paragraph_tokenize(self):
             + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
         )
         self.assertIsNotNone(paragraph_tokenize(sent))
+        with self.assertRaises(ValueError):
+            paragraph_tokenize(sent, engine="ai2+2thai")
 
     def test_subword_tokenize(self):
         self.assertEqual(subword_tokenize(None), [])