From 11087cb63581c63ae638053335a8b6b5d82bfcc0 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jun 2023 02:43:06 +0700 Subject: [PATCH 1/3] Add wtpsplit to sentence segmentation Add sentence segmentation with 'wtpsplit' #803 --- docker_requirements.txt | 1 + pythainlp/tokenize/core.py | 13 ++++++++ pythainlp/tokenize/wtsplit.py | 57 +++++++++++++++++++++++++++++++++++ setup.py | 2 ++ tests/test_tokenize.py | 24 +++++++++++++++ 5 files changed, 97 insertions(+) create mode 100644 pythainlp/tokenize/wtsplit.py diff --git a/docker_requirements.txt b/docker_requirements.txt index 1cf59d425..6394b2ed2 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -34,3 +34,4 @@ khanaa==0.0.6 spacy_thai==0.7.1 esupar==1.3.8 ufal.chu-liu-edmonds==1.0.2 +wtpsplit==1.0.1 diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 2482d08ff..5b2f04acf 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -344,6 +344,12 @@ def sent_tokenize( * *thaisum* - The implementation of sentence segmentator from \ Nakhun Chumpolsathien, 2020 * *tltk* - split by `TLTK `_., + * *wtp* - split by `wtpsplitaxe `_., \ + It support many size of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. * *whitespace+newline* - split by whitespaces and newline. * *whitespace* - split by whitespaces. Specifiaclly, with \ :class:`regex` pattern ``r" +"`` @@ -414,6 +420,13 @@ def sent_tokenize( segment = segmentor() segments = segment.split_into_sentences(text) + elif engine.startswith("wtp"): + if "-" not in engine: + _size="mini" + else: + _size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment + segments = segment(text,size=_size,tokenize="sentence") else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/pythainlp/tokenize/wtsplit.py b/pythainlp/tokenize/wtsplit.py new file mode 100644 index 000000000..20c8a8eb1 --- /dev/null +++ b/pythainlp/tokenize/wtsplit.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Where's the Point? Self-Supervised Multilingual Punctuation-Agnostic Sentence Segmentation + +GitHub: https://github.com/bminixhofer/wtpsplit +""" +from typing import List +from wtpsplit import WtP + +_MODEL = None +_MODEL_NAME = None + + +def _tokenize( + text:str, + lang_code:str="th", + model:str="wtp-bert-mini", + tokenize:str="sentence" + )-> List[str]: + global _MODEL_NAME,_MODEL + if _MODEL_NAME != model: + _MODEL = WtP(model_name_or_model=model) + _MODEL_NAME = model + if tokenize=="sentence": + return _MODEL.split(text,lang_code=lang_code) + else: # Paragraph + return _MODEL.split( + text, + lang_code=lang_code, + do_paragraph_segmentation=True + ) + + +def tokenize(text:str, size:str="mini", tokenize:str="sentence")-> List[str]: + _model_load="" + if size=="tiny": + _model_load="wtp-bert-tiny" + elif size=="base": + _model_load="wtp-canine-s-1l" + elif size=="large": + _model_load="wtp-canine-s-12l" + else: # mini + _model_load="wtp-bert-mini" + return _tokenize(text, model=_model_load,tokenize=tokenize) diff --git a/setup.py b/setup.py index c03533bf7..425ce176a 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ "sentencepiece>=0.1.91" ], "mt5": ["transformers>=4.6.0", "sentencepiece>=0.1.91"], + "wtp": ["transformers>=4.6.0", "wtpsplit>=1.0.1"], "wordnet": ["nltk>=3.3"], "generate": ["fastai<2.0"], "sefr_cut": ["sefr_cut>=1.1"], @@ -136,6 +137,7 @@ "onnxruntime>=1.10.0", "thai_nner", "wunsen>=0.0.3", + "wtpsplit>=1.0.1", "spacy_thai>=0.7.1", "ufal.chu-liu-edmonds>=1.0.2", ], diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 18e4cacbc..56d94c16d 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -306,6 +306,30 @@ def test_sent_tokenize(self): engine="thaisum", ), ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp", + ), + ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp-tiny", + ), + ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp-base", + ), + ) + self.assertIsNotNone( + sent_tokenize( + sent_3, + engine="wtp-large", + ), + ) self.assertFalse( " " in sent_tokenize( From f71a099334c3aad7c1a991f4b80b4c377eec23b8 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jun 2023 03:15:46 +0700 Subject: [PATCH 2/3] Add paragraph_tokenize Tokenizes text into paragraph. --- docs/api/tokenize.rst | 1 + pythainlp/tokenize/__init__.py | 2 ++ pythainlp/tokenize/core.py | 55 ++++++++++++++++++++++++++++++++++ tests/test_tokenize.py | 34 +++++++++++++-------- 4 files changed, 80 insertions(+), 12 deletions(-) diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index ced072da4..dcec5dc07 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -10,6 +10,7 @@ Modules .. autofunction:: clause_tokenize .. autofunction:: sent_tokenize +.. autofunction:: paragraph_tokenize .. autofunction:: subword_tokenize .. autofunction:: word_tokenize .. autofunction:: word_detokenize diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 39d7a7151..674153cc7 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -25,6 +25,7 @@ "subword_tokenize", "word_tokenize", "word_detokenize", + "paragraph_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words @@ -46,6 +47,7 @@ subword_tokenize, word_tokenize, word_detokenize, + paragraph_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 5b2f04acf..73b98a88a 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -439,6 +439,61 @@ def sent_tokenize( return segments +def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]: + """ + Paragraph tokenizer. + + Tokenizes text into paragraph. + + :param str text: text to be tokenized + :param str engine: the name paragraph tokenizer + :return: list of paragraph + :rtype: List[List[str]] + **Options for engine** + * *wtp* - split by `wtpsplitaxe `_., \ + It support many size of models. You can use ``wtp`` to use mini model, \ + ``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \ + ``wtp-mini`` to use ``wtp-bert-mini`` model, \ + ``wtp-base`` to use ``wtp-canine-s-1l`` model, \ + and ``wtp-large`` to use ``wtp-canine-s-12l`` model. + + :Example: + + Split the text based on *wtp*:: + + from pythainlp.tokenize import paragraph_tokenize + + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต" + +" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + +" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + + paragraph_tokenize(sent) + # output: [ + # ['(1) '], + # [ + # 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ', + # 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ', + # 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ', + # 'ณ ที่นี้' + # ]] + """ + if engine.startswith("wtp"): + if "-" not in engine: + _size="mini" + else: + _size = engine.split("-")[-1] + from pythainlp.tokenize.wtsplit import tokenize as segment + segments = segment(text,size=_size,tokenize="paragraph") + else: + raise ValueError( + f"""Tokenizer \"{engine}\" not found. + It might be a typo; if not, please consult our document.""" + ) + return segments + + def subword_tokenize( text: str, engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE, diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 56d94c16d..76d8eed29 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -23,6 +23,7 @@ tltk, oskut, word_detokenize, + paragraph_tokenize, ) from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize from pythainlp.util import dict_trie @@ -318,18 +319,18 @@ def test_sent_tokenize(self): engine="wtp-tiny", ), ) - self.assertIsNotNone( - sent_tokenize( - sent_3, - engine="wtp-base", - ), - ) - self.assertIsNotNone( - sent_tokenize( - sent_3, - engine="wtp-large", - ), - ) + # self.assertIsNotNone( + # sent_tokenize( + # sent_3, + # engine="wtp-base", + # ), + # ) + # self.assertIsNotNone( + # sent_tokenize( + # sent_3, + # engine="wtp-large", + # ), + # ) self.assertFalse( " " in sent_tokenize( @@ -341,6 +342,15 @@ def test_sent_tokenize(self): with self.assertRaises(ValueError): sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist + def test_paragraph_tokenize(self): + sent = ( + "(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา" + + "จากผลงานวิจัยที่เคยทำมาในอดีต" + + " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด" + + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" + ) + self.assertIsNotNone(paragraph_tokenize(sent)) + def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), []) self.assertEqual(subword_tokenize(""), []) From 95f4ea79bdc3065bdb4efd599be47f8729943f67 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 6 Jun 2023 12:47:39 +0700 Subject: [PATCH 3/3] Add case to test_paragraph_tokenize --- tests/test_tokenize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 76d8eed29..4659ff08c 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -350,6 +350,8 @@ def test_paragraph_tokenize(self): + " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้" ) self.assertIsNotNone(paragraph_tokenize(sent)) + with self.assertRaises(ValueError): + paragraph_tokenize(sent, engine="ai2+2thai") def test_subword_tokenize(self): self.assertEqual(subword_tokenize(None), [])