From 9b56c558cf6464aaf67d870daaad835c4962cb93 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 06:06:30 +0700 Subject: [PATCH 1/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 8beac057c..0618ad4b7 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -49,7 +49,7 @@ jobs: conda info conda list python -m pip install --upgrade pip - pip install pytest coverage coveralls + pip install pytest coverage coveralls typing_extensions==4.5.0 conda install -c conda-forge icu conda install -c conda-forge pyicu SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt From b3113b21a65c3744baf859fff948de4968cbf2b8 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 06:30:34 +0700 Subject: [PATCH 2/9] Update docker_requirements.txt --- docker_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_requirements.txt b/docker_requirements.txt index ff62b6786..a4e226194 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -22,7 +22,7 @@ fairseq==0.10.2 pyicu==2.8 deepcut==0.7.0.0 h5py==3.1.0 -tensorflow==2.11.1 +tensorflow==2.13.0 pandas==1.4.* tltk==1.6.8 OSKut==1.3 From 5a9c7be0436837cbf1c80580bca61a8a87cba3e4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 06:39:53 +0700 Subject: [PATCH 3/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 0618ad4b7..fac0dcd7a 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -45,13 +45,15 @@ jobs: shell: bash -l {0} run: | source ~/miniconda3/etc/profile.d/conda.sh - conda activate pythainlp38 + if conda info --envs | grep -q base; then echo "base already exists"; else conda create -y -n pythainlpwork; fi + conda activate pythainlpwork conda info conda list python -m pip install --upgrade pip pip install pytest coverage coveralls typing_extensions==4.5.0 - conda install -c conda-forge icu - conda install -c conda-forge pyicu + conda install -y -c conda-forge icu + conda install -y -c conda-forge pyicu + conda install -y -c conda-forge fairse SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt pip install deepcut tltk pip install .[full] From fce7fd371d450bd24c4bd073347fe7b0bcfad2ad Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 06:47:27 +0700 Subject: [PATCH 4/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index fac0dcd7a..61cf9d99b 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -50,11 +50,12 @@ jobs: conda info conda list python -m pip install --upgrade pip + SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt + conda install -y -c conda-forge protobuf pip install pytest coverage coveralls typing_extensions==4.5.0 conda install -y -c conda-forge icu conda install -y -c conda-forge pyicu conda install -y -c conda-forge fairse - SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt pip install deepcut tltk pip install .[full] python -m nltk.downloader omw-1.4 From 453391d5f275f9d5e3278da09fdca9c9ccd55f67 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 06:54:10 +0700 Subject: [PATCH 5/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 61cf9d99b..244f4a399 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -53,9 +53,11 @@ jobs: SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True pip install -r docker_requirements.txt conda install -y -c conda-forge protobuf pip install pytest coverage coveralls typing_extensions==4.5.0 + pip install ssg epitran + pip install fastai==1.0.61 + pip install fairseq==0.10.2 conda install -y -c conda-forge icu conda install -y -c conda-forge pyicu - conda install -y -c conda-forge fairse pip install deepcut tltk pip install .[full] python -m nltk.downloader omw-1.4 From def289a8ef18ab3f576cee4636bc3d657804dd8f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 07:00:05 +0700 Subject: [PATCH 6/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 244f4a399..5ccb624ee 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -45,7 +45,7 @@ jobs: shell: bash -l {0} run: | source ~/miniconda3/etc/profile.d/conda.sh - if conda info --envs | grep -q base; then echo "base already exists"; else conda create -y -n pythainlpwork; fi + conda create -y -n pythainlpwork conda activate pythainlpwork conda info conda list From a319d08e6f624f08dfa0d92be174efeff85c033d Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 14 Nov 2023 07:03:31 +0700 Subject: [PATCH 7/9] Update macos-test.yml --- .github/workflows/macos-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/macos-test.yml b/.github/workflows/macos-test.yml index 5ccb624ee..dcfac851a 100644 --- a/.github/workflows/macos-test.yml +++ b/.github/workflows/macos-test.yml @@ -45,8 +45,8 @@ jobs: shell: bash -l {0} run: | source ~/miniconda3/etc/profile.d/conda.sh - conda create -y -n pythainlpwork - conda activate pythainlpwork + conda create -y -n pythainlpwork38 python=3.8 + conda activate pythainlpwork38 conda info conda list python -m pip install --upgrade pip From c1009919dc7eb1171be6daf410a75cc63aba7ff4 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Tue, 14 Nov 2023 03:40:46 +0000 Subject: [PATCH 8/9] refactor code and add test cases --- pythainlp/tag/pos_tag.py | 36 ++++++++++++++++++++++-------------- tests/test_tag.py | 11 +++++++---- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index abdfe5fc2..9fcb8ff01 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -180,7 +180,9 @@ def pos_tag_sents( def pos_tag_transformers( - words: str, engine: str = "bert-base-th-cased-blackboard" + words: str, + engine: str = "bert", + corpus: str = "blackboard", ): """ "wangchanberta-ud-thai-pud-upos", @@ -199,21 +201,27 @@ def pos_tag_transformers( if not words: return [] - if engine == "wangchanberta-ud-thai-pud-upos": - model = AutoModelForTokenClassification.from_pretrained( - "Pavarissy/wangchanberta-ud-thai-pud-upos") - tokenizer = AutoTokenizer.from_pretrained("Pavarissy/wangchanberta-ud-thai-pud-upos") - elif engine == "mdeberta-v3-ud-thai-pud-upos": - model = AutoModelForTokenClassification.from_pretrained( - "Pavarissy/mdeberta-v3-ud-thai-pud-upos") - tokenizer = AutoTokenizer.from_pretrained("Pavarissy/mdeberta-v3-ud-thai-pud-upos") - elif engine == "bert-base-th-cased-blackboard": - model = AutoModelForTokenClassification.from_pretrained("lunarlist/pos_thai") - tokenizer = AutoTokenizer.from_pretrained("lunarlist/pos_thai") + _blackboard_support_engine = { + "bert" : "lunarlist/pos_thai", + } + + _pud_support_engine = { + "wangchanberta" : "Pavarissy/wangchanberta-ud-thai-pud-upos", + "mdeberta" : "Pavarissy/mdeberta-v3-ud-thai-pud-upos", + } + + if corpus == 'blackboard' and engine in _blackboard_support_engine.keys(): + base_model = _blackboard_support_engine.get(engine) + model = AutoModelForTokenClassification.from_pretrained(base_model) + tokenizer = AutoTokenizer.from_pretrained(base_model) + elif corpus == 'pud' and engine in _pud_support_engine.keys(): + base_model = _pud_support_engine.get(engine) + model = AutoModelForTokenClassification.from_pretrained(base_model) + tokenizer = AutoTokenizer.from_pretrained(base_model) else: raise ValueError( - "pos_tag_transformers not support {0} engine.".format( - engine + "pos_tag_transformers not support {0} engine or {1} corpus.".format( + engine, corpus ) ) diff --git a/tests/test_tag.py b/tests/test_tag.py index 8d1755b18..b5529ec5b 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -367,10 +367,13 @@ def test_NNER_class(self): def test_pos_tag_transformers(self): self.assertIsNotNone(pos_tag_transformers( - words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert-base-th-cased-blackboard")) + words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert", corpus="blackboard")) self.assertIsNotNone(pos_tag_transformers( - words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta-v3-ud-thai-pud-upos")) + words="แมวทำอะไรตอนห้าโมงเช้า", engine="mdeberta", corpus="pud")) self.assertIsNotNone(pos_tag_transformers( - words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta-ud-thai-pud-upos")) + words="แมวทำอะไรตอนห้าโมงเช้า", engine="wangchanberta", corpus="pud")) with self.assertRaises(ValueError): - pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine") \ No newline at end of file + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="non-existing-engine") + with self.assertRaises(ValueError): + pos_tag_transformers(words="แมวทำอะไรตอนห้าโมงเช้า", engine="bert", + corpus="non-existing corpus") \ No newline at end of file From 5574ce3278d721c903b4c8d1ae530a4737802304 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Tue, 14 Nov 2023 04:30:56 +0000 Subject: [PATCH 9/9] update pos_tag_transformers and fix deprecation --- pythainlp/tag/pos_tag.py | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 9fcb8ff01..ee2a2b478 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -180,15 +180,36 @@ def pos_tag_sents( def pos_tag_transformers( - words: str, + sentence: str, engine: str = "bert", corpus: str = "blackboard", -): +)->List[List[Tuple[str, str]]]: """ - "wangchanberta-ud-thai-pud-upos", - "mdeberta-v3-ud-thai-pud-upos", - "bert-base-th-cased-blackboard", + Marks sentences with part-of-speech (POS) tags. + + :param str sentence: a list of lists of tokenized words + :param str engine: + * *bert* - BERT: Bidirectional Encoder Representations from Transformers (default) + * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only) + * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only) + :param str corpus: the corpus that is used to create the language model for tagger + * *blackboard* - `blackboard treebank (support bert engine only) `_ + * *pud* - `Parallel Universal Dependencies (PUD)\ + `_ \ + treebanks, natively use Universal POS tags (support wangchanberta and mdeberta engine) + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] + + :Example: + + Labels POS for given sentence:: + from pythainlp.tag import pos_tag_transformers + + sentences = "แมวทำอะไรตอนห้าโมงเช้า" + pos_tag_transformers(sentences, engine="bert", corpus='blackboard') + # output: + # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] """ try: @@ -198,7 +219,7 @@ def pos_tag_transformers( raise ImportError( "Not found transformers! Please install transformers by pip install transformers") - if not words: + if not sentence: return [] _blackboard_support_engine = { @@ -225,7 +246,8 @@ def pos_tag_transformers( ) ) - pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True) + pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple") - outputs = pipeline(words) - return outputs \ No newline at end of file + outputs = pipeline(sentence) + word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] + return word_tags \ No newline at end of file