diff --git a/docs/api/util.rst b/docs/api/util.rst index 3854788ba..491c7579e 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -49,6 +49,7 @@ Modules .. autofunction:: thaiword_to_num .. autofunction:: thaiword_to_time .. autofunction:: time_to_thaiword +.. autofunction:: tis620_to_utf8 .. autofunction:: tone_detector .. autofunction:: words_to_num .. autofunction:: nectec_to_ipa diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index aa8ef370d..c468251ac 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -65,6 +65,7 @@ "nectec_to_ipa", "ipa_to_rtgs", "remove_tone_ipa", + "tis620_to_utf8", ] from pythainlp.util.collate import collate @@ -121,3 +122,4 @@ syllable_open_close_detector, ) from pythainlp.util.phoneme import nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa +from pythainlp.util.encoding import tis620_to_utf8 diff --git a/pythainlp/util/encoding.py b/pythainlp/util/encoding.py new file mode 100644 index 000000000..8af43f3cf --- /dev/null +++ b/pythainlp/util/encoding.py @@ -0,0 +1,29 @@ +# -*- coding_utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +def tis620_to_utf8(text: str)->str: + """ + Convert TIS-620 to UTF-8 + + :param str text: Text that use TIS-620 encoding + :return: Text that use UTF-8 encoding + :rtype: str + + :Example: + + from pythainlp.util import tis620_to_utf8 + tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ") + # output: 'กระทรวงอุตสาหกรรม' + """ + return text.encode("cp1252", "ignore").decode("tis-620") diff --git a/tests/test_util.py b/tests/test_util.py index 59db4098c..34d12c06a 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -57,6 +57,7 @@ nectec_to_ipa, ipa_to_rtgs, remove_tone_ipa, + tis620_to_utf8, ) @@ -840,3 +841,6 @@ def test_ipa_to_rtgs(self): def test_remove_tone_ipa(self): self.assertEqual(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"), "laː.sa.maj") + + def test_tis620_to_utf8(self): + self.assertEqual(tis620_to_utf8("¡ÃзÃÇ§ÍØµÊÒË¡ÃÃÁ"), "กระทรวงอุตสาหกรรม")