From baba3024d8a16f23527c6c1647b8c8b7a5e175c3 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 13 Aug 2020 21:32:06 +0100 Subject: [PATCH 1/2] Fix type hinting, clean code, remove thailand_provinces_th.txt --- pythainlp/corpus/common.py | 56 ++++++++-------- pythainlp/corpus/core.py | 62 ++++++++++------- pythainlp/corpus/thailand_provinces_th.txt | 77 ---------------------- tests/test_corpus.py | 3 + 4 files changed, 72 insertions(+), 126 deletions(-) delete mode 100644 pythainlp/corpus/thailand_provinces_th.txt diff --git a/pythainlp/corpus/common.py b/pythainlp/corpus/common.py index 36c490df0..e56565c11 100644 --- a/pythainlp/corpus/common.py +++ b/pythainlp/corpus/common.py @@ -15,15 +15,14 @@ ] from pythainlp.corpus import get_corpus +from typing import Union _THAI_COUNTRIES = set() _THAI_COUNTRIES_FILENAME = "countries_th.txt" _THAI_THAILAND_PROVINCES = set() -_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.txt" - -_THAI_THAILAND_PROVINCES_DETAILS = dict() -_THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME = "thailand_provinces_th.csv" +_THAI_THAILAND_PROVINCES_DETAILS = list() +_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv" _THAI_SYLLABLES = set() _THAI_SYLLABLES_FILENAME = "syllables_th.txt" @@ -60,38 +59,43 @@ def countries() -> frozenset: return _THAI_COUNTRIES -def provinces(details: bool = False) -> frozenset: +def provinces(details: bool = False) -> Union[frozenset, list]: """ Return a frozenset of Thailand province names in Thai such as "กระบี่", "กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี". \n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\ `_) - :param bool details: a details of provinces + :param bool details: return details of provinces or not - :return: :class:`frozenset` containing province names of Thailand (if details is False) or list \ - dict of Thailand province names in Thai such as\ - [{'provinces_th': 'นนทบุรี', 'abridgement': 'นบ', 'provinces_en': 'Nonthaburi', 'HS': 'NBI'}]. + :return: :class:`frozenset` containing province names of Thailand \ + (if details is False) or :class:`list` containing :class:`dict` of \ + province names and details such as \ + [{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \ + 'abbr_en': 'NBI'}]. :rtype: :class:`frozenset` or :class:`list` """ global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS - if not _THAI_THAILAND_PROVINCES_DETAILS and not _THAI_THAILAND_PROVINCES: - _THAI_THAILAND_PROVINCES = list() - _TEMP = list(get_corpus( - _THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME - )) - _THAI_THAILAND_PROVINCES_DETAILS = list() - for i in _TEMP: - _data = i.split(",") - _dict_data = dict() - _dict_data["provinces_th"] = _data[0] - _THAI_THAILAND_PROVINCES.append(_data[0]) - _dict_data["abridgement"] = _data[1] - _dict_data["provinces_en"] = _data[2] - _dict_data["HS"] = _data[3] - _THAI_THAILAND_PROVINCES_DETAILS.append(_dict_data) - - _THAI_THAILAND_PROVINCES = frozenset(_THAI_THAILAND_PROVINCES) + + if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS: + provs = set() + prov_details = list() + + for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True): + p = line.split(",") + + prov = dict() + prov["name_th"] = p[0] + prov["abbr_th"] = p[1] + prov["name_en"] = p[2] + prov["abbr_en"] = p[3] + + provs.add(prov["name_th"]) + prov_details.append(prov) + + _THAI_THAILAND_PROVINCES = frozenset(provs) + _THAI_THAILAND_PROVINCES_DETAILS = prov_details + if details: return _THAI_THAILAND_PROVINCES_DETAILS diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 47f1dd148..f05beb4cd 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -51,9 +51,17 @@ def get_corpus_db_detail(name: str) -> dict: return dict() -def get_corpus(filename: str) -> frozenset: +def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: """ - Read corpus data from file and return a frozenset. + Read corpus data from file and return a frozenset or a list. + + Each line in the file will be a member of the set or the list. + + By default, a frozenset will be return, with whitespaces stripped, and + empty values and duplicates removed. + + If as_is is True, a list will be return, with no modifications + in member values and their orders. (Please see the filename from `this file @@ -61,8 +69,8 @@ def get_corpus(filename: str) -> frozenset: :param str filename: filename of the corpus to be read - :return: :mod:`frozenset` consist of lines in the file - :rtype: :mod:`frozenset` + :return: :class:`frozenset` or :class:`list` consists of lines in the file + :rtype: :class:`frozenset` or :class:`list` :Example: :: @@ -85,7 +93,11 @@ def get_corpus(filename: str) -> frozenset: with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() - return frozenset(lines) + if as_is: + return lines + + lines = [line.strip() for line in lines] + return frozenset(filter(None, lines)) def _update_all(): @@ -96,7 +108,9 @@ def _update_all(): for item in item_all: name = item["name"] if "file_name" in item.keys(): - local_db.update({"filename": item["file_name"]}, query.name == name) + local_db.update( + {"filename": item["file_name"]}, query.name == name + ) elif "file" in item.keys(): local_db.update({"filename": item["file"]}, query.name == name) local_db.close() @@ -139,9 +153,15 @@ def get_corpus_path(name: str) -> Union[str, None]: """ # check if the corpus is in local catalog, download if not corpus_db_detail = get_corpus_db_detail(name) - if corpus_db_detail.get("file_name") is not None and corpus_db_detail.get("filename") is None: + if ( + corpus_db_detail.get("file_name") is not None + and corpus_db_detail.get("filename") is None + ): _update_all() - elif corpus_db_detail.get("file") is not None and corpus_db_detail.get("filename") is None: + elif ( + corpus_db_detail.get("file") is not None + and corpus_db_detail.get("filename") is None + ): _update_all() if not corpus_db_detail or not corpus_db_detail.get("filename"): @@ -208,7 +228,9 @@ def _check_hash(dst: str, md5: str) -> None: raise Exception("Hash does not match expected.") -def download(name: str, force: bool = False, url: str = None, version: str = None) -> bool: +def download( + name: str, force: bool = False, url: str = None, version: str = None +) -> bool: """ Download corpus. @@ -256,34 +278,28 @@ def download(name: str, force: bool = False, url: str = None, version: str = Non corpus = corpus_db[name.lower()] print("Corpus:", name) if version is None: - version = corpus['latest_version'] + version = corpus["latest_version"] corpus_versions = corpus["versions"][version] file_name = corpus_versions["filename"] - found = local_db.search((query.name == name) & (query.version == version)) + found = local_db.search( + (query.name == name) & (query.version == version) + ) # If not found in local, download if force or not found: print(f"- Downloading: {name} {version}") _download( - corpus_versions["download_url"], - file_name, + corpus_versions["download_url"], file_name, ) _check_hash( - file_name, - corpus_versions["md5"], + file_name, corpus_versions["md5"], ) if found: - local_db.update( - {"version": version}, query.name == name - ) + local_db.update({"version": version}, query.name == name) else: local_db.insert( - { - "name": name, - "version": version, - "filename": file_name, - } + {"name": name, "version": version, "filename": file_name,} ) else: if local_db.search( diff --git a/pythainlp/corpus/thailand_provinces_th.txt b/pythainlp/corpus/thailand_provinces_th.txt deleted file mode 100644 index 15c1e735f..000000000 --- a/pythainlp/corpus/thailand_provinces_th.txt +++ /dev/null @@ -1,77 +0,0 @@ -กระบี่ -กรุงเทพมหานคร -กาญจนบุรี -กาฬสินธุ์ -กำแพงเพชร -ขอนแก่น -จันทบุรี -ฉะเชิงเทรา -ชลบุรี -ชัยนาท -ชัยภูมิ -ชุมพร -เชียงราย -เชียงใหม่ -ตรัง -ตราด -ตาก -นครนายก -นครปฐม -นครพนม -นครราชสีมา -นครศรีธรรมราช -นครสวรรค์ -นนทบุรี -นราธิวาส -น่าน -บึงกาฬ -บุรีรัมย์ -ปทุมธานี -ประจวบคีรีขันธ์ -ปราจีนบุรี -ปัตตานี -พะเยา -พระนครศรีอยุธยา -พังงา -พัทลุง -พิจิตร -พิษณุโลก -เพชรบุรี -เพชรบูรณ์ -แพร่ -ภูเก็ต -มหาสารคาม -มุกดาหาร -แม่ฮ่องสอน -ยโสธร -ยะลา -ร้อยเอ็ด -ระนอง -ระยอง -ราชบุรี -ลพบุรี -ลำปาง -ลำพูน -เลย -ศรีสะเกษ -สกลนคร -สงขลา -สตูล -สมุทรปราการ -สมุทรสงคราม -สมุทรสาคร -สระแก้ว -สระบุรี -สิงห์บุรี -สุโขทัย -สุพรรณบุรี -สุราษฎร์ธานี -สุรินทร์ -หนองคาย -หนองบัวลำภู -อ่างทอง -อำนาจเจริญ -อุดรธานี -อุตรดิตถ์ -อุทัยธานี -อุบลราชธานี diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 8dd27c2f2..c7484f972 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -35,6 +35,9 @@ def test_corpus(self): self.assertIsInstance(countries(), frozenset) self.assertIsInstance(provinces(), frozenset) self.assertIsInstance(provinces(details=True), list) + self.assertEqual( + len(provinces(details=False)), len(provinces(details=True)) + ) self.assertIsInstance(thai_female_names(), frozenset) self.assertIsInstance(thai_male_names(), frozenset) From 8b4beac43ff563a952635e2d9c436f94720ec792 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 13 Aug 2020 21:37:42 +0100 Subject: [PATCH 2/2] Fix PEP8 --- pythainlp/corpus/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index f05beb4cd..0c906fdbb 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -57,8 +57,8 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: Each line in the file will be a member of the set or the list. - By default, a frozenset will be return, with whitespaces stripped, and - empty values and duplicates removed. + By default, a frozenset will be return, with whitespaces stripped, and + empty values and duplicates removed. If as_is is True, a list will be return, with no modifications in member values and their orders. @@ -299,7 +299,7 @@ def download( local_db.update({"version": version}, query.name == name) else: local_db.insert( - {"name": name, "version": version, "filename": file_name,} + {"name": name, "version": version, "filename": file_name} ) else: if local_db.search(