Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 30 additions & 26 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@
]

from pythainlp.corpus import get_corpus
from typing import Union

_THAI_COUNTRIES = set()
_THAI_COUNTRIES_FILENAME = "countries_th.txt"

_THAI_THAILAND_PROVINCES = set()
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.txt"

_THAI_THAILAND_PROVINCES_DETAILS = dict()
_THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME = "thailand_provinces_th.csv"
_THAI_THAILAND_PROVINCES_DETAILS = list()
_THAI_THAILAND_PROVINCES_FILENAME = "thailand_provinces_th.csv"

_THAI_SYLLABLES = set()
_THAI_SYLLABLES_FILENAME = "syllables_th.txt"
Expand Down Expand Up @@ -60,38 +59,43 @@ def countries() -> frozenset:
return _THAI_COUNTRIES


def provinces(details: bool = False) -> frozenset:
def provinces(details: bool = False) -> Union[frozenset, list]:
"""
Return a frozenset of Thailand province names in Thai such as "กระบี่",
"กรุงเทพมหานคร", "กาญจนบุรี", and "อุบลราชธานี".
\n(See: `dev/pythainlp/corpus/thailand_provinces_th.txt\
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/thailand_provinces_th.txt>`_)

:param bool details: a details of provinces
:param bool details: return details of provinces or not

:return: :class:`frozenset` containing province names of Thailand (if details is False) or list \
dict of Thailand province names in Thai such as\
[{'provinces_th': 'นนทบุรี', 'abridgement': 'นบ', 'provinces_en': 'Nonthaburi', 'HS': 'NBI'}].
:return: :class:`frozenset` containing province names of Thailand \
(if details is False) or :class:`list` containing :class:`dict` of \
province names and details such as \
[{'name_th': 'นนทบุรี', 'abbr_th': 'นบ', 'name_en': 'Nonthaburi', \
'abbr_en': 'NBI'}].
:rtype: :class:`frozenset` or :class:`list`
"""
global _THAI_THAILAND_PROVINCES, _THAI_THAILAND_PROVINCES_DETAILS
if not _THAI_THAILAND_PROVINCES_DETAILS and not _THAI_THAILAND_PROVINCES:
_THAI_THAILAND_PROVINCES = list()
_TEMP = list(get_corpus(
_THAI_THAILAND_PROVINCES_LIST_ALL_FILENAME
))
_THAI_THAILAND_PROVINCES_DETAILS = list()
for i in _TEMP:
_data = i.split(",")
_dict_data = dict()
_dict_data["provinces_th"] = _data[0]
_THAI_THAILAND_PROVINCES.append(_data[0])
_dict_data["abridgement"] = _data[1]
_dict_data["provinces_en"] = _data[2]
_dict_data["HS"] = _data[3]
_THAI_THAILAND_PROVINCES_DETAILS.append(_dict_data)

_THAI_THAILAND_PROVINCES = frozenset(_THAI_THAILAND_PROVINCES)

if not _THAI_THAILAND_PROVINCES or not _THAI_THAILAND_PROVINCES_DETAILS:
provs = set()
prov_details = list()

for line in get_corpus(_THAI_THAILAND_PROVINCES_FILENAME, as_is=True):
p = line.split(",")

prov = dict()
prov["name_th"] = p[0]
prov["abbr_th"] = p[1]
prov["name_en"] = p[2]
prov["abbr_en"] = p[3]

provs.add(prov["name_th"])
prov_details.append(prov)

_THAI_THAILAND_PROVINCES = frozenset(provs)
_THAI_THAILAND_PROVINCES_DETAILS = prov_details

if details:
return _THAI_THAILAND_PROVINCES_DETAILS

Expand Down
62 changes: 39 additions & 23 deletions pythainlp/corpus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,26 @@ def get_corpus_db_detail(name: str) -> dict:
return dict()


def get_corpus(filename: str) -> frozenset:
def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
"""
Read corpus data from file and return a frozenset.
Read corpus data from file and return a frozenset or a list.

Each line in the file will be a member of the set or the list.

By default, a frozenset will be return, with whitespaces stripped, and
empty values and duplicates removed.

If as_is is True, a list will be return, with no modifications
in member values and their orders.

(Please see the filename from
`this file
<https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_

:param str filename: filename of the corpus to be read

:return: :mod:`frozenset` consist of lines in the file
:rtype: :mod:`frozenset`
:return: :class:`frozenset` or :class:`list` consists of lines in the file
:rtype: :class:`frozenset` or :class:`list`

:Example:
::
Expand All @@ -85,7 +93,11 @@ def get_corpus(filename: str) -> frozenset:
with open(path, "r", encoding="utf-8-sig") as fh:
lines = fh.read().splitlines()

return frozenset(lines)
if as_is:
return lines

lines = [line.strip() for line in lines]
return frozenset(filter(None, lines))


def _update_all():
Expand All @@ -96,7 +108,9 @@ def _update_all():
for item in item_all:
name = item["name"]
if "file_name" in item.keys():
local_db.update({"filename": item["file_name"]}, query.name == name)
local_db.update(
{"filename": item["file_name"]}, query.name == name
)
elif "file" in item.keys():
local_db.update({"filename": item["file"]}, query.name == name)
local_db.close()
Expand Down Expand Up @@ -139,9 +153,15 @@ def get_corpus_path(name: str) -> Union[str, None]:
"""
# check if the corpus is in local catalog, download if not
corpus_db_detail = get_corpus_db_detail(name)
if corpus_db_detail.get("file_name") is not None and corpus_db_detail.get("filename") is None:
if (
corpus_db_detail.get("file_name") is not None
and corpus_db_detail.get("filename") is None
):
_update_all()
elif corpus_db_detail.get("file") is not None and corpus_db_detail.get("filename") is None:
elif (
corpus_db_detail.get("file") is not None
and corpus_db_detail.get("filename") is None
):
_update_all()

if not corpus_db_detail or not corpus_db_detail.get("filename"):
Expand Down Expand Up @@ -208,7 +228,9 @@ def _check_hash(dst: str, md5: str) -> None:
raise Exception("Hash does not match expected.")


def download(name: str, force: bool = False, url: str = None, version: str = None) -> bool:
def download(
name: str, force: bool = False, url: str = None, version: str = None
) -> bool:
"""
Download corpus.

Expand Down Expand Up @@ -256,34 +278,28 @@ def download(name: str, force: bool = False, url: str = None, version: str = Non
corpus = corpus_db[name.lower()]
print("Corpus:", name)
if version is None:
version = corpus['latest_version']
version = corpus["latest_version"]
corpus_versions = corpus["versions"][version]
file_name = corpus_versions["filename"]
found = local_db.search((query.name == name) & (query.version == version))
found = local_db.search(
(query.name == name) & (query.version == version)
)

# If not found in local, download
if force or not found:
print(f"- Downloading: {name} {version}")
_download(
corpus_versions["download_url"],
file_name,
corpus_versions["download_url"], file_name,
)
_check_hash(
file_name,
corpus_versions["md5"],
file_name, corpus_versions["md5"],
)

if found:
local_db.update(
{"version": version}, query.name == name
)
local_db.update({"version": version}, query.name == name)
else:
local_db.insert(
{
"name": name,
"version": version,
"filename": file_name,
}
{"name": name, "version": version, "filename": file_name}
)
else:
if local_db.search(
Expand Down
77 changes: 0 additions & 77 deletions pythainlp/corpus/thailand_provinces_th.txt

This file was deleted.

3 changes: 3 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ def test_corpus(self):
self.assertIsInstance(countries(), frozenset)
self.assertIsInstance(provinces(), frozenset)
self.assertIsInstance(provinces(details=True), list)
self.assertEqual(
len(provinces(details=False)), len(provinces(details=True))
)
self.assertIsInstance(thai_female_names(), frozenset)
self.assertIsInstance(thai_male_names(), frozenset)

Expand Down