diff --git a/docs/api/corpus.rst b/docs/api/corpus.rst index dbf0c95fa..6db3d46ef 100644 --- a/docs/api/corpus.rst +++ b/docs/api/corpus.rst @@ -11,6 +11,7 @@ Modules .. autofunction:: get_corpus .. autofunction:: get_corpus_db .. autofunction:: get_corpus_db_detail +.. autofunction:: get_corpus_default_db .. autofunction:: get_corpus_path .. autofunction:: download .. autofunction:: remove diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py index e038c3a3e..810680e33 100644 --- a/pythainlp/corpus/__init__.py +++ b/pythainlp/corpus/__init__.py @@ -15,6 +15,7 @@ "get_corpus", "get_corpus_db", "get_corpus_db_detail", + "get_corpus_default_db", "get_corpus_path", "provinces", "remove", @@ -80,6 +81,7 @@ def corpus_db_path() -> str: get_corpus, get_corpus_db, get_corpus_db_detail, + get_corpus_default_db, get_corpus_path, remove, path_pythainlp_corpus, diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 9e97897f9..229654297 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -7,6 +7,7 @@ import os from typing import Union from urllib.request import urlopen +import json import requests from pythainlp.corpus import corpus_db_path, corpus_db_url, corpus_path @@ -101,7 +102,7 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: # 'หยิบยื่น\\t3', # ...}) """ - path = os.path.join(corpus_path(), filename) + path = path_pythainlp_corpus(filename) lines = [] with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() @@ -113,7 +114,35 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]: return frozenset(filter(None, lines)) -def get_corpus_path(name: str, version : str = None) -> Union[str, None]: +def get_corpus_default_db(name: str, version: str = None) -> Union[str, None]: + """ + Get model path from default_db.json + + :param str name: corpus name + :return: path to the corpus or **None** of the corpus doesn't \ + exist in the device + :rtype: str + + If you want edit default_db.json, \ + you can edit in pythainlp/corpus/default_db.json + """ + default_db_path = path_pythainlp_corpus("default_db.json") + with open(default_db_path, encoding="utf-8-sig") as fh: + corpus_db = json.load(fh) + + if name in list(corpus_db.keys()): + if version in list(corpus_db[name]["versions"].keys()): + return path_pythainlp_corpus( + corpus_db[name]["versions"][version]["filename"] + ) + elif version is None: # load latest version + version = corpus_db[name]["latest_version"] + return path_pythainlp_corpus( + corpus_db[name]["versions"][version]["filename"] + ) + + +def get_corpus_path(name: str, version: str = None) -> Union[str, None]: """ Get corpus path. @@ -159,6 +188,10 @@ def get_corpus_path(name: str, version : str = None) -> Union[str, None]: if name in list(_CUSTOMIZE.keys()): return _CUSTOMIZE[name] + default_path = get_corpus_default_db(name=name, version=version) + if default_path is not None: + return default_path + # check if the corpus is in local catalog, download if not corpus_db_detail = get_corpus_db_detail(name) diff --git a/pythainlp/corpus/default_db.json b/pythainlp/corpus/default_db.json new file mode 100644 index 000000000..e3c43b8bf --- /dev/null +++ b/pythainlp/corpus/default_db.json @@ -0,0 +1,22 @@ +{ + "thainer": { + "name": "thainer", + "latest_version": "1.5", + "description": "Thai Named Entity Recognition", + "long_description": "Thai Named Entity Recognition", + "url": "https://github.com/wannaphong/thai-ner/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license": "cc-by-4.0", + "versions": { + "1.5": { + "filename": "thainer_crf_1_5.model", + "download_url": "https://github.com/wannaphong/thai-ner/releases/download/1.5/thai-ner-1-5-newmm-lst20.crfsuite", + "md5": "-", + "pythainlp_version": ">=2.2.7" + } + } + } +} \ No newline at end of file diff --git a/pythainlp/corpus/thainer_crf_1_5.model b/pythainlp/corpus/thainer_crf_1_5.model new file mode 100644 index 000000000..2041c24c6 Binary files /dev/null and b/pythainlp/corpus/thainer_crf_1_5.model differ diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 69fa22dc0..432bda352 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -9,6 +9,7 @@ download, get_corpus_db, get_corpus_db_detail, + get_corpus_default_db, get_corpus_path, provinces, remove, @@ -72,6 +73,10 @@ def test_corpus(self): ) # corpus name not exist self.assertIsNotNone(get_corpus_db_detail("test")) # corpus exists self.assertIsNotNone(get_corpus_path("test")) # corpus exists + self.assertIsNone(get_corpus_default_db("test")) + self.assertIsNotNone(get_corpus_default_db("thainer", "1.5")) + self.assertIsNotNone(get_corpus_default_db("thainer")) + self.assertIsNone(get_corpus_default_db("thainer", "1.2")) self.assertTrue(remove("test")) # remove existing self.assertFalse(remove("test")) # remove non-existing self.assertIsNone(get_corpus_path("XXXkdjfBzc")) # query non-existing