PyThaiNLP · bact · Dec 6, 2023 · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -26,6 +26,7 @@
     "thai_dict",
     "thai_family_names",
     "thai_female_names",
+    "thai_icu_words",
     "thai_male_names",
     "thai_negations",
     "thai_orst_words",
@@ -103,12 +104,13 @@ def corpus_db_path() -> str:
     thai_female_names,
     thai_male_names,
     thai_negations,
+    thai_orst_words,
     thai_stopwords,
     thai_syllables,
     thai_synonym,
-    thai_orst_words,
     thai_words,
     thai_wsd_dict,
 )
+from pythainlp.corpus.icu import thai_icu_words
 from pythainlp.corpus.volubilis import volubilis
 from pythainlp.corpus.wikipedia_titles import wikipedia_titles
diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -70,7 +70,10 @@ def path_pythainlp_corpus(filename: str) -> str:
     return os.path.join(corpus_path(), filename)
 
 
-def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
+def get_corpus(filename: str,
+               as_is: bool = False,
+               comments: bool = True
+               ) -> Union[frozenset, list]:
     """
     Read corpus data from file and return a frozenset or a list.
 
@@ -82,8 +85,12 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
     If as_is is True, a list will be return, with no modifications
     in member values and their orders.
 
+    If comments is False, any text at any position after the character
+    '#' in each line will be discarded.
 
     :param str filename: filename of the corpus to be read
+    :param bool as_is: no modification to the text, and return a list
+    :param bool comments: keep comments
 
     :return: :class:`frozenset` or :class:`list` consisting of lines in the file
     :rtype: :class:`frozenset` or :class:`list`
@@ -93,26 +100,61 @@ def get_corpus(filename: str, as_is: bool = False) -> Union[frozenset, list]:
 
         from pythainlp.corpus import get_corpus
 
-        get_corpus('negations_th.txt')
+        # input file (negations_th.txt):
+        # แต่
+        # ไม่
+
+        get_corpus("negations_th.txt")
         # output:
         # frozenset({'แต่', 'ไม่'})
 
-        get_corpus('ttc_freq.txt')
+        get_corpus("negations_th.txt", as_is=True)
+        # output:
+        # ['แต่', 'ไม่']
+
+        # input file (ttc_freq.txt):
+        # ตัวบท<tab>10
+        # โดยนัยนี้<tab>1
+
+        get_corpus("ttc_freq.txt")
         # output:
         # frozenset({'โดยนัยนี้\\t1',
         #    'ตัวบท\\t10',
-        #    'หยิบยื่น\\t3',
         #     ...})
+
+        # input file (icubrk_th.txt):
+        # # Thai Dictionary for ICU BreakIterator
+        # กก
+        # กกขนาก
+
+        get_corpus("icubrk_th.txt")
+        # output:
+        # frozenset({'กกขนาก',
+        #     '# Thai Dictionary for ICU BreakIterator',
+        #     'กก',
+        #     ...})
+
+        get_corpus("icubrk_th.txt", comments=False)
+        # output:
+        # frozenset({'กกขนาก',
+        #     'กก',
+        #     ...})
+
     """
     path = path_pythainlp_corpus(filename)
     lines = []
     with open(path, "r", encoding="utf-8-sig") as fh:
         lines = fh.read().splitlines()
 
+    if not comments:
+        # take only text before character '#'
+        lines = [line.split("#", 1)[0] for line in lines]
+
     if as_is:
         return lines
 
     lines = [line.strip() for line in lines]
+
     return frozenset(filter(None, lines))
 
 

diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md
@@ -53,6 +53,17 @@ https://creativecommons.org/licenses/by/4.0/
 | sentenceseg_crfcut.model  | Sentence segmentation model, trained from TED subtitles, using CRF                                    |
 
 
+## Thai Dictionary for ICU BreakIterator
+
+A Thai word list from ICU (International Components for Unicode) project
+(icubrk_th.txt) is copyrighted by Unicode, Inc. and others.,
+released under **Unicode License Agreement - Data Files and Software (2016)**
+http://www.unicode.org/copyright.html
+
+Original data:
+https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/dictionaries/thaidict.txt
+
+
 ## Thai WordNet
 
 Thai WordNet (wordnet_th.db) is created by Thai Computational Linguistic

diff --git a/pythainlp/corpus/icu.py b/pythainlp/corpus/icu.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+"""
+Provides an optional word list from International Components for Unicode (ICU) dictionary.
+"""
+from typing import FrozenSet
+
+from pythainlp.corpus.common import get_corpus
+
+
+_THAI_ICU_FILENAME = "icubrk_th.txt"
+
+
+def thai_icu_words() -> FrozenSet[str]:
+    """
+    Return a frozenset of words from the Thai dictionary for BreakIterator of the
+    International Components for Unicode (ICU).
+
+    :return: :class:`frozenset` containing `str`
+    :rtype: :class:`frozenset`
+    """
+
+    _WORDS = get_corpus(_THAI_ICU_FILENAME, comments=False)
+
+    return _WORDS