PyThaiNLP · kianmeng · Oct 17, 2023
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -132,14 +132,14 @@ Make sure the same tests pass on Travis CI and AppVeyor.
 Thanks all the [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contributors). (Image made with [contributors-img](https://contributors-img.firebaseapp.com))
 
 ### Development Lead
-- Wannaphong Phatthiyaphaibun <[email protected]> - founder, distribution and maintainance
+- Wannaphong Phatthiyaphaibun <[email protected]> - founder, distribution and maintenance
 - Korakot Chaovavanich - initial tokenization and soundex code
 - Charin Polpanumas - classification and benchmarking
 - Peeradej Tanruangporn - documentation
-- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintainance
+- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance
 - Chakri Lowphansirikul - documentation
 - Pattarawat Chormai - benchmarking
-- Thanathip Suntorntip - nlpO3 maintainance, Rust Developer
+- Thanathip Suntorntip - nlpO3 maintenance, Rust Developer
 - Can Udomcharoenchaikit - documentation and code
 
 ### Maintainers

diff --git a/docs/api/tag.rst b/docs/api/tag.rst
@@ -96,7 +96,7 @@ ORCHID corpus uses different set of POS tags. Thus, we make UD POS tags version
 The following table shows the mapping of POS tags from ORCHID to UD:
 
 ===============     =======================
-ORCHID POS tags     Coresponding UD POS tag
+ORCHID POS tags     Corresponding UD POS tag
 ===============     =======================
 NOUN                NOUN
 NCMN                NOUN
@@ -165,7 +165,7 @@ Details about LST20 POS tags are available in [#Prachya_2020]_.
 The following table shows the mapping of POS tags from LST20 to UD:
 
 +----------------+-------------------------+
-| LST20 POS tags | Coresponding UD POS tag |
+| LST20 POS tags | Corresponding UD POS tag |
 +================+=========================+
 | AJ             | ADJ                     |
 +----------------+-------------------------+
@@ -198,9 +198,9 @@ The following table shows the mapping of POS tags from LST20 to UD:
 | XX             | X                       |
 +----------------+-------------------------+
 
-For the NE, we use `Inside-outside-beggining (IOB) <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`_ format to tag NE for each word.
+For the NE, we use `Inside-outside-beginning (IOB) <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`_ format to tag NE for each word.
 
-*B-* prefix indicates the begining token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk.
+*B-* prefix indicates the beginning token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk.
 
 For instance, given a sentence "บารัค โอบามาเป็นประธานธิปดี", it would tag the tokens "บารัค", "โอบามา", "เป็น", "ประธานาธิปดี" with "B-PERSON", "I-PERSON", "O", and "O" respectively.
 

diff --git a/docs/clean_directory.sh b/docs/clean_directory.sh
@@ -5,7 +5,7 @@
 # $1 : FTP_USER
 # $2 : FTP_PASSWORD
 # $3 : FTP_HOST
-# $4 : Brnach name
+# $4 : Branch name
 
 FTP_USER=$1
 FTP_PASSWORD=$2

diff --git a/notebooks/test_el.ipynb b/notebooks/test_el.ipynb
@@ -39,7 +39,7 @@
       "  deprecation_warning(message=message)\n",
       "/usr/local/lib/python3.8/dist-packages/hydra/experimental/initialize.py:82: UserWarning: \n",
       "The version_base parameter is not specified.\n",
-      "Please specify a compatability version level, or None.\n",
+      "Please specify a compatibility version level, or None.\n",
       "Will assume defaults for version 1.1\n",
       "  self.delegate = real_initialize_config_module(\n",
       "/usr/local/lib/python3.8/dist-packages/hydra/experimental/compose.py:25: UserWarning: hydra.experimental.compose() is no longer experimental. Use hydra.compose()\n",

diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py
@@ -80,7 +80,7 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict:
 
 def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame:
     """
-    Performace benchmark of samples.
+    Performance benchmark of samples.
 
     Please see :meth:`pythainlp.benchmarks.word_tokenization.compute_stats` for
     metrics being computed.
@@ -157,7 +157,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
     **Word-Level**:
       Precision, Recall, and f1
     **Other**:
-      - Correct tokenization indicator: {0, 1} sequence indicating the correspoding
+      - Correct tokenization indicator: {0, 1} sequence indicating the corresponding
         word is tokenized correctly.
 
     :param str ref_sample: ground truth samples
@@ -169,7 +169,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
     ref_sample = _binary_representation(ref_sample)
     sample = _binary_representation(raw_sample)
 
-    # Compute charater-level statistics
+    # Compute character-level statistics
     c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0)
 
     c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]]
@@ -188,10 +188,10 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict:
     # Compute word-level statistics
 
     # Find correctly tokenized words in the reference sample
-    word_boundaries = _find_word_boudaries(ref_sample)
+    word_boundaries = _find_word_boundaries(ref_sample)
 
     # Find correctly tokenized words in the sample
-    ss_boundaries = _find_word_boudaries(sample)
+    ss_boundaries = _find_word_boundaries(sample)
     tokenization_indicators = _find_words_correctly_tokenised(
         word_boundaries, ss_boundaries
     )
@@ -253,7 +253,7 @@ def _binary_representation(txt: str, verbose: bool = False):
     return bin_rept
 
 
-def _find_word_boudaries(bin_reps) -> list:
+def _find_word_boundaries(bin_reps) -> list:
     """
     Find start and end location of each word.
 

diff --git a/pythainlp/corpus/conceptnet.py b/pythainlp/corpus/conceptnet.py
@@ -26,7 +26,7 @@ def edges(word: str, lang: str = "th"):
 
     For example, the term "ConceptNet" is a "knowledge graph", and
     "knowledge graph" has "common sense knowledge" which is a  part of
-    "artificial inteligence". Also, "ConcepNet" is used for
+    "artificial intelligence". Also, "ConcepNet" is used for
     "natural language understanding" which is a part of
     "artificial intelligence".
 

diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py
@@ -360,7 +360,7 @@ def download(
     :param bool force: force download
     :param str url: URL of the corpus catalog
     :param str version: Version of the corpus
-    :return: **True** if the corpus is found and succesfully downloaded.
+    :return: **True** if the corpus is found and successfully downloaded.
              Otherwise, it returns **False**.
     :rtype: bool
 
@@ -509,7 +509,7 @@ def remove(name: str) -> bool:
     Remove corpus
 
     :param str name: corpus name
-    :return: **True** if the corpus is found and succesfully removed.
+    :return: **True** if the corpus is found and successfully removed.
              Otherwise, it returns **False**.
     :rtype: bool
 

diff --git a/pythainlp/corpus/util.py b/pythainlp/corpus/util.py
@@ -33,7 +33,7 @@
 
 def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
     """
-    Return begining and ending index pairs of words
+    Return beginning and ending index pairs of words
     """
     i = 0
     for w in words:
@@ -86,7 +86,7 @@ def revise_wordset(
     Revise a set of word that could improve tokenization performance of
     a dictionary-based `tokenize` function.
 
-    `orign_words` will be used as a base set for the dictionary.
+    `origin_words` will be used as a base set for the dictionary.
     Words that do not performed well with `training_data` will be removed.
     The remaining words will be returned.
 

diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py
@@ -67,7 +67,7 @@ def synsets(word: str, pos: str = None, lang: str = "tha"):
          Synset('houseful.n.01'), Synset('home.n.07')]
 
         When specifying the part of speech constrain. For example,
-        the word "แรง" cound be interpreted as force (n.) or hard (adj.).
+        the word "แรง" could be interpreted as force (n.) or hard (adj.).
 
         >>> from pythainlp.corpus.wordnet import synsets
         >>> # By default, accept all part of speech

diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py
@@ -78,7 +78,7 @@ def gen_sentence(
             gen = Unigram()
 
             gen.gen_sentence("แมว")
-            # ouput: 'แมวเวลานะนั้น'
+            # output: 'แมวเวลานะนั้น'
         """
         if start_seq is None:
             start_seq = random.choice(self.word)
@@ -174,7 +174,7 @@ def gen_sentence(
             gen = Bigram()
 
             gen.gen_sentence("แมว")
-            # ouput: 'แมวไม่ได้รับเชื้อมัน'
+            # output: 'แมวไม่ได้รับเชื้อมัน'
         """
         if start_seq is None:
             start_seq = random.choice(self.words)
@@ -269,7 +269,7 @@ def gen_sentence(
             gen = Trigram()
 
             gen.gen_sentence()
-            # ouput: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
+            # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
         """
         if start_seq is None:
             start_seq = random.choice(self.bi_keys)

diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py
@@ -86,7 +86,7 @@ def check_sara(self, word: str)-> str:
                     sara.append('อำ')
                 else:
                     sara.append('อะ')
-        # Incase ออ
+        # In case ออ
         if countoa == 1 and 'อ' in word[-1] and 'เ' not in word:
             sara.remove('ออ')
         # In case เอ เอ 
@@ -183,7 +183,7 @@ def check_sara(self, word: str)-> str:
         elif sara == [] and len(word) == 3:
             sara.append('ออ') 
 
-        # incase บ่ 
+        # in case บ่ 
         if 'บ่' == word:
             sara = []
             sara.append('ออ')
@@ -257,7 +257,7 @@ def is_sumpus(self, word1: str,word2: str) -> bool:
 
         :param str word1: Thai word
         :param str word2: Thai word
-        :return: boolen
+        :return: boolean
         :rtype: bool
 
         :Example:
@@ -320,7 +320,7 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]:
             # output: The poem is correct according to the principle.
 
             print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4))
-            # # -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"]
+            # # -> ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"]
         """
         if k_type == 8:
             try:

diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py
@@ -51,7 +51,7 @@
 อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา
 ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมเอ่อเก่งกาจไม่กังขา 
 เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''',k_type=8))
-# -> ["Cant find rhyme between paragraphs ('สอน', 'ไป')in paragraph 4", "Cant find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ'])in paragraph 5"]
+# -> ["Can't find rhyme between paragraphs ('สอน', 'ไป')in paragraph 4", "Can't find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ'])in paragraph 5"]
 
 
 # การตรวจสอบกลอน 4 ที่ถูกฉันทลักษณ์
@@ -60,7 +60,7 @@
 
 # การตรวจสอบกลอน 4 ที่ผิดฉันทลักษณ์
 print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4))
-# -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"]
+# -> ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"]
 
 # การเช็คคำเอกโท
 print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง'))

diff --git a/pythainlp/spell/wanchanberta_thai_grammarly.py b/pythainlp/spell/wanchanberta_thai_grammarly.py
@@ -68,7 +68,7 @@ def evaluate_one_text(model, sentence):
     input_id = text['input_ids'][0].unsqueeze(0).to(device)
     label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)
     # print(f"input_ids: {input_id}")
-    # print(f"attnetion_mask: {mask}")
+    # print(f"attetion_mask: {mask}")
     # print(f"label_ids: {label_ids}")
 
     logits = tagging_model(input_id, mask, None)

diff --git a/pythainlp/summarize/core.py b/pythainlp/summarize/core.py
@@ -41,7 +41,7 @@ def summarize(
     Then, computes frequencies of tokenized words
     (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences
     and normalized with maximum word frequency. The words with normalized
-    frequncy that are less than 0.1 or greater than 0.9 will be
+    frequency that are less than 0.1 or greater than 0.9 will be
     filtered out from frequency dictionary. Finally, it picks *n* sentences
     with highest sum of normalized frequency from all words
     in the sentence and also appear in the frequency dictionary.

diff --git a/pythainlp/tag/locations.py b/pythainlp/tag/locations.py
@@ -28,7 +28,7 @@ def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]:
     Note that it uses exact match and considers no context.
 
     :param list[str] tokens: a list of words
-    :reutrn: a list of tuple indicating NER for `LOCATION` in IOB format
+    :return: a list of tuple indicating NER for `LOCATION` in IOB format
     :rtype: list[tuple[str, str]]
 
     :Example:

diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
@@ -69,7 +69,7 @@ def tag(
         self, text, pos=False, tag=False
     ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
-        This function tags named-entitiy from text in IOB format.
+        This function tags named-entity from text in IOB format.
 
         :param str text: text in Thai to be tagged
         :param bool pos: output with part-of-speech tag.\
@@ -122,7 +122,7 @@ def load_engine(self, engine: str = "thai_nner") -> None:
 
     def tag(self, text) -> Tuple[List[str], List[dict]]:
         """
-        This function tags nested named-entitiy.
+        This function tags nested named-entity.
 
         :param str text: text in Thai to be tagged
 

diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py
@@ -89,7 +89,7 @@ class ThaiNameTagger:
     This function support Thai NER 1.4 and 1.5 only.
     :param str version: Thai NER version.
         It's support Thai NER 1.4 & 1.5.
-        The defualt value is `1.4
+        The default value is `1.4
 
     :Example:
     ::
@@ -106,7 +106,7 @@ def __init__(self, version: str = "1.4") -> None:
 
         :param str version: Thai NER version.
                             It's support Thai NER 1.4 & 1.5.
-                            The defualt value is `1.4`
+                            The default value is `1.4`
         """
         from pycrfsuite import Tagger as CRFTagger
 
@@ -123,11 +123,11 @@ def get_ner(
         self, text: str, pos: bool = True, tag: bool = False
     ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]:
         """
-        This function tags named-entitiy from text in IOB format.
+        This function tags named-entity from text in IOB format.
 
         :param str text: text in Thai to be tagged
         :param bool pos: To include POS tags in the results (`True`) or
-                            exclude (`False`). The defualt value is `True`
+                            exclude (`False`). The default value is `True`
         :param bool tag: output like html tag.
         :return: a list of tuple associated with tokenized word, NER tag,
                  POS tag (if the parameter `pos` is specified as `True`),

diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py
@@ -39,11 +39,11 @@ def get_ner(
     """
     Named-entity recognizer from **TLTK**
 
-    This function tags named-entitiy from text in IOB format.
+    This function tags named-entity from text in IOB format.
 
     :param str text: text in Thai to be tagged
     :param bool pos: To include POS tags in the results (`True`) or
-        exclude (`False`). The defualt value is `True`
+        exclude (`False`). The default value is `True`
     :param bool tag: output like html tag.
     :return: a list of tuple associated with tokenized word, NER tag,
         POS tag (if the parameter `pos` is specified as `True`),

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -199,7 +199,7 @@ def word_tokenize(
         word_tokenize(text, engine='attacut')
         # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
 
-    Tokenize text by omiting whitespaces::
+    Tokenize text by omitting whitespaces::
 
         text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว "
 
@@ -344,7 +344,7 @@ def sent_tokenize(
     :param str text: the text to be tokenized
     :param str engine: choose among *'crfcut'*, *'whitespace'*, \
     *'whitespace+newline'*
-    :return: list of splited sentences
+    :return: list of split sentences
     :rtype: list[str]
     **Options for engine**
         * *crfcut* - (default) split by CRF trained on TED dataset
@@ -722,7 +722,7 @@ class Tokenizer:
 
         text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
 
-        # initate an object from file with `attacut` as tokenizer
+        # initiate an object from file with `attacut` as tokenizer
         _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\
             engine='attacut')
 

diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py
@@ -94,7 +94,7 @@ def serialize(p, p2):  # helper function
             last_p = q0
         elif len_q == 0:  # len(q) == 0  means not found in dictionary
             m = _PAT_NONTHAI.match(text[p:])
-            if m:  # non-Thai toekn
+            if m:  # non-Thai token
                 i = p + m.span()[1]
             else:  # non-Thai token, find minimum skip
                 for i in range(p, len_text):

diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py
@@ -46,8 +46,8 @@ def segment(
     parts of the same named-entity.
 
     :param str text: text to be tokenized to words
-    :parm list taglist: a list of named-entity tags to be used
-    :parm class tagger: ner tagger engine
+    :param list taglist: a list of named-entity tags to be used
+    :param class tagger: ner tagger engine
     :return: list of words, tokenized from the text
     """
     if not isinstance(text, str):

diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py
@@ -29,7 +29,7 @@
 def load_dict(file_path: str, dict_name: str) -> bool:
     """Load a dictionary file into an in-memory dictionary collection.
 
-    The loaded dictionary will be accessible throught the assigned dict_name.
+    The loaded dictionary will be accessible throughout the assigned dict_name.
     *** This function does not override an existing dict name. ***
 
     :param file_path: Path to a dictionary file