From 24f29353c8dd78511047d5368a2a0b75c1c89de7 Mon Sep 17 00:00:00 2001 From: Kian-Meng Ang Date: Tue, 17 Oct 2023 12:51:00 +0800 Subject: [PATCH] Fix typos Found via: - `codespell -S ./pythainlp/corpus -L etcc,tha,fo,ue,ist,ans` - `typos --format brief` --- CONTRIBUTING.md | 6 +++--- docs/api/tag.rst | 8 ++++---- docs/clean_directory.sh | 2 +- notebooks/test_el.ipynb | 2 +- pythainlp/benchmarks/word_tokenization.py | 12 ++++++------ pythainlp/corpus/conceptnet.py | 2 +- pythainlp/corpus/core.py | 4 ++-- pythainlp/corpus/util.py | 4 ++-- pythainlp/corpus/wordnet.py | 2 +- pythainlp/generate/core.py | 6 +++--- pythainlp/khavee/core.py | 8 ++++---- pythainlp/khavee/example.py | 4 ++-- pythainlp/spell/wanchanberta_thai_grammarly.py | 2 +- pythainlp/summarize/core.py | 2 +- pythainlp/tag/locations.py | 2 +- pythainlp/tag/named_entity.py | 4 ++-- pythainlp/tag/thainer.py | 8 ++++---- pythainlp/tag/tltk.py | 4 ++-- pythainlp/tokenize/core.py | 6 +++--- pythainlp/tokenize/multi_cut.py | 2 +- pythainlp/tokenize/nercut.py | 4 ++-- pythainlp/tokenize/nlpo3.py | 2 +- pythainlp/tools/misspell.py | 6 +++--- pythainlp/ulmfit/core.py | 4 ++-- pythainlp/ulmfit/preprocess.py | 2 +- pythainlp/ulmfit/tokenizer.py | 2 +- pythainlp/util/abbreviation.py | 2 +- pythainlp/util/keywords.py | 2 +- pythainlp/util/strftime.py | 4 ++-- pythainlp/util/syllable.py | 8 ++++---- pythainlp/wangchanberta/core.py | 8 ++++---- pythainlp/word_vector/core.py | 4 ++-- tests/test_benchmarks.py | 4 ++-- tests/test_khavee.py | 2 +- tests/test_tag.py | 8 ++++---- tokenization-benchmark.md | 2 +- 36 files changed, 77 insertions(+), 77 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 159a576b6..1a9e19c4c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -132,14 +132,14 @@ Make sure the same tests pass on Travis CI and AppVeyor. Thanks all the [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contributors). (Image made with [contributors-img](https://contributors-img.firebaseapp.com)) ### Development Lead -- Wannaphong Phatthiyaphaibun - founder, distribution and maintainance +- Wannaphong Phatthiyaphaibun - founder, distribution and maintenance - Korakot Chaovavanich - initial tokenization and soundex code - Charin Polpanumas - classification and benchmarking - Peeradej Tanruangporn - documentation -- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintainance +- Arthit Suriyawongkul - refactoring, packaging, distribution, and maintenance - Chakri Lowphansirikul - documentation - Pattarawat Chormai - benchmarking -- Thanathip Suntorntip - nlpO3 maintainance, Rust Developer +- Thanathip Suntorntip - nlpO3 maintenance, Rust Developer - Can Udomcharoenchaikit - documentation and code ### Maintainers diff --git a/docs/api/tag.rst b/docs/api/tag.rst index e8769a225..4b9bdf11b 100644 --- a/docs/api/tag.rst +++ b/docs/api/tag.rst @@ -96,7 +96,7 @@ ORCHID corpus uses different set of POS tags. Thus, we make UD POS tags version The following table shows the mapping of POS tags from ORCHID to UD: =============== ======================= -ORCHID POS tags Coresponding UD POS tag +ORCHID POS tags Corresponding UD POS tag =============== ======================= NOUN NOUN NCMN NOUN @@ -165,7 +165,7 @@ Details about LST20 POS tags are available in [#Prachya_2020]_. The following table shows the mapping of POS tags from LST20 to UD: +----------------+-------------------------+ -| LST20 POS tags | Coresponding UD POS tag | +| LST20 POS tags | Corresponding UD POS tag | +================+=========================+ | AJ | ADJ | +----------------+-------------------------+ @@ -198,9 +198,9 @@ The following table shows the mapping of POS tags from LST20 to UD: | XX | X | +----------------+-------------------------+ -For the NE, we use `Inside-outside-beggining (IOB) `_ format to tag NE for each word. +For the NE, we use `Inside-outside-beginning (IOB) `_ format to tag NE for each word. -*B-* prefix indicates the begining token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk. +*B-* prefix indicates the beginning token of the chunk. *I-* prefix indicates the intermediate token within the chunk. *O* indicates that the token does not belong to any NE chunk. For instance, given a sentence "บารัค โอบามาเป็นประธานธิปดี", it would tag the tokens "บารัค", "โอบามา", "เป็น", "ประธานาธิปดี" with "B-PERSON", "I-PERSON", "O", and "O" respectively. diff --git a/docs/clean_directory.sh b/docs/clean_directory.sh index 5f7cb555e..a39243d2a 100644 --- a/docs/clean_directory.sh +++ b/docs/clean_directory.sh @@ -5,7 +5,7 @@ # $1 : FTP_USER # $2 : FTP_PASSWORD # $3 : FTP_HOST -# $4 : Brnach name +# $4 : Branch name FTP_USER=$1 FTP_PASSWORD=$2 diff --git a/notebooks/test_el.ipynb b/notebooks/test_el.ipynb index da6060583..71f72650d 100644 --- a/notebooks/test_el.ipynb +++ b/notebooks/test_el.ipynb @@ -39,7 +39,7 @@ " deprecation_warning(message=message)\n", "/usr/local/lib/python3.8/dist-packages/hydra/experimental/initialize.py:82: UserWarning: \n", "The version_base parameter is not specified.\n", - "Please specify a compatability version level, or None.\n", + "Please specify a compatibility version level, or None.\n", "Will assume defaults for version 1.1\n", " self.delegate = real_initialize_config_module(\n", "/usr/local/lib/python3.8/dist-packages/hydra/experimental/compose.py:25: UserWarning: hydra.experimental.compose() is no longer experimental. Use hydra.compose()\n", diff --git a/pythainlp/benchmarks/word_tokenization.py b/pythainlp/benchmarks/word_tokenization.py index acee19cd4..d9ad37029 100644 --- a/pythainlp/benchmarks/word_tokenization.py +++ b/pythainlp/benchmarks/word_tokenization.py @@ -80,7 +80,7 @@ def _flatten_result(my_dict: dict, sep: str = ":") -> dict: def benchmark(ref_samples: List[str], samples: List[str]) -> pd.DataFrame: """ - Performace benchmark of samples. + Performance benchmark of samples. Please see :meth:`pythainlp.benchmarks.word_tokenization.compute_stats` for metrics being computed. @@ -157,7 +157,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: **Word-Level**: Precision, Recall, and f1 **Other**: - - Correct tokenization indicator: {0, 1} sequence indicating the correspoding + - Correct tokenization indicator: {0, 1} sequence indicating the corresponding word is tokenized correctly. :param str ref_sample: ground truth samples @@ -169,7 +169,7 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: ref_sample = _binary_representation(ref_sample) sample = _binary_representation(raw_sample) - # Compute charater-level statistics + # Compute character-level statistics c_pos_pred, c_neg_pred = np.argwhere(sample == 1), np.argwhere(sample == 0) c_pos_pred = c_pos_pred[c_pos_pred < ref_sample.shape[0]] @@ -188,10 +188,10 @@ def compute_stats(ref_sample: str, raw_sample: str) -> dict: # Compute word-level statistics # Find correctly tokenized words in the reference sample - word_boundaries = _find_word_boudaries(ref_sample) + word_boundaries = _find_word_boundaries(ref_sample) # Find correctly tokenized words in the sample - ss_boundaries = _find_word_boudaries(sample) + ss_boundaries = _find_word_boundaries(sample) tokenization_indicators = _find_words_correctly_tokenised( word_boundaries, ss_boundaries ) @@ -253,7 +253,7 @@ def _binary_representation(txt: str, verbose: bool = False): return bin_rept -def _find_word_boudaries(bin_reps) -> list: +def _find_word_boundaries(bin_reps) -> list: """ Find start and end location of each word. diff --git a/pythainlp/corpus/conceptnet.py b/pythainlp/corpus/conceptnet.py index 943a9bf43..05aa021f9 100644 --- a/pythainlp/corpus/conceptnet.py +++ b/pythainlp/corpus/conceptnet.py @@ -26,7 +26,7 @@ def edges(word: str, lang: str = "th"): For example, the term "ConceptNet" is a "knowledge graph", and "knowledge graph" has "common sense knowledge" which is a part of - "artificial inteligence". Also, "ConcepNet" is used for + "artificial intelligence". Also, "ConcepNet" is used for "natural language understanding" which is a part of "artificial intelligence". diff --git a/pythainlp/corpus/core.py b/pythainlp/corpus/core.py index 148728ec7..b7ed4eed1 100644 --- a/pythainlp/corpus/core.py +++ b/pythainlp/corpus/core.py @@ -360,7 +360,7 @@ def download( :param bool force: force download :param str url: URL of the corpus catalog :param str version: Version of the corpus - :return: **True** if the corpus is found and succesfully downloaded. + :return: **True** if the corpus is found and successfully downloaded. Otherwise, it returns **False**. :rtype: bool @@ -509,7 +509,7 @@ def remove(name: str) -> bool: Remove corpus :param str name: corpus name - :return: **True** if the corpus is found and succesfully removed. + :return: **True** if the corpus is found and successfully removed. Otherwise, it returns **False**. :rtype: bool diff --git a/pythainlp/corpus/util.py b/pythainlp/corpus/util.py index 75a917db5..c8816b4eb 100644 --- a/pythainlp/corpus/util.py +++ b/pythainlp/corpus/util.py @@ -33,7 +33,7 @@ def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]: """ - Return begining and ending index pairs of words + Return beginning and ending index pairs of words """ i = 0 for w in words: @@ -86,7 +86,7 @@ def revise_wordset( Revise a set of word that could improve tokenization performance of a dictionary-based `tokenize` function. - `orign_words` will be used as a base set for the dictionary. + `origin_words` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. diff --git a/pythainlp/corpus/wordnet.py b/pythainlp/corpus/wordnet.py index 07a470d0e..4b030f852 100644 --- a/pythainlp/corpus/wordnet.py +++ b/pythainlp/corpus/wordnet.py @@ -67,7 +67,7 @@ def synsets(word: str, pos: str = None, lang: str = "tha"): Synset('houseful.n.01'), Synset('home.n.07')] When specifying the part of speech constrain. For example, - the word "แรง" cound be interpreted as force (n.) or hard (adj.). + the word "แรง" could be interpreted as force (n.) or hard (adj.). >>> from pythainlp.corpus.wordnet import synsets >>> # By default, accept all part of speech diff --git a/pythainlp/generate/core.py b/pythainlp/generate/core.py index 49331b21b..fd5f1a6d7 100644 --- a/pythainlp/generate/core.py +++ b/pythainlp/generate/core.py @@ -78,7 +78,7 @@ def gen_sentence( gen = Unigram() gen.gen_sentence("แมว") - # ouput: 'แมวเวลานะนั้น' + # output: 'แมวเวลานะนั้น' """ if start_seq is None: start_seq = random.choice(self.word) @@ -174,7 +174,7 @@ def gen_sentence( gen = Bigram() gen.gen_sentence("แมว") - # ouput: 'แมวไม่ได้รับเชื้อมัน' + # output: 'แมวไม่ได้รับเชื้อมัน' """ if start_seq is None: start_seq = random.choice(self.words) @@ -269,7 +269,7 @@ def gen_sentence( gen = Trigram() gen.gen_sentence() - # ouput: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' + # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' """ if start_seq is None: start_seq = random.choice(self.bi_keys) diff --git a/pythainlp/khavee/core.py b/pythainlp/khavee/core.py index 536e1f1c4..3ed01a46e 100644 --- a/pythainlp/khavee/core.py +++ b/pythainlp/khavee/core.py @@ -86,7 +86,7 @@ def check_sara(self, word: str)-> str: sara.append('อำ') else: sara.append('อะ') - # Incase ออ + # In case ออ if countoa == 1 and 'อ' in word[-1] and 'เ' not in word: sara.remove('ออ') # In case เอ เอ @@ -183,7 +183,7 @@ def check_sara(self, word: str)-> str: elif sara == [] and len(word) == 3: sara.append('ออ') - # incase บ่ + # in case บ่ if 'บ่' == word: sara = [] sara.append('ออ') @@ -257,7 +257,7 @@ def is_sumpus(self, word1: str,word2: str) -> bool: :param str word1: Thai word :param str word2: Thai word - :return: boolen + :return: boolean :rtype: bool :Example: @@ -320,7 +320,7 @@ def check_klon(self, text: str,k_type: int=8) -> Union[List[str], str]: # output: The poem is correct according to the principle. print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) - # # -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] + # # -> ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] """ if k_type == 8: try: diff --git a/pythainlp/khavee/example.py b/pythainlp/khavee/example.py index b6dfba79c..21ea2c377 100644 --- a/pythainlp/khavee/example.py +++ b/pythainlp/khavee/example.py @@ -51,7 +51,7 @@ อ่านตำราหาความรู้ไม่ละทอน เป็นคนดีศรีนครของจิตรลดา ภัสนันท์นาคลออหรือมีมี่ เรื่องเกมเอ่อเก่งกาจไม่กังขา เกมอะไรก็เล่นได้ไม่ลดวา สุดฉลาดมากปัญญามาครบครัน''',k_type=8)) -# -> ["Cant find rhyme between paragraphs ('สอน', 'ไป')in paragraph 4", "Cant find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ'])in paragraph 5"] +# -> ["Can't find rhyme between paragraphs ('สอน', 'ไป')in paragraph 4", "Can't find rhyme between paragraphs ('มี่', ['เกม', 'เอ่อ', 'เก่ง', 'กาจ'])in paragraph 5"] # การตรวจสอบกลอน 4 ที่ถูกฉันทลักษณ์ @@ -60,7 +60,7 @@ # การตรวจสอบกลอน 4 ที่ผิดฉันทลักษณ์ print(kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4)) -# -> ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] +# -> ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] # การเช็คคำเอกโท print(kv.check_aek_too('เอง'), kv.check_aek_too('เอ่ง'), kv.check_aek_too('เอ้ง')) diff --git a/pythainlp/spell/wanchanberta_thai_grammarly.py b/pythainlp/spell/wanchanberta_thai_grammarly.py index 9004d8838..f284f2133 100644 --- a/pythainlp/spell/wanchanberta_thai_grammarly.py +++ b/pythainlp/spell/wanchanberta_thai_grammarly.py @@ -68,7 +68,7 @@ def evaluate_one_text(model, sentence): input_id = text['input_ids'][0].unsqueeze(0).to(device) label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device) # print(f"input_ids: {input_id}") - # print(f"attnetion_mask: {mask}") + # print(f"attetion_mask: {mask}") # print(f"label_ids: {label_ids}") logits = tagging_model(input_id, mask, None) diff --git a/pythainlp/summarize/core.py b/pythainlp/summarize/core.py index 26995dcc1..57971d19a 100644 --- a/pythainlp/summarize/core.py +++ b/pythainlp/summarize/core.py @@ -41,7 +41,7 @@ def summarize( Then, computes frequencies of tokenized words (with :func:`pythainlp.tokenize.word_tokenize`) in all sentences and normalized with maximum word frequency. The words with normalized - frequncy that are less than 0.1 or greater than 0.9 will be + frequency that are less than 0.1 or greater than 0.9 will be filtered out from frequency dictionary. Finally, it picks *n* sentences with highest sum of normalized frequency from all words in the sentence and also appear in the frequency dictionary. diff --git a/pythainlp/tag/locations.py b/pythainlp/tag/locations.py index c44a2ee8f..09a984e18 100644 --- a/pythainlp/tag/locations.py +++ b/pythainlp/tag/locations.py @@ -28,7 +28,7 @@ def tag_provinces(tokens: List[str]) -> List[Tuple[str, str]]: Note that it uses exact match and considers no context. :param list[str] tokens: a list of words - :reutrn: a list of tuple indicating NER for `LOCATION` in IOB format + :return: a list of tuple indicating NER for `LOCATION` in IOB format :rtype: list[tuple[str, str]] :Example: diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index da9fab0f1..a253cb019 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -69,7 +69,7 @@ def tag( self, text, pos=False, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tag.\ @@ -122,7 +122,7 @@ def load_engine(self, engine: str = "thai_nner") -> None: def tag(self, text) -> Tuple[List[str], List[dict]]: """ - This function tags nested named-entitiy. + This function tags nested named-entity. :param str text: text in Thai to be tagged diff --git a/pythainlp/tag/thainer.py b/pythainlp/tag/thainer.py index 62e2453ee..cd8bb4e97 100644 --- a/pythainlp/tag/thainer.py +++ b/pythainlp/tag/thainer.py @@ -89,7 +89,7 @@ class ThaiNameTagger: This function support Thai NER 1.4 and 1.5 only. :param str version: Thai NER version. It's support Thai NER 1.4 & 1.5. - The defualt value is `1.4 + The default value is `1.4 :Example: :: @@ -106,7 +106,7 @@ def __init__(self, version: str = "1.4") -> None: :param str version: Thai NER version. It's support Thai NER 1.4 & 1.5. - The defualt value is `1.4` + The default value is `1.4` """ from pycrfsuite import Tagger as CRFTagger @@ -123,11 +123,11 @@ def get_ner( self, text: str, pos: bool = True, tag: bool = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or - exclude (`False`). The defualt value is `True` + exclude (`False`). The default value is `True` :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), diff --git a/pythainlp/tag/tltk.py b/pythainlp/tag/tltk.py index b729629f8..38f23c722 100644 --- a/pythainlp/tag/tltk.py +++ b/pythainlp/tag/tltk.py @@ -39,11 +39,11 @@ def get_ner( """ Named-entity recognizer from **TLTK** - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. :param str text: text in Thai to be tagged :param bool pos: To include POS tags in the results (`True`) or - exclude (`False`). The defualt value is `True` + exclude (`False`). The default value is `True` :param bool tag: output like html tag. :return: a list of tuple associated with tokenized word, NER tag, POS tag (if the parameter `pos` is specified as `True`), diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index ea33d00df..471036085 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -199,7 +199,7 @@ def word_tokenize( word_tokenize(text, engine='attacut') # output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด'] - Tokenize text by omiting whitespaces:: + Tokenize text by omitting whitespaces:: text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว " @@ -344,7 +344,7 @@ def sent_tokenize( :param str text: the text to be tokenized :param str engine: choose among *'crfcut'*, *'whitespace'*, \ *'whitespace+newline'* - :return: list of splited sentences + :return: list of split sentences :rtype: list[str] **Options for engine** * *crfcut* - (default) split by CRF trained on TED dataset @@ -722,7 +722,7 @@ class Tokenizer: text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด" - # initate an object from file with `attacut` as tokenizer + # initiate an object from file with `attacut` as tokenizer _tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\ engine='attacut') diff --git a/pythainlp/tokenize/multi_cut.py b/pythainlp/tokenize/multi_cut.py index 45c077ce0..51ca3ae66 100644 --- a/pythainlp/tokenize/multi_cut.py +++ b/pythainlp/tokenize/multi_cut.py @@ -94,7 +94,7 @@ def serialize(p, p2): # helper function last_p = q0 elif len_q == 0: # len(q) == 0 means not found in dictionary m = _PAT_NONTHAI.match(text[p:]) - if m: # non-Thai toekn + if m: # non-Thai token i = p + m.span()[1] else: # non-Thai token, find minimum skip for i in range(p, len_text): diff --git a/pythainlp/tokenize/nercut.py b/pythainlp/tokenize/nercut.py index 70baaccaa..dc664ba74 100644 --- a/pythainlp/tokenize/nercut.py +++ b/pythainlp/tokenize/nercut.py @@ -46,8 +46,8 @@ def segment( parts of the same named-entity. :param str text: text to be tokenized to words - :parm list taglist: a list of named-entity tags to be used - :parm class tagger: ner tagger engine + :param list taglist: a list of named-entity tags to be used + :param class tagger: ner tagger engine :return: list of words, tokenized from the text """ if not isinstance(text, str): diff --git a/pythainlp/tokenize/nlpo3.py b/pythainlp/tokenize/nlpo3.py index 314a59dfa..fd80e20fd 100644 --- a/pythainlp/tokenize/nlpo3.py +++ b/pythainlp/tokenize/nlpo3.py @@ -29,7 +29,7 @@ def load_dict(file_path: str, dict_name: str) -> bool: """Load a dictionary file into an in-memory dictionary collection. - The loaded dictionary will be accessible throught the assigned dict_name. + The loaded dictionary will be accessible throughout the assigned dict_name. *** This function does not override an existing dict name. *** :param file_path: Path to a dictionary file diff --git a/pythainlp/tools/misspell.py b/pythainlp/tools/misspell.py index 359a181e6..35c2bfc58 100644 --- a/pythainlp/tools/misspell.py +++ b/pythainlp/tools/misspell.py @@ -115,10 +115,10 @@ def find_misspell_candidates(char: str, verbose: bool = False): def misspell(sentence: str, ratio: float = 0.05): """ - Simulate some mispellings for the input sentence. - The number of mispelled locations is governed by ratio. + Simulate some misspellings for the input sentence. + The number of misspelled locations is governed by ratio. - :params str sentence: sentence to be mispelled + :params str sentence: sentence to be misspelled :params float ratio: number of misspells per 100 chars. Defaults to 0.5. :return: sentence containing some misspelled diff --git a/pythainlp/ulmfit/core.py b/pythainlp/ulmfit/core.py index 85d09e044..65183a37c 100644 --- a/pythainlp/ulmfit/core.py +++ b/pythainlp/ulmfit/core.py @@ -116,7 +116,7 @@ def process_thai( [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', '😃', '😄', '😅', 'pythainlp', '&'] - 2. Modify pre_rules and post_rules arugments with + 2. Modify pre_rules and post_rules arguments with rules provided in :mod:`pythainlp.ulmfit`: >>> from pythainlp.ulmfit import ( @@ -161,7 +161,7 @@ def document_vector(text: str, learn, data, agg: str = "mean"): :param learn: :class:`fastai` language model learner :param data: :class:`fastai` data bunch :param str agg: name of aggregation methods for word embeddings - The avialable methods are "mean" and "sum" + The available methods are "mean" and "sum" :return: :class:`numpy.array` of document vector sized 400 based on the encoder of the model diff --git a/pythainlp/ulmfit/preprocess.py b/pythainlp/ulmfit/preprocess.py index 60974dad0..b725ef778 100644 --- a/pythainlp/ulmfit/preprocess.py +++ b/pythainlp/ulmfit/preprocess.py @@ -169,7 +169,7 @@ def rm_brackets(text: str) -> str: new_line = re.sub(r"\(\)", "", text) new_line = re.sub(r"\{\}", "", new_line) new_line = re.sub(r"\[\]", "", new_line) - # brakets with only punctuations + # brackets with only punctuations new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) diff --git a/pythainlp/ulmfit/tokenizer.py b/pythainlp/ulmfit/tokenizer.py index d738c9d86..e4e9af716 100644 --- a/pythainlp/ulmfit/tokenizer.py +++ b/pythainlp/ulmfit/tokenizer.py @@ -48,7 +48,7 @@ def tokenizer(text: str) -> List[str]: """ This function tokenizes text with *newmm* engine and the dictionary specifically for `ulmfit` related functions - (see: `Dictonary file (.txt) \ + (see: `Dictionary file (.txt) \ `_). :meth: tokenize text with a frozen newmm engine :param str text: text to tokenize diff --git a/pythainlp/util/abbreviation.py b/pythainlp/util/abbreviation.py index 9e5b1dc18..24042ea9a 100644 --- a/pythainlp/util/abbreviation.py +++ b/pythainlp/util/abbreviation.py @@ -48,7 +48,7 @@ def abbreviation_to_full_text(text: str, top_k: int=2) -> List[Tuple[str, Union[ except ImportError: raise ImportError( """ - This funtion need to use khamyo. + This function need to use khamyo. You can install by pip install khamyo or pip install pythainlp[abbreviation]. """ diff --git a/pythainlp/util/keywords.py b/pythainlp/util/keywords.py index 6fda66048..53cfe4fd5 100644 --- a/pythainlp/util/keywords.py +++ b/pythainlp/util/keywords.py @@ -88,7 +88,7 @@ def find_keyword(word_list: List[str], min_len: int = 3) -> Dict[str, int]: where stopword is excluded and returns as a frequency dictionary. :param list word_list: a list of words - :param int min_len: the mininum frequency for words to obtain + :param int min_len: the minimum frequency for words to obtain :return: a dictionary object with key-value pair as word and its raw count :rtype: dict[str, int] diff --git a/pythainlp/util/strftime.py b/pythainlp/util/strftime.py index 197cf68b0..cb7c688be 100644 --- a/pythainlp/util/strftime.py +++ b/pythainlp/util/strftime.py @@ -193,13 +193,13 @@ def thai_strftime( * The Thai Buddhist Era (BE) year is simply converted from AD by adding 543. This is certainly not accurate for years before 1941 AD, due to the change in Thai New Year's Day. - * This meant to be an interrim solution, since + * This meant to be an interim solution, since Python standard's locale module (which relied on C's strftime()) does not support "th" or "th_TH" locale yet. If supported, we can just locale.setlocale(locale.LC_TIME, "th_TH") and then use native datetime.strftime(). - We trying to make this platform-independent and support extentions + We trying to make this platform-independent and support extensions as many as possible. See these links for strftime() extensions in POSIX, BSD, and GNU libc: diff --git a/pythainlp/util/syllable.py b/pythainlp/util/syllable.py index ca894fcc8..b38288d7d 100644 --- a/pythainlp/util/syllable.py +++ b/pythainlp/util/syllable.py @@ -244,7 +244,7 @@ def tone_detector(syllable: str) -> str: initial_consonant = consonants[0] tone_mark = _tone_mark_detector(syllable) syllable_check = syllable_open_close_detector(syllable) - syllable_check_lenght = syllable_length(syllable) + syllable_check_length = syllable_length(syllable) initial_consonant_type = thai_initial_consonant_to_type[initial_consonant] # r for store value r = "" @@ -281,21 +281,21 @@ def tone_detector(syllable: str) -> str: r = "r" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "short" + and syllable_check_length == "short" and syllable_check == "close" and s == "dead" ): r = "h" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "long" + and syllable_check_length == "long" and syllable_check == "close" and s == "dead" ): r = "f" elif ( initial_consonant_type == "low" - and syllable_check_lenght == "short" + and syllable_check_length == "short" and syllable_check == "open" ): r = "h" diff --git a/pythainlp/wangchanberta/core.py b/pythainlp/wangchanberta/core.py index 0abfe2066..0e57bee1c 100644 --- a/pythainlp/wangchanberta/core.py +++ b/pythainlp/wangchanberta/core.py @@ -34,7 +34,7 @@ def __init__( self, dataset_name: str = "thainer", grouped_entities: bool = True ): """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand @@ -66,7 +66,7 @@ def get_ner( self, text: str, pos: bool= False,tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand @@ -141,7 +141,7 @@ def get_ner( class NamedEntityRecognition: def __init__(self, model: str ="pythainlp/thainer-corpus-v2-base-model") -> None: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand @@ -169,7 +169,7 @@ def get_ner( self, text: str, pos: bool= False,tag: bool = False ) -> Union[List[Tuple[str, str]], str]: """ - This function tags named-entitiy from text in IOB format. + This function tags named-entity from text in IOB format. Powered by wangchanberta from VISTEC-depa\ AI Research Institute of Thailand diff --git a/pythainlp/word_vector/core.py b/pythainlp/word_vector/core.py index b542da713..56955d547 100644 --- a/pythainlp/word_vector/core.py +++ b/pythainlp/word_vector/core.py @@ -135,7 +135,7 @@ def most_similar_cosmul( :mod:`gensim`. :param list positive: a list of words to add - :param list negative: a list of words to substract + :param list negative: a list of words to subtract :raises KeyError: if there is any word in `positive` or `negative` not in the vocabulary of the model. @@ -270,7 +270,7 @@ def sentence_vectorizer(self, text: str, use_mean: bool = True) -> ndarray: Specifically, it first tokenize that text and map each tokenized words with the word vectors from the model. Then, word vectors are aggregatesd into one vector of 300 dimension - by calulating either mean, or summation of all word vectors. + by calculating either mean, or summation of all word vectors. :param str text: text input :param bool use_mean: if `True` aggregate word vectors with mean of all diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index be6e33dde..bde4a4612 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -59,8 +59,8 @@ def test_count_correctly_tokenised_words(self): sample = np.array(list(d["actual"])).astype(int) ref_sample = np.array(list(d["expected"])).astype(int) - sb = list(word_tokenization._find_word_boudaries(sample)) - rb = list(word_tokenization._find_word_boudaries(ref_sample)) + sb = list(word_tokenization._find_word_boundaries(sample)) + rb = list(word_tokenization._find_word_boundaries(ref_sample)) # in binary [{0, 1}, ...] correctly_tokenized_words = word_tokenization._find_words_correctly_tokenised( diff --git a/tests/test_khavee.py b/tests/test_khavee.py index f2cbd2e0a..821bcea9f 100644 --- a/tests/test_khavee.py +++ b/tests/test_khavee.py @@ -23,7 +23,7 @@ def test_check_klon(self): ) self.assertEqual( kv.check_klon('''ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร''',k_type=4), - ["Cant find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Cant find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] + ["Can't find rhyme between paragraphs ('หมา', 'จอง')in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง')in paragraph 2"] ) def test_check_aek_too(self): diff --git a/tests/test_tag.py b/tests/test_tag.py index f8172446d..b16621918 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -212,7 +212,7 @@ def test_ner(self): ) ) - # arguement `tag` is True + # argument `tag` is True self.assertIsNotNone( ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True) ) @@ -239,7 +239,7 @@ def test_ner(self): ) ) - # arguement `tag` is True + # argument `tag` is True self.assertEqual( ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", tag=True), "วันที่ 15 ก.ย. 61 " @@ -297,13 +297,13 @@ def test_ner(self): ner.get_ner("บางแสนกรุงเทพ", pos=False, tag=True) ) - # arguement `tag` is False and `pos` is True + # argument `tag` is False and `pos` is True self.assertEqual( ner.get_ner("ไทย", pos=True, tag=False), [('ไทย', 'PROPN', 'B-LOCATION')], ) - # arguement `tag` is False and `pos` is False + # argument `tag` is False and `pos` is False self.assertIsNotNone( ner.get_ner( "วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", diff --git a/tokenization-benchmark.md b/tokenization-benchmark.md index 42ae22df0..bf64e8d84 100644 --- a/tokenization-benchmark.md +++ b/tokenization-benchmark.md @@ -1,7 +1,7 @@ # Word Tokenisation Benchmark for Thai (obsolete) A framework for benchmarking tokenisation algorithms for Thai. -It has a command-line interface that allows users to conviniently execute the benchmarks +It has a command-line interface that allows users to conveniently execute the benchmarks as well as a module interface for later use in their development pipelines.