From 318bc34dfc5f5575be3b029a92b0ddf5eff8ffa8 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 8 Jan 2025 19:56:18 +0700 Subject: [PATCH 1/4] Add misspell command to CLI Fixes #615 Add command line interface for generating misspelled texts. * Add `misspell` command to `pythainlp/cli/__init__.py` and `pythainlp/__main__.py`. * Implement `App` class in `pythainlp/cli/misspell.py` to handle misspelling generation. * Add argument parsing for `--file`, `--seed`, `--misspell-ratio`, and `--output`. * Read input file, apply misspelling, and save output file. * Update `docs/notes/command_line.rst` to include documentation for the new `misspell` command. * Add tests for the new command in `tests/core/test_cli.py`. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/PyThaiNLP/pythainlp/issues/615?shareId=XXXX-XXXX-XXXX-XXXX). --- docs/notes/command_line.rst | 9 +++++ pythainlp/__main__.py | 2 +- pythainlp/cli/__init__.py | 3 +- pythainlp/cli/misspell.py | 69 +++++++++++++++++++++++++++++++++++++ tests/core/test_cli.py | 23 +++++++++++++ 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 pythainlp/cli/misspell.py diff --git a/docs/notes/command_line.rst b/docs/notes/command_line.rst index 1deb5edf0..a0e91569b 100644 --- a/docs/notes/command_line.rst +++ b/docs/notes/command_line.rst @@ -94,6 +94,15 @@ You can use some thainlp functions directly from command line. word_level:precision 0.8173 word_level:recall 0.8314 +**Misspell**:: + + thainlp misspell --file [--seed ] [--misspell-ratio ] [--output ] + +*Example*:: + + $ thainlp misspell --file ./some/data.txt --seed=1 --misspell-ratio 0.05 + # output file: ./some/data-misspelled-r0.05-seed1.txt + **Help**:: thainlp --help diff --git a/pythainlp/__main__.py b/pythainlp/__main__.py index 75ae3a0eb..2c1c2b31c 100644 --- a/pythainlp/__main__.py +++ b/pythainlp/__main__.py @@ -26,7 +26,7 @@ def main(argv=None): parser.add_argument( "command", type=str, - choices=cli.COMMANDS, + choices=cli.COMMANDS + ["misspell"], help="text processing action", ) diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py index da2b8775c..197bd9887 100644 --- a/pythainlp/cli/__init__.py +++ b/pythainlp/cli/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE +# SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 """Command line helpers.""" @@ -12,7 +13,7 @@ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8") # a command should start with a verb when possible -COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"]) +COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark", "misspell"]) CLI_NAME = "thainlp" diff --git a/pythainlp/cli/misspell.py b/pythainlp/cli/misspell.py new file mode 100644 index 000000000..4becb55ed --- /dev/null +++ b/pythainlp/cli/misspell.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import random + +from pythainlp.tools.misspell import misspell + + +class App: + def __init__(self, argv): + parser = argparse.ArgumentParser( + prog="misspell", + description="Generate misspelled texts from a given file.", + usage=( + "thainlp misspell --file [--seed ] " + "[--misspell-ratio ] [--output ]\n\n" + "Example:\n\n" + "thainlp misspell --file ./some/data.txt --seed=1 " + "--misspell-ratio 0.05\n\n" + "--" + ), + ) + parser.add_argument( + "--file", + type=str, + required=True, + help="Path to the input file", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for reproducibility", + ) + parser.add_argument( + "--misspell-ratio", + type=float, + default=0.05, + help="Ratio of misspells per 100 characters", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Path to the output file", + ) + + args = parser.parse_args(argv[2:]) + + if args.seed is not None: + random.seed(args.seed) + + with open(args.file, "r", encoding="utf-8") as f: + lines = f.readlines() + + misspelled_lines = [ + misspell(line, ratio=args.misspell_ratio) for line in lines + ] + + if args.output is None: + base, ext = os.path.splitext(args.file) + args.output = f"{base}-misspelled-r{args.misspell_ratio}-seed{args.seed}{ext}" + + with open(args.output, "w", encoding="utf-8") as f: + f.writelines(misspelled_lines) diff --git a/tests/core/test_cli.py b/tests/core/test_cli.py index 5fc3ed7f9..2470e4912 100644 --- a/tests/core/test_cli.py +++ b/tests/core/test_cli.py @@ -11,6 +11,7 @@ from pythainlp.cli.soundex import App as SoundexApp from pythainlp.cli.tag import App as TagApp from pythainlp.cli.tokenize import App as TokenizeApp +from pythainlp.cli.misspell import App as MisspellApp class CliTestCase(unittest.TestCase): @@ -139,3 +140,25 @@ def test_cli_tokenize(self): ] ) ) + + def test_cli_misspell(self): + self.assertTrue(hasattr(cli, "misspell")) + + with self.assertRaises(SystemExit) as ex: + MisspellApp(["thainlp", "misspell"]) + self.assertEqual(ex.exception.code, 2) + + self.assertIsNotNone( + MisspellApp( + [ + "thainlp", + "misspell", + "--file", + "./some/data.txt", + "--seed", + "1", + "--misspell-ratio", + "0.05", + ] + ) + ) From eae6478e607f309569e696ae38e61c1dacc45657 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Wed, 8 Jan 2025 20:07:02 +0700 Subject: [PATCH 2/4] Update code --- pythainlp/cli/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py index 197bd9887..27edb8708 100644 --- a/pythainlp/cli/__init__.py +++ b/pythainlp/cli/__init__.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE -# SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 """Command line helpers.""" From 2b446af16fb5553eea4c4fb23fa843ab6ada272e Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Wed, 8 Jan 2025 20:12:12 +0700 Subject: [PATCH 3/4] Update test misspell cli --- tests/core/test_cli.py | 2 +- tests/data/text.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 tests/data/text.txt diff --git a/tests/core/test_cli.py b/tests/core/test_cli.py index 2470e4912..e12ce91c4 100644 --- a/tests/core/test_cli.py +++ b/tests/core/test_cli.py @@ -154,7 +154,7 @@ def test_cli_misspell(self): "thainlp", "misspell", "--file", - "./some/data.txt", + "./tests/data/text.txt", "--seed", "1", "--misspell-ratio", diff --git a/tests/data/text.txt b/tests/data/text.txt new file mode 100644 index 000000000..bed37d4b4 --- /dev/null +++ b/tests/data/text.txt @@ -0,0 +1 @@ +ผมไม่ชอบกินผัก ดังนั้นผมจึงมักจะเลือกทานอาหารที่มีเนื้อสัตว์เป็นส่วนใหญ่ อย่างไรก็ตาม ผมก็รู้ว่าการทานผักมีประโยชน์ต่อสุขภาพ ดังนั้นผมจึงพยายามทานผักบ้างในบางมื้อ แต่ผมก็ยังคงเลือกทานผักที่ผมชอบเท่านั้น อย่างเช่น ถั่วฝักยาว หรือ ถั่วฝักยาว ซึ่งผมคิดว่ามันก็เป็นผักที่อร่อยและมีประโยชน์ด้วย \ No newline at end of file From 3367c0334ba01f9e8da887581729d24c37094062 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Wed, 8 Jan 2025 20:15:23 +0700 Subject: [PATCH 4/4] Update cli --- pythainlp/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/__main__.py b/pythainlp/__main__.py index 2c1c2b31c..75ae3a0eb 100644 --- a/pythainlp/__main__.py +++ b/pythainlp/__main__.py @@ -26,7 +26,7 @@ def main(argv=None): parser.add_argument( "command", type=str, - choices=cli.COMMANDS + ["misspell"], + choices=cli.COMMANDS, help="text processing action", )