diff --git a/docs/notes/command_line.rst b/docs/notes/command_line.rst index 1deb5edf0..a0e91569b 100644 --- a/docs/notes/command_line.rst +++ b/docs/notes/command_line.rst @@ -94,6 +94,15 @@ You can use some thainlp functions directly from command line. word_level:precision 0.8173 word_level:recall 0.8314 +**Misspell**:: + + thainlp misspell --file [--seed ] [--misspell-ratio ] [--output ] + +*Example*:: + + $ thainlp misspell --file ./some/data.txt --seed=1 --misspell-ratio 0.05 + # output file: ./some/data-misspelled-r0.05-seed1.txt + **Help**:: thainlp --help diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py index da2b8775c..27edb8708 100644 --- a/pythainlp/cli/__init__.py +++ b/pythainlp/cli/__init__.py @@ -12,7 +12,7 @@ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8") # a command should start with a verb when possible -COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"]) +COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark", "misspell"]) CLI_NAME = "thainlp" diff --git a/pythainlp/cli/misspell.py b/pythainlp/cli/misspell.py new file mode 100644 index 000000000..4becb55ed --- /dev/null +++ b/pythainlp/cli/misspell.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import random + +from pythainlp.tools.misspell import misspell + + +class App: + def __init__(self, argv): + parser = argparse.ArgumentParser( + prog="misspell", + description="Generate misspelled texts from a given file.", + usage=( + "thainlp misspell --file [--seed ] " + "[--misspell-ratio ] [--output ]\n\n" + "Example:\n\n" + "thainlp misspell --file ./some/data.txt --seed=1 " + "--misspell-ratio 0.05\n\n" + "--" + ), + ) + parser.add_argument( + "--file", + type=str, + required=True, + help="Path to the input file", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed for reproducibility", + ) + parser.add_argument( + "--misspell-ratio", + type=float, + default=0.05, + help="Ratio of misspells per 100 characters", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Path to the output file", + ) + + args = parser.parse_args(argv[2:]) + + if args.seed is not None: + random.seed(args.seed) + + with open(args.file, "r", encoding="utf-8") as f: + lines = f.readlines() + + misspelled_lines = [ + misspell(line, ratio=args.misspell_ratio) for line in lines + ] + + if args.output is None: + base, ext = os.path.splitext(args.file) + args.output = f"{base}-misspelled-r{args.misspell_ratio}-seed{args.seed}{ext}" + + with open(args.output, "w", encoding="utf-8") as f: + f.writelines(misspelled_lines) diff --git a/tests/core/test_cli.py b/tests/core/test_cli.py index 5fc3ed7f9..e12ce91c4 100644 --- a/tests/core/test_cli.py +++ b/tests/core/test_cli.py @@ -11,6 +11,7 @@ from pythainlp.cli.soundex import App as SoundexApp from pythainlp.cli.tag import App as TagApp from pythainlp.cli.tokenize import App as TokenizeApp +from pythainlp.cli.misspell import App as MisspellApp class CliTestCase(unittest.TestCase): @@ -139,3 +140,25 @@ def test_cli_tokenize(self): ] ) ) + + def test_cli_misspell(self): + self.assertTrue(hasattr(cli, "misspell")) + + with self.assertRaises(SystemExit) as ex: + MisspellApp(["thainlp", "misspell"]) + self.assertEqual(ex.exception.code, 2) + + self.assertIsNotNone( + MisspellApp( + [ + "thainlp", + "misspell", + "--file", + "./tests/data/text.txt", + "--seed", + "1", + "--misspell-ratio", + "0.05", + ] + ) + ) diff --git a/tests/data/text.txt b/tests/data/text.txt new file mode 100644 index 000000000..bed37d4b4 --- /dev/null +++ b/tests/data/text.txt @@ -0,0 +1 @@ +ผมไม่ชอบกินผัก ดังนั้นผมจึงมักจะเลือกทานอาหารที่มีเนื้อสัตว์เป็นส่วนใหญ่ อย่างไรก็ตาม ผมก็รู้ว่าการทานผักมีประโยชน์ต่อสุขภาพ ดังนั้นผมจึงพยายามทานผักบ้างในบางมื้อ แต่ผมก็ยังคงเลือกทานผักที่ผมชอบเท่านั้น อย่างเช่น ถั่วฝักยาว หรือ ถั่วฝักยาว ซึ่งผมคิดว่ามันก็เป็นผักที่อร่อยและมีประโยชน์ด้วย \ No newline at end of file