Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/notes/command_line.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ You can use some thainlp functions directly from command line.
word_level:precision 0.8173
word_level:recall 0.8314

**Misspell**::

thainlp misspell --file <input_file> [--seed <seed>] [--misspell-ratio <ratio>] [--output <output_file>]

*Example*::

$ thainlp misspell --file ./some/data.txt --seed=1 --misspell-ratio 0.05
# output file: ./some/data-misspelled-r0.05-seed1.txt

**Help**::

thainlp --help
2 changes: 1 addition & 1 deletion pythainlp/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8")

# a command should start with a verb when possible
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark", "misspell"])

CLI_NAME = "thainlp"

Expand Down
69 changes: 69 additions & 0 deletions pythainlp/cli/misspell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

import argparse
import os
import random

from pythainlp.tools.misspell import misspell


class App:
def __init__(self, argv):
parser = argparse.ArgumentParser(
prog="misspell",
description="Generate misspelled texts from a given file.",
usage=(
"thainlp misspell --file <input_file> [--seed <seed>] "
"[--misspell-ratio <ratio>] [--output <output_file>]\n\n"
"Example:\n\n"
"thainlp misspell --file ./some/data.txt --seed=1 "
"--misspell-ratio 0.05\n\n"
"--"
),
)
parser.add_argument(
"--file",
type=str,
required=True,
help="Path to the input file",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Random seed for reproducibility",
)
parser.add_argument(
"--misspell-ratio",
type=float,
default=0.05,
help="Ratio of misspells per 100 characters",
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Path to the output file",
)

args = parser.parse_args(argv[2:])

if args.seed is not None:
random.seed(args.seed)

with open(args.file, "r", encoding="utf-8") as f:
lines = f.readlines()

misspelled_lines = [
misspell(line, ratio=args.misspell_ratio) for line in lines
]

if args.output is None:
base, ext = os.path.splitext(args.file)
args.output = f"{base}-misspelled-r{args.misspell_ratio}-seed{args.seed}{ext}"

with open(args.output, "w", encoding="utf-8") as f:
f.writelines(misspelled_lines)
23 changes: 23 additions & 0 deletions tests/core/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pythainlp.cli.soundex import App as SoundexApp
from pythainlp.cli.tag import App as TagApp
from pythainlp.cli.tokenize import App as TokenizeApp
from pythainlp.cli.misspell import App as MisspellApp


class CliTestCase(unittest.TestCase):
Expand Down Expand Up @@ -139,3 +140,25 @@ def test_cli_tokenize(self):
]
)
)

def test_cli_misspell(self):
self.assertTrue(hasattr(cli, "misspell"))

with self.assertRaises(SystemExit) as ex:
MisspellApp(["thainlp", "misspell"])
self.assertEqual(ex.exception.code, 2)

self.assertIsNotNone(
MisspellApp(
[
"thainlp",
"misspell",
"--file",
"./tests/data/text.txt",
"--seed",
"1",
"--misspell-ratio",
"0.05",
]
)
)
1 change: 1 addition & 0 deletions tests/data/text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ผมไม่ชอบกินผัก ดังนั้นผมจึงมักจะเลือกทานอาหารที่มีเนื้อสัตว์เป็นส่วนใหญ่ อย่างไรก็ตาม ผมก็รู้ว่าการทานผักมีประโยชน์ต่อสุขภาพ ดังนั้นผมจึงพยายามทานผักบ้างในบางมื้อ แต่ผมก็ยังคงเลือกทานผักที่ผมชอบเท่านั้น อย่างเช่น ถั่วฝักยาว หรือ ถั่วฝักยาว ซึ่งผมคิดว่ามันก็เป็นผักที่อร่อยและมีประโยชน์ด้วย
Loading