From daa5b9c59f2a345db89bf6ad7c26b7418e67ffab Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Nov 2024 16:52:29 -0500 Subject: [PATCH 1/3] Do not allow for a word to start with punctuation symbols The inspired for me to look use case - https://github.com/INTERSECT-SDK/python-sdk/pull/19/files/33da9ff31d6162caa0dfc1a1155f321e6d68b1cc#diff-10380fd6e5ecb84c1ae11e135982739946c5aff1a50499378db397cf5034f54e And then I found the issue this - Close #3305 Although may be I am missing the use-cases/problems @DimitriPapadopoulos and @mdeweerd discussed back then --- codespell_lib/_codespell.py | 2 +- codespell_lib/tests/test_basic.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 8e9165ff95..08087f30af 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -53,7 +53,7 @@ __version__ as VERSION, # noqa: N812 ) -word_regex_def = r"[\w\-'’]+" # noqa: RUF001 +word_regex_def = r"\w[\w\-'’]*" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, # these may occur unescaped in URIs, and so we are more restrictive on the # endpoint. Emails are more restrictive, so the endpoint remains flexible. diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 74e10404e1..4176274f3f 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -51,6 +51,7 @@ def main( capsys = frame.f_locals["capsys"] stdout, stderr = capsys.readouterr() assert code in (EX_OK, EX_USAGE, EX_DATAERR, EX_CONFIG) + # print(f"stderr: {stderr}") if code == EX_DATAERR: # have some misspellings code = int(stderr.split("\n")[-2]) elif code == EX_OK and count: @@ -117,7 +118,7 @@ def test_basic( assert cs.main("--builtin", "clear,rare,names,informal", fname) == 4 with fname.open("w") as f: # overwrite the file f.write("var = 'nwe must check codespell likes escapes nin strings'\n") - assert cs.main(fname) == 1, "checking our string escape test word is bad" + assert cs.main(fname) == 2, "checking our string escape test word is bad" # the first one is missed because the apostrophe means its not currently # treated as a word on its own with fname.open("w") as f: # overwrite the file From 609f26960e74ca4e436aeb24cf66f84bb9b989df Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Nov 2024 17:12:06 -0500 Subject: [PATCH 2/3] Disallow trailing hyphens or apostrophes (and do not care about single character "words") --- codespell_lib/_codespell.py | 4 +++- codespell_lib/tests/test_basic.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 08087f30af..286f48003f 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -53,7 +53,9 @@ __version__ as VERSION, # noqa: N812 ) -word_regex_def = r"\w[\w\-'’]*" # noqa: RUF001 +# We do not care about single character words, and words should not +# have leading or trailing hyphens or apostrophes. +word_regex_def = r"\w[\w\-'’]*\w" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, # these may occur unescaped in URIs, and so we are more restrictive on the # endpoint. Emails are more restrictive, so the endpoint remains flexible. diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 4176274f3f..62d1c58a82 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -119,6 +119,9 @@ def test_basic( with fname.open("w") as f: # overwrite the file f.write("var = 'nwe must check codespell likes escapes nin strings'\n") assert cs.main(fname) == 2, "checking our string escape test word is bad" + with fname.open("w") as f: # overwrite the file + f.write("fully 'nwe' quoted, or end nwe' quoted\n") + assert cs.main(fname) == 2, "fully or end quoted should be detected" # the first one is missed because the apostrophe means its not currently # treated as a word on its own with fname.open("w") as f: # overwrite the file From 7282837a4f3d20eb7e2e6adb4a8ff1da359da8c3 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Nov 2024 17:58:15 -0500 Subject: [PATCH 3/3] RF: make it explicit to allow words in single quotes and allow trailing quote --- codespell_lib/_codespell.py | 6 +++--- codespell_lib/tests/test_basic.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 286f48003f..be2a559b95 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -53,9 +53,9 @@ __version__ as VERSION, # noqa: N812 ) -# We do not care about single character words, and words should not -# have leading or trailing hyphens or apostrophes. -word_regex_def = r"\w[\w\-'’]*\w" # noqa: RUF001 +# Words could be surrounded in quotes, so we allow for that, but no nested quotes. +# Cannot have leading but can have trailing hyphens or apostrophes. +word_regex_def = r"(?<=')[\w\-’]+(?=')|(?<=’)[\w\-']+(?=’)|\w[\w\-'’]*" # noqa: RUF001 # While we want to treat characters like ( or " as okay for a starting break, # these may occur unescaped in URIs, and so we are more restrictive on the # endpoint. Emails are more restrictive, so the endpoint remains flexible. diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 62d1c58a82..3106b04b2b 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -120,8 +120,11 @@ def test_basic( f.write("var = 'nwe must check codespell likes escapes nin strings'\n") assert cs.main(fname) == 2, "checking our string escape test word is bad" with fname.open("w") as f: # overwrite the file - f.write("fully 'nwe' quoted, or end nwe' quoted\n") - assert cs.main(fname) == 2, "fully or end quoted should be detected" + f.write("fully 'nwe' quoted\n") + assert cs.main(fname) == 1, "fully quoted" + with fname.open("w") as f: # overwrite the file + f.write("only end nwe' quoted\n") + assert cs.main(fname) == 0, "only end quoted should be ok since we have werent'" # the first one is missed because the apostrophe means its not currently # treated as a word on its own with fname.open("w") as f: # overwrite the file