From 411b6e40d04e942b30704b643e7b6fd8d7e10702 Mon Sep 17 00:00:00 2001 From: un-pogaz <46523284+un-pogaz@users.noreply.github.com> Date: Mon, 24 Mar 2025 08:03:43 +0100 Subject: [PATCH 1/3] New option: per-file-ignores (#3673) --- README.rst | 26 +++ codespell_lib/_codespell.py | 253 ++++++++++++++++++++++-------- codespell_lib/tests/test_basic.py | 67 +++++++- 3 files changed, 283 insertions(+), 63 deletions(-) diff --git a/README.rst b/README.rst index 8e256dc923..f5cbfe7903 100644 --- a/README.rst +++ b/README.rst @@ -156,6 +156,26 @@ Words should be separated by a comma. def wrod(wrods) # codespell:ignore pass +Per-file ignores +---------------- + +To give a finer control, is possible to specified a additional set of words to ignore into a specific file only. + +1. ``--per-file-ignores``: A pair of arguments into the command line. The first provide a file, or a glob, and the second a comma-separated list of word to ignore for the given file: + + .. code-block:: sh + + codespell --per-file-ignores "*.ext" word1,word2,word3 + +2. A comment anywhere in the file, preferably at the top. Words should be separated by a comma: + + .. code-block:: python + + # codespell:file-ignore wrod + + def wrod(wrods) + pass + Using a config file ------------------- @@ -173,6 +193,9 @@ be specified in this file (without the preceding dashes), for example: skip = *.po,*.ts,./src/3rdParty,./src/Test count = quiet-level = 3 + [codespell.per-file-ignores] + *.ext1 = word1,word2,word3 + *.ext2 = word4 Python's `configparser `_ @@ -191,6 +214,9 @@ previous config file: skip = '*.po,*.ts,./src/3rdParty,./src/Test' count = true quiet-level = 3 + [tool.codespell.per-file-ignores] + "*.ext1" = word1,word2,word3 + "*.ext2" = word4 The above INI and TOML files are equivalent to running: diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index dee6a63ee8..f30e754b52 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -61,6 +61,9 @@ r"(\b(?:https?|[ts]?ftp|file|git|smb)://[^\s]+(?=$|\s)|\b[\w.%+-]+@[\w.-]+\b)" ) inline_ignore_regex = re.compile(r"[^\w\s]\s*codespell:ignore\b(\s+(?P[\w,]*))?") +inside_file_ignore_regex = re.compile( + r"[^\w\s]\s*codespell:file-ignore\b(\s+(?P[\w,]*))" +) USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ @@ -333,7 +336,13 @@ def _split_lines(self, text: str, width: int) -> List[str]: def _toml_to_parseconfig(toml_dict: Dict[str, Any]) -> Dict[str, Any]: """Convert a dict read from a TOML file to the parseconfig.read_dict() format.""" return { - k: "" if v is True else ",".join(v) if isinstance(v, list) else v + k: "" + if v is True + else ",".join(v) + if isinstance(v, list) + else _toml_to_parseconfig(v) + if isinstance(v, dict) + else v for k, v in toml_dict.items() if v is not False } @@ -476,6 +485,18 @@ def parse_options( 'the dictionary file. If set to "*", all ' "misspelling in URIs and emails will be ignored.", ) + parser.add_argument( + "--per-file-ignores", + action="append", + nargs=2, + help="Require a pair of arguments. The first argument " + "is a file to apply the second argument, a " + "comma-separated list of words to be ignored for" + "this file only. The first argument accepts globs " + "as well. The words in the second argument are case " + "sensitive based on how they are written in the " + "dictionary file.", + ) parser.add_argument( "-r", "--regex", @@ -660,7 +681,11 @@ def parse_options( with open(toml_file, "rb") as f: data = tomllib.load(f).get("tool", {}) if "codespell" in data: - data["codespell"] = _toml_to_parseconfig(data["codespell"]) + data_toml = _toml_to_parseconfig(data["codespell"]) + for k in list(data_toml.keys()): + if isinstance(data_toml[k], dict): + data[f"codespell.{k}"] = data_toml.pop(k) + data["codespell"] = data_toml config.read_dict(data) # Collect which config files are going to be used @@ -673,9 +698,9 @@ def parse_options( # Use config files config.read(used_cfg_files) + # Build a "fake" argv list using option name and value. + cfg_args = [] if config.has_section("codespell"): - # Build a "fake" argv list using option name and value. - cfg_args = [] for key in config["codespell"]: # Add option as arg. cfg_args.append(f"--{key}") @@ -684,6 +709,20 @@ def parse_options( if val: cfg_args.append(val) + # Iter dict arguments + for key in ["per-file-ignores"]: + section = f"codespell.{key}" + if config.has_section(section): + for name in config[section]: + # If value is blank, skip. + val = config[section][name] + if val: + # Add option as pair args. + cfg_args.append(f"--{key}") + cfg_args.append(name) + cfg_args.append(val) + + if cfg_args: # Parse config file options. options = parser.parse_args(cfg_args) @@ -722,6 +761,50 @@ def parse_ignore_words_option( return (ignore_words, ignore_words_cased) +def parse_per_file_ignores_option( + per_file_ignores_option: List[Tuple[str, str]], +) -> Dict[GlobMatch, Set[str]]: + per_file_ignores_cased: Dict[GlobMatch, Set[str]] = {} + if per_file_ignores_option: + for file, comma_separated_words in per_file_ignores_option: + per_file_ignores_cased[GlobMatch([file])] = { + word.strip() for word in comma_separated_words.split(",") + } + return per_file_ignores_cased + + +def parse_dictionary_option( + parser: argparse.ArgumentParser, + dictionary_option: List[str], + builtin_option: str, +) -> Tuple[int, List[str]]: + use_dictionaries = [] + for dictionary in flatten_clean_comma_separated_arguments(dictionary_option): + if dictionary == "-": + # figure out which builtin dictionaries to use + use = sorted(set(builtin_option.split(","))) + for u in use: + for builtin in _builtin_dictionaries: + if builtin[0] == u: + use_dictionaries.append( + os.path.join(_data_root, f"dictionary{builtin[2]}.txt") + ) + break + else: + return _usage_error( + parser, + f"ERROR: Unknown builtin dictionary: {u}", + ), [] + else: + if not os.path.isfile(dictionary): + return _usage_error( + parser, + f"ERROR: cannot find dictionary file: {dictionary}", + ), [] + use_dictionaries.append(dictionary) + return 0, use_dictionaries + + def build_exclude_hashes(filename: str, exclude_lines: Set[str]) -> None: with open(filename, encoding="utf-8") as f: exclude_lines.update(line.rstrip() for line in f) @@ -736,6 +819,19 @@ def build_ignore_words( ) +def build_ignore_words_for_file( + ignore_words_cased: Set[str], + per_file_ignores: Dict[GlobMatch, Set[str]], + file_name: str, + file_path: str, +) -> Set[str]: + ignore_words_cased_for_file = set(ignore_words_cased) + for m, v in per_file_ignores.items(): + if m.match(file_name) or m.match(file_path): + ignore_words_cased_for_file.update(v) + return ignore_words_cased_for_file + + def is_hidden(filename: str, check_hidden: bool) -> bool: bfilename = os.path.basename(filename) @@ -894,35 +990,16 @@ def parse_file( lines = f.readlines() else: if options.check_filenames: - for word in extract_words(filename, word_regex, ignore_word_regex): - if word in ignore_words_cased: - continue - lword = word.lower() - if lword not in misspellings: - continue - fix = misspellings[lword].fix - fixword = fix_case(word, misspellings[lword].data) - - if summary and fix: - summary.update(lword) - - cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" - cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" - crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" - - reason = misspellings[lword].reason - if reason: - if options.quiet_level & QuietLevels.DISABLED_FIXES: - continue - creason = f" | {colors.FILE}{reason}{colors.DISABLE}" - else: - if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: - continue - creason = "" - - bad_count += 1 - - print(f"{cfilename}: {cwrongword} ==> {crightword}{creason}") + bad_count += parse_filename( + filename, + colors, + summary, + misspellings, + ignore_words_cased, + word_regex, + ignore_word_regex, + options, + ) # ignore irregular files if not os.path.isfile(filename): @@ -945,6 +1022,14 @@ def parse_file( except OSError: return bad_count + inside_file_to_ignore: Set[str] = set() + for line in lines: + match = inside_file_ignore_regex.search(line) + if match: + inside_file_to_ignore.update( + filter(None, (match.group("words") or "").split(",")) + ) + for i, line in enumerate(lines): if line.rstrip() in exclude_lines: continue @@ -982,7 +1067,11 @@ def parse_file( if word in ignore_words_cased: continue lword = word.lower() - if lword in misspellings and lword not in extra_words_to_ignore: + if ( + lword in misspellings + and lword not in extra_words_to_ignore + and lword not in inside_file_to_ignore + ): # Sometimes we find a 'misspelling' which is actually a valid word # preceded by a string escape sequence. Ignore such cases as # they're usually false alarms; see issue #17 among others. @@ -1082,6 +1171,44 @@ def parse_file( return bad_count +def parse_filename( + filename: str, + colors: TermColors, + summary: Optional[Summary], + misspellings: Dict[str, Misspelling], + ignore_words_cased: Set[str], + word_regex: Pattern[str], + ignore_word_regex: Optional[Pattern[str]], + options: argparse.Namespace, +) -> int: + bad_count = 0 + for word in extract_words(filename, word_regex, ignore_word_regex): + if word in ignore_words_cased: + continue + lword = word.lower() + if lword not in misspellings: + continue + fix = misspellings[lword].fix + fixword = fix_case(word, misspellings[lword].data) + if summary and fix: + summary.update(lword) + cfilename = f"{colors.FILE}{filename}{colors.DISABLE}" + cwrongword = f"{colors.WWORD}{word}{colors.DISABLE}" + crightword = f"{colors.FWORD}{fixword}{colors.DISABLE}" + reason = misspellings[lword].reason + if reason: + if options.quiet_level & QuietLevels.DISABLED_FIXES: + continue + creason = f" | {colors.FILE}{reason}{colors.DISABLE}" + else: + if options.quiet_level & QuietLevels.NON_AUTOMATIC_FIXES: + continue + creason = "" + bad_count += 1 + print(f"{cfilename}: {cwrongword} ==> {crightword}{creason}") + return bad_count + + def flatten_clean_comma_separated_arguments( arguments: Iterable[str], ) -> List[str]: @@ -1187,6 +1314,17 @@ def main(*args: str) -> int: ) build_ignore_words(ignore_words_file, ignore_words, ignore_words_cased) + per_file_ignores = parse_per_file_ignores_option(options.per_file_ignores) + try: + for match in per_file_ignores: + match.match("/random/path") # does not need a real path + except re.error: + return _usage_error( + parser, + "ERROR: --per-file-ignores has been fed an invalid glob, " + "try escaping special characters", + ) + uri_regex = options.uri_regex or uri_regex_def try: uri_regex = re.compile(uri_regex) @@ -1200,32 +1338,13 @@ def main(*args: str) -> int: itertools.chain(*parse_ignore_words_option(options.uri_ignore_words_list)) ) - dictionaries = flatten_clean_comma_separated_arguments(options.dictionary or ["-"]) - - use_dictionaries = [] - for dictionary in dictionaries: - if dictionary == "-": - # figure out which builtin dictionaries to use - use = sorted(set(options.builtin.split(","))) - for u in use: - for builtin in _builtin_dictionaries: - if builtin[0] == u: - use_dictionaries.append( - os.path.join(_data_root, f"dictionary{builtin[2]}.txt") - ) - break - else: - return _usage_error( - parser, - f"ERROR: Unknown builtin dictionary: {u}", - ) - else: - if not os.path.isfile(dictionary): - return _usage_error( - parser, - f"ERROR: cannot find dictionary file: {dictionary}", - ) - use_dictionaries.append(dictionary) + error, use_dictionaries = parse_dictionary_option( + parser, + options.dictionary or ["-"], + options.builtin, + ) + if error != 0: + return error misspellings: Dict[str, Misspelling] = {} for dictionary in use_dictionaries: build_dict(dictionary, misspellings, ignore_words) @@ -1305,7 +1424,12 @@ def main(*args: str) -> int: colors, summary, misspellings, - ignore_words_cased, + build_ignore_words_for_file( + ignore_words_cased, + per_file_ignores, + file_, + fname, + ), exclude_lines, file_opener, word_regex, @@ -1330,7 +1454,12 @@ def main(*args: str) -> int: colors, summary, misspellings, - ignore_words_cased, + build_ignore_words_for_file( + ignore_words_cased, + per_file_ignores, + os.path.basename(filename), + filename, + ), exclude_lines, file_opener, word_regex, diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index a6c05fc089..929964c7cc 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -396,6 +396,42 @@ def test_ignore_words_with_cases( assert cs.main("-Lmis", "-f", bad_name) == 0 +def test_per_file_ignores( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + """Test --per-file-ignores options.""" + text = "abandonned abondon abilty" + bad_file = tmp_path / "bad.txt" + bad_file.write_text(text) + name = "ignore.txt" + fname = tmp_path / name + fname.write_text(text) + assert cs.main(tmp_path) == 6 + + assert cs.main(fname, bad_file, "--per-file-ignores", name, "abondon") == 5 + assert cs.main(fname, bad_file, "--per-file-ignores", name, "abondon,abilty") == 4 + # case sensitive + assert cs.main(fname, bad_file, "--per-file-ignores", name, "Abilty") == 6 + assert cs.main(fname, bad_file, "--per-file-ignores", "name.txt", "abilty") == 6 + # several pair arguments + assert ( + cs.main( + fname, + bad_file, + # pair arguments 1 + "--per-file-ignores", + name, + "abondon", + # pair arguments 2 + "--per-file-ignores", + name, + "abilty", + ) + == 4 + ) + + def test_ignore_word_list( tmp_path: Path, capsys: pytest.CaptureFixture[str], @@ -453,6 +489,23 @@ def test_ignore_word_list( "You could also use line based igore (codespell:ignore igare) to igore ", 2, ), + # file-ignore + ("abandonned abondon abilty \n # codespell:file-ignore abondon", 2), + ("abandonned abondon abilty \n // codespell:file-ignore abondon,abilty", 1), + ( + "abandonned abondon abilty \n /* codespell:file-ignore abandonned,abondon,abilty", # noqa: E501 + 0, + ), + # ignore unused ignore + ("abandonned abondon abilty \n # codespell:file-ignore nomenklatur", 3), + # ignore these as they aren't valid + ("abandonned abondon abilty \n # codespell:file-ignore", 3), + ("abandonned abondon abilty \n # codespell:file-igore word", 3), + # several in the same file + ( + "// codespell:file-ignore abondon \n abandonned abondon abilty \n // codespell:file-ignore abilty", # noqa: E501 + 1, + ), ], ) def test_inline_ignores( @@ -1286,15 +1339,17 @@ def test_config_toml( (d / "bad.txt").write_text("abandonned donn\n") (d / "good.txt").write_text("good") (d / "abandonned.txt").write_text("") + (d / "per-file.txt").write_text("donn") # Should fail when checking all files. result = cs.main(d, "--check-filenames", count=True, std=True) assert isinstance(result, tuple) code, stdout, _ = result # Code in this case is not exit code, but count of misspellings. - assert code == 3 + assert code == 4 assert "bad.txt" in stdout assert "abandonned.txt" in stdout + assert "per-file.txt" in stdout if kind.startswith("cfg"): conffile = tmp_path / "setup.cfg" @@ -1304,6 +1359,8 @@ def test_config_toml( [codespell] skip = bad.txt, whatever.txt count = +[codespell.per-file-ignores] +per-file.txt = donn """ else: assert kind == "cfg_multiline" @@ -1314,6 +1371,8 @@ def test_config_toml( , count = +[codespell.per-file-ignores] +per-file.txt = donn """ conffile.write_text(text) else: @@ -1327,6 +1386,8 @@ def test_config_toml( skip = 'bad.txt,whatever.txt' check-filenames = false count = true +[tool.codespell.per-file-ignores] +"per-file.txt" = 'donn' """ else: assert kind == "toml_list" @@ -1335,6 +1396,8 @@ def test_config_toml( skip = ['bad.txt', 'whatever.txt'] check-filenames = false count = true +[tool.codespell.per-file-ignores] +"per-file.txt" = ['donn'] """ tomlfile.write_text(text) @@ -1345,6 +1408,7 @@ def test_config_toml( assert code == 0 assert "bad.txt" not in stdout assert "abandonned.txt" not in stdout + assert "per-file.txt" not in stdout # And both should automatically work if they're in cwd cwd = Path.cwd() @@ -1358,6 +1422,7 @@ def test_config_toml( assert code == 0 assert "bad.txt" not in stdout assert "abandonned.txt" not in stdout + assert "per-file.txt" not in stdout @contextlib.contextmanager From 406b277dd7b6575144042ee986e1e387d5812171 Mon Sep 17 00:00:00 2001 From: un-pogaz <46523284+un-pogaz@users.noreply.github.com> Date: Tue, 25 Mar 2025 07:47:20 +0100 Subject: [PATCH 2/3] test_bad_glob_per_file_ignores() --- codespell_lib/tests/test_basic.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 929964c7cc..8e95ff7c58 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -207,6 +207,29 @@ def test_bad_glob( assert cs.main("--skip", "[[]b-a[]].txt", g) == 0 +def test_bad_glob_per_file_ignores( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + # disregard invalid globs, properly handle escaped globs + g = tmp_path / "glob" + g.mkdir() + fname = g / "[b-a].txt" + fname.write_text("abandonned\n") + assert cs.main(g) == 1 + # bad glob is invalid + result = cs.main(g, "--per-file-ignores", "[b-a].txt", "abandonned", std=True) + assert isinstance(result, tuple) + code, _, stderr = result + if sys.hexversion < 0x030A05F0: # Python < 3.10.5 raises re.error + assert code == EX_USAGE, "invalid glob" + assert "invalid glob" in stderr + else: # Python >= 3.10.5 does not match + assert code == 1 + # properly escaped glob is valid, and matches glob-like file name + assert cs.main(g, "--per-file-ignores", "[[]b-a[]].txt", "abandonned") == 0 + + @pytest.mark.skipif(sys.platform != "linux", reason="Only supported on Linux") def test_permission_error( tmp_path: Path, From a064dcb286853da62f1c5bb5601d1973f898c92e Mon Sep 17 00:00:00 2001 From: un_pogaz <46523284+un-pogaz@users.noreply.github.com> Date: Sun, 3 Aug 2025 12:49:25 +0200 Subject: [PATCH 3/3] code review: apply TheGiraffe3 suggestions Co-authored-by: Loymdayddaud <145969603+TheGiraffe3@users.noreply.github.com> --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index f5cbfe7903..f8e2aa0b98 100644 --- a/README.rst +++ b/README.rst @@ -159,9 +159,9 @@ Words should be separated by a comma. Per-file ignores ---------------- -To give a finer control, is possible to specified a additional set of words to ignore into a specific file only. +To give a finer control, it is possible to specify an additional set of words to ignore in a specific file. -1. ``--per-file-ignores``: A pair of arguments into the command line. The first provide a file, or a glob, and the second a comma-separated list of word to ignore for the given file: +1. ``--per-file-ignores``: A pair of arguments in the command line. The first provides a file, or a glob, and the second a comma-separated list of words to ignore for the given file: .. code-block:: sh