From ff98e6de265b334c55b694817bf0c23e1fe62609 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Mon, 17 Mar 2025 19:55:36 +0100 Subject: [PATCH 01/26] copy `ignore` crate for easier modifications --- Cargo.lock | 20 +- crates/ignore/COPYING | 3 + crates/ignore/Cargo.toml | 44 + crates/ignore/LICENSE-MIT | 21 + crates/ignore/README.md | 59 + crates/ignore/UNLICENSE | 24 + crates/ignore/examples/walk.rs | 64 + crates/ignore/src/default_types.rs | 351 +++ crates/ignore/src/dir.rs | 1187 +++++++++ crates/ignore/src/gitignore.rs | 812 ++++++ crates/ignore/src/lib.rs | 564 ++++ crates/ignore/src/overrides.rs | 265 ++ crates/ignore/src/pathutil.rs | 141 + crates/ignore/src/types.rs | 601 +++++ crates/ignore/src/walk.rs | 2297 +++++++++++++++++ ...atched_path_or_any_parents_tests.gitignore | 216 ++ ...gnore_matched_path_or_any_parents_tests.rs | 291 +++ crates/oxide/Cargo.toml | 2 +- 18 files changed, 6959 insertions(+), 3 deletions(-) create mode 100644 crates/ignore/COPYING create mode 100644 crates/ignore/Cargo.toml create mode 100644 crates/ignore/LICENSE-MIT create mode 100644 crates/ignore/README.md create mode 100644 crates/ignore/UNLICENSE create mode 100644 crates/ignore/examples/walk.rs create mode 100644 crates/ignore/src/default_types.rs create mode 100644 crates/ignore/src/dir.rs create mode 100644 crates/ignore/src/gitignore.rs create mode 100644 crates/ignore/src/lib.rs create mode 100644 crates/ignore/src/overrides.rs create mode 100644 crates/ignore/src/pathutil.rs create mode 100644 crates/ignore/src/types.rs create mode 100644 crates/ignore/src/walk.rs create mode 100644 crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.gitignore create mode 100644 crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 31b5529a6c21..4a6baa0a56ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -217,10 +217,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" dependencies = [ "bitflags", - "ignore", + "ignore 0.4.23 (registry+https://github.com/rust-lang/crates.io-index)", "walkdir", ] +[[package]] +name = "ignore" +version = "0.4.23" +dependencies = [ + "bstr", + "crossbeam-channel", + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.8", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "ignore" version = "0.4.23" @@ -567,7 +583,7 @@ dependencies = [ "fancy-regex", "fast-glob", "globwalk", - "ignore", + "ignore 0.4.23", "log", "rayon", "regex", diff --git a/crates/ignore/COPYING b/crates/ignore/COPYING new file mode 100644 index 000000000000..bb9c20a094e4 --- /dev/null +++ b/crates/ignore/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml new file mode 100644 index 000000000000..3a22a48c1bb9 --- /dev/null +++ b/crates/ignore/Cargo.toml @@ -0,0 +1,44 @@ +[package] +name = "ignore" +version = "0.4.23" #:version +authors = ["Andrew Gallant "] +description = """ +A fast library for efficiently matching ignore files such as `.gitignore` +against file paths. +""" +documentation = "https://docs.rs/ignore" +homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/ignore" +repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/ignore" +readme = "README.md" +keywords = ["glob", "ignore", "gitignore", "pattern", "file"] +license = "Unlicense OR MIT" +edition = "2021" + +[lib] +name = "ignore" +bench = false + +[dependencies] +crossbeam-deque = "0.8.3" +globset = "0.4.15" +log = "0.4.20" +memchr = "2.6.3" +same-file = "1.0.6" +walkdir = "2.4.0" + +[dependencies.regex-automata] +version = "0.4.0" +default-features = false +features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"] + +[target.'cfg(windows)'.dependencies.winapi-util] +version = "0.1.2" + +[dev-dependencies] +bstr = { version = "1.6.2", default-features = false, features = ["std"] } +crossbeam-channel = "0.5.8" + +[features] +# DEPRECATED. It is a no-op. SIMD is done automatically through runtime +# dispatch. +simd-accel = [] diff --git a/crates/ignore/LICENSE-MIT b/crates/ignore/LICENSE-MIT new file mode 100644 index 000000000000..3b0a5dc09c1e --- /dev/null +++ b/crates/ignore/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/crates/ignore/README.md b/crates/ignore/README.md new file mode 100644 index 000000000000..72258e6b5824 --- /dev/null +++ b/crates/ignore/README.md @@ -0,0 +1,59 @@ +ignore +====== +The ignore crate provides a fast recursive directory iterator that respects +various filters such as globs, file types and `.gitignore` files. This crate +also provides lower level direct access to gitignore and file type matchers. + +[![Build status](https://github.com/BurntSushi/ripgrep/workflows/ci/badge.svg)](https://github.com/BurntSushi/ripgrep/actions) +[![](https://img.shields.io/crates/v/ignore.svg)](https://crates.io/crates/ignore) + +Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). + +### Documentation + +[https://docs.rs/ignore](https://docs.rs/ignore) + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +ignore = "0.4" +``` + +### Example + +This example shows the most basic usage of this crate. This code will +recursively traverse the current directory while automatically filtering out +files and directories according to ignore globs found in files like +`.ignore` and `.gitignore`: + + +```rust,no_run +use ignore::Walk; + +for result in Walk::new("./") { + // Each item yielded by the iterator is either a directory entry or an + // error, so either print the path or the error. + match result { + Ok(entry) => println!("{}", entry.path().display()), + Err(err) => println!("ERROR: {}", err), + } +} +``` + +### Example: advanced + +By default, the recursive directory iterator will ignore hidden files and +directories. This can be disabled by building the iterator with `WalkBuilder`: + +```rust,no_run +use ignore::WalkBuilder; + +for result in WalkBuilder::new("./").hidden(false).build() { + println!("{:?}", result); +} +``` + +See the documentation for `WalkBuilder` for many other options. diff --git a/crates/ignore/UNLICENSE b/crates/ignore/UNLICENSE new file mode 100644 index 000000000000..68a49daad8ff --- /dev/null +++ b/crates/ignore/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/crates/ignore/examples/walk.rs b/crates/ignore/examples/walk.rs new file mode 100644 index 000000000000..5bbd10f2bc2d --- /dev/null +++ b/crates/ignore/examples/walk.rs @@ -0,0 +1,64 @@ +use std::{env, io::Write, path::Path}; + +use {bstr::ByteVec, ignore::WalkBuilder, walkdir::WalkDir}; + +fn main() { + let mut path = env::args().nth(1).unwrap(); + let mut parallel = false; + let mut simple = false; + let (tx, rx) = crossbeam_channel::bounded::(100); + if path == "parallel" { + path = env::args().nth(2).unwrap(); + parallel = true; + } else if path == "walkdir" { + path = env::args().nth(2).unwrap(); + simple = true; + } + + let stdout_thread = std::thread::spawn(move || { + let mut stdout = std::io::BufWriter::new(std::io::stdout()); + for dent in rx { + stdout.write(&*Vec::from_path_lossy(dent.path())).unwrap(); + stdout.write(b"\n").unwrap(); + } + }); + + if parallel { + let walker = WalkBuilder::new(path).threads(6).build_parallel(); + walker.run(|| { + let tx = tx.clone(); + Box::new(move |result| { + use ignore::WalkState::*; + + tx.send(DirEntry::Y(result.unwrap())).unwrap(); + Continue + }) + }); + } else if simple { + let walker = WalkDir::new(path); + for result in walker { + tx.send(DirEntry::X(result.unwrap())).unwrap(); + } + } else { + let walker = WalkBuilder::new(path).build(); + for result in walker { + tx.send(DirEntry::Y(result.unwrap())).unwrap(); + } + } + drop(tx); + stdout_thread.join().unwrap(); +} + +enum DirEntry { + X(walkdir::DirEntry), + Y(ignore::DirEntry), +} + +impl DirEntry { + fn path(&self) -> &Path { + match *self { + DirEntry::X(ref x) => x.path(), + DirEntry::Y(ref y) => y.path(), + } + } +} diff --git a/crates/ignore/src/default_types.rs b/crates/ignore/src/default_types.rs new file mode 100644 index 000000000000..2cf8ad80794b --- /dev/null +++ b/crates/ignore/src/default_types.rs @@ -0,0 +1,351 @@ +/// This list represents the default file types that ripgrep ships with. In +/// general, any file format is fair game, although it should generally be +/// limited to reasonably popular open formats. For other cases, you can add +/// types to each invocation of ripgrep with the '--type-add' flag. +/// +/// If you would like to add or improve this list, please file a PR: +/// . +/// +/// Please try to keep this list sorted lexicographically and wrapped to 79 +/// columns (inclusive). +#[rustfmt::skip] +pub(crate) const DEFAULT_TYPES: &[(&[&str], &[&str])] = &[ + (&["ada"], &["*.adb", "*.ads"]), + (&["agda"], &["*.agda", "*.lagda"]), + (&["aidl"], &["*.aidl"]), + (&["alire"], &["alire.toml"]), + (&["amake"], &["*.mk", "*.bp"]), + (&["asciidoc"], &["*.adoc", "*.asc", "*.asciidoc"]), + (&["asm"], &["*.asm", "*.s", "*.S"]), + (&["asp"], &[ + "*.aspx", "*.aspx.cs", "*.aspx.vb", "*.ascx", "*.ascx.cs", + "*.ascx.vb", "*.asp" + ]), + (&["ats"], &["*.ats", "*.dats", "*.sats", "*.hats"]), + (&["avro"], &["*.avdl", "*.avpr", "*.avsc"]), + (&["awk"], &["*.awk"]), + (&["bat", "batch"], &["*.bat"]), + (&["bazel"], &[ + "*.bazel", "*.bzl", "*.BUILD", "*.bazelrc", "BUILD", "MODULE.bazel", + "WORKSPACE", "WORKSPACE.bazel", + ]), + (&["bitbake"], &["*.bb", "*.bbappend", "*.bbclass", "*.conf", "*.inc"]), + (&["brotli"], &["*.br"]), + (&["buildstream"], &["*.bst"]), + (&["bzip2"], &["*.bz2", "*.tbz2"]), + (&["c"], &["*.[chH]", "*.[chH].in", "*.cats"]), + (&["cabal"], &["*.cabal"]), + (&["candid"], &["*.did"]), + (&["carp"], &["*.carp"]), + (&["cbor"], &["*.cbor"]), + (&["ceylon"], &["*.ceylon"]), + (&["clojure"], &["*.clj", "*.cljc", "*.cljs", "*.cljx"]), + (&["cmake"], &["*.cmake", "CMakeLists.txt"]), + (&["cmd"], &["*.bat", "*.cmd"]), + (&["cml"], &["*.cml"]), + (&["coffeescript"], &["*.coffee"]), + (&["config"], &["*.cfg", "*.conf", "*.config", "*.ini"]), + (&["coq"], &["*.v"]), + (&["cpp"], &[ + "*.[ChH]", "*.cc", "*.[ch]pp", "*.[ch]xx", "*.hh", "*.inl", + "*.[ChH].in", "*.cc.in", "*.[ch]pp.in", "*.[ch]xx.in", "*.hh.in", + ]), + (&["creole"], &["*.creole"]), + (&["crystal"], &["Projectfile", "*.cr", "*.ecr", "shard.yml"]), + (&["cs"], &["*.cs"]), + (&["csharp"], &["*.cs"]), + (&["cshtml"], &["*.cshtml"]), + (&["csproj"], &["*.csproj"]), + (&["css"], &["*.css", "*.scss"]), + (&["csv"], &["*.csv"]), + (&["cuda"], &["*.cu", "*.cuh"]), + (&["cython"], &["*.pyx", "*.pxi", "*.pxd"]), + (&["d"], &["*.d"]), + (&["dart"], &["*.dart"]), + (&["devicetree"], &["*.dts", "*.dtsi"]), + (&["dhall"], &["*.dhall"]), + (&["diff"], &["*.patch", "*.diff"]), + (&["dita"], &["*.dita", "*.ditamap", "*.ditaval"]), + (&["docker"], &["*Dockerfile*"]), + (&["dockercompose"], &["docker-compose.yml", "docker-compose.*.yml"]), + (&["dts"], &["*.dts", "*.dtsi"]), + (&["dvc"], &["Dvcfile", "*.dvc"]), + (&["ebuild"], &["*.ebuild", "*.eclass"]), + (&["edn"], &["*.edn"]), + (&["elisp"], &["*.el"]), + (&["elixir"], &["*.ex", "*.eex", "*.exs", "*.heex", "*.leex", "*.livemd"]), + (&["elm"], &["*.elm"]), + (&["erb"], &["*.erb"]), + (&["erlang"], &["*.erl", "*.hrl"]), + (&["fennel"], &["*.fnl"]), + (&["fidl"], &["*.fidl"]), + (&["fish"], &["*.fish"]), + (&["flatbuffers"], &["*.fbs"]), + (&["fortran"], &[ + "*.f", "*.F", "*.f77", "*.F77", "*.pfo", + "*.f90", "*.F90", "*.f95", "*.F95", + ]), + (&["fsharp"], &["*.fs", "*.fsx", "*.fsi"]), + (&["fut"], &["*.fut"]), + (&["gap"], &["*.g", "*.gap", "*.gi", "*.gd", "*.tst"]), + (&["gn"], &["*.gn", "*.gni"]), + (&["go"], &["*.go"]), + (&["gprbuild"], &["*.gpr"]), + (&["gradle"], &[ + "*.gradle", "*.gradle.kts", "gradle.properties", "gradle-wrapper.*", + "gradlew", "gradlew.bat", + ]), + (&["graphql"], &["*.graphql", "*.graphqls"]), + (&["groovy"], &["*.groovy", "*.gradle"]), + (&["gzip"], &["*.gz", "*.tgz"]), + (&["h"], &["*.h", "*.hh", "*.hpp"]), + (&["haml"], &["*.haml"]), + (&["hare"], &["*.ha"]), + (&["haskell"], &["*.hs", "*.lhs", "*.cpphs", "*.c2hs", "*.hsc"]), + (&["hbs"], &["*.hbs"]), + (&["hs"], &["*.hs", "*.lhs"]), + (&["html"], &["*.htm", "*.html", "*.ejs"]), + (&["hy"], &["*.hy"]), + (&["idris"], &["*.idr", "*.lidr"]), + (&["janet"], &["*.janet"]), + (&["java"], &["*.java", "*.jsp", "*.jspx", "*.properties"]), + (&["jinja"], &["*.j2", "*.jinja", "*.jinja2"]), + (&["jl"], &["*.jl"]), + (&["js"], &["*.js", "*.jsx", "*.vue", "*.cjs", "*.mjs"]), + (&["json"], &["*.json", "composer.lock", "*.sarif"]), + (&["jsonl"], &["*.jsonl"]), + (&["julia"], &["*.jl"]), + (&["jupyter"], &["*.ipynb", "*.jpynb"]), + (&["k"], &["*.k"]), + (&["kotlin"], &["*.kt", "*.kts"]), + (&["lean"], &["*.lean"]), + (&["less"], &["*.less"]), + (&["license"], &[ + // General + "COPYING", "COPYING[.-]*", + "COPYRIGHT", "COPYRIGHT[.-]*", + "EULA", "EULA[.-]*", + "licen[cs]e", "licen[cs]e.*", + "LICEN[CS]E", "LICEN[CS]E[.-]*", "*[.-]LICEN[CS]E*", + "NOTICE", "NOTICE[.-]*", + "PATENTS", "PATENTS[.-]*", + "UNLICEN[CS]E", "UNLICEN[CS]E[.-]*", + // GPL (gpl.txt, etc.) + "agpl[.-]*", + "gpl[.-]*", + "lgpl[.-]*", + // Other license-specific (APACHE-2.0.txt, etc.) + "AGPL-*[0-9]*", + "APACHE-*[0-9]*", + "BSD-*[0-9]*", + "CC-BY-*", + "GFDL-*[0-9]*", + "GNU-*[0-9]*", + "GPL-*[0-9]*", + "LGPL-*[0-9]*", + "MIT-*[0-9]*", + "MPL-*[0-9]*", + "OFL-*[0-9]*", + ]), + (&["lilypond"], &["*.ly", "*.ily"]), + (&["lisp"], &["*.el", "*.jl", "*.lisp", "*.lsp", "*.sc", "*.scm"]), + (&["lock"], &["*.lock", "package-lock.json"]), + (&["log"], &["*.log"]), + (&["lua"], &["*.lua"]), + (&["lz4"], &["*.lz4"]), + (&["lzma"], &["*.lzma"]), + (&["m4"], &["*.ac", "*.m4"]), + (&["make"], &[ + "[Gg][Nn][Uu]makefile", "[Mm]akefile", + "[Gg][Nn][Uu]makefile.am", "[Mm]akefile.am", + "[Gg][Nn][Uu]makefile.in", "[Mm]akefile.in", + "*.mk", "*.mak" + ]), + (&["mako"], &["*.mako", "*.mao"]), + (&["man"], &["*.[0-9lnpx]", "*.[0-9][cEFMmpSx]"]), + (&["markdown", "md"], &[ + "*.markdown", + "*.md", + "*.mdown", + "*.mdwn", + "*.mkd", + "*.mkdn", + "*.mdx", + ]), + (&["matlab"], &["*.m"]), + (&["meson"], &["meson.build", "meson_options.txt", "meson.options"]), + (&["minified"], &["*.min.html", "*.min.css", "*.min.js"]), + (&["mint"], &["*.mint"]), + (&["mk"], &["mkfile"]), + (&["ml"], &["*.ml"]), + (&["motoko"], &["*.mo"]), + (&["msbuild"], &[ + "*.csproj", "*.fsproj", "*.vcxproj", "*.proj", "*.props", "*.targets", + "*.sln", + ]), + (&["nim"], &["*.nim", "*.nimf", "*.nimble", "*.nims"]), + (&["nix"], &["*.nix"]), + (&["objc"], &["*.h", "*.m"]), + (&["objcpp"], &["*.h", "*.mm"]), + (&["ocaml"], &["*.ml", "*.mli", "*.mll", "*.mly"]), + (&["org"], &["*.org", "*.org_archive"]), + (&["pants"], &["BUILD"]), + (&["pascal"], &["*.pas", "*.dpr", "*.lpr", "*.pp", "*.inc"]), + (&["pdf"], &["*.pdf"]), + (&["perl"], &["*.perl", "*.pl", "*.PL", "*.plh", "*.plx", "*.pm", "*.t"]), + (&["php"], &[ + // note that PHP 6 doesn't exist + // See: https://wiki.php.net/rfc/php6 + "*.php", "*.php3", "*.php4", "*.php5", "*.php7", "*.php8", + "*.pht", "*.phtml" + ]), + (&["po"], &["*.po"]), + (&["pod"], &["*.pod"]), + (&["postscript"], &["*.eps", "*.ps"]), + (&["prolog"], &["*.pl", "*.pro", "*.prolog", "*.P"]), + (&["protobuf"], &["*.proto"]), + (&["ps"], &["*.cdxml", "*.ps1", "*.ps1xml", "*.psd1", "*.psm1"]), + (&["puppet"], &["*.epp", "*.erb", "*.pp", "*.rb"]), + (&["purs"], &["*.purs"]), + (&["py", "python"], &["*.py", "*.pyi"]), + (&["qmake"], &["*.pro", "*.pri", "*.prf"]), + (&["qml"], &["*.qml"]), + (&["r"], &["*.R", "*.r", "*.Rmd", "*.Rnw"]), + (&["racket"], &["*.rkt"]), + (&["raku"], &[ + "*.raku", "*.rakumod", "*.rakudoc", "*.rakutest", + "*.p6", "*.pl6", "*.pm6" + ]), + (&["rdoc"], &["*.rdoc"]), + (&["readme"], &["README*", "*README"]), + (&["reasonml"], &["*.re", "*.rei"]), + (&["red"], &["*.r", "*.red", "*.reds"]), + (&["rescript"], &["*.res", "*.resi"]), + (&["robot"], &["*.robot"]), + (&["rst"], &["*.rst"]), + (&["ruby"], &[ + // Idiomatic files + "config.ru", "Gemfile", ".irbrc", "Rakefile", + // Extensions + "*.gemspec", "*.rb", "*.rbw" + ]), + (&["rust"], &["*.rs"]), + (&["sass"], &["*.sass", "*.scss"]), + (&["scala"], &["*.scala", "*.sbt"]), + (&["sh"], &[ + // Portable/misc. init files + ".login", ".logout", ".profile", "profile", + // bash-specific init files + ".bash_login", "bash_login", + ".bash_logout", "bash_logout", + ".bash_profile", "bash_profile", + ".bashrc", "bashrc", "*.bashrc", + // csh-specific init files + ".cshrc", "*.cshrc", + // ksh-specific init files + ".kshrc", "*.kshrc", + // tcsh-specific init files + ".tcshrc", + // zsh-specific init files + ".zshenv", "zshenv", + ".zlogin", "zlogin", + ".zlogout", "zlogout", + ".zprofile", "zprofile", + ".zshrc", "zshrc", + // Extensions + "*.bash", "*.csh", "*.ksh", "*.sh", "*.tcsh", "*.zsh", + ]), + (&["slim"], &["*.skim", "*.slim", "*.slime"]), + (&["smarty"], &["*.tpl"]), + (&["sml"], &["*.sml", "*.sig"]), + (&["solidity"], &["*.sol"]), + (&["soy"], &["*.soy"]), + (&["spark"], &["*.spark"]), + (&["spec"], &["*.spec"]), + (&["sql"], &["*.sql", "*.psql"]), + (&["stylus"], &["*.styl"]), + (&["sv"], &["*.v", "*.vg", "*.sv", "*.svh", "*.h"]), + (&["svelte"], &["*.svelte"]), + (&["svg"], &["*.svg"]), + (&["swift"], &["*.swift"]), + (&["swig"], &["*.def", "*.i"]), + (&["systemd"], &[ + "*.automount", "*.conf", "*.device", "*.link", "*.mount", "*.path", + "*.scope", "*.service", "*.slice", "*.socket", "*.swap", "*.target", + "*.timer", + ]), + (&["taskpaper"], &["*.taskpaper"]), + (&["tcl"], &["*.tcl"]), + (&["tex"], &["*.tex", "*.ltx", "*.cls", "*.sty", "*.bib", "*.dtx", "*.ins"]), + (&["texinfo"], &["*.texi"]), + (&["textile"], &["*.textile"]), + (&["tf"], &[ + "*.tf", "*.auto.tfvars", "terraform.tfvars", "*.tf.json", + "*.auto.tfvars.json", "terraform.tfvars.json", "*.terraformrc", + "terraform.rc", "*.tfrc", "*.terraform.lock.hcl", + ]), + (&["thrift"], &["*.thrift"]), + (&["toml"], &["*.toml", "Cargo.lock"]), + (&["ts", "typescript"], &["*.ts", "*.tsx", "*.cts", "*.mts"]), + (&["twig"], &["*.twig"]), + (&["txt"], &["*.txt"]), + (&["typoscript"], &["*.typoscript", "*.ts"]), + (&["usd"], &["*.usd", "*.usda", "*.usdc"]), + (&["v"], &["*.v", "*.vsh"]), + (&["vala"], &["*.vala"]), + (&["vb"], &["*.vb"]), + (&["vcl"], &["*.vcl"]), + (&["verilog"], &["*.v", "*.vh", "*.sv", "*.svh"]), + (&["vhdl"], &["*.vhd", "*.vhdl"]), + (&["vim"], &[ + "*.vim", ".vimrc", ".gvimrc", "vimrc", "gvimrc", "_vimrc", "_gvimrc", + ]), + (&["vimscript"], &[ + "*.vim", ".vimrc", ".gvimrc", "vimrc", "gvimrc", "_vimrc", "_gvimrc", + ]), + (&["vue"], &["*.vue"]), + (&["webidl"], &["*.idl", "*.webidl", "*.widl"]), + (&["wgsl"], &["*.wgsl"]), + (&["wiki"], &["*.mediawiki", "*.wiki"]), + (&["xml"], &[ + "*.xml", "*.xml.dist", "*.dtd", "*.xsl", "*.xslt", "*.xsd", "*.xjb", + "*.rng", "*.sch", "*.xhtml", + ]), + (&["xz"], &["*.xz", "*.txz"]), + (&["yacc"], &["*.y"]), + (&["yaml"], &["*.yaml", "*.yml"]), + (&["yang"], &["*.yang"]), + (&["z"], &["*.Z"]), + (&["zig"], &["*.zig"]), + (&["zsh"], &[ + ".zshenv", "zshenv", + ".zlogin", "zlogin", + ".zlogout", "zlogout", + ".zprofile", "zprofile", + ".zshrc", "zshrc", + "*.zsh", + ]), + (&["zstd"], &["*.zst", "*.zstd"]), +]; + +#[cfg(test)] +mod tests { + use super::DEFAULT_TYPES; + + #[test] + fn default_types_are_sorted() { + let mut names = DEFAULT_TYPES.iter().map(|(aliases, _)| aliases[0]); + let Some(mut previous_name) = names.next() else { + return; + }; + for name in names { + assert!( + name > previous_name, + r#""{}" should be sorted before "{}" in `DEFAULT_TYPES`"#, + name, + previous_name + ); + previous_name = name; + } + } +} diff --git a/crates/ignore/src/dir.rs b/crates/ignore/src/dir.rs new file mode 100644 index 000000000000..48bc7ac1332d --- /dev/null +++ b/crates/ignore/src/dir.rs @@ -0,0 +1,1187 @@ +// This module provides a data structure, `Ignore`, that connects "directory +// traversal" with "ignore matchers." Specifically, it knows about gitignore +// semantics and precedence, and is organized based on directory hierarchy. +// Namely, every matcher logically corresponds to ignore rules from a single +// directory, and points to the matcher for its corresponding parent directory. +// In this sense, `Ignore` is a *persistent* data structure. +// +// This design was specifically chosen to make it possible to use this data +// structure in a parallel directory iterator. +// +// My initial intention was to expose this module as part of this crate's +// public API, but I think the data structure's public API is too complicated +// with non-obvious failure modes. Alas, such things haven't been documented +// well. + +use std::{ + collections::HashMap, + ffi::{OsStr, OsString}, + fs::{File, FileType}, + io::{self, BufRead}, + path::{Path, PathBuf}, + sync::{Arc, RwLock, Weak}, +}; + +use crate::{ + gitignore::{self, Gitignore, GitignoreBuilder}, + overrides::{self, Override}, + pathutil::{is_hidden, strip_prefix}, + types::{self, Types}, + walk::DirEntry, + {Error, Match, PartialErrorBuilder}, +}; + +/// IgnoreMatch represents information about where a match came from when using +/// the `Ignore` matcher. +#[derive(Clone, Debug)] +#[allow(dead_code)] +pub(crate) struct IgnoreMatch<'a>(IgnoreMatchInner<'a>); + +/// IgnoreMatchInner describes precisely where the match information came from. +/// This is private to allow expansion to more matchers in the future. +#[derive(Clone, Debug)] +#[allow(dead_code)] +enum IgnoreMatchInner<'a> { + Override(overrides::Glob<'a>), + Gitignore(&'a gitignore::Glob), + Types(types::Glob<'a>), + Hidden, +} + +impl<'a> IgnoreMatch<'a> { + fn overrides(x: overrides::Glob<'a>) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Override(x)) + } + + fn gitignore(x: &'a gitignore::Glob) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Gitignore(x)) + } + + fn types(x: types::Glob<'a>) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Types(x)) + } + + fn hidden() -> IgnoreMatch<'static> { + IgnoreMatch(IgnoreMatchInner::Hidden) + } +} + +/// Options for the ignore matcher, shared between the matcher itself and the +/// builder. +#[derive(Clone, Copy, Debug)] +struct IgnoreOptions { + /// Whether to ignore hidden file paths or not. + hidden: bool, + /// Whether to read .ignore files. + ignore: bool, + /// Whether to respect any ignore files in parent directories. + parents: bool, + /// Whether to read git's global gitignore file. + git_global: bool, + /// Whether to read .gitignore files. + git_ignore: bool, + /// Whether to read .git/info/exclude files. + git_exclude: bool, + /// Whether to ignore files case insensitively + ignore_case_insensitive: bool, + /// Whether a git repository must be present in order to apply any + /// git-related ignore rules. + require_git: bool, +} + +/// Ignore is a matcher useful for recursively walking one or more directories. +#[derive(Clone, Debug)] +pub(crate) struct Ignore(Arc); + +#[derive(Clone, Debug)] +struct IgnoreInner { + /// A map of all existing directories that have already been + /// compiled into matchers. + /// + /// Note that this is never used during matching, only when adding new + /// parent directory matchers. This avoids needing to rebuild glob sets for + /// parent directories if many paths are being searched. + compiled: Arc>>>, + /// The path to the directory that this matcher was built from. + dir: PathBuf, + /// An override matcher (default is empty). + overrides: Arc, + /// A file type matcher. + types: Arc, + /// The parent directory to match next. + /// + /// If this is the root directory or there are otherwise no more + /// directories to match, then `parent` is `None`. + parent: Option, + /// Whether this is an absolute parent matcher, as added by add_parent. + is_absolute_parent: bool, + /// The absolute base path of this matcher. Populated only if parent + /// directories are added. + absolute_base: Option>, + /// Explicit global ignore matchers specified by the caller. + explicit_ignores: Arc>, + /// Ignore files used in addition to `.ignore` + custom_ignore_filenames: Arc>, + /// The matcher for custom ignore files + custom_ignore_matcher: Gitignore, + /// The matcher for .ignore files. + ignore_matcher: Gitignore, + /// A global gitignore matcher, usually from $XDG_CONFIG_HOME/git/ignore. + git_global_matcher: Arc, + /// The matcher for .gitignore files. + git_ignore_matcher: Gitignore, + /// Special matcher for `.git/info/exclude` files. + git_exclude_matcher: Gitignore, + /// Whether this directory contains a .git sub-directory. + has_git: bool, + /// Ignore config. + opts: IgnoreOptions, +} + +impl Ignore { + /// Return the directory path of this matcher. + pub(crate) fn path(&self) -> &Path { + &self.0.dir + } + + /// Return true if this matcher has no parent. + pub(crate) fn is_root(&self) -> bool { + self.0.parent.is_none() + } + + /// Returns true if this matcher was added via the `add_parents` method. + pub(crate) fn is_absolute_parent(&self) -> bool { + self.0.is_absolute_parent + } + + /// Return this matcher's parent, if one exists. + pub(crate) fn parent(&self) -> Option { + self.0.parent.clone() + } + + /// Create a new `Ignore` matcher with the parent directories of `dir`. + /// + /// Note that this can only be called on an `Ignore` matcher with no + /// parents (i.e., `is_root` returns `true`). This will panic otherwise. + pub(crate) fn add_parents>(&self, path: P) -> (Ignore, Option) { + if !self.0.opts.parents + && !self.0.opts.git_ignore + && !self.0.opts.git_exclude + && !self.0.opts.git_global + { + // If we never need info from parent directories, then don't do + // anything. + return (self.clone(), None); + } + if !self.is_root() { + panic!("Ignore::add_parents called on non-root matcher"); + } + let absolute_base = match path.as_ref().canonicalize() { + Ok(path) => Arc::new(path), + Err(_) => { + // There's not much we can do here, so just return our + // existing matcher. We drop the error to be consistent + // with our general pattern of ignoring I/O errors when + // processing ignore files. + return (self.clone(), None); + } + }; + // List of parents, from child to root. + let mut parents = vec![]; + let mut path = &**absolute_base; + while let Some(parent) = path.parent() { + parents.push(parent); + path = parent; + } + let mut errs = PartialErrorBuilder::default(); + let mut ig = self.clone(); + for parent in parents.into_iter().rev() { + let mut compiled = self.0.compiled.write().unwrap(); + if let Some(weak) = compiled.get(parent.as_os_str()) { + if let Some(prebuilt) = weak.upgrade() { + ig = Ignore(prebuilt); + continue; + } + } + let (mut igtmp, err) = ig.add_child_path(parent); + errs.maybe_push(err); + igtmp.is_absolute_parent = true; + igtmp.absolute_base = Some(absolute_base.clone()); + igtmp.has_git = if self.0.opts.require_git && self.0.opts.git_ignore { + parent.join(".git").exists() + } else { + false + }; + let ig_arc = Arc::new(igtmp); + ig = Ignore(ig_arc.clone()); + compiled.insert(parent.as_os_str().to_os_string(), Arc::downgrade(&ig_arc)); + } + (ig, errs.into_error_option()) + } + + /// Create a new `Ignore` matcher for the given child directory. + /// + /// Since building the matcher may require reading from multiple + /// files, it's possible that this method partially succeeds. Therefore, + /// a matcher is always returned (which may match nothing) and an error is + /// returned if it exists. + /// + /// Note that all I/O errors are completely ignored. + pub(crate) fn add_child>(&self, dir: P) -> (Ignore, Option) { + let (ig, err) = self.add_child_path(dir.as_ref()); + (Ignore(Arc::new(ig)), err) + } + + /// Like add_child, but takes a full path and returns an IgnoreInner. + fn add_child_path(&self, dir: &Path) -> (IgnoreInner, Option) { + let git_type = + if self.0.opts.require_git && (self.0.opts.git_ignore || self.0.opts.git_exclude) { + dir.join(".git").metadata().ok().map(|md| md.file_type()) + } else { + None + }; + let has_git = git_type.map(|_| true).unwrap_or(false); + + let mut errs = PartialErrorBuilder::default(); + let custom_ig_matcher = if self.0.custom_ignore_filenames.is_empty() { + Gitignore::empty() + } else { + let (m, err) = create_gitignore( + &dir, + &dir, + &self.0.custom_ignore_filenames, + self.0.opts.ignore_case_insensitive, + ); + errs.maybe_push(err); + m + }; + let ig_matcher = if !self.0.opts.ignore { + Gitignore::empty() + } else { + let (m, err) = create_gitignore( + &dir, + &dir, + &[".ignore"], + self.0.opts.ignore_case_insensitive, + ); + errs.maybe_push(err); + m + }; + let gi_matcher = if !self.0.opts.git_ignore { + Gitignore::empty() + } else { + let (m, err) = create_gitignore( + &dir, + &dir, + &[".gitignore"], + self.0.opts.ignore_case_insensitive, + ); + errs.maybe_push(err); + m + }; + let gi_exclude_matcher = if !self.0.opts.git_exclude { + Gitignore::empty() + } else { + match resolve_git_commondir(dir, git_type) { + Ok(git_dir) => { + let (m, err) = create_gitignore( + &dir, + &git_dir, + &["info/exclude"], + self.0.opts.ignore_case_insensitive, + ); + errs.maybe_push(err); + m + } + Err(err) => { + errs.maybe_push(err); + Gitignore::empty() + } + } + }; + let ig = IgnoreInner { + compiled: self.0.compiled.clone(), + dir: dir.to_path_buf(), + overrides: self.0.overrides.clone(), + types: self.0.types.clone(), + parent: Some(self.clone()), + is_absolute_parent: false, + absolute_base: self.0.absolute_base.clone(), + explicit_ignores: self.0.explicit_ignores.clone(), + custom_ignore_filenames: self.0.custom_ignore_filenames.clone(), + custom_ignore_matcher: custom_ig_matcher, + ignore_matcher: ig_matcher, + git_global_matcher: self.0.git_global_matcher.clone(), + git_ignore_matcher: gi_matcher, + git_exclude_matcher: gi_exclude_matcher, + has_git, + opts: self.0.opts, + }; + (ig, errs.into_error_option()) + } + + /// Returns true if at least one type of ignore rule should be matched. + fn has_any_ignore_rules(&self) -> bool { + let opts = self.0.opts; + let has_custom_ignore_files = !self.0.custom_ignore_filenames.is_empty(); + let has_explicit_ignores = !self.0.explicit_ignores.is_empty(); + + opts.ignore + || opts.git_global + || opts.git_ignore + || opts.git_exclude + || has_custom_ignore_files + || has_explicit_ignores + } + + /// Like `matched`, but works with a directory entry instead. + pub(crate) fn matched_dir_entry<'a>(&'a self, dent: &DirEntry) -> Match> { + let m = self.matched(dent.path(), dent.is_dir()); + if m.is_none() && self.0.opts.hidden && is_hidden(dent) { + return Match::Ignore(IgnoreMatch::hidden()); + } + m + } + + /// Returns a match indicating whether the given file path should be + /// ignored or not. + /// + /// The match contains information about its origin. + fn matched<'a, P: AsRef>(&'a self, path: P, is_dir: bool) -> Match> { + // We need to be careful with our path. If it has a leading ./, then + // strip it because it causes nothing but trouble. + let mut path = path.as_ref(); + if let Some(p) = strip_prefix("./", path) { + path = p; + } + // Match against the override patterns. If an override matches + // regardless of whether it's whitelist/ignore, then we quit and + // return that result immediately. Overrides have the highest + // precedence. + if !self.0.overrides.is_empty() { + let mat = self + .0 + .overrides + .matched(path, is_dir) + .map(IgnoreMatch::overrides); + if !mat.is_none() { + return mat; + } + } + let mut whitelisted = Match::None; + if self.has_any_ignore_rules() { + let mat = self.matched_ignore(path, is_dir); + if mat.is_ignore() { + return mat; + } else if mat.is_whitelist() { + whitelisted = mat; + } + } + if !self.0.types.is_empty() { + let mat = self.0.types.matched(path, is_dir).map(IgnoreMatch::types); + if mat.is_ignore() { + return mat; + } else if mat.is_whitelist() { + whitelisted = mat; + } + } + whitelisted + } + + /// Performs matching only on the ignore files for this directory and + /// all parent directories. + fn matched_ignore<'a>(&'a self, path: &Path, is_dir: bool) -> Match> { + let (mut m_custom_ignore, mut m_ignore, mut m_gi, mut m_gi_exclude, mut m_explicit) = ( + Match::None, + Match::None, + Match::None, + Match::None, + Match::None, + ); + let any_git = !self.0.opts.require_git || self.parents().any(|ig| ig.0.has_git); + let mut saw_git = false; + for ig in self.parents().take_while(|ig| !ig.0.is_absolute_parent) { + if m_custom_ignore.is_none() { + m_custom_ignore = + ig.0.custom_ignore_matcher + .matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + if m_ignore.is_none() { + m_ignore = + ig.0.ignore_matcher + .matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi.is_none() { + m_gi = + ig.0.git_ignore_matcher + .matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi_exclude.is_none() { + m_gi_exclude = + ig.0.git_exclude_matcher + .matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + saw_git = saw_git || ig.0.has_git; + } + if self.0.opts.parents { + if let Some(abs_parent_path) = self.absolute_base() { + // What we want to do here is take the absolute base path of + // this directory and join it with the path we're searching. + // The main issue we want to avoid is accidentally duplicating + // directory components, so we try to strip any common prefix + // off of `path`. Overall, this seems a little ham-fisted, but + // it does fix a nasty bug. It should do fine until we overhaul + // this crate. + let dirpath = self.0.dir.as_path(); + let path_prefix = match strip_prefix("./", dirpath) { + None => dirpath, + Some(stripped_dot_slash) => stripped_dot_slash, + }; + let path = match strip_prefix(path_prefix, path) { + None => abs_parent_path.join(path), + Some(p) => { + let p = match strip_prefix("/", p) { + None => p, + Some(p) => p, + }; + abs_parent_path.join(p) + } + }; + + for ig in self.parents().skip_while(|ig| !ig.0.is_absolute_parent) { + if m_custom_ignore.is_none() { + m_custom_ignore = + ig.0.custom_ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if m_ignore.is_none() { + m_ignore = + ig.0.ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi.is_none() { + m_gi = + ig.0.git_ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi_exclude.is_none() { + m_gi_exclude = + ig.0.git_exclude_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + saw_git = saw_git || ig.0.has_git; + } + } + } + for gi in self.0.explicit_ignores.iter().rev() { + if !m_explicit.is_none() { + break; + } + m_explicit = gi.matched(&path, is_dir).map(IgnoreMatch::gitignore); + } + let m_global = if any_git { + self.0 + .git_global_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore) + } else { + Match::None + }; + + m_custom_ignore + .or(m_ignore) + .or(m_gi) + .or(m_gi_exclude) + .or(m_global) + .or(m_explicit) + } + + /// Returns an iterator over parent ignore matchers, including this one. + pub(crate) fn parents(&self) -> Parents<'_> { + Parents(Some(self)) + } + + /// Returns the first absolute path of the first absolute parent, if + /// one exists. + fn absolute_base(&self) -> Option<&Path> { + self.0.absolute_base.as_ref().map(|p| &***p) + } +} + +/// An iterator over all parents of an ignore matcher, including itself. +/// +/// The lifetime `'a` refers to the lifetime of the initial `Ignore` matcher. +pub(crate) struct Parents<'a>(Option<&'a Ignore>); + +impl<'a> Iterator for Parents<'a> { + type Item = &'a Ignore; + + fn next(&mut self) -> Option<&'a Ignore> { + match self.0.take() { + None => None, + Some(ig) => { + self.0 = ig.0.parent.as_ref(); + Some(ig) + } + } + } +} + +/// A builder for creating an Ignore matcher. +#[derive(Clone, Debug)] +pub(crate) struct IgnoreBuilder { + /// The root directory path for this ignore matcher. + dir: PathBuf, + /// An override matcher (default is empty). + overrides: Arc, + /// A type matcher (default is empty). + types: Arc, + /// Explicit global ignore matchers. + explicit_ignores: Vec, + /// Ignore files in addition to .ignore. + custom_ignore_filenames: Vec, + /// Ignore config. + opts: IgnoreOptions, +} + +impl IgnoreBuilder { + /// Create a new builder for an `Ignore` matcher. + /// + /// All relative file paths are resolved with respect to the current + /// working directory. + pub(crate) fn new() -> IgnoreBuilder { + IgnoreBuilder { + dir: Path::new("").to_path_buf(), + overrides: Arc::new(Override::empty()), + types: Arc::new(Types::empty()), + explicit_ignores: vec![], + custom_ignore_filenames: vec![], + opts: IgnoreOptions { + hidden: true, + ignore: true, + parents: true, + git_global: true, + git_ignore: true, + git_exclude: true, + ignore_case_insensitive: false, + require_git: true, + }, + } + } + + /// Builds a new `Ignore` matcher. + /// + /// The matcher returned won't match anything until ignore rules from + /// directories are added to it. + pub(crate) fn build(&self) -> Ignore { + let git_global_matcher = if !self.opts.git_global { + Gitignore::empty() + } else { + let mut builder = GitignoreBuilder::new(""); + builder + .case_insensitive(self.opts.ignore_case_insensitive) + .unwrap(); + let (gi, err) = builder.build_global(); + if let Some(err) = err { + log::debug!("{}", err); + } + gi + }; + + Ignore(Arc::new(IgnoreInner { + compiled: Arc::new(RwLock::new(HashMap::new())), + dir: self.dir.clone(), + overrides: self.overrides.clone(), + types: self.types.clone(), + parent: None, + is_absolute_parent: true, + absolute_base: None, + explicit_ignores: Arc::new(self.explicit_ignores.clone()), + custom_ignore_filenames: Arc::new(self.custom_ignore_filenames.clone()), + custom_ignore_matcher: Gitignore::empty(), + ignore_matcher: Gitignore::empty(), + git_global_matcher: Arc::new(git_global_matcher), + git_ignore_matcher: Gitignore::empty(), + git_exclude_matcher: Gitignore::empty(), + has_git: false, + opts: self.opts, + })) + } + + /// Add an override matcher. + /// + /// By default, no override matcher is used. + /// + /// This overrides any previous setting. + pub(crate) fn overrides(&mut self, overrides: Override) -> &mut IgnoreBuilder { + self.overrides = Arc::new(overrides); + self + } + + /// Add a file type matcher. + /// + /// By default, no file type matcher is used. + /// + /// This overrides any previous setting. + pub(crate) fn types(&mut self, types: Types) -> &mut IgnoreBuilder { + self.types = Arc::new(types); + self + } + + /// Adds a new global ignore matcher from the ignore file path given. + pub(crate) fn add_ignore(&mut self, ig: Gitignore) -> &mut IgnoreBuilder { + self.explicit_ignores.push(ig); + self + } + + /// Add a custom ignore file name + /// + /// These ignore files have higher precedence than all other ignore files. + /// + /// When specifying multiple names, earlier names have lower precedence than + /// later names. + pub(crate) fn add_custom_ignore_filename>( + &mut self, + file_name: S, + ) -> &mut IgnoreBuilder { + self.custom_ignore_filenames + .push(file_name.as_ref().to_os_string()); + self + } + + /// Enables ignoring hidden files. + /// + /// This is enabled by default. + pub(crate) fn hidden(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.hidden = yes; + self + } + + /// Enables reading `.ignore` files. + /// + /// `.ignore` files have the same semantics as `gitignore` files and are + /// supported by search tools such as ripgrep and The Silver Searcher. + /// + /// This is enabled by default. + pub(crate) fn ignore(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.ignore = yes; + self + } + + /// Enables reading ignore files from parent directories. + /// + /// If this is enabled, then .gitignore files in parent directories of each + /// file path given are respected. Otherwise, they are ignored. + /// + /// This is enabled by default. + pub(crate) fn parents(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.parents = yes; + self + } + + /// Add a global gitignore matcher. + /// + /// Its precedence is lower than both normal `.gitignore` files and + /// `.git/info/exclude` files. + /// + /// This overwrites any previous global gitignore setting. + /// + /// This is enabled by default. + pub(crate) fn git_global(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_global = yes; + self + } + + /// Enables reading `.gitignore` files. + /// + /// `.gitignore` files have match semantics as described in the `gitignore` + /// man page. + /// + /// This is enabled by default. + pub(crate) fn git_ignore(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_ignore = yes; + self + } + + /// Enables reading `.git/info/exclude` files. + /// + /// `.git/info/exclude` files have match semantics as described in the + /// `gitignore` man page. + /// + /// This is enabled by default. + pub(crate) fn git_exclude(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_exclude = yes; + self + } + + /// Whether a git repository is required to apply git-related ignore + /// rules (global rules, .gitignore and local exclude rules). + /// + /// When disabled, git-related ignore rules are applied even when searching + /// outside a git repository. + pub(crate) fn require_git(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.require_git = yes; + self + } + + /// Process ignore files case insensitively + /// + /// This is disabled by default. + pub(crate) fn ignore_case_insensitive(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.ignore_case_insensitive = yes; + self + } +} + +/// Creates a new gitignore matcher for the directory given. +/// +/// The matcher is meant to match files below `dir`. +/// Ignore globs are extracted from each of the file names relative to +/// `dir_for_ignorefile` in the order given (earlier names have lower +/// precedence than later names). +/// +/// I/O errors are ignored. +pub(crate) fn create_gitignore>( + dir: &Path, + dir_for_ignorefile: &Path, + names: &[T], + case_insensitive: bool, +) -> (Gitignore, Option) { + let mut builder = GitignoreBuilder::new(dir); + let mut errs = PartialErrorBuilder::default(); + builder.case_insensitive(case_insensitive).unwrap(); + for name in names { + let gipath = dir_for_ignorefile.join(name.as_ref()); + // This check is not necessary, but is added for performance. Namely, + // a simple stat call checking for existence can often be just a bit + // quicker than actually trying to open a file. Since the number of + // directories without ignore files likely greatly exceeds the number + // with ignore files, this check generally makes sense. + // + // However, until demonstrated otherwise, we speculatively do not do + // this on Windows since Windows is notorious for having slow file + // system operations. Namely, it's not clear whether this analysis + // makes sense on Windows. + // + // For more details: https://github.com/BurntSushi/ripgrep/pull/1381 + if cfg!(windows) || gipath.exists() { + errs.maybe_push_ignore_io(builder.add(gipath)); + } + } + let gi = match builder.build() { + Ok(gi) => gi, + Err(err) => { + errs.push(err); + GitignoreBuilder::new(dir).build().unwrap() + } + }; + (gi, errs.into_error_option()) +} + +/// Find the GIT_COMMON_DIR for the given git worktree. +/// +/// This is the directory that may contain a private ignore file +/// "info/exclude". Unlike git, this function does *not* read environment +/// variables GIT_DIR and GIT_COMMON_DIR, because it is not clear how to use +/// them when multiple repositories are searched. +/// +/// Some I/O errors are ignored. +fn resolve_git_commondir(dir: &Path, git_type: Option) -> Result> { + let git_dir_path = || dir.join(".git"); + let git_dir = git_dir_path(); + if !git_type.map_or(false, |ft| ft.is_file()) { + return Ok(git_dir); + } + let file = match File::open(git_dir) { + Ok(file) => io::BufReader::new(file), + Err(err) => { + return Err(Some(Error::Io(err).with_path(git_dir_path()))); + } + }; + let dot_git_line = match file.lines().next() { + Some(Ok(line)) => line, + Some(Err(err)) => { + return Err(Some(Error::Io(err).with_path(git_dir_path()))); + } + None => return Err(None), + }; + if !dot_git_line.starts_with("gitdir: ") { + return Err(None); + } + let real_git_dir = PathBuf::from(&dot_git_line["gitdir: ".len()..]); + let git_commondir_file = || real_git_dir.join("commondir"); + let file = match File::open(git_commondir_file()) { + Ok(file) => io::BufReader::new(file), + Err(_) => return Err(None), + }; + let commondir_line = match file.lines().next() { + Some(Ok(line)) => line, + Some(Err(err)) => { + return Err(Some(Error::Io(err).with_path(git_commondir_file()))); + } + None => return Err(None), + }; + let commondir_abs = if commondir_line.starts_with(".") { + real_git_dir.join(commondir_line) // relative commondir + } else { + PathBuf::from(commondir_line) + }; + Ok(commondir_abs) +} + +#[cfg(test)] +mod tests { + use std::{io::Write, path::Path}; + + use crate::{dir::IgnoreBuilder, gitignore::Gitignore, tests::TempDir, Error}; + + fn wfile>(path: P, contents: &str) { + let mut file = std::fs::File::create(path).unwrap(); + file.write_all(contents.as_bytes()).unwrap(); + } + + fn mkdirp>(path: P) { + std::fs::create_dir_all(path).unwrap(); + } + + fn partial(err: Error) -> Vec { + match err { + Error::Partial(errs) => errs, + _ => panic!("expected partial error but got {:?}", err), + } + } + + fn tmpdir() -> TempDir { + TempDir::new().unwrap() + } + + #[test] + fn explicit_ignore() { + let td = tmpdir(); + wfile(td.path().join("not-an-ignore"), "foo\n!bar"); + + let (gi, err) = Gitignore::new(td.path().join("not-an-ignore")); + assert!(err.is_none()); + let (ig, err) = IgnoreBuilder::new() + .add_ignore(gi) + .build() + .add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn git_exclude() { + let td = tmpdir(); + mkdirp(td.path().join(".git/info")); + wfile(td.path().join(".git/info/exclude"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn gitignore() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + wfile(td.path().join(".gitignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn gitignore_no_git() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_none()); + assert!(ig.matched("bar", false).is_none()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn gitignore_allowed_no_git() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new() + .require_git(false) + .build() + .add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn ignore() { + let td = tmpdir(); + wfile(td.path().join(".ignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn custom_ignore() { + let td = tmpdir(); + let custom_ignore = ".customignore"; + wfile(td.path().join(custom_ignore), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new() + .add_custom_ignore_filename(custom_ignore) + .build() + .add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + // Tests that a custom ignore file will override an .ignore. + #[test] + fn custom_ignore_over_ignore() { + let td = tmpdir(); + let custom_ignore = ".customignore"; + wfile(td.path().join(".ignore"), "foo"); + wfile(td.path().join(custom_ignore), "!foo"); + + let (ig, err) = IgnoreBuilder::new() + .add_custom_ignore_filename(custom_ignore) + .build() + .add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_whitelist()); + } + + // Tests that earlier custom ignore files have lower precedence than later. + #[test] + fn custom_ignore_precedence() { + let td = tmpdir(); + let custom_ignore1 = ".customignore1"; + let custom_ignore2 = ".customignore2"; + wfile(td.path().join(custom_ignore1), "foo"); + wfile(td.path().join(custom_ignore2), "!foo"); + + let (ig, err) = IgnoreBuilder::new() + .add_custom_ignore_filename(custom_ignore1) + .add_custom_ignore_filename(custom_ignore2) + .build() + .add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_whitelist()); + } + + // Tests that an .ignore will override a .gitignore. + #[test] + fn ignore_over_gitignore() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join(".ignore"), "!foo"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_whitelist()); + } + + // Tests that exclude has lower precedent than both .ignore and .gitignore. + #[test] + fn exclude_lowest() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "!foo"); + wfile(td.path().join(".ignore"), "!bar"); + mkdirp(td.path().join(".git/info")); + wfile(td.path().join(".git/info/exclude"), "foo\nbar\nbaz"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("baz", false).is_ignore()); + assert!(ig.matched("foo", false).is_whitelist()); + assert!(ig.matched("bar", false).is_whitelist()); + } + + #[test] + fn errored() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "{foo"); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + } + + #[test] + fn errored_both() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "{foo"); + wfile(td.path().join(".ignore"), "{bar"); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert_eq!(2, partial(err.expect("an error")).len()); + } + + #[test] + fn errored_partial() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + wfile(td.path().join(".gitignore"), "{foo\nbar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + assert!(ig.matched("bar", false).is_ignore()); + } + + #[test] + fn errored_partial_and_ignore() { + let td = tmpdir(); + wfile(td.path().join(".gitignore"), "{foo\nbar"); + wfile(td.path().join(".ignore"), "!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + assert!(ig.matched("bar", false).is_whitelist()); + } + + #[test] + fn not_present_empty() { + let td = tmpdir(); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + } + + #[test] + fn stops_at_git_dir() { + // This tests that .gitignore files beyond a .git barrier aren't + // matched, but .ignore files are. + let td = tmpdir(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("foo/.git")); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join(".ignore"), "bar"); + + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_child(td.path()); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child(ig1.path().join("foo")); + assert!(err.is_none()); + + assert!(ig1.matched("foo", false).is_ignore()); + assert!(ig2.matched("foo", false).is_none()); + + assert!(ig1.matched("bar", false).is_ignore()); + assert!(ig2.matched("bar", false).is_ignore()); + } + + #[test] + fn absolute_parent() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("foo")); + wfile(td.path().join(".gitignore"), "bar"); + + // First, check that the parent gitignore file isn't detected if the + // parent isn't added. This establishes a baseline. + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_child(td.path().join("foo")); + assert!(err.is_none()); + assert!(ig1.matched("bar", false).is_none()); + + // Second, check that adding a parent directory actually works. + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_parents(td.path().join("foo")); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child(td.path().join("foo")); + assert!(err.is_none()); + assert!(ig2.matched("bar", false).is_ignore()); + } + + #[test] + fn absolute_parent_anchored() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("src/llvm")); + wfile(td.path().join(".gitignore"), "/llvm/\nfoo"); + + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_parents(td.path().join("src")); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child("src"); + assert!(err.is_none()); + + assert!(ig1.matched("llvm", true).is_none()); + assert!(ig2.matched("llvm", true).is_none()); + assert!(ig2.matched("src/llvm", true).is_none()); + assert!(ig2.matched("foo", false).is_ignore()); + assert!(ig2.matched("src/foo", false).is_ignore()); + } + + #[test] + fn git_info_exclude_in_linked_worktree() { + let td = tmpdir(); + let git_dir = td.path().join(".git"); + mkdirp(git_dir.join("info")); + wfile(git_dir.join("info/exclude"), "ignore_me"); + mkdirp(git_dir.join("worktrees/linked-worktree")); + let commondir_path = || git_dir.join("worktrees/linked-worktree/commondir"); + mkdirp(td.path().join("linked-worktree")); + let worktree_git_dir_abs = format!( + "gitdir: {}", + git_dir.join("worktrees/linked-worktree").to_str().unwrap(), + ); + wfile( + td.path().join("linked-worktree/.git"), + &worktree_git_dir_abs, + ); + + // relative commondir + wfile(commondir_path(), "../.."); + let ib = IgnoreBuilder::new().build(); + let (ignore, err) = ib.add_child(td.path().join("linked-worktree")); + assert!(err.is_none()); + assert!(ignore.matched("ignore_me", false).is_ignore()); + + // absolute commondir + wfile(commondir_path(), git_dir.to_str().unwrap()); + let (ignore, err) = ib.add_child(td.path().join("linked-worktree")); + assert!(err.is_none()); + assert!(ignore.matched("ignore_me", false).is_ignore()); + + // missing commondir file + assert!(std::fs::remove_file(commondir_path()).is_ok()); + let (_, err) = ib.add_child(td.path().join("linked-worktree")); + // We squash the error in this case, because it occurs in repositories + // that are not linked worktrees but have submodules. + assert!(err.is_none()); + + wfile(td.path().join("linked-worktree/.git"), "garbage"); + let (_, err) = ib.add_child(td.path().join("linked-worktree")); + assert!(err.is_none()); + + wfile(td.path().join("linked-worktree/.git"), "gitdir: garbage"); + let (_, err) = ib.add_child(td.path().join("linked-worktree")); + assert!(err.is_none()); + } +} diff --git a/crates/ignore/src/gitignore.rs b/crates/ignore/src/gitignore.rs new file mode 100644 index 000000000000..30f1ccef2d7c --- /dev/null +++ b/crates/ignore/src/gitignore.rs @@ -0,0 +1,812 @@ +/*! +The gitignore module provides a way to match globs from a gitignore file +against file paths. + +Note that this module implements the specification as described in the +`gitignore` man page from scratch. That is, this module does *not* shell out to +the `git` command line tool. +*/ + +use std::{ + fs::File, + io::{BufRead, BufReader, Read}, + path::{Path, PathBuf}, + sync::Arc, +}; + +use { + globset::{Candidate, GlobBuilder, GlobSet, GlobSetBuilder}, + regex_automata::util::pool::Pool, +}; + +use crate::{ + pathutil::{is_file_name, strip_prefix}, + Error, Match, PartialErrorBuilder, +}; + +/// Glob represents a single glob in a gitignore file. +/// +/// This is used to report information about the highest precedent glob that +/// matched in one or more gitignore files. +#[derive(Clone, Debug)] +pub struct Glob { + /// The file path that this glob was extracted from. + from: Option, + /// The original glob string. + original: String, + /// The actual glob string used to convert to a regex. + actual: String, + /// Whether this is a whitelisted glob or not. + is_whitelist: bool, + /// Whether this glob should only match directories or not. + is_only_dir: bool, +} + +impl Glob { + /// Returns the file path that defined this glob. + pub fn from(&self) -> Option<&Path> { + self.from.as_ref().map(|p| &**p) + } + + /// The original glob as it was defined in a gitignore file. + pub fn original(&self) -> &str { + &self.original + } + + /// The actual glob that was compiled to respect gitignore + /// semantics. + pub fn actual(&self) -> &str { + &self.actual + } + + /// Whether this was a whitelisted glob or not. + pub fn is_whitelist(&self) -> bool { + self.is_whitelist + } + + /// Whether this glob must match a directory or not. + pub fn is_only_dir(&self) -> bool { + self.is_only_dir + } + + /// Returns true if and only if this glob has a `**/` prefix. + fn has_doublestar_prefix(&self) -> bool { + self.actual.starts_with("**/") || self.actual == "**" + } +} + +/// Gitignore is a matcher for the globs in one or more gitignore files +/// in the same directory. +#[derive(Clone, Debug)] +pub struct Gitignore { + set: GlobSet, + root: PathBuf, + globs: Vec, + num_ignores: u64, + num_whitelists: u64, + matches: Option>>>, +} + +impl Gitignore { + /// Creates a new gitignore matcher from the gitignore file path given. + /// + /// If it's desirable to include multiple gitignore files in a single + /// matcher, or read gitignore globs from a different source, then + /// use `GitignoreBuilder`. + /// + /// This always returns a valid matcher, even if it's empty. In particular, + /// a Gitignore file can be partially valid, e.g., when one glob is invalid + /// but the rest aren't. + /// + /// Note that I/O errors are ignored. For more granular control over + /// errors, use `GitignoreBuilder`. + pub fn new>(gitignore_path: P) -> (Gitignore, Option) { + let path = gitignore_path.as_ref(); + let parent = path.parent().unwrap_or(Path::new("/")); + let mut builder = GitignoreBuilder::new(parent); + let mut errs = PartialErrorBuilder::default(); + errs.maybe_push_ignore_io(builder.add(path)); + match builder.build() { + Ok(gi) => (gi, errs.into_error_option()), + Err(err) => { + errs.push(err); + (Gitignore::empty(), errs.into_error_option()) + } + } + } + + /// Creates a new gitignore matcher from the global ignore file, if one + /// exists. + /// + /// The global config file path is specified by git's `core.excludesFile` + /// config option. + /// + /// Git's config file location is `$HOME/.gitconfig`. If `$HOME/.gitconfig` + /// does not exist or does not specify `core.excludesFile`, then + /// `$XDG_CONFIG_HOME/git/ignore` is read. If `$XDG_CONFIG_HOME` is not + /// set or is empty, then `$HOME/.config/git/ignore` is used instead. + pub fn global() -> (Gitignore, Option) { + GitignoreBuilder::new("").build_global() + } + + /// Creates a new empty gitignore matcher that never matches anything. + /// + /// Its path is empty. + pub fn empty() -> Gitignore { + Gitignore { + set: GlobSet::empty(), + root: PathBuf::from(""), + globs: vec![], + num_ignores: 0, + num_whitelists: 0, + matches: None, + } + } + + /// Returns the directory containing this gitignore matcher. + /// + /// All matches are done relative to this path. + pub fn path(&self) -> &Path { + &*self.root + } + + /// Returns true if and only if this gitignore has zero globs, and + /// therefore never matches any file path. + pub fn is_empty(&self) -> bool { + self.set.is_empty() + } + + /// Returns the total number of globs, which should be equivalent to + /// `num_ignores + num_whitelists`. + pub fn len(&self) -> usize { + self.set.len() + } + + /// Returns the total number of ignore globs. + pub fn num_ignores(&self) -> u64 { + self.num_ignores + } + + /// Returns the total number of whitelisted globs. + pub fn num_whitelists(&self) -> u64 { + self.num_whitelists + } + + /// Returns whether the given path (file or directory) matched a pattern in + /// this gitignore matcher. + /// + /// `is_dir` should be true if the path refers to a directory and false + /// otherwise. + /// + /// The given path is matched relative to the path given when building + /// the matcher. Specifically, before matching `path`, its prefix (as + /// determined by a common suffix of the directory containing this + /// gitignore) is stripped. If there is no common suffix/prefix overlap, + /// then `path` is assumed to be relative to this matcher. + pub fn matched>(&self, path: P, is_dir: bool) -> Match<&Glob> { + if self.is_empty() { + return Match::None; + } + self.matched_stripped(self.strip(path.as_ref()), is_dir) + } + + /// Returns whether the given path (file or directory, and expected to be + /// under the root) or any of its parent directories (up to the root) + /// matched a pattern in this gitignore matcher. + /// + /// NOTE: This method is more expensive than walking the directory hierarchy + /// top-to-bottom and matching the entries. But, is easier to use in cases + /// when a list of paths are available without a hierarchy. + /// + /// `is_dir` should be true if the path refers to a directory and false + /// otherwise. + /// + /// The given path is matched relative to the path given when building + /// the matcher. Specifically, before matching `path`, its prefix (as + /// determined by a common suffix of the directory containing this + /// gitignore) is stripped. If there is no common suffix/prefix overlap, + /// then `path` is assumed to be relative to this matcher. + /// + /// # Panics + /// + /// This method panics if the given file path is not under the root path + /// of this matcher. + pub fn matched_path_or_any_parents>( + &self, + path: P, + is_dir: bool, + ) -> Match<&Glob> { + if self.is_empty() { + return Match::None; + } + let mut path = self.strip(path.as_ref()); + assert!(!path.has_root(), "path is expected to be under the root"); + + match self.matched_stripped(path, is_dir) { + Match::None => (), // walk up + a_match => return a_match, + } + while let Some(parent) = path.parent() { + match self.matched_stripped(parent, /* is_dir */ true) { + Match::None => path = parent, // walk up + a_match => return a_match, + } + } + Match::None + } + + /// Like matched, but takes a path that has already been stripped. + fn matched_stripped>(&self, path: P, is_dir: bool) -> Match<&Glob> { + if self.is_empty() { + return Match::None; + } + let path = path.as_ref(); + let mut matches = self.matches.as_ref().unwrap().get(); + let candidate = Candidate::new(path); + self.set.matches_candidate_into(&candidate, &mut *matches); + for &i in matches.iter().rev() { + let glob = &self.globs[i]; + if !glob.is_only_dir() || is_dir { + return if glob.is_whitelist() { + Match::Whitelist(glob) + } else { + Match::Ignore(glob) + }; + } + } + Match::None + } + + /// Strips the given path such that it's suitable for matching with this + /// gitignore matcher. + fn strip<'a, P: 'a + AsRef + ?Sized>(&'a self, path: &'a P) -> &'a Path { + let mut path = path.as_ref(); + // A leading ./ is completely superfluous. We also strip it from + // our gitignore root path, so we need to strip it from our candidate + // path too. + if let Some(p) = strip_prefix("./", path) { + path = p; + } + // Strip any common prefix between the candidate path and the root + // of the gitignore, to make sure we get relative matching right. + // BUT, a file name might not have any directory components to it, + // in which case, we don't want to accidentally strip any part of the + // file name. + // + // As an additional special case, if the root is just `.`, then we + // shouldn't try to strip anything, e.g., when path begins with a `.`. + if self.root != Path::new(".") && !is_file_name(path) { + if let Some(p) = strip_prefix(&self.root, path) { + path = p; + // If we're left with a leading slash, get rid of it. + if let Some(p) = strip_prefix("/", path) { + path = p; + } + } + } + path + } +} + +/// Builds a matcher for a single set of globs from a .gitignore file. +#[derive(Clone, Debug)] +pub struct GitignoreBuilder { + builder: GlobSetBuilder, + root: PathBuf, + globs: Vec, + case_insensitive: bool, +} + +impl GitignoreBuilder { + /// Create a new builder for a gitignore file. + /// + /// The path given should be the path at which the globs for this gitignore + /// file should be matched. Note that paths are always matched relative + /// to the root path given here. Generally, the root path should correspond + /// to the *directory* containing a `.gitignore` file. + pub fn new>(root: P) -> GitignoreBuilder { + let root = root.as_ref(); + GitignoreBuilder { + builder: GlobSetBuilder::new(), + root: strip_prefix("./", root).unwrap_or(root).to_path_buf(), + globs: vec![], + case_insensitive: false, + } + } + + /// Builds a new matcher from the globs added so far. + /// + /// Once a matcher is built, no new globs can be added to it. + pub fn build(&self) -> Result { + let nignore = self.globs.iter().filter(|g| !g.is_whitelist()).count(); + let nwhite = self.globs.iter().filter(|g| g.is_whitelist()).count(); + let set = self.builder.build().map_err(|err| Error::Glob { + glob: None, + err: err.to_string(), + })?; + Ok(Gitignore { + set, + root: self.root.clone(), + globs: self.globs.clone(), + num_ignores: nignore as u64, + num_whitelists: nwhite as u64, + matches: Some(Arc::new(Pool::new(|| vec![]))), + }) + } + + /// Build a global gitignore matcher using the configuration in this + /// builder. + /// + /// This consumes ownership of the builder unlike `build` because it + /// must mutate the builder to add the global gitignore globs. + /// + /// Note that this ignores the path given to this builder's constructor + /// and instead derives the path automatically from git's global + /// configuration. + pub fn build_global(mut self) -> (Gitignore, Option) { + match gitconfig_excludes_path() { + None => (Gitignore::empty(), None), + Some(path) => { + if !path.is_file() { + (Gitignore::empty(), None) + } else { + let mut errs = PartialErrorBuilder::default(); + errs.maybe_push_ignore_io(self.add(path)); + match self.build() { + Ok(gi) => (gi, errs.into_error_option()), + Err(err) => { + errs.push(err); + (Gitignore::empty(), errs.into_error_option()) + } + } + } + } + } + } + + /// Add each glob from the file path given. + /// + /// The file given should be formatted as a `gitignore` file. + /// + /// Note that partial errors can be returned. For example, if there was + /// a problem adding one glob, an error for that will be returned, but + /// all other valid globs will still be added. + pub fn add>(&mut self, path: P) -> Option { + let path = path.as_ref(); + let file = match File::open(path) { + Err(err) => return Some(Error::Io(err).with_path(path)), + Ok(file) => file, + }; + log::debug!("opened gitignore file: {}", path.display()); + let rdr = BufReader::new(file); + let mut errs = PartialErrorBuilder::default(); + for (i, line) in rdr.lines().enumerate() { + let lineno = (i + 1) as u64; + let line = match line { + Ok(line) => line, + Err(err) => { + errs.push(Error::Io(err).tagged(path, lineno)); + break; + } + }; + if let Err(err) = self.add_line(Some(path.to_path_buf()), &line) { + errs.push(err.tagged(path, lineno)); + } + } + errs.into_error_option() + } + + /// Add each glob line from the string given. + /// + /// If this string came from a particular `gitignore` file, then its path + /// should be provided here. + /// + /// The string given should be formatted as a `gitignore` file. + #[cfg(test)] + fn add_str( + &mut self, + from: Option, + gitignore: &str, + ) -> Result<&mut GitignoreBuilder, Error> { + for line in gitignore.lines() { + self.add_line(from.clone(), line)?; + } + Ok(self) + } + + /// Add a line from a gitignore file to this builder. + /// + /// If this line came from a particular `gitignore` file, then its path + /// should be provided here. + /// + /// If the line could not be parsed as a glob, then an error is returned. + pub fn add_line( + &mut self, + from: Option, + mut line: &str, + ) -> Result<&mut GitignoreBuilder, Error> { + #![allow(deprecated)] + + if line.starts_with("#") { + return Ok(self); + } + if !line.ends_with("\\ ") { + line = line.trim_right(); + } + if line.is_empty() { + return Ok(self); + } + let mut glob = Glob { + from, + original: line.to_string(), + actual: String::new(), + is_whitelist: false, + is_only_dir: false, + }; + let mut is_absolute = false; + if line.starts_with("\\!") || line.starts_with("\\#") { + line = &line[1..]; + is_absolute = line.chars().nth(0) == Some('/'); + } else { + if line.starts_with("!") { + glob.is_whitelist = true; + line = &line[1..]; + } + if line.starts_with("/") { + // `man gitignore` says that if a glob starts with a slash, + // then the glob can only match the beginning of a path + // (relative to the location of gitignore). We achieve this by + // simply banning wildcards from matching /. + line = &line[1..]; + is_absolute = true; + } + } + // If it ends with a slash, then this should only match directories, + // but the slash should otherwise not be used while globbing. + if line.as_bytes().last() == Some(&b'/') { + glob.is_only_dir = true; + line = &line[..line.len() - 1]; + // If the slash was escaped, then remove the escape. + // See: https://github.com/BurntSushi/ripgrep/issues/2236 + if line.as_bytes().last() == Some(&b'\\') { + line = &line[..line.len() - 1]; + } + } + glob.actual = line.to_string(); + // If there is a literal slash, then this is a glob that must match the + // entire path name. Otherwise, we should let it match anywhere, so use + // a **/ prefix. + if !is_absolute && !line.chars().any(|c| c == '/') { + // ... but only if we don't already have a **/ prefix. + if !glob.has_doublestar_prefix() { + glob.actual = format!("**/{}", glob.actual); + } + } + // If the glob ends with `/**`, then we should only match everything + // inside a directory, but not the directory itself. Standard globs + // will match the directory. So we add `/*` to force the issue. + if glob.actual.ends_with("/**") { + glob.actual = format!("{}/*", glob.actual); + } + let parsed = GlobBuilder::new(&glob.actual) + .literal_separator(true) + .case_insensitive(self.case_insensitive) + .backslash_escape(true) + .build() + .map_err(|err| Error::Glob { + glob: Some(glob.original.clone()), + err: err.kind().to_string(), + })?; + self.builder.add(parsed); + self.globs.push(glob); + Ok(self) + } + + /// Toggle whether the globs should be matched case insensitively or not. + /// + /// When this option is changed, only globs added after the change will be + /// affected. + /// + /// This is disabled by default. + pub fn case_insensitive(&mut self, yes: bool) -> Result<&mut GitignoreBuilder, Error> { + // TODO: This should not return a `Result`. Fix this in the next semver + // release. + self.case_insensitive = yes; + Ok(self) + } +} + +/// Return the file path of the current environment's global gitignore file. +/// +/// Note that the file path returned may not exist. +pub fn gitconfig_excludes_path() -> Option { + // git supports $HOME/.gitconfig and $XDG_CONFIG_HOME/git/config. Notably, + // both can be active at the same time, where $HOME/.gitconfig takes + // precedent. So if $HOME/.gitconfig defines a `core.excludesFile`, then + // we're done. + match gitconfig_home_contents().and_then(|x| parse_excludes_file(&x)) { + Some(path) => return Some(path), + None => {} + } + match gitconfig_xdg_contents().and_then(|x| parse_excludes_file(&x)) { + Some(path) => return Some(path), + None => {} + } + excludes_file_default() +} + +/// Returns the file contents of git's global config file, if one exists, in +/// the user's home directory. +fn gitconfig_home_contents() -> Option> { + let home = match home_dir() { + None => return None, + Some(home) => home, + }; + let mut file = match File::open(home.join(".gitconfig")) { + Err(_) => return None, + Ok(file) => BufReader::new(file), + }; + let mut contents = vec![]; + file.read_to_end(&mut contents).ok().map(|_| contents) +} + +/// Returns the file contents of git's global config file, if one exists, in +/// the user's XDG_CONFIG_HOME directory. +fn gitconfig_xdg_contents() -> Option> { + let path = std::env::var_os("XDG_CONFIG_HOME") + .and_then(|x| { + if x.is_empty() { + None + } else { + Some(PathBuf::from(x)) + } + }) + .or_else(|| home_dir().map(|p| p.join(".config"))) + .map(|x| x.join("git/config")); + let mut file = match path.and_then(|p| File::open(p).ok()) { + None => return None, + Some(file) => BufReader::new(file), + }; + let mut contents = vec![]; + file.read_to_end(&mut contents).ok().map(|_| contents) +} + +/// Returns the default file path for a global .gitignore file. +/// +/// Specifically, this respects XDG_CONFIG_HOME. +fn excludes_file_default() -> Option { + std::env::var_os("XDG_CONFIG_HOME") + .and_then(|x| { + if x.is_empty() { + None + } else { + Some(PathBuf::from(x)) + } + }) + .or_else(|| home_dir().map(|p| p.join(".config"))) + .map(|x| x.join("git/ignore")) +} + +/// Extract git's `core.excludesfile` config setting from the raw file contents +/// given. +fn parse_excludes_file(data: &[u8]) -> Option { + use std::sync::OnceLock; + + use regex_automata::{meta::Regex, util::syntax}; + + // N.B. This is the lazy approach, and isn't technically correct, but + // probably works in more circumstances. I guess we would ideally have + // a full INI parser. Yuck. + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::builder() + .configure(Regex::config().utf8_empty(false)) + .syntax(syntax::Config::new().utf8(false)) + .build(r#"(?im-u)^\s*excludesfile\s*=\s*"?\s*(\S+?)\s*"?\s*$"#) + .unwrap() + }); + // We don't care about amortizing allocs here I think. This should only + // be called ~once per traversal or so? (Although it's not guaranteed...) + let mut caps = re.create_captures(); + re.captures(data, &mut caps); + let span = caps.get_group(1)?; + let candidate = &data[span]; + std::str::from_utf8(candidate) + .ok() + .map(|s| PathBuf::from(expand_tilde(s))) +} + +/// Expands ~ in file paths to the value of $HOME. +fn expand_tilde(path: &str) -> String { + let home = match home_dir() { + None => return path.to_string(), + Some(home) => home.to_string_lossy().into_owned(), + }; + path.replace("~", &home) +} + +/// Returns the location of the user's home directory. +fn home_dir() -> Option { + // We're fine with using std::env::home_dir for now. Its bugs are, IMO, + // pretty minor corner cases. + #![allow(deprecated)] + std::env::home_dir() +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use super::{Gitignore, GitignoreBuilder}; + + fn gi_from_str>(root: P, s: &str) -> Gitignore { + let mut builder = GitignoreBuilder::new(root); + builder.add_str(None, s).unwrap(); + builder.build().unwrap() + } + + macro_rules! ignored { + ($name:ident, $root:expr, $gi:expr, $path:expr) => { + ignored!($name, $root, $gi, $path, false); + }; + ($name:ident, $root:expr, $gi:expr, $path:expr, $is_dir:expr) => { + #[test] + fn $name() { + let gi = gi_from_str($root, $gi); + assert!(gi.matched($path, $is_dir).is_ignore()); + } + }; + } + + macro_rules! not_ignored { + ($name:ident, $root:expr, $gi:expr, $path:expr) => { + not_ignored!($name, $root, $gi, $path, false); + }; + ($name:ident, $root:expr, $gi:expr, $path:expr, $is_dir:expr) => { + #[test] + fn $name() { + let gi = gi_from_str($root, $gi); + assert!(!gi.matched($path, $is_dir).is_ignore()); + } + }; + } + + const ROOT: &'static str = "/home/foobar/rust/rg"; + + ignored!(ig1, ROOT, "months", "months"); + ignored!(ig2, ROOT, "*.lock", "Cargo.lock"); + ignored!(ig3, ROOT, "*.rs", "src/main.rs"); + ignored!(ig4, ROOT, "src/*.rs", "src/main.rs"); + ignored!(ig5, ROOT, "/*.c", "cat-file.c"); + ignored!(ig6, ROOT, "/src/*.rs", "src/main.rs"); + ignored!(ig7, ROOT, "!src/main.rs\n*.rs", "src/main.rs"); + ignored!(ig8, ROOT, "foo/", "foo", true); + ignored!(ig9, ROOT, "**/foo", "foo"); + ignored!(ig10, ROOT, "**/foo", "src/foo"); + ignored!(ig11, ROOT, "**/foo/**", "src/foo/bar"); + ignored!(ig12, ROOT, "**/foo/**", "wat/src/foo/bar/baz"); + ignored!(ig13, ROOT, "**/foo/bar", "foo/bar"); + ignored!(ig14, ROOT, "**/foo/bar", "src/foo/bar"); + ignored!(ig15, ROOT, "abc/**", "abc/x"); + ignored!(ig16, ROOT, "abc/**", "abc/x/y"); + ignored!(ig17, ROOT, "abc/**", "abc/x/y/z"); + ignored!(ig18, ROOT, "a/**/b", "a/b"); + ignored!(ig19, ROOT, "a/**/b", "a/x/b"); + ignored!(ig20, ROOT, "a/**/b", "a/x/y/b"); + ignored!(ig21, ROOT, r"\!xy", "!xy"); + ignored!(ig22, ROOT, r"\#foo", "#foo"); + ignored!(ig23, ROOT, "foo", "./foo"); + ignored!(ig24, ROOT, "target", "grep/target"); + ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock"); + ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz"); + ignored!(ig27, ROOT, "foo/", "xyz/foo", true); + ignored!(ig28, "./src", "/llvm/", "./src/llvm", true); + ignored!(ig29, ROOT, "node_modules/ ", "node_modules", true); + ignored!(ig30, ROOT, "**/", "foo/bar", true); + ignored!(ig31, ROOT, "path1/*", "path1/foo"); + ignored!(ig32, ROOT, ".a/b", ".a/b"); + ignored!(ig33, "./", ".a/b", ".a/b"); + ignored!(ig34, ".", ".a/b", ".a/b"); + ignored!(ig35, "./.", ".a/b", ".a/b"); + ignored!(ig36, "././", ".a/b", ".a/b"); + ignored!(ig37, "././.", ".a/b", ".a/b"); + ignored!(ig38, ROOT, "\\[", "["); + ignored!(ig39, ROOT, "\\?", "?"); + ignored!(ig40, ROOT, "\\*", "*"); + ignored!(ig41, ROOT, "\\a", "a"); + ignored!(ig42, ROOT, "s*.rs", "sfoo.rs"); + ignored!(ig43, ROOT, "**", "foo.rs"); + ignored!(ig44, ROOT, "**/**/*", "a/foo.rs"); + + not_ignored!(ignot1, ROOT, "amonths", "months"); + not_ignored!(ignot2, ROOT, "monthsa", "months"); + not_ignored!(ignot3, ROOT, "/src/*.rs", "src/grep/src/main.rs"); + not_ignored!(ignot4, ROOT, "/*.c", "mozilla-sha1/sha1.c"); + not_ignored!(ignot5, ROOT, "/src/*.rs", "src/grep/src/main.rs"); + not_ignored!(ignot6, ROOT, "*.rs\n!src/main.rs", "src/main.rs"); + not_ignored!(ignot7, ROOT, "foo/", "foo", false); + not_ignored!(ignot8, ROOT, "**/foo/**", "wat/src/afoo/bar/baz"); + not_ignored!(ignot9, ROOT, "**/foo/**", "wat/src/fooa/bar/baz"); + not_ignored!(ignot10, ROOT, "**/foo/bar", "foo/src/bar"); + not_ignored!(ignot11, ROOT, "#foo", "#foo"); + not_ignored!(ignot12, ROOT, "\n\n\n", "foo"); + not_ignored!(ignot13, ROOT, "foo/**", "foo", true); + not_ignored!( + ignot14, + "./third_party/protobuf", + "m4/ltoptions.m4", + "./third_party/protobuf/csharp/src/packages/repositories.config" + ); + not_ignored!(ignot15, ROOT, "!/bar", "foo/bar"); + not_ignored!(ignot16, ROOT, "*\n!**/", "foo", true); + not_ignored!(ignot17, ROOT, "src/*.rs", "src/grep/src/main.rs"); + not_ignored!(ignot18, ROOT, "path1/*", "path2/path1/foo"); + not_ignored!(ignot19, ROOT, "s*.rs", "src/foo.rs"); + + fn bytes(s: &str) -> Vec { + s.to_string().into_bytes() + } + + fn path_string>(path: P) -> String { + path.as_ref().to_str().unwrap().to_string() + } + + #[test] + fn parse_excludes_file1() { + let data = bytes("[core]\nexcludesFile = /foo/bar"); + let got = super::parse_excludes_file(&data).unwrap(); + assert_eq!(path_string(got), "/foo/bar"); + } + + #[test] + fn parse_excludes_file2() { + let data = bytes("[core]\nexcludesFile = ~/foo/bar"); + let got = super::parse_excludes_file(&data).unwrap(); + assert_eq!(path_string(got), super::expand_tilde("~/foo/bar")); + } + + #[test] + fn parse_excludes_file3() { + let data = bytes("[core]\nexcludeFile = /foo/bar"); + assert!(super::parse_excludes_file(&data).is_none()); + } + + #[test] + fn parse_excludes_file4() { + let data = bytes("[core]\nexcludesFile = \"~/foo/bar\""); + let got = super::parse_excludes_file(&data); + assert_eq!(path_string(got.unwrap()), super::expand_tilde("~/foo/bar")); + } + + #[test] + fn parse_excludes_file5() { + let data = bytes("[core]\nexcludesFile = \" \"~/foo/bar \" \""); + assert!(super::parse_excludes_file(&data).is_none()); + } + + // See: https://github.com/BurntSushi/ripgrep/issues/106 + #[test] + fn regression_106() { + gi_from_str("/", " "); + } + + #[test] + fn case_insensitive() { + let gi = GitignoreBuilder::new(ROOT) + .case_insensitive(true) + .unwrap() + .add_str(None, "*.html") + .unwrap() + .build() + .unwrap(); + assert!(gi.matched("foo.html", false).is_ignore()); + assert!(gi.matched("foo.HTML", false).is_ignore()); + assert!(!gi.matched("foo.htm", false).is_ignore()); + assert!(!gi.matched("foo.HTM", false).is_ignore()); + } + + ignored!(cs1, ROOT, "*.html", "foo.html"); + not_ignored!(cs2, ROOT, "*.html", "foo.HTML"); + not_ignored!(cs3, ROOT, "*.html", "foo.htm"); + not_ignored!(cs4, ROOT, "*.html", "foo.HTM"); +} diff --git a/crates/ignore/src/lib.rs b/crates/ignore/src/lib.rs new file mode 100644 index 000000000000..cd0af7ad1c47 --- /dev/null +++ b/crates/ignore/src/lib.rs @@ -0,0 +1,564 @@ +/*! +The ignore crate provides a fast recursive directory iterator that respects +various filters such as globs, file types and `.gitignore` files. The precise +matching rules and precedence is explained in the documentation for +`WalkBuilder`. + +Secondarily, this crate exposes gitignore and file type matchers for use cases +that demand more fine-grained control. + +# Example + +This example shows the most basic usage of this crate. This code will +recursively traverse the current directory while automatically filtering out +files and directories according to ignore globs found in files like +`.ignore` and `.gitignore`: + + +```rust,no_run +use ignore::Walk; + +for result in Walk::new("./") { + // Each item yielded by the iterator is either a directory entry or an + // error, so either print the path or the error. + match result { + Ok(entry) => println!("{}", entry.path().display()), + Err(err) => println!("ERROR: {}", err), + } +} +``` + +# Example: advanced + +By default, the recursive directory iterator will ignore hidden files and +directories. This can be disabled by building the iterator with `WalkBuilder`: + +```rust,no_run +use ignore::WalkBuilder; + +for result in WalkBuilder::new("./").hidden(false).build() { + println!("{:?}", result); +} +``` + +See the documentation for `WalkBuilder` for many other options. +*/ + +#![deny(missing_docs)] + +use std::path::{Path, PathBuf}; + +pub use crate::walk::{ + DirEntry, ParallelVisitor, ParallelVisitorBuilder, Walk, WalkBuilder, WalkParallel, WalkState, +}; + +mod default_types; +mod dir; +pub mod gitignore; +pub mod overrides; +mod pathutil; +pub mod types; +mod walk; + +/// Represents an error that can occur when parsing a gitignore file. +#[derive(Debug)] +pub enum Error { + /// A collection of "soft" errors. These occur when adding an ignore + /// file partially succeeded. + Partial(Vec), + /// An error associated with a specific line number. + WithLineNumber { + /// The line number. + line: u64, + /// The underlying error. + err: Box, + }, + /// An error associated with a particular file path. + WithPath { + /// The file path. + path: PathBuf, + /// The underlying error. + err: Box, + }, + /// An error associated with a particular directory depth when recursively + /// walking a directory. + WithDepth { + /// The directory depth. + depth: usize, + /// The underlying error. + err: Box, + }, + /// An error that occurs when a file loop is detected when traversing + /// symbolic links. + Loop { + /// The ancestor file path in the loop. + ancestor: PathBuf, + /// The child file path in the loop. + child: PathBuf, + }, + /// An error that occurs when doing I/O, such as reading an ignore file. + Io(std::io::Error), + /// An error that occurs when trying to parse a glob. + Glob { + /// The original glob that caused this error. This glob, when + /// available, always corresponds to the glob provided by an end user. + /// e.g., It is the glob as written in a `.gitignore` file. + /// + /// (This glob may be distinct from the glob that is actually + /// compiled, after accounting for `gitignore` semantics.) + glob: Option, + /// The underlying glob error as a string. + err: String, + }, + /// A type selection for a file type that is not defined. + UnrecognizedFileType(String), + /// A user specified file type definition could not be parsed. + InvalidDefinition, +} + +impl Clone for Error { + fn clone(&self) -> Error { + match *self { + Error::Partial(ref errs) => Error::Partial(errs.clone()), + Error::WithLineNumber { line, ref err } => Error::WithLineNumber { + line, + err: err.clone(), + }, + Error::WithPath { ref path, ref err } => Error::WithPath { + path: path.clone(), + err: err.clone(), + }, + Error::WithDepth { depth, ref err } => Error::WithDepth { + depth, + err: err.clone(), + }, + Error::Loop { + ref ancestor, + ref child, + } => Error::Loop { + ancestor: ancestor.clone(), + child: child.clone(), + }, + Error::Io(ref err) => match err.raw_os_error() { + Some(e) => Error::Io(std::io::Error::from_raw_os_error(e)), + None => Error::Io(std::io::Error::new(err.kind(), err.to_string())), + }, + Error::Glob { ref glob, ref err } => Error::Glob { + glob: glob.clone(), + err: err.clone(), + }, + Error::UnrecognizedFileType(ref err) => Error::UnrecognizedFileType(err.clone()), + Error::InvalidDefinition => Error::InvalidDefinition, + } + } +} + +impl Error { + /// Returns true if this is a partial error. + /// + /// A partial error occurs when only some operations failed while others + /// may have succeeded. For example, an ignore file may contain an invalid + /// glob among otherwise valid globs. + pub fn is_partial(&self) -> bool { + match *self { + Error::Partial(_) => true, + Error::WithLineNumber { ref err, .. } => err.is_partial(), + Error::WithPath { ref err, .. } => err.is_partial(), + Error::WithDepth { ref err, .. } => err.is_partial(), + _ => false, + } + } + + /// Returns true if this error is exclusively an I/O error. + pub fn is_io(&self) -> bool { + match *self { + Error::Partial(ref errs) => errs.len() == 1 && errs[0].is_io(), + Error::WithLineNumber { ref err, .. } => err.is_io(), + Error::WithPath { ref err, .. } => err.is_io(), + Error::WithDepth { ref err, .. } => err.is_io(), + Error::Loop { .. } => false, + Error::Io(_) => true, + Error::Glob { .. } => false, + Error::UnrecognizedFileType(_) => false, + Error::InvalidDefinition => false, + } + } + + /// Inspect the original [`std::io::Error`] if there is one. + /// + /// [`None`] is returned if the [`Error`] doesn't correspond to an + /// [`std::io::Error`]. This might happen, for example, when the error was + /// produced because a cycle was found in the directory tree while + /// following symbolic links. + /// + /// This method returns a borrowed value that is bound to the lifetime of the [`Error`]. To + /// obtain an owned value, the [`into_io_error`] can be used instead. + /// + /// > This is the original [`std::io::Error`] and is _not_ the same as + /// > [`impl From for std::io::Error`][impl] which contains + /// > additional context about the error. + /// + /// [`None`]: https://doc.rust-lang.org/stable/std/option/enum.Option.html#variant.None + /// [`std::io::Error`]: https://doc.rust-lang.org/stable/std/io/struct.Error.html + /// [`From`]: https://doc.rust-lang.org/stable/std/convert/trait.From.html + /// [`Error`]: struct.Error.html + /// [`into_io_error`]: struct.Error.html#method.into_io_error + /// [impl]: struct.Error.html#impl-From%3CError%3E + pub fn io_error(&self) -> Option<&std::io::Error> { + match *self { + Error::Partial(ref errs) => { + if errs.len() == 1 { + errs[0].io_error() + } else { + None + } + } + Error::WithLineNumber { ref err, .. } => err.io_error(), + Error::WithPath { ref err, .. } => err.io_error(), + Error::WithDepth { ref err, .. } => err.io_error(), + Error::Loop { .. } => None, + Error::Io(ref err) => Some(err), + Error::Glob { .. } => None, + Error::UnrecognizedFileType(_) => None, + Error::InvalidDefinition => None, + } + } + + /// Similar to [`io_error`] except consumes self to convert to the original + /// [`std::io::Error`] if one exists. + /// + /// [`io_error`]: struct.Error.html#method.io_error + /// [`std::io::Error`]: https://doc.rust-lang.org/stable/std/io/struct.Error.html + pub fn into_io_error(self) -> Option { + match self { + Error::Partial(mut errs) => { + if errs.len() == 1 { + errs.remove(0).into_io_error() + } else { + None + } + } + Error::WithLineNumber { err, .. } => err.into_io_error(), + Error::WithPath { err, .. } => err.into_io_error(), + Error::WithDepth { err, .. } => err.into_io_error(), + Error::Loop { .. } => None, + Error::Io(err) => Some(err), + Error::Glob { .. } => None, + Error::UnrecognizedFileType(_) => None, + Error::InvalidDefinition => None, + } + } + + /// Returns a depth associated with recursively walking a directory (if + /// this error was generated from a recursive directory iterator). + pub fn depth(&self) -> Option { + match *self { + Error::WithPath { ref err, .. } => err.depth(), + Error::WithDepth { depth, .. } => Some(depth), + _ => None, + } + } + + /// Turn an error into a tagged error with the given file path. + fn with_path>(self, path: P) -> Error { + Error::WithPath { + path: path.as_ref().to_path_buf(), + err: Box::new(self), + } + } + + /// Turn an error into a tagged error with the given depth. + fn with_depth(self, depth: usize) -> Error { + Error::WithDepth { + depth, + err: Box::new(self), + } + } + + /// Turn an error into a tagged error with the given file path and line + /// number. If path is empty, then it is omitted from the error. + fn tagged>(self, path: P, lineno: u64) -> Error { + let errline = Error::WithLineNumber { + line: lineno, + err: Box::new(self), + }; + if path.as_ref().as_os_str().is_empty() { + return errline; + } + errline.with_path(path) + } + + /// Build an error from a walkdir error. + fn from_walkdir(err: walkdir::Error) -> Error { + let depth = err.depth(); + if let (Some(anc), Some(child)) = (err.loop_ancestor(), err.path()) { + return Error::WithDepth { + depth, + err: Box::new(Error::Loop { + ancestor: anc.to_path_buf(), + child: child.to_path_buf(), + }), + }; + } + let path = err.path().map(|p| p.to_path_buf()); + let mut ig_err = Error::Io(std::io::Error::from(err)); + if let Some(path) = path { + ig_err = Error::WithPath { + path, + err: Box::new(ig_err), + }; + } + ig_err + } +} + +impl std::error::Error for Error { + #[allow(deprecated)] + fn description(&self) -> &str { + match *self { + Error::Partial(_) => "partial error", + Error::WithLineNumber { ref err, .. } => err.description(), + Error::WithPath { ref err, .. } => err.description(), + Error::WithDepth { ref err, .. } => err.description(), + Error::Loop { .. } => "file system loop found", + Error::Io(ref err) => err.description(), + Error::Glob { ref err, .. } => err, + Error::UnrecognizedFileType(_) => "unrecognized file type", + Error::InvalidDefinition => "invalid definition", + } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match *self { + Error::Partial(ref errs) => { + let msgs: Vec = errs.iter().map(|err| err.to_string()).collect(); + write!(f, "{}", msgs.join("\n")) + } + Error::WithLineNumber { line, ref err } => { + write!(f, "line {}: {}", line, err) + } + Error::WithPath { ref path, ref err } => { + write!(f, "{}: {}", path.display(), err) + } + Error::WithDepth { ref err, .. } => err.fmt(f), + Error::Loop { + ref ancestor, + ref child, + } => write!( + f, + "File system loop found: \ + {} points to an ancestor {}", + child.display(), + ancestor.display() + ), + Error::Io(ref err) => err.fmt(f), + Error::Glob { + glob: None, + ref err, + } => write!(f, "{}", err), + Error::Glob { + glob: Some(ref glob), + ref err, + } => { + write!(f, "error parsing glob '{}': {}", glob, err) + } + Error::UnrecognizedFileType(ref ty) => { + write!(f, "unrecognized file type: {}", ty) + } + Error::InvalidDefinition => write!( + f, + "invalid definition (format is type:glob, e.g., \ + html:*.html)" + ), + } + } +} + +impl From for Error { + fn from(err: std::io::Error) -> Error { + Error::Io(err) + } +} + +#[derive(Debug, Default)] +struct PartialErrorBuilder(Vec); + +impl PartialErrorBuilder { + fn push(&mut self, err: Error) { + self.0.push(err); + } + + fn push_ignore_io(&mut self, err: Error) { + if !err.is_io() { + self.push(err); + } + } + + fn maybe_push(&mut self, err: Option) { + if let Some(err) = err { + self.push(err); + } + } + + fn maybe_push_ignore_io(&mut self, err: Option) { + if let Some(err) = err { + self.push_ignore_io(err); + } + } + + fn into_error_option(mut self) -> Option { + if self.0.is_empty() { + None + } else if self.0.len() == 1 { + Some(self.0.pop().unwrap()) + } else { + Some(Error::Partial(self.0)) + } + } +} + +/// The result of a glob match. +/// +/// The type parameter `T` typically refers to a type that provides more +/// information about a particular match. For example, it might identify +/// the specific gitignore file and the specific glob pattern that caused +/// the match. +#[derive(Clone, Debug)] +pub enum Match { + /// The path didn't match any glob. + None, + /// The highest precedent glob matched indicates the path should be + /// ignored. + Ignore(T), + /// The highest precedent glob matched indicates the path should be + /// whitelisted. + Whitelist(T), +} + +impl Match { + /// Returns true if the match result didn't match any globs. + pub fn is_none(&self) -> bool { + match *self { + Match::None => true, + Match::Ignore(_) | Match::Whitelist(_) => false, + } + } + + /// Returns true if the match result implies the path should be ignored. + pub fn is_ignore(&self) -> bool { + match *self { + Match::Ignore(_) => true, + Match::None | Match::Whitelist(_) => false, + } + } + + /// Returns true if the match result implies the path should be + /// whitelisted. + pub fn is_whitelist(&self) -> bool { + match *self { + Match::Whitelist(_) => true, + Match::None | Match::Ignore(_) => false, + } + } + + /// Inverts the match so that `Ignore` becomes `Whitelist` and + /// `Whitelist` becomes `Ignore`. A non-match remains the same. + pub fn invert(self) -> Match { + match self { + Match::None => Match::None, + Match::Ignore(t) => Match::Whitelist(t), + Match::Whitelist(t) => Match::Ignore(t), + } + } + + /// Return the value inside this match if it exists. + pub fn inner(&self) -> Option<&T> { + match *self { + Match::None => None, + Match::Ignore(ref t) => Some(t), + Match::Whitelist(ref t) => Some(t), + } + } + + /// Apply the given function to the value inside this match. + /// + /// If the match has no value, then return the match unchanged. + pub fn map U>(self, f: F) -> Match { + match self { + Match::None => Match::None, + Match::Ignore(t) => Match::Ignore(f(t)), + Match::Whitelist(t) => Match::Whitelist(f(t)), + } + } + + /// Return the match if it is not none. Otherwise, return other. + pub fn or(self, other: Self) -> Self { + if self.is_none() { + other + } else { + self + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + env, fs, + path::{Path, PathBuf}, + }; + + /// A convenient result type alias. + pub(crate) type Result = std::result::Result>; + + macro_rules! err { + ($($tt:tt)*) => { + Box::::from(format!($($tt)*)) + } + } + + /// A simple wrapper for creating a temporary directory that is + /// automatically deleted when it's dropped. + /// + /// We use this in lieu of tempfile because tempfile brings in too many + /// dependencies. + #[derive(Debug)] + pub struct TempDir(PathBuf); + + impl Drop for TempDir { + fn drop(&mut self) { + fs::remove_dir_all(&self.0).unwrap(); + } + } + + impl TempDir { + /// Create a new empty temporary directory under the system's configured + /// temporary directory. + pub fn new() -> Result { + use std::sync::atomic::{AtomicUsize, Ordering}; + + static TRIES: usize = 100; + static COUNTER: AtomicUsize = AtomicUsize::new(0); + + let tmpdir = env::temp_dir(); + for _ in 0..TRIES { + let count = COUNTER.fetch_add(1, Ordering::SeqCst); + let path = tmpdir.join("rust-ignore").join(count.to_string()); + if path.is_dir() { + continue; + } + fs::create_dir_all(&path) + .map_err(|e| err!("failed to create {}: {}", path.display(), e))?; + return Ok(TempDir(path)); + } + Err(err!("failed to create temp dir after {} tries", TRIES)) + } + + /// Return the underlying path to this temporary directory. + pub fn path(&self) -> &Path { + &self.0 + } + } +} diff --git a/crates/ignore/src/overrides.rs b/crates/ignore/src/overrides.rs new file mode 100644 index 000000000000..693c7dd0a79b --- /dev/null +++ b/crates/ignore/src/overrides.rs @@ -0,0 +1,265 @@ +/*! +The overrides module provides a way to specify a set of override globs. +This provides functionality similar to `--include` or `--exclude` in command +line tools. +*/ + +use std::path::Path; + +use crate::{ + gitignore::{self, Gitignore, GitignoreBuilder}, + Error, Match, +}; + +/// Glob represents a single glob in an override matcher. +/// +/// This is used to report information about the highest precedent glob +/// that matched. +/// +/// Note that not all matches necessarily correspond to a specific glob. For +/// example, if there are one or more whitelist globs and a file path doesn't +/// match any glob in the set, then the file path is considered to be ignored. +/// +/// The lifetime `'a` refers to the lifetime of the matcher that produced +/// this glob. +#[derive(Clone, Debug)] +#[allow(dead_code)] +pub struct Glob<'a>(GlobInner<'a>); + +#[derive(Clone, Debug)] +#[allow(dead_code)] +enum GlobInner<'a> { + /// No glob matched, but the file path should still be ignored. + UnmatchedIgnore, + /// A glob matched. + Matched(&'a gitignore::Glob), +} + +impl<'a> Glob<'a> { + fn unmatched() -> Glob<'a> { + Glob(GlobInner::UnmatchedIgnore) + } +} + +/// Manages a set of overrides provided explicitly by the end user. +#[derive(Clone, Debug)] +pub struct Override(Gitignore); + +impl Override { + /// Returns an empty matcher that never matches any file path. + pub fn empty() -> Override { + Override(Gitignore::empty()) + } + + /// Returns the directory of this override set. + /// + /// All matches are done relative to this path. + pub fn path(&self) -> &Path { + self.0.path() + } + + /// Returns true if and only if this matcher is empty. + /// + /// When a matcher is empty, it will never match any file path. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns the total number of ignore globs. + pub fn num_ignores(&self) -> u64 { + self.0.num_whitelists() + } + + /// Returns the total number of whitelisted globs. + pub fn num_whitelists(&self) -> u64 { + self.0.num_ignores() + } + + /// Returns whether the given file path matched a pattern in this override + /// matcher. + /// + /// `is_dir` should be true if the path refers to a directory and false + /// otherwise. + /// + /// If there are no overrides, then this always returns `Match::None`. + /// + /// If there is at least one whitelist override and `is_dir` is false, then + /// this never returns `Match::None`, since non-matches are interpreted as + /// ignored. + /// + /// The given path is matched to the globs relative to the path given + /// when building the override matcher. Specifically, before matching + /// `path`, its prefix (as determined by a common suffix of the directory + /// given) is stripped. If there is no common suffix/prefix overlap, then + /// `path` is assumed to reside in the same directory as the root path for + /// this set of overrides. + pub fn matched<'a, P: AsRef>(&'a self, path: P, is_dir: bool) -> Match> { + if self.is_empty() { + return Match::None; + } + let mat = self.0.matched(path, is_dir).invert(); + if mat.is_none() && self.num_whitelists() > 0 && !is_dir { + return Match::Ignore(Glob::unmatched()); + } + mat.map(move |giglob| Glob(GlobInner::Matched(giglob))) + } +} + +/// Builds a matcher for a set of glob overrides. +#[derive(Clone, Debug)] +pub struct OverrideBuilder { + builder: GitignoreBuilder, +} + +impl OverrideBuilder { + /// Create a new override builder. + /// + /// Matching is done relative to the directory path provided. + pub fn new>(path: P) -> OverrideBuilder { + OverrideBuilder { + builder: GitignoreBuilder::new(path), + } + } + + /// Builds a new override matcher from the globs added so far. + /// + /// Once a matcher is built, no new globs can be added to it. + pub fn build(&self) -> Result { + Ok(Override(self.builder.build()?)) + } + + /// Add a glob to the set of overrides. + /// + /// Globs provided here have precisely the same semantics as a single + /// line in a `gitignore` file, where the meaning of `!` is inverted: + /// namely, `!` at the beginning of a glob will ignore a file. Without `!`, + /// all matches of the glob provided are treated as whitelist matches. + pub fn add(&mut self, glob: &str) -> Result<&mut OverrideBuilder, Error> { + self.builder.add_line(None, glob)?; + Ok(self) + } + + /// Toggle whether the globs should be matched case insensitively or not. + /// + /// When this option is changed, only globs added after the change will be affected. + /// + /// This is disabled by default. + pub fn case_insensitive(&mut self, yes: bool) -> Result<&mut OverrideBuilder, Error> { + // TODO: This should not return a `Result`. Fix this in the next semver + // release. + self.builder.case_insensitive(yes)?; + Ok(self) + } +} + +#[cfg(test)] +mod tests { + use super::{Override, OverrideBuilder}; + + const ROOT: &'static str = "/home/andrew/foo"; + + fn ov(globs: &[&str]) -> Override { + let mut builder = OverrideBuilder::new(ROOT); + for glob in globs { + builder.add(glob).unwrap(); + } + builder.build().unwrap() + } + + #[test] + fn empty() { + let ov = ov(&[]); + assert!(ov.matched("a.foo", false).is_none()); + assert!(ov.matched("a", false).is_none()); + assert!(ov.matched("", false).is_none()); + } + + #[test] + fn simple() { + let ov = ov(&["*.foo", "!*.bar"]); + assert!(ov.matched("a.foo", false).is_whitelist()); + assert!(ov.matched("a.foo", true).is_whitelist()); + assert!(ov.matched("a.rs", false).is_ignore()); + assert!(ov.matched("a.rs", true).is_none()); + assert!(ov.matched("a.bar", false).is_ignore()); + assert!(ov.matched("a.bar", true).is_ignore()); + } + + #[test] + fn only_ignores() { + let ov = ov(&["!*.bar"]); + assert!(ov.matched("a.rs", false).is_none()); + assert!(ov.matched("a.rs", true).is_none()); + assert!(ov.matched("a.bar", false).is_ignore()); + assert!(ov.matched("a.bar", true).is_ignore()); + } + + #[test] + fn precedence() { + let ov = ov(&["*.foo", "!*.bar.foo"]); + assert!(ov.matched("a.foo", false).is_whitelist()); + assert!(ov.matched("a.baz", false).is_ignore()); + assert!(ov.matched("a.bar.foo", false).is_ignore()); + } + + #[test] + fn gitignore() { + let ov = ov(&["/foo", "bar/*.rs", "baz/**"]); + assert!(ov.matched("bar/lib.rs", false).is_whitelist()); + assert!(ov.matched("bar/wat/lib.rs", false).is_ignore()); + assert!(ov.matched("wat/bar/lib.rs", false).is_ignore()); + assert!(ov.matched("foo", false).is_whitelist()); + assert!(ov.matched("wat/foo", false).is_ignore()); + assert!(ov.matched("baz", false).is_ignore()); + assert!(ov.matched("baz/a", false).is_whitelist()); + assert!(ov.matched("baz/a/b", false).is_whitelist()); + } + + #[test] + fn allow_directories() { + // This tests that directories are NOT ignored when they are unmatched. + let ov = ov(&["*.rs"]); + assert!(ov.matched("foo.rs", false).is_whitelist()); + assert!(ov.matched("foo.c", false).is_ignore()); + assert!(ov.matched("foo", false).is_ignore()); + assert!(ov.matched("foo", true).is_none()); + assert!(ov.matched("src/foo.rs", false).is_whitelist()); + assert!(ov.matched("src/foo.c", false).is_ignore()); + assert!(ov.matched("src/foo", false).is_ignore()); + assert!(ov.matched("src/foo", true).is_none()); + } + + #[test] + fn absolute_path() { + let ov = ov(&["!/bar"]); + assert!(ov.matched("./foo/bar", false).is_none()); + } + + #[test] + fn case_insensitive() { + let ov = OverrideBuilder::new(ROOT) + .case_insensitive(true) + .unwrap() + .add("*.html") + .unwrap() + .build() + .unwrap(); + assert!(ov.matched("foo.html", false).is_whitelist()); + assert!(ov.matched("foo.HTML", false).is_whitelist()); + assert!(ov.matched("foo.htm", false).is_ignore()); + assert!(ov.matched("foo.HTM", false).is_ignore()); + } + + #[test] + fn default_case_sensitive() { + let ov = OverrideBuilder::new(ROOT) + .add("*.html") + .unwrap() + .build() + .unwrap(); + assert!(ov.matched("foo.html", false).is_whitelist()); + assert!(ov.matched("foo.HTML", false).is_ignore()); + assert!(ov.matched("foo.htm", false).is_ignore()); + assert!(ov.matched("foo.HTM", false).is_ignore()); + } +} diff --git a/crates/ignore/src/pathutil.rs b/crates/ignore/src/pathutil.rs new file mode 100644 index 000000000000..0ceb5a356c32 --- /dev/null +++ b/crates/ignore/src/pathutil.rs @@ -0,0 +1,141 @@ +use std::{ffi::OsStr, path::Path}; + +use crate::walk::DirEntry; + +/// Returns true if and only if this entry is considered to be hidden. +/// +/// This only returns true if the base name of the path starts with a `.`. +/// +/// On Unix, this implements a more optimized check. +#[cfg(unix)] +pub(crate) fn is_hidden(dent: &DirEntry) -> bool { + use std::os::unix::ffi::OsStrExt; + + if let Some(name) = file_name(dent.path()) { + name.as_bytes().get(0) == Some(&b'.') + } else { + false + } +} + +/// Returns true if and only if this entry is considered to be hidden. +/// +/// On Windows, this returns true if one of the following is true: +/// +/// * The base name of the path starts with a `.`. +/// * The file attributes have the `HIDDEN` property set. +#[cfg(windows)] +pub(crate) fn is_hidden(dent: &DirEntry) -> bool { + use std::os::windows::fs::MetadataExt; + use winapi_util::file; + + // This looks like we're doing an extra stat call, but on Windows, the + // directory traverser reuses the metadata retrieved from each directory + // entry and stores it on the DirEntry itself. So this is "free." + if let Ok(md) = dent.metadata() { + if file::is_hidden(md.file_attributes() as u64) { + return true; + } + } + if let Some(name) = file_name(dent.path()) { + name.to_str().map(|s| s.starts_with(".")).unwrap_or(false) + } else { + false + } +} + +/// Returns true if and only if this entry is considered to be hidden. +/// +/// This only returns true if the base name of the path starts with a `.`. +#[cfg(not(any(unix, windows)))] +pub(crate) fn is_hidden(dent: &DirEntry) -> bool { + if let Some(name) = file_name(dent.path()) { + name.to_str().map(|s| s.starts_with(".")).unwrap_or(false) + } else { + false + } +} + +/// Strip `prefix` from the `path` and return the remainder. +/// +/// If `path` doesn't have a prefix `prefix`, then return `None`. +#[cfg(unix)] +pub(crate) fn strip_prefix<'a, P: AsRef + ?Sized>( + prefix: &'a P, + path: &'a Path, +) -> Option<&'a Path> { + use std::os::unix::ffi::OsStrExt; + + let prefix = prefix.as_ref().as_os_str().as_bytes(); + let path = path.as_os_str().as_bytes(); + if prefix.len() > path.len() || prefix != &path[0..prefix.len()] { + None + } else { + Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..]))) + } +} + +/// Strip `prefix` from the `path` and return the remainder. +/// +/// If `path` doesn't have a prefix `prefix`, then return `None`. +#[cfg(not(unix))] +pub(crate) fn strip_prefix<'a, P: AsRef + ?Sized>( + prefix: &'a P, + path: &'a Path, +) -> Option<&'a Path> { + path.strip_prefix(prefix).ok() +} + +/// Returns true if this file path is just a file name. i.e., Its parent is +/// the empty string. +#[cfg(unix)] +pub(crate) fn is_file_name>(path: P) -> bool { + use std::os::unix::ffi::OsStrExt; + + use memchr::memchr; + + let path = path.as_ref().as_os_str().as_bytes(); + memchr(b'/', path).is_none() +} + +/// Returns true if this file path is just a file name. i.e., Its parent is +/// the empty string. +#[cfg(not(unix))] +pub(crate) fn is_file_name>(path: P) -> bool { + path.as_ref() + .parent() + .map(|p| p.as_os_str().is_empty()) + .unwrap_or(false) +} + +/// The final component of the path, if it is a normal file. +/// +/// If the path terminates in ., .., or consists solely of a root of prefix, +/// file_name will return None. +#[cfg(unix)] +pub(crate) fn file_name<'a, P: AsRef + ?Sized>(path: &'a P) -> Option<&'a OsStr> { + use memchr::memrchr; + use std::os::unix::ffi::OsStrExt; + + let path = path.as_ref().as_os_str().as_bytes(); + if path.is_empty() { + return None; + } else if path.len() == 1 && path[0] == b'.' { + return None; + } else if path.last() == Some(&b'.') { + return None; + } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] { + return None; + } + let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0); + Some(OsStr::from_bytes(&path[last_slash..])) +} + +/// The final component of the path, if it is a normal file. +/// +/// If the path terminates in ., .., or consists solely of a root of prefix, +/// file_name will return None. +#[cfg(not(unix))] +pub(crate) fn file_name<'a, P: AsRef + ?Sized>(path: &'a P) -> Option<&'a OsStr> { + path.as_ref().file_name() +} diff --git a/crates/ignore/src/types.rs b/crates/ignore/src/types.rs new file mode 100644 index 000000000000..814ee4a0a5f7 --- /dev/null +++ b/crates/ignore/src/types.rs @@ -0,0 +1,601 @@ +/*! +The types module provides a way of associating globs on file names to file +types. + +This can be used to match specific types of files. For example, among +the default file types provided, the Rust file type is defined to be `*.rs` +with name `rust`. Similarly, the C file type is defined to be `*.{c,h}` with +name `c`. + +Note that the set of default types may change over time. + +# Example + +This shows how to create and use a simple file type matcher using the default +file types defined in this crate. + +``` +use ignore::types::TypesBuilder; + +let mut builder = TypesBuilder::new(); +builder.add_defaults(); +builder.select("rust"); +let matcher = builder.build().unwrap(); + +assert!(matcher.matched("foo.rs", false).is_whitelist()); +assert!(matcher.matched("foo.c", false).is_ignore()); +``` + +# Example: negation + +This is like the previous example, but shows how negating a file type works. +That is, this will let us match file paths that *don't* correspond to a +particular file type. + +``` +use ignore::types::TypesBuilder; + +let mut builder = TypesBuilder::new(); +builder.add_defaults(); +builder.negate("c"); +let matcher = builder.build().unwrap(); + +assert!(matcher.matched("foo.rs", false).is_none()); +assert!(matcher.matched("foo.c", false).is_ignore()); +``` + +# Example: custom file type definitions + +This shows how to extend this library default file type definitions with +your own. + +``` +use ignore::types::TypesBuilder; + +let mut builder = TypesBuilder::new(); +builder.add_defaults(); +builder.add("foo", "*.foo"); +// Another way of adding a file type definition. +// This is useful when accepting input from an end user. +builder.add_def("bar:*.bar"); +// Note: we only select `foo`, not `bar`. +builder.select("foo"); +let matcher = builder.build().unwrap(); + +assert!(matcher.matched("x.foo", false).is_whitelist()); +// This is ignored because we only selected the `foo` file type. +assert!(matcher.matched("x.bar", false).is_ignore()); +``` + +We can also add file type definitions based on other definitions. + +``` +use ignore::types::TypesBuilder; + +let mut builder = TypesBuilder::new(); +builder.add_defaults(); +builder.add("foo", "*.foo"); +builder.add_def("bar:include:foo,cpp"); +builder.select("bar"); +let matcher = builder.build().unwrap(); + +assert!(matcher.matched("x.foo", false).is_whitelist()); +assert!(matcher.matched("y.cpp", false).is_whitelist()); +``` +*/ + +use std::{collections::HashMap, path::Path, sync::Arc}; + +use { + globset::{GlobBuilder, GlobSet, GlobSetBuilder}, + regex_automata::util::pool::Pool, +}; + +use crate::{default_types::DEFAULT_TYPES, pathutil::file_name, Error, Match}; + +/// Glob represents a single glob in a set of file type definitions. +/// +/// There may be more than one glob for a particular file type. +/// +/// This is used to report information about the highest precedent glob +/// that matched. +/// +/// Note that not all matches necessarily correspond to a specific glob. +/// For example, if there are one or more selections and a file path doesn't +/// match any of those selections, then the file path is considered to be +/// ignored. +/// +/// The lifetime `'a` refers to the lifetime of the underlying file type +/// definition, which corresponds to the lifetime of the file type matcher. +#[derive(Clone, Debug)] +pub struct Glob<'a>(GlobInner<'a>); + +#[derive(Clone, Debug)] +enum GlobInner<'a> { + /// No glob matched, but the file path should still be ignored. + UnmatchedIgnore, + /// A glob matched. + Matched { + /// The file type definition which provided the glob. + def: &'a FileTypeDef, + }, +} + +impl<'a> Glob<'a> { + fn unmatched() -> Glob<'a> { + Glob(GlobInner::UnmatchedIgnore) + } + + /// Return the file type definition that matched, if one exists. A file type + /// definition always exists when a specific definition matches a file + /// path. + pub fn file_type_def(&self) -> Option<&FileTypeDef> { + match self { + Glob(GlobInner::UnmatchedIgnore) => None, + Glob(GlobInner::Matched { def, .. }) => Some(def), + } + } +} + +/// A single file type definition. +/// +/// File type definitions can be retrieved in aggregate from a file type +/// matcher. File type definitions are also reported when its responsible +/// for a match. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct FileTypeDef { + name: String, + globs: Vec, +} + +impl FileTypeDef { + /// Return the name of this file type. + pub fn name(&self) -> &str { + &self.name + } + + /// Return the globs used to recognize this file type. + pub fn globs(&self) -> &[String] { + &self.globs + } +} + +/// Types is a file type matcher. +#[derive(Clone, Debug)] +pub struct Types { + /// All of the file type definitions, sorted lexicographically by name. + defs: Vec, + /// All of the selections made by the user. + selections: Vec>, + /// Whether there is at least one Selection::Select in our selections. + /// When this is true, a Match::None is converted to Match::Ignore. + has_selected: bool, + /// A mapping from glob index in the set to two indices. The first is an + /// index into `selections` and the second is an index into the + /// corresponding file type definition's list of globs. + glob_to_selection: Vec<(usize, usize)>, + /// The set of all glob selections, used for actual matching. + set: GlobSet, + /// Temporary storage for globs that match. + matches: Arc>>, +} + +/// Indicates the type of a selection for a particular file type. +#[derive(Clone, Debug)] +enum Selection { + Select(String, T), + Negate(String, T), +} + +impl Selection { + fn is_negated(&self) -> bool { + match *self { + Selection::Select(..) => false, + Selection::Negate(..) => true, + } + } + + fn name(&self) -> &str { + match *self { + Selection::Select(ref name, _) => name, + Selection::Negate(ref name, _) => name, + } + } + + fn map U>(self, f: F) -> Selection { + match self { + Selection::Select(name, inner) => Selection::Select(name, f(inner)), + Selection::Negate(name, inner) => Selection::Negate(name, f(inner)), + } + } + + fn inner(&self) -> &T { + match *self { + Selection::Select(_, ref inner) => inner, + Selection::Negate(_, ref inner) => inner, + } + } +} + +impl Types { + /// Creates a new file type matcher that never matches any path and + /// contains no file type definitions. + pub fn empty() -> Types { + Types { + defs: vec![], + selections: vec![], + has_selected: false, + glob_to_selection: vec![], + set: GlobSetBuilder::new().build().unwrap(), + matches: Arc::new(Pool::new(|| vec![])), + } + } + + /// Returns true if and only if this matcher has zero selections. + pub fn is_empty(&self) -> bool { + self.selections.is_empty() + } + + /// Returns the number of selections used in this matcher. + pub fn len(&self) -> usize { + self.selections.len() + } + + /// Return the set of current file type definitions. + /// + /// Definitions and globs are sorted. + pub fn definitions(&self) -> &[FileTypeDef] { + &self.defs + } + + /// Returns a match for the given path against this file type matcher. + /// + /// The path is considered whitelisted if it matches a selected file type. + /// The path is considered ignored if it matches a negated file type. + /// If at least one file type is selected and `path` doesn't match, then + /// the path is also considered ignored. + pub fn matched<'a, P: AsRef>(&'a self, path: P, is_dir: bool) -> Match> { + // File types don't apply to directories, and we can't do anything + // if our glob set is empty. + if is_dir || self.set.is_empty() { + return Match::None; + } + // We only want to match against the file name, so extract it. + // If one doesn't exist, then we can't match it. + let name = match file_name(path.as_ref()) { + Some(name) => name, + None if self.has_selected => { + return Match::Ignore(Glob::unmatched()); + } + None => { + return Match::None; + } + }; + let mut matches = self.matches.get(); + self.set.matches_into(name, &mut *matches); + // The highest precedent match is the last one. + if let Some(&i) = matches.last() { + let (isel, _) = self.glob_to_selection[i]; + let sel = &self.selections[isel]; + let glob = Glob(GlobInner::Matched { def: sel.inner() }); + return if sel.is_negated() { + Match::Ignore(glob) + } else { + Match::Whitelist(glob) + }; + } + if self.has_selected { + Match::Ignore(Glob::unmatched()) + } else { + Match::None + } + } +} + +/// TypesBuilder builds a type matcher from a set of file type definitions and +/// a set of file type selections. +pub struct TypesBuilder { + types: HashMap, + selections: Vec>, +} + +impl TypesBuilder { + /// Create a new builder for a file type matcher. + /// + /// The builder contains *no* type definitions to start with. A set + /// of default type definitions can be added with `add_defaults`, and + /// additional type definitions can be added with `select` and `negate`. + pub fn new() -> TypesBuilder { + TypesBuilder { + types: HashMap::new(), + selections: vec![], + } + } + + /// Build the current set of file type definitions *and* selections into + /// a file type matcher. + pub fn build(&self) -> Result { + let defs = self.definitions(); + let has_selected = self.selections.iter().any(|s| !s.is_negated()); + + let mut selections = vec![]; + let mut glob_to_selection = vec![]; + let mut build_set = GlobSetBuilder::new(); + for (isel, selection) in self.selections.iter().enumerate() { + let def = match self.types.get(selection.name()) { + Some(def) => def.clone(), + None => { + let name = selection.name().to_string(); + return Err(Error::UnrecognizedFileType(name)); + } + }; + for (iglob, glob) in def.globs.iter().enumerate() { + build_set.add( + GlobBuilder::new(glob) + .literal_separator(true) + .build() + .map_err(|err| Error::Glob { + glob: Some(glob.to_string()), + err: err.kind().to_string(), + })?, + ); + glob_to_selection.push((isel, iglob)); + } + selections.push(selection.clone().map(move |_| def)); + } + let set = build_set.build().map_err(|err| Error::Glob { + glob: None, + err: err.to_string(), + })?; + Ok(Types { + defs, + selections, + has_selected, + glob_to_selection, + set, + matches: Arc::new(Pool::new(|| vec![])), + }) + } + + /// Return the set of current file type definitions. + /// + /// Definitions and globs are sorted. + pub fn definitions(&self) -> Vec { + let mut defs = vec![]; + for def in self.types.values() { + let mut def = def.clone(); + def.globs.sort(); + defs.push(def); + } + defs.sort_by(|def1, def2| def1.name().cmp(def2.name())); + defs + } + + /// Select the file type given by `name`. + /// + /// If `name` is `all`, then all file types currently defined are selected. + pub fn select(&mut self, name: &str) -> &mut TypesBuilder { + if name == "all" { + for name in self.types.keys() { + self.selections + .push(Selection::Select(name.to_string(), ())); + } + } else { + self.selections + .push(Selection::Select(name.to_string(), ())); + } + self + } + + /// Ignore the file type given by `name`. + /// + /// If `name` is `all`, then all file types currently defined are negated. + pub fn negate(&mut self, name: &str) -> &mut TypesBuilder { + if name == "all" { + for name in self.types.keys() { + self.selections + .push(Selection::Negate(name.to_string(), ())); + } + } else { + self.selections + .push(Selection::Negate(name.to_string(), ())); + } + self + } + + /// Clear any file type definitions for the type name given. + pub fn clear(&mut self, name: &str) -> &mut TypesBuilder { + self.types.remove(name); + self + } + + /// Add a new file type definition. `name` can be arbitrary and `pat` + /// should be a glob recognizing file paths belonging to the `name` type. + /// + /// If `name` is `all` or otherwise contains any character that is not a + /// Unicode letter or number, then an error is returned. + pub fn add(&mut self, name: &str, glob: &str) -> Result<(), Error> { + if name == "all" || !name.chars().all(|c| c.is_alphanumeric()) { + return Err(Error::InvalidDefinition); + } + let (key, glob) = (name.to_string(), glob.to_string()); + self.types + .entry(key) + .or_insert_with(|| FileTypeDef { + name: name.to_string(), + globs: vec![], + }) + .globs + .push(glob); + Ok(()) + } + + /// Add a new file type definition specified in string form. There are two + /// valid formats: + /// 1. `{name}:{glob}`. This defines a 'root' definition that associates the + /// given name with the given glob. + /// 2. `{name}:include:{comma-separated list of already defined names}. + /// This defines an 'include' definition that associates the given name + /// with the definitions of the given existing types. + /// Names may not include any characters that are not + /// Unicode letters or numbers. + pub fn add_def(&mut self, def: &str) -> Result<(), Error> { + let parts: Vec<&str> = def.split(':').collect(); + match parts.len() { + 2 => { + let name = parts[0]; + let glob = parts[1]; + if name.is_empty() || glob.is_empty() { + return Err(Error::InvalidDefinition); + } + self.add(name, glob) + } + 3 => { + let name = parts[0]; + let types_string = parts[2]; + if name.is_empty() || parts[1] != "include" || types_string.is_empty() { + return Err(Error::InvalidDefinition); + } + let types = types_string.split(','); + // Check ahead of time to ensure that all types specified are + // present and fail fast if not. + if types.clone().any(|t| !self.types.contains_key(t)) { + return Err(Error::InvalidDefinition); + } + for type_name in types { + let globs = self.types.get(type_name).unwrap().globs.clone(); + for glob in globs { + self.add(name, &glob)?; + } + } + Ok(()) + } + _ => Err(Error::InvalidDefinition), + } + } + + /// Add a set of default file type definitions. + pub fn add_defaults(&mut self) -> &mut TypesBuilder { + static MSG: &'static str = "adding a default type should never fail"; + for &(names, exts) in DEFAULT_TYPES { + for name in names { + for ext in exts { + self.add(name, ext).expect(MSG); + } + } + } + self + } +} + +#[cfg(test)] +mod tests { + use super::TypesBuilder; + + macro_rules! matched { + ($name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr) => { + matched!($name, $types, $sel, $selnot, $path, true); + }; + (not, $name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr) => { + matched!($name, $types, $sel, $selnot, $path, false); + }; + ($name:ident, $types:expr, $sel:expr, $selnot:expr, + $path:expr, $matched:expr) => { + #[test] + fn $name() { + let mut btypes = TypesBuilder::new(); + for tydef in $types { + btypes.add_def(tydef).unwrap(); + } + for sel in $sel { + btypes.select(sel); + } + for selnot in $selnot { + btypes.negate(selnot); + } + let types = btypes.build().unwrap(); + let mat = types.matched($path, false); + assert_eq!($matched, !mat.is_ignore()); + } + }; + } + + fn types() -> Vec<&'static str> { + vec![ + "html:*.html", + "html:*.htm", + "rust:*.rs", + "js:*.js", + "py:*.py", + "python:*.py", + "foo:*.{rs,foo}", + "combo:include:html,rust", + ] + } + + matched!(match1, types(), vec!["rust"], vec![], "lib.rs"); + matched!(match2, types(), vec!["html"], vec![], "index.html"); + matched!(match3, types(), vec!["html"], vec![], "index.htm"); + matched!(match4, types(), vec!["html", "rust"], vec![], "main.rs"); + matched!(match5, types(), vec![], vec![], "index.html"); + matched!(match6, types(), vec![], vec!["rust"], "index.html"); + matched!(match7, types(), vec!["foo"], vec!["rust"], "main.foo"); + matched!(match8, types(), vec!["combo"], vec![], "index.html"); + matched!(match9, types(), vec!["combo"], vec![], "lib.rs"); + matched!(match10, types(), vec!["py"], vec![], "main.py"); + matched!(match11, types(), vec!["python"], vec![], "main.py"); + + matched!(not, matchnot1, types(), vec!["rust"], vec![], "index.html"); + matched!(not, matchnot2, types(), vec![], vec!["rust"], "main.rs"); + matched!( + not, + matchnot3, + types(), + vec!["foo"], + vec!["rust"], + "main.rs" + ); + matched!( + not, + matchnot4, + types(), + vec!["rust"], + vec!["foo"], + "main.rs" + ); + matched!( + not, + matchnot5, + types(), + vec!["rust"], + vec!["foo"], + "main.foo" + ); + matched!(not, matchnot6, types(), vec!["combo"], vec![], "leftpad.js"); + matched!(not, matchnot7, types(), vec!["py"], vec![], "index.html"); + matched!(not, matchnot8, types(), vec!["python"], vec![], "doc.md"); + + #[test] + fn test_invalid_defs() { + let mut btypes = TypesBuilder::new(); + for tydef in types() { + btypes.add_def(tydef).unwrap(); + } + // Preserve the original definitions for later comparison. + let original_defs = btypes.definitions(); + let bad_defs = vec![ + // Reference to type that does not exist + "combo:include:html,qwerty", + // Bad format + "combo:foobar:html,rust", + "", + ]; + for def in bad_defs { + assert!(btypes.add_def(def).is_err()); + // Ensure that nothing changed, even if some of the includes were valid. + assert_eq!(btypes.definitions(), original_defs); + } + } +} diff --git a/crates/ignore/src/walk.rs b/crates/ignore/src/walk.rs new file mode 100644 index 000000000000..5a8da6e336f6 --- /dev/null +++ b/crates/ignore/src/walk.rs @@ -0,0 +1,2297 @@ +use std::{ + cmp::Ordering, + ffi::OsStr, + fs::{self, FileType, Metadata}, + io, + path::{Path, PathBuf}, + sync::atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}, + sync::Arc, +}; + +use { + crossbeam_deque::{Stealer, Worker as Deque}, + same_file::Handle, + walkdir::WalkDir, +}; + +use crate::{ + dir::{Ignore, IgnoreBuilder}, + gitignore::GitignoreBuilder, + overrides::Override, + types::Types, + Error, PartialErrorBuilder, +}; + +/// A directory entry with a possible error attached. +/// +/// The error typically refers to a problem parsing ignore files in a +/// particular directory. +#[derive(Clone, Debug)] +pub struct DirEntry { + dent: DirEntryInner, + err: Option, +} + +impl DirEntry { + /// The full path that this entry represents. + pub fn path(&self) -> &Path { + self.dent.path() + } + + /// The full path that this entry represents. + /// Analogous to [`DirEntry::path`], but moves ownership of the path. + pub fn into_path(self) -> PathBuf { + self.dent.into_path() + } + + /// Whether this entry corresponds to a symbolic link or not. + pub fn path_is_symlink(&self) -> bool { + self.dent.path_is_symlink() + } + + /// Returns true if and only if this entry corresponds to stdin. + /// + /// i.e., The entry has depth 0 and its file name is `-`. + pub fn is_stdin(&self) -> bool { + self.dent.is_stdin() + } + + /// Return the metadata for the file that this entry points to. + pub fn metadata(&self) -> Result { + self.dent.metadata() + } + + /// Return the file type for the file that this entry points to. + /// + /// This entry doesn't have a file type if it corresponds to stdin. + pub fn file_type(&self) -> Option { + self.dent.file_type() + } + + /// Return the file name of this entry. + /// + /// If this entry has no file name (e.g., `/`), then the full path is + /// returned. + pub fn file_name(&self) -> &OsStr { + self.dent.file_name() + } + + /// Returns the depth at which this entry was created relative to the root. + pub fn depth(&self) -> usize { + self.dent.depth() + } + + /// Returns the underlying inode number if one exists. + /// + /// If this entry doesn't have an inode number, then `None` is returned. + #[cfg(unix)] + pub fn ino(&self) -> Option { + self.dent.ino() + } + + /// Returns an error, if one exists, associated with processing this entry. + /// + /// An example of an error is one that occurred while parsing an ignore + /// file. Errors related to traversing a directory tree itself are reported + /// as part of yielding the directory entry, and not with this method. + pub fn error(&self) -> Option<&Error> { + self.err.as_ref() + } + + /// Returns true if and only if this entry points to a directory. + pub(crate) fn is_dir(&self) -> bool { + self.dent.is_dir() + } + + fn new_stdin() -> DirEntry { + DirEntry { + dent: DirEntryInner::Stdin, + err: None, + } + } + + fn new_walkdir(dent: walkdir::DirEntry, err: Option) -> DirEntry { + DirEntry { + dent: DirEntryInner::Walkdir(dent), + err, + } + } + + fn new_raw(dent: DirEntryRaw, err: Option) -> DirEntry { + DirEntry { + dent: DirEntryInner::Raw(dent), + err, + } + } +} + +/// DirEntryInner is the implementation of DirEntry. +/// +/// It specifically represents three distinct sources of directory entries: +/// +/// 1. From the walkdir crate. +/// 2. Special entries that represent things like stdin. +/// 3. From a path. +/// +/// Specifically, (3) has to essentially re-create the DirEntry implementation +/// from WalkDir. +#[derive(Clone, Debug)] +enum DirEntryInner { + Stdin, + Walkdir(walkdir::DirEntry), + Raw(DirEntryRaw), +} + +impl DirEntryInner { + fn path(&self) -> &Path { + use self::DirEntryInner::*; + match *self { + Stdin => Path::new(""), + Walkdir(ref x) => x.path(), + Raw(ref x) => x.path(), + } + } + + fn into_path(self) -> PathBuf { + use self::DirEntryInner::*; + match self { + Stdin => PathBuf::from(""), + Walkdir(x) => x.into_path(), + Raw(x) => x.into_path(), + } + } + + fn path_is_symlink(&self) -> bool { + use self::DirEntryInner::*; + match *self { + Stdin => false, + Walkdir(ref x) => x.path_is_symlink(), + Raw(ref x) => x.path_is_symlink(), + } + } + + fn is_stdin(&self) -> bool { + match *self { + DirEntryInner::Stdin => true, + _ => false, + } + } + + fn metadata(&self) -> Result { + use self::DirEntryInner::*; + match *self { + Stdin => { + let err = Error::Io(io::Error::new( + io::ErrorKind::Other, + " has no metadata", + )); + Err(err.with_path("")) + } + Walkdir(ref x) => x + .metadata() + .map_err(|err| Error::Io(io::Error::from(err)).with_path(x.path())), + Raw(ref x) => x.metadata(), + } + } + + fn file_type(&self) -> Option { + use self::DirEntryInner::*; + match *self { + Stdin => None, + Walkdir(ref x) => Some(x.file_type()), + Raw(ref x) => Some(x.file_type()), + } + } + + fn file_name(&self) -> &OsStr { + use self::DirEntryInner::*; + match *self { + Stdin => OsStr::new(""), + Walkdir(ref x) => x.file_name(), + Raw(ref x) => x.file_name(), + } + } + + fn depth(&self) -> usize { + use self::DirEntryInner::*; + match *self { + Stdin => 0, + Walkdir(ref x) => x.depth(), + Raw(ref x) => x.depth(), + } + } + + #[cfg(unix)] + fn ino(&self) -> Option { + use self::DirEntryInner::*; + use walkdir::DirEntryExt; + match *self { + Stdin => None, + Walkdir(ref x) => Some(x.ino()), + Raw(ref x) => Some(x.ino()), + } + } + + /// Returns true if and only if this entry points to a directory. + fn is_dir(&self) -> bool { + self.file_type().map(|ft| ft.is_dir()).unwrap_or(false) + } +} + +/// DirEntryRaw is essentially copied from the walkdir crate so that we can +/// build `DirEntry`s from whole cloth in the parallel iterator. +#[derive(Clone)] +struct DirEntryRaw { + /// The path as reported by the `fs::ReadDir` iterator (even if it's a + /// symbolic link). + path: PathBuf, + /// The file type. Necessary for recursive iteration, so store it. + ty: FileType, + /// Is set when this entry was created from a symbolic link and the user + /// expects the iterator to follow symbolic links. + follow_link: bool, + /// The depth at which this entry was generated relative to the root. + depth: usize, + /// The underlying inode number (Unix only). + #[cfg(unix)] + ino: u64, + /// The underlying metadata (Windows only). We store this on Windows + /// because this comes for free while reading a directory. + #[cfg(windows)] + metadata: fs::Metadata, +} + +impl std::fmt::Debug for DirEntryRaw { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Leaving out FileType because it doesn't have a debug impl + // in Rust 1.9. We could add it if we really wanted to by manually + // querying each possibly file type. Meh. ---AG + f.debug_struct("DirEntryRaw") + .field("path", &self.path) + .field("follow_link", &self.follow_link) + .field("depth", &self.depth) + .finish() + } +} + +impl DirEntryRaw { + fn path(&self) -> &Path { + &self.path + } + + fn into_path(self) -> PathBuf { + self.path + } + + fn path_is_symlink(&self) -> bool { + self.ty.is_symlink() || self.follow_link + } + + fn metadata(&self) -> Result { + self.metadata_internal() + } + + #[cfg(windows)] + fn metadata_internal(&self) -> Result { + if self.follow_link { + fs::metadata(&self.path) + } else { + Ok(self.metadata.clone()) + } + .map_err(|err| Error::Io(io::Error::from(err)).with_path(&self.path)) + } + + #[cfg(not(windows))] + fn metadata_internal(&self) -> Result { + if self.follow_link { + fs::metadata(&self.path) + } else { + fs::symlink_metadata(&self.path) + } + .map_err(|err| Error::Io(io::Error::from(err)).with_path(&self.path)) + } + + fn file_type(&self) -> FileType { + self.ty + } + + fn file_name(&self) -> &OsStr { + self.path + .file_name() + .unwrap_or_else(|| self.path.as_os_str()) + } + + fn depth(&self) -> usize { + self.depth + } + + #[cfg(unix)] + fn ino(&self) -> u64 { + self.ino + } + + fn from_entry(depth: usize, ent: &fs::DirEntry) -> Result { + let ty = ent.file_type().map_err(|err| { + let err = Error::Io(io::Error::from(err)).with_path(ent.path()); + Error::WithDepth { + depth, + err: Box::new(err), + } + })?; + DirEntryRaw::from_entry_os(depth, ent, ty) + } + + #[cfg(windows)] + fn from_entry_os( + depth: usize, + ent: &fs::DirEntry, + ty: fs::FileType, + ) -> Result { + let md = ent.metadata().map_err(|err| { + let err = Error::Io(io::Error::from(err)).with_path(ent.path()); + Error::WithDepth { + depth, + err: Box::new(err), + } + })?; + Ok(DirEntryRaw { + path: ent.path(), + ty, + follow_link: false, + depth, + metadata: md, + }) + } + + #[cfg(unix)] + fn from_entry_os( + depth: usize, + ent: &fs::DirEntry, + ty: fs::FileType, + ) -> Result { + use std::os::unix::fs::DirEntryExt; + + Ok(DirEntryRaw { + path: ent.path(), + ty, + follow_link: false, + depth, + ino: ent.ino(), + }) + } + + // Placeholder implementation to allow compiling on non-standard platforms + // (e.g. wasm32). + #[cfg(not(any(windows, unix)))] + fn from_entry_os( + depth: usize, + ent: &fs::DirEntry, + ty: fs::FileType, + ) -> Result { + Err(Error::Io(io::Error::new( + io::ErrorKind::Other, + "unsupported platform", + ))) + } + + #[cfg(windows)] + fn from_path(depth: usize, pb: PathBuf, link: bool) -> Result { + let md = fs::metadata(&pb).map_err(|err| Error::Io(err).with_path(&pb))?; + Ok(DirEntryRaw { + path: pb, + ty: md.file_type(), + follow_link: link, + depth, + metadata: md, + }) + } + + #[cfg(unix)] + fn from_path(depth: usize, pb: PathBuf, link: bool) -> Result { + use std::os::unix::fs::MetadataExt; + + let md = fs::metadata(&pb).map_err(|err| Error::Io(err).with_path(&pb))?; + Ok(DirEntryRaw { + path: pb, + ty: md.file_type(), + follow_link: link, + depth, + ino: md.ino(), + }) + } + + // Placeholder implementation to allow compiling on non-standard platforms + // (e.g. wasm32). + #[cfg(not(any(windows, unix)))] + fn from_path(depth: usize, pb: PathBuf, link: bool) -> Result { + Err(Error::Io(io::Error::new( + io::ErrorKind::Other, + "unsupported platform", + ))) + } +} + +/// WalkBuilder builds a recursive directory iterator. +/// +/// The builder supports a large number of configurable options. This includes +/// specific glob overrides, file type matching, toggling whether hidden +/// files are ignored or not, and of course, support for respecting gitignore +/// files. +/// +/// By default, all ignore files found are respected. This includes `.ignore`, +/// `.gitignore`, `.git/info/exclude` and even your global gitignore +/// globs, usually found in `$XDG_CONFIG_HOME/git/ignore`. +/// +/// Some standard recursive directory options are also supported, such as +/// limiting the recursive depth or whether to follow symbolic links (disabled +/// by default). +/// +/// # Ignore rules +/// +/// There are many rules that influence whether a particular file or directory +/// is skipped by this iterator. Those rules are documented here. Note that +/// the rules assume a default configuration. +/// +/// * First, glob overrides are checked. If a path matches a glob override, +/// then matching stops. The path is then only skipped if the glob that matched +/// the path is an ignore glob. (An override glob is a whitelist glob unless it +/// starts with a `!`, in which case it is an ignore glob.) +/// * Second, ignore files are checked. Ignore files currently only come from +/// git ignore files (`.gitignore`, `.git/info/exclude` and the configured +/// global gitignore file), plain `.ignore` files, which have the same format +/// as gitignore files, or explicitly added ignore files. The precedence order +/// is: `.ignore`, `.gitignore`, `.git/info/exclude`, global gitignore and +/// finally explicitly added ignore files. Note that precedence between +/// different types of ignore files is not impacted by the directory hierarchy; +/// any `.ignore` file overrides all `.gitignore` files. Within each precedence +/// level, more nested ignore files have a higher precedence than less nested +/// ignore files. +/// * Third, if the previous step yields an ignore match, then all matching +/// is stopped and the path is skipped. If it yields a whitelist match, then +/// matching continues. A whitelist match can be overridden by a later matcher. +/// * Fourth, unless the path is a directory, the file type matcher is run on +/// the path. As above, if it yields an ignore match, then all matching is +/// stopped and the path is skipped. If it yields a whitelist match, then +/// matching continues. +/// * Fifth, if the path hasn't been whitelisted and it is hidden, then the +/// path is skipped. +/// * Sixth, unless the path is a directory, the size of the file is compared +/// against the max filesize limit. If it exceeds the limit, it is skipped. +/// * Seventh, if the path has made it this far then it is yielded in the +/// iterator. +#[derive(Clone)] +pub struct WalkBuilder { + paths: Vec, + ig_builder: IgnoreBuilder, + max_depth: Option, + max_filesize: Option, + follow_links: bool, + same_file_system: bool, + sorter: Option, + threads: usize, + skip: Option>, + filter: Option, +} + +#[derive(Clone)] +enum Sorter { + ByName(Arc Ordering + Send + Sync + 'static>), + ByPath(Arc Ordering + Send + Sync + 'static>), +} + +#[derive(Clone)] +struct Filter(Arc bool + Send + Sync + 'static>); + +impl std::fmt::Debug for WalkBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("WalkBuilder") + .field("paths", &self.paths) + .field("ig_builder", &self.ig_builder) + .field("max_depth", &self.max_depth) + .field("max_filesize", &self.max_filesize) + .field("follow_links", &self.follow_links) + .field("threads", &self.threads) + .field("skip", &self.skip) + .finish() + } +} + +impl WalkBuilder { + /// Create a new builder for a recursive directory iterator for the + /// directory given. + /// + /// Note that if you want to traverse multiple different directories, it + /// is better to call `add` on this builder than to create multiple + /// `Walk` values. + pub fn new>(path: P) -> WalkBuilder { + WalkBuilder { + paths: vec![path.as_ref().to_path_buf()], + ig_builder: IgnoreBuilder::new(), + max_depth: None, + max_filesize: None, + follow_links: false, + same_file_system: false, + sorter: None, + threads: 0, + skip: None, + filter: None, + } + } + + /// Build a new `Walk` iterator. + pub fn build(&self) -> Walk { + let follow_links = self.follow_links; + let max_depth = self.max_depth; + let sorter = self.sorter.clone(); + let its = self + .paths + .iter() + .map(move |p| { + if p == Path::new("-") { + (p.to_path_buf(), None) + } else { + let mut wd = WalkDir::new(p); + wd = wd.follow_links(follow_links || p.is_file()); + wd = wd.same_file_system(self.same_file_system); + if let Some(max_depth) = max_depth { + wd = wd.max_depth(max_depth); + } + if let Some(ref sorter) = sorter { + match sorter.clone() { + Sorter::ByName(cmp) => { + wd = wd.sort_by(move |a, b| cmp(a.file_name(), b.file_name())); + } + Sorter::ByPath(cmp) => { + wd = wd.sort_by(move |a, b| cmp(a.path(), b.path())); + } + } + } + (p.to_path_buf(), Some(WalkEventIter::from(wd))) + } + }) + .collect::>() + .into_iter(); + let ig_root = self.ig_builder.build(); + Walk { + its, + it: None, + ig_root: ig_root.clone(), + ig: ig_root.clone(), + max_filesize: self.max_filesize, + skip: self.skip.clone(), + filter: self.filter.clone(), + } + } + + /// Build a new `WalkParallel` iterator. + /// + /// Note that this *doesn't* return something that implements `Iterator`. + /// Instead, the returned value must be run with a closure. e.g., + /// `builder.build_parallel().run(|| |path| { println!("{path:?}"); WalkState::Continue })`. + pub fn build_parallel(&self) -> WalkParallel { + WalkParallel { + paths: self.paths.clone().into_iter(), + ig_root: self.ig_builder.build(), + max_depth: self.max_depth, + max_filesize: self.max_filesize, + follow_links: self.follow_links, + same_file_system: self.same_file_system, + threads: self.threads, + skip: self.skip.clone(), + filter: self.filter.clone(), + } + } + + /// Add a file path to the iterator. + /// + /// Each additional file path added is traversed recursively. This should + /// be preferred over building multiple `Walk` iterators since this + /// enables reusing resources across iteration. + pub fn add>(&mut self, path: P) -> &mut WalkBuilder { + self.paths.push(path.as_ref().to_path_buf()); + self + } + + /// The maximum depth to recurse. + /// + /// The default, `None`, imposes no depth restriction. + pub fn max_depth(&mut self, depth: Option) -> &mut WalkBuilder { + self.max_depth = depth; + self + } + + /// Whether to follow symbolic links or not. + pub fn follow_links(&mut self, yes: bool) -> &mut WalkBuilder { + self.follow_links = yes; + self + } + + /// Whether to ignore files above the specified limit. + pub fn max_filesize(&mut self, filesize: Option) -> &mut WalkBuilder { + self.max_filesize = filesize; + self + } + + /// The number of threads to use for traversal. + /// + /// Note that this only has an effect when using `build_parallel`. + /// + /// The default setting is `0`, which chooses the number of threads + /// automatically using heuristics. + pub fn threads(&mut self, n: usize) -> &mut WalkBuilder { + self.threads = n; + self + } + + /// Add a global ignore file to the matcher. + /// + /// This has lower precedence than all other sources of ignore rules. + /// + /// If there was a problem adding the ignore file, then an error is + /// returned. Note that the error may indicate *partial* failure. For + /// example, if an ignore file contains an invalid glob, all other globs + /// are still applied. + pub fn add_ignore>(&mut self, path: P) -> Option { + let mut builder = GitignoreBuilder::new(""); + let mut errs = PartialErrorBuilder::default(); + errs.maybe_push(builder.add(path)); + match builder.build() { + Ok(gi) => { + self.ig_builder.add_ignore(gi); + } + Err(err) => { + errs.push(err); + } + } + errs.into_error_option() + } + + /// Add a custom ignore file name + /// + /// These ignore files have higher precedence than all other ignore files. + /// + /// When specifying multiple names, earlier names have lower precedence than + /// later names. + pub fn add_custom_ignore_filename>( + &mut self, + file_name: S, + ) -> &mut WalkBuilder { + self.ig_builder.add_custom_ignore_filename(file_name); + self + } + + /// Add an override matcher. + /// + /// By default, no override matcher is used. + /// + /// This overrides any previous setting. + pub fn overrides(&mut self, overrides: Override) -> &mut WalkBuilder { + self.ig_builder.overrides(overrides); + self + } + + /// Add a file type matcher. + /// + /// By default, no file type matcher is used. + /// + /// This overrides any previous setting. + pub fn types(&mut self, types: Types) -> &mut WalkBuilder { + self.ig_builder.types(types); + self + } + + /// Enables all the standard ignore filters. + /// + /// This toggles, as a group, all the filters that are enabled by default: + /// + /// - [hidden()](#method.hidden) + /// - [parents()](#method.parents) + /// - [ignore()](#method.ignore) + /// - [git_ignore()](#method.git_ignore) + /// - [git_global()](#method.git_global) + /// - [git_exclude()](#method.git_exclude) + /// + /// They may still be toggled individually after calling this function. + /// + /// This is (by definition) enabled by default. + pub fn standard_filters(&mut self, yes: bool) -> &mut WalkBuilder { + self.hidden(yes) + .parents(yes) + .ignore(yes) + .git_ignore(yes) + .git_global(yes) + .git_exclude(yes) + } + + /// Enables ignoring hidden files. + /// + /// This is enabled by default. + pub fn hidden(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.hidden(yes); + self + } + + /// Enables reading ignore files from parent directories. + /// + /// If this is enabled, then .gitignore files in parent directories of each + /// file path given are respected. Otherwise, they are ignored. + /// + /// This is enabled by default. + pub fn parents(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.parents(yes); + self + } + + /// Enables reading `.ignore` files. + /// + /// `.ignore` files have the same semantics as `gitignore` files and are + /// supported by search tools such as ripgrep and The Silver Searcher. + /// + /// This is enabled by default. + pub fn ignore(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.ignore(yes); + self + } + + /// Enables reading a global gitignore file, whose path is specified in + /// git's `core.excludesFile` config option. + /// + /// Git's config file location is `$HOME/.gitconfig`. If `$HOME/.gitconfig` + /// does not exist or does not specify `core.excludesFile`, then + /// `$XDG_CONFIG_HOME/git/ignore` is read. If `$XDG_CONFIG_HOME` is not + /// set or is empty, then `$HOME/.config/git/ignore` is used instead. + /// + /// This is enabled by default. + pub fn git_global(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.git_global(yes); + self + } + + /// Enables reading `.gitignore` files. + /// + /// `.gitignore` files have match semantics as described in the `gitignore` + /// man page. + /// + /// This is enabled by default. + pub fn git_ignore(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.git_ignore(yes); + self + } + + /// Enables reading `.git/info/exclude` files. + /// + /// `.git/info/exclude` files have match semantics as described in the + /// `gitignore` man page. + /// + /// This is enabled by default. + pub fn git_exclude(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.git_exclude(yes); + self + } + + /// Whether a git repository is required to apply git-related ignore + /// rules (global rules, .gitignore and local exclude rules). + /// + /// When disabled, git-related ignore rules are applied even when searching + /// outside a git repository. + pub fn require_git(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.require_git(yes); + self + } + + /// Process ignore files case insensitively + /// + /// This is disabled by default. + pub fn ignore_case_insensitive(&mut self, yes: bool) -> &mut WalkBuilder { + self.ig_builder.ignore_case_insensitive(yes); + self + } + + /// Set a function for sorting directory entries by their path. + /// + /// If a compare function is set, the resulting iterator will return all + /// paths in sorted order. The compare function will be called to compare + /// entries from the same directory. + /// + /// This is like `sort_by_file_name`, except the comparator accepts + /// a `&Path` instead of the base file name, which permits it to sort by + /// more criteria. + /// + /// This method will override any previous sorter set by this method or + /// by `sort_by_file_name`. + /// + /// Note that this is not used in the parallel iterator. + pub fn sort_by_file_path(&mut self, cmp: F) -> &mut WalkBuilder + where + F: Fn(&Path, &Path) -> Ordering + Send + Sync + 'static, + { + self.sorter = Some(Sorter::ByPath(Arc::new(cmp))); + self + } + + /// Set a function for sorting directory entries by file name. + /// + /// If a compare function is set, the resulting iterator will return all + /// paths in sorted order. The compare function will be called to compare + /// names from entries from the same directory using only the name of the + /// entry. + /// + /// This method will override any previous sorter set by this method or + /// by `sort_by_file_path`. + /// + /// Note that this is not used in the parallel iterator. + pub fn sort_by_file_name(&mut self, cmp: F) -> &mut WalkBuilder + where + F: Fn(&OsStr, &OsStr) -> Ordering + Send + Sync + 'static, + { + self.sorter = Some(Sorter::ByName(Arc::new(cmp))); + self + } + + /// Do not cross file system boundaries. + /// + /// When this option is enabled, directory traversal will not descend into + /// directories that are on a different file system from the root path. + /// + /// Currently, this option is only supported on Unix and Windows. If this + /// option is used on an unsupported platform, then directory traversal + /// will immediately return an error and will not yield any entries. + pub fn same_file_system(&mut self, yes: bool) -> &mut WalkBuilder { + self.same_file_system = yes; + self + } + + /// Do not yield directory entries that are believed to correspond to + /// stdout. + /// + /// This is useful when a command is invoked via shell redirection to a + /// file that is also being read. For example, `grep -r foo ./ > results` + /// might end up trying to search `results` even though it is also writing + /// to it, which could cause an unbounded feedback loop. Setting this + /// option prevents this from happening by skipping over the `results` + /// file. + /// + /// This is disabled by default. + pub fn skip_stdout(&mut self, yes: bool) -> &mut WalkBuilder { + if yes { + self.skip = stdout_handle().map(Arc::new); + } else { + self.skip = None; + } + self + } + + /// Yields only entries which satisfy the given predicate and skips + /// descending into directories that do not satisfy the given predicate. + /// + /// The predicate is applied to all entries. If the predicate is + /// true, iteration carries on as normal. If the predicate is false, the + /// entry is ignored and if it is a directory, it is not descended into. + /// + /// Note that the errors for reading entries that may not satisfy the + /// predicate will still be yielded. + pub fn filter_entry

(&mut self, filter: P) -> &mut WalkBuilder + where + P: Fn(&DirEntry) -> bool + Send + Sync + 'static, + { + self.filter = Some(Filter(Arc::new(filter))); + self + } +} + +/// Walk is a recursive directory iterator over file paths in one or more +/// directories. +/// +/// Only file and directory paths matching the rules are returned. By default, +/// ignore files like `.gitignore` are respected. The precise matching rules +/// and precedence is explained in the documentation for `WalkBuilder`. +pub struct Walk { + its: std::vec::IntoIter<(PathBuf, Option)>, + it: Option, + ig_root: Ignore, + ig: Ignore, + max_filesize: Option, + skip: Option>, + filter: Option, +} + +impl Walk { + /// Creates a new recursive directory iterator for the file path given. + /// + /// Note that this uses default settings, which include respecting + /// `.gitignore` files. To configure the iterator, use `WalkBuilder` + /// instead. + pub fn new>(path: P) -> Walk { + WalkBuilder::new(path).build() + } + + fn skip_entry(&self, ent: &DirEntry) -> Result { + if ent.depth() == 0 { + return Ok(false); + } + // We ensure that trivial skipping is done before any other potentially + // expensive operations (stat, filesystem other) are done. This seems + // like an obvious optimization but becomes critical when filesystem + // operations even as simple as stat can result in significant + // overheads; an example of this was a bespoke filesystem layer in + // Windows that hosted files remotely and would download them on-demand + // when particular filesystem operations occurred. Users of this system + // who ensured correct file-type filters were being used could still + // get unnecessary file access resulting in large downloads. + if should_skip_entry(&self.ig, ent) { + return Ok(true); + } + if let Some(ref stdout) = self.skip { + if path_equals(ent, stdout)? { + return Ok(true); + } + } + if self.max_filesize.is_some() && !ent.is_dir() { + return Ok(skip_filesize( + self.max_filesize.unwrap(), + ent.path(), + &ent.metadata().ok(), + )); + } + if let Some(Filter(filter)) = &self.filter { + if !filter(ent) { + return Ok(true); + } + } + Ok(false) + } +} + +impl Iterator for Walk { + type Item = Result; + + #[inline(always)] + fn next(&mut self) -> Option> { + loop { + let ev = match self.it.as_mut().and_then(|it| it.next()) { + Some(ev) => ev, + None => { + match self.its.next() { + None => return None, + Some((_, None)) => { + return Some(Ok(DirEntry::new_stdin())); + } + Some((path, Some(it))) => { + self.it = Some(it); + if path.is_dir() { + let (ig, err) = self.ig_root.add_parents(path); + self.ig = ig; + if let Some(err) = err { + return Some(Err(err)); + } + } else { + self.ig = self.ig_root.clone(); + } + } + } + continue; + } + }; + match ev { + Err(err) => { + return Some(Err(Error::from_walkdir(err))); + } + Ok(WalkEvent::Exit) => { + self.ig = self.ig.parent().unwrap(); + } + Ok(WalkEvent::Dir(ent)) => { + let mut ent = DirEntry::new_walkdir(ent, None); + let should_skip = match self.skip_entry(&ent) { + Err(err) => return Some(Err(err)), + Ok(should_skip) => should_skip, + }; + if should_skip { + self.it.as_mut().unwrap().it.skip_current_dir(); + // Still need to push this on the stack because + // we'll get a WalkEvent::Exit event for this dir. + // We don't care if it errors though. + let (igtmp, _) = self.ig.add_child(ent.path()); + self.ig = igtmp; + continue; + } + let (igtmp, err) = self.ig.add_child(ent.path()); + self.ig = igtmp; + ent.err = err; + return Some(Ok(ent)); + } + Ok(WalkEvent::File(ent)) => { + let ent = DirEntry::new_walkdir(ent, None); + let should_skip = match self.skip_entry(&ent) { + Err(err) => return Some(Err(err)), + Ok(should_skip) => should_skip, + }; + if should_skip { + continue; + } + return Some(Ok(ent)); + } + } + } + } +} + +impl std::iter::FusedIterator for Walk {} + +/// WalkEventIter transforms a WalkDir iterator into an iterator that more +/// accurately describes the directory tree. Namely, it emits events that are +/// one of three types: directory, file or "exit." An "exit" event means that +/// the entire contents of a directory have been enumerated. +struct WalkEventIter { + depth: usize, + it: walkdir::IntoIter, + next: Option>, +} + +#[derive(Debug)] +enum WalkEvent { + Dir(walkdir::DirEntry), + File(walkdir::DirEntry), + Exit, +} + +impl From for WalkEventIter { + fn from(it: WalkDir) -> WalkEventIter { + WalkEventIter { + depth: 0, + it: it.into_iter(), + next: None, + } + } +} + +impl Iterator for WalkEventIter { + type Item = walkdir::Result; + + #[inline(always)] + fn next(&mut self) -> Option> { + let dent = self.next.take().or_else(|| self.it.next()); + let depth = match dent { + None => 0, + Some(Ok(ref dent)) => dent.depth(), + Some(Err(ref err)) => err.depth(), + }; + if depth < self.depth { + self.depth -= 1; + self.next = dent; + return Some(Ok(WalkEvent::Exit)); + } + self.depth = depth; + match dent { + None => None, + Some(Err(err)) => Some(Err(err)), + Some(Ok(dent)) => { + if walkdir_is_dir(&dent) { + self.depth += 1; + Some(Ok(WalkEvent::Dir(dent))) + } else { + Some(Ok(WalkEvent::File(dent))) + } + } + } + } +} + +/// WalkState is used in the parallel recursive directory iterator to indicate +/// whether walking should continue as normal, skip descending into a +/// particular directory or quit the walk entirely. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum WalkState { + /// Continue walking as normal. + Continue, + /// If the directory entry given is a directory, don't descend into it. + /// In all other cases, this has no effect. + Skip, + /// Quit the entire iterator as soon as possible. + /// + /// Note that this is an inherently asynchronous action. It is possible + /// for more entries to be yielded even after instructing the iterator + /// to quit. + Quit, +} + +impl WalkState { + fn is_continue(&self) -> bool { + *self == WalkState::Continue + } + + fn is_quit(&self) -> bool { + *self == WalkState::Quit + } +} + +/// A builder for constructing a visitor when using [`WalkParallel::visit`]. +/// The builder will be called for each thread started by `WalkParallel`. The +/// visitor returned from each builder is then called for every directory +/// entry. +pub trait ParallelVisitorBuilder<'s> { + /// Create per-thread `ParallelVisitor`s for `WalkParallel`. + fn build(&mut self) -> Box; +} + +impl<'a, 's, P: ParallelVisitorBuilder<'s>> ParallelVisitorBuilder<'s> for &'a mut P { + fn build(&mut self) -> Box { + (**self).build() + } +} + +/// Receives files and directories for the current thread. +/// +/// Setup for the traversal can be implemented as part of +/// [`ParallelVisitorBuilder::build`]. Teardown when traversal finishes can be +/// implemented by implementing the `Drop` trait on your traversal type. +pub trait ParallelVisitor: Send { + /// Receives files and directories for the current thread. This is called + /// once for every directory entry visited by traversal. + fn visit(&mut self, entry: Result) -> WalkState; +} + +struct FnBuilder { + builder: F, +} + +impl<'s, F: FnMut() -> FnVisitor<'s>> ParallelVisitorBuilder<'s> for FnBuilder { + fn build(&mut self) -> Box { + let visitor = (self.builder)(); + Box::new(FnVisitorImp { visitor }) + } +} + +type FnVisitor<'s> = Box) -> WalkState + Send + 's>; + +struct FnVisitorImp<'s> { + visitor: FnVisitor<'s>, +} + +impl<'s> ParallelVisitor for FnVisitorImp<'s> { + fn visit(&mut self, entry: Result) -> WalkState { + (self.visitor)(entry) + } +} + +/// WalkParallel is a parallel recursive directory iterator over files paths +/// in one or more directories. +/// +/// Only file and directory paths matching the rules are returned. By default, +/// ignore files like `.gitignore` are respected. The precise matching rules +/// and precedence is explained in the documentation for `WalkBuilder`. +/// +/// Unlike `Walk`, this uses multiple threads for traversing a directory. +pub struct WalkParallel { + paths: std::vec::IntoIter, + ig_root: Ignore, + max_filesize: Option, + max_depth: Option, + follow_links: bool, + same_file_system: bool, + threads: usize, + skip: Option>, + filter: Option, +} + +impl WalkParallel { + /// Execute the parallel recursive directory iterator. `mkf` is called + /// for each thread used for iteration. The function produced by `mkf` + /// is then in turn called for each visited file path. + pub fn run<'s, F>(self, mkf: F) + where + F: FnMut() -> FnVisitor<'s>, + { + self.visit(&mut FnBuilder { builder: mkf }) + } + + /// Execute the parallel recursive directory iterator using a custom + /// visitor. + /// + /// The builder given is used to construct a visitor for every thread + /// used by this traversal. The visitor returned from each builder is then + /// called for every directory entry seen by that thread. + /// + /// Typically, creating a custom visitor is useful if you need to perform + /// some kind of cleanup once traversal is finished. This can be achieved + /// by implementing `Drop` for your builder (or for your visitor, if you + /// want to execute cleanup for every thread that is launched). + /// + /// For example, each visitor might build up a data structure of results + /// corresponding to the directory entries seen for each thread. Since each + /// visitor runs on only one thread, this build-up can be done without + /// synchronization. Then, once traversal is complete, all of the results + /// can be merged together into a single data structure. + pub fn visit(mut self, builder: &mut dyn ParallelVisitorBuilder<'_>) { + let threads = self.threads(); + let mut stack = vec![]; + { + let mut visitor = builder.build(); + let mut paths = Vec::new().into_iter(); + std::mem::swap(&mut paths, &mut self.paths); + // Send the initial set of root paths to the pool of workers. Note + // that we only send directories. For files, we send to them the + // callback directly. + for path in paths { + let (dent, root_device) = if path == Path::new("-") { + (DirEntry::new_stdin(), None) + } else { + let root_device = if !self.same_file_system { + None + } else { + match device_num(&path) { + Ok(root_device) => Some(root_device), + Err(err) => { + let err = Error::Io(err).with_path(path); + if visitor.visit(Err(err)).is_quit() { + return; + } + continue; + } + } + }; + match DirEntryRaw::from_path(0, path, false) { + Ok(dent) => (DirEntry::new_raw(dent, None), root_device), + Err(err) => { + if visitor.visit(Err(err)).is_quit() { + return; + } + continue; + } + } + }; + stack.push(Message::Work(Work { + dent, + ignore: self.ig_root.clone(), + root_device, + })); + } + // ... but there's no need to start workers if we don't need them. + if stack.is_empty() { + return; + } + } + // Create the workers and then wait for them to finish. + let quit_now = Arc::new(AtomicBool::new(false)); + let active_workers = Arc::new(AtomicUsize::new(threads)); + let stacks = Stack::new_for_each_thread(threads, stack); + std::thread::scope(|s| { + let handles: Vec<_> = stacks + .into_iter() + .map(|stack| Worker { + visitor: builder.build(), + stack, + quit_now: quit_now.clone(), + active_workers: active_workers.clone(), + max_depth: self.max_depth, + max_filesize: self.max_filesize, + follow_links: self.follow_links, + skip: self.skip.clone(), + filter: self.filter.clone(), + }) + .map(|worker| s.spawn(|| worker.run())) + .collect(); + for handle in handles { + handle.join().unwrap(); + } + }); + } + + fn threads(&self) -> usize { + if self.threads == 0 { + 2 + } else { + self.threads + } + } +} + +/// Message is the set of instructions that a worker knows how to process. +enum Message { + /// A work item corresponds to a directory that should be descended into. + /// Work items for entries that should be skipped or ignored should not + /// be produced. + Work(Work), + /// This instruction indicates that the worker should quit. + Quit, +} + +/// A unit of work for each worker to process. +/// +/// Each unit of work corresponds to a directory that should be descended +/// into. +struct Work { + /// The directory entry. + dent: DirEntry, + /// Any ignore matchers that have been built for this directory's parents. + ignore: Ignore, + /// The root device number. When present, only files with the same device + /// number should be considered. + root_device: Option, +} + +impl Work { + /// Returns true if and only if this work item is a directory. + fn is_dir(&self) -> bool { + self.dent.is_dir() + } + + /// Returns true if and only if this work item is a symlink. + fn is_symlink(&self) -> bool { + self.dent.file_type().map_or(false, |ft| ft.is_symlink()) + } + + /// Adds ignore rules for parent directories. + /// + /// Note that this only applies to entries at depth 0. On all other + /// entries, this is a no-op. + fn add_parents(&mut self) -> Option { + if self.dent.depth() > 0 { + return None; + } + // At depth 0, the path of this entry is a root path, so we can + // use it directly to add parent ignore rules. + let (ig, err) = self.ignore.add_parents(self.dent.path()); + self.ignore = ig; + err + } + + /// Reads the directory contents of this work item and adds ignore + /// rules for this directory. + /// + /// If there was a problem with reading the directory contents, then + /// an error is returned. If there was a problem reading the ignore + /// rules for this directory, then the error is attached to this + /// work item's directory entry. + fn read_dir(&mut self) -> Result { + let readdir = match fs::read_dir(self.dent.path()) { + Ok(readdir) => readdir, + Err(err) => { + let err = Error::from(err) + .with_path(self.dent.path()) + .with_depth(self.dent.depth()); + return Err(err); + } + }; + let (ig, err) = self.ignore.add_child(self.dent.path()); + self.ignore = ig; + self.dent.err = err; + Ok(readdir) + } +} + +/// A work-stealing stack. +#[derive(Debug)] +struct Stack { + /// This thread's index. + index: usize, + /// The thread-local stack. + deque: Deque, + /// The work stealers. + stealers: Arc<[Stealer]>, +} + +impl Stack { + /// Create a work-stealing stack for each thread. The given messages + /// correspond to the initial paths to start the search at. They will + /// be distributed automatically to each stack in a round-robin fashion. + fn new_for_each_thread(threads: usize, init: Vec) -> Vec { + // Using new_lifo() ensures each worker operates depth-first, not + // breadth-first. We do depth-first because a breadth first traversal + // on wide directories with a lot of gitignores is disastrous (for + // example, searching a directory tree containing all of crates.io). + let deques: Vec> = std::iter::repeat_with(Deque::new_lifo) + .take(threads) + .collect(); + let stealers = + Arc::<[Stealer]>::from(deques.iter().map(Deque::stealer).collect::>()); + let stacks: Vec = deques + .into_iter() + .enumerate() + .map(|(index, deque)| Stack { + index, + deque, + stealers: stealers.clone(), + }) + .collect(); + // Distribute the initial messages. + init.into_iter() + .zip(stacks.iter().cycle()) + .for_each(|(m, s)| s.push(m)); + stacks + } + + /// Push a message. + fn push(&self, msg: Message) { + self.deque.push(msg); + } + + /// Pop a message. + fn pop(&self) -> Option { + self.deque.pop().or_else(|| self.steal()) + } + + /// Steal a message from another queue. + fn steal(&self) -> Option { + // For fairness, try to steal from index + 1, index + 2, ... len - 1, + // then wrap around to 0, 1, ... index - 1. + let (left, right) = self.stealers.split_at(self.index); + // Don't steal from ourselves + let right = &right[1..]; + + right + .iter() + .chain(left.iter()) + .map(|s| s.steal_batch_and_pop(&self.deque)) + .find_map(|s| s.success()) + } +} + +/// A worker is responsible for descending into directories, updating the +/// ignore matchers, producing new work and invoking the caller's callback. +/// +/// Note that a worker is *both* a producer and a consumer. +struct Worker<'s> { + /// The caller's callback. + visitor: Box, + /// A work-stealing stack of work to do. + /// + /// We use a stack instead of a channel because a stack lets us visit + /// directories in depth first order. This can substantially reduce peak + /// memory usage by keeping both the number of file paths and gitignore + /// matchers in memory lower. + stack: Stack, + /// Whether all workers should terminate at the next opportunity. Note + /// that we need this because we don't want other `Work` to be done after + /// we quit. We wouldn't need this if have a priority channel. + quit_now: Arc, + /// The number of currently active workers. + active_workers: Arc, + /// The maximum depth of directories to descend. A value of `0` means no + /// descension at all. + max_depth: Option, + /// The maximum size a searched file can be (in bytes). If a file exceeds + /// this size it will be skipped. + max_filesize: Option, + /// Whether to follow symbolic links or not. When this is enabled, loop + /// detection is performed. + follow_links: bool, + /// A file handle to skip, currently is either `None` or stdout, if it's + /// a file and it has been requested to skip files identical to stdout. + skip: Option>, + /// A predicate applied to dir entries. If true, the entry and all + /// children will be skipped. + filter: Option, +} + +impl<'s> Worker<'s> { + /// Runs this worker until there is no more work left to do. + /// + /// The worker will call the caller's callback for all entries that aren't + /// skipped by the ignore matcher. + fn run(mut self) { + while let Some(work) = self.get_work() { + if let WalkState::Quit = self.run_one(work) { + self.quit_now(); + } + } + } + + fn run_one(&mut self, mut work: Work) -> WalkState { + // If the work is not a directory, then we can just execute the + // caller's callback immediately and move on. + if work.is_symlink() || !work.is_dir() { + return self.visitor.visit(Ok(work.dent)); + } + if let Some(err) = work.add_parents() { + let state = self.visitor.visit(Err(err)); + if state.is_quit() { + return state; + } + } + + let descend = if let Some(root_device) = work.root_device { + match is_same_file_system(root_device, work.dent.path()) { + Ok(true) => true, + Ok(false) => false, + Err(err) => { + let state = self.visitor.visit(Err(err)); + if state.is_quit() { + return state; + } + false + } + } + } else { + true + }; + + // Try to read the directory first before we transfer ownership + // to the provided closure. Do not unwrap it immediately, though, + // as we may receive an `Err` value e.g. in the case when we do not + // have sufficient read permissions to list the directory. + // In that case we still want to provide the closure with a valid + // entry before passing the error value. + let readdir = work.read_dir(); + let depth = work.dent.depth(); + let state = self.visitor.visit(Ok(work.dent)); + if !state.is_continue() { + return state; + } + if !descend { + return WalkState::Skip; + } + + let readdir = match readdir { + Ok(readdir) => readdir, + Err(err) => { + return self.visitor.visit(Err(err)); + } + }; + + if self.max_depth.map_or(false, |max| depth >= max) { + return WalkState::Skip; + } + for result in readdir { + let state = self.generate_work(&work.ignore, depth + 1, work.root_device, result); + if state.is_quit() { + return state; + } + } + WalkState::Continue + } + + /// Decides whether to submit the given directory entry as a file to + /// search. + /// + /// If the entry is a path that should be ignored, then this is a no-op. + /// Otherwise, the entry is pushed on to the queue. (The actual execution + /// of the callback happens in `run_one`.) + /// + /// If an error occurs while reading the entry, then it is sent to the + /// caller's callback. + /// + /// `ig` is the `Ignore` matcher for the parent directory. `depth` should + /// be the depth of this entry. `result` should be the item yielded by + /// a directory iterator. + fn generate_work( + &mut self, + ig: &Ignore, + depth: usize, + root_device: Option, + result: Result, + ) -> WalkState { + let fs_dent = match result { + Ok(fs_dent) => fs_dent, + Err(err) => { + return self.visitor.visit(Err(Error::from(err).with_depth(depth))); + } + }; + let mut dent = match DirEntryRaw::from_entry(depth, &fs_dent) { + Ok(dent) => DirEntry::new_raw(dent, None), + Err(err) => { + return self.visitor.visit(Err(err)); + } + }; + let is_symlink = dent.file_type().map_or(false, |ft| ft.is_symlink()); + if self.follow_links && is_symlink { + let path = dent.path().to_path_buf(); + dent = match DirEntryRaw::from_path(depth, path, true) { + Ok(dent) => DirEntry::new_raw(dent, None), + Err(err) => { + return self.visitor.visit(Err(err)); + } + }; + if dent.is_dir() { + if let Err(err) = check_symlink_loop(ig, dent.path(), depth) { + return self.visitor.visit(Err(err)); + } + } + } + // N.B. See analogous call in the single-threaded implementation about + // why it's important for this to come before the checks below. + if should_skip_entry(ig, &dent) { + return WalkState::Continue; + } + if let Some(ref stdout) = self.skip { + let is_stdout = match path_equals(&dent, stdout) { + Ok(is_stdout) => is_stdout, + Err(err) => return self.visitor.visit(Err(err)), + }; + if is_stdout { + return WalkState::Continue; + } + } + let should_skip_filesize = if self.max_filesize.is_some() && !dent.is_dir() { + skip_filesize( + self.max_filesize.unwrap(), + dent.path(), + &dent.metadata().ok(), + ) + } else { + false + }; + let should_skip_filtered = if let Some(Filter(predicate)) = &self.filter { + !predicate(&dent) + } else { + false + }; + if !should_skip_filesize && !should_skip_filtered { + self.send(Work { + dent, + ignore: ig.clone(), + root_device, + }); + } + WalkState::Continue + } + + /// Returns the next directory to descend into. + /// + /// If all work has been exhausted, then this returns None. The worker + /// should then subsequently quit. + fn get_work(&mut self) -> Option { + let mut value = self.recv(); + loop { + // Simulate a priority channel: If quit_now flag is set, we can + // receive only quit messages. + if self.is_quit_now() { + value = Some(Message::Quit) + } + match value { + Some(Message::Work(work)) => { + return Some(work); + } + Some(Message::Quit) => { + // Repeat quit message to wake up sleeping threads, if + // any. The domino effect will ensure that every thread + // will quit. + self.send_quit(); + return None; + } + None => { + if self.deactivate_worker() == 0 { + // If deactivate_worker() returns 0, every worker thread + // is currently within the critical section between the + // acquire in deactivate_worker() and the release in + // activate_worker() below. For this to happen, every + // worker's local deque must be simultaneously empty, + // meaning there is no more work left at all. + self.send_quit(); + return None; + } + // Wait for next `Work` or `Quit` message. + loop { + if let Some(v) = self.recv() { + self.activate_worker(); + value = Some(v); + break; + } + // Our stack isn't blocking. Instead of burning the + // CPU waiting, we let the thread sleep for a bit. In + // general, this tends to only occur once the search is + // approaching termination. + let dur = std::time::Duration::from_millis(1); + std::thread::sleep(dur); + } + } + } + } + } + + /// Indicates that all workers should quit immediately. + fn quit_now(&self) { + self.quit_now.store(true, AtomicOrdering::SeqCst); + } + + /// Returns true if this worker should quit immediately. + fn is_quit_now(&self) -> bool { + self.quit_now.load(AtomicOrdering::SeqCst) + } + + /// Send work. + fn send(&self, work: Work) { + self.stack.push(Message::Work(work)); + } + + /// Send a quit message. + fn send_quit(&self) { + self.stack.push(Message::Quit); + } + + /// Receive work. + fn recv(&self) -> Option { + self.stack.pop() + } + + /// Deactivates a worker and returns the number of currently active workers. + fn deactivate_worker(&self) -> usize { + self.active_workers.fetch_sub(1, AtomicOrdering::Acquire) - 1 + } + + /// Reactivates a worker. + fn activate_worker(&self) { + self.active_workers.fetch_add(1, AtomicOrdering::Release); + } +} + +fn check_symlink_loop( + ig_parent: &Ignore, + child_path: &Path, + child_depth: usize, +) -> Result<(), Error> { + let hchild = Handle::from_path(child_path).map_err(|err| { + Error::from(err) + .with_path(child_path) + .with_depth(child_depth) + })?; + for ig in ig_parent + .parents() + .take_while(|ig| !ig.is_absolute_parent()) + { + let h = Handle::from_path(ig.path()).map_err(|err| { + Error::from(err) + .with_path(child_path) + .with_depth(child_depth) + })?; + if hchild == h { + return Err(Error::Loop { + ancestor: ig.path().to_path_buf(), + child: child_path.to_path_buf(), + } + .with_depth(child_depth)); + } + } + Ok(()) +} + +// Before calling this function, make sure that you ensure that is really +// necessary as the arguments imply a file stat. +fn skip_filesize(max_filesize: u64, path: &Path, ent: &Option) -> bool { + let filesize = match *ent { + Some(ref md) => Some(md.len()), + None => None, + }; + + if let Some(fs) = filesize { + if fs > max_filesize { + log::debug!("ignoring {}: {} bytes", path.display(), fs); + true + } else { + false + } + } else { + false + } +} + +fn should_skip_entry(ig: &Ignore, dent: &DirEntry) -> bool { + let m = ig.matched_dir_entry(dent); + if m.is_ignore() { + log::debug!("ignoring {}: {:?}", dent.path().display(), m); + true + } else if m.is_whitelist() { + log::debug!("whitelisting {}: {:?}", dent.path().display(), m); + false + } else { + false + } +} + +/// Returns a handle to stdout for filtering search. +/// +/// A handle is returned if and only if stdout is being redirected to a file. +/// The handle returned corresponds to that file. +/// +/// This can be used to ensure that we do not attempt to search a file that we +/// may also be writing to. +fn stdout_handle() -> Option { + let h = match Handle::stdout() { + Err(_) => return None, + Ok(h) => h, + }; + let md = match h.as_file().metadata() { + Err(_) => return None, + Ok(md) => md, + }; + if !md.is_file() { + return None; + } + Some(h) +} + +/// Returns true if and only if the given directory entry is believed to be +/// equivalent to the given handle. If there was a problem querying the path +/// for information to determine equality, then that error is returned. +fn path_equals(dent: &DirEntry, handle: &Handle) -> Result { + #[cfg(unix)] + fn never_equal(dent: &DirEntry, handle: &Handle) -> bool { + dent.ino() != Some(handle.ino()) + } + + #[cfg(not(unix))] + fn never_equal(_: &DirEntry, _: &Handle) -> bool { + false + } + + // If we know for sure that these two things aren't equal, then avoid + // the costly extra stat call to determine equality. + if dent.is_stdin() || never_equal(dent, handle) { + return Ok(false); + } + Handle::from_path(dent.path()) + .map(|h| &h == handle) + .map_err(|err| Error::Io(err).with_path(dent.path())) +} + +/// Returns true if the given walkdir entry corresponds to a directory. +/// +/// This is normally just `dent.file_type().is_dir()`, but when we aren't +/// following symlinks, the root directory entry may be a symlink to a +/// directory that we *do* follow---by virtue of it being specified by the user +/// explicitly. In that case, we need to follow the symlink and query whether +/// it's a directory or not. But we only do this for root entries to avoid an +/// additional stat check in most cases. +fn walkdir_is_dir(dent: &walkdir::DirEntry) -> bool { + if dent.file_type().is_dir() { + return true; + } + if !dent.file_type().is_symlink() || dent.depth() > 0 { + return false; + } + dent.path() + .metadata() + .ok() + .map_or(false, |md| md.file_type().is_dir()) +} + +/// Returns true if and only if the given path is on the same device as the +/// given root device. +fn is_same_file_system(root_device: u64, path: &Path) -> Result { + let dent_device = device_num(path).map_err(|err| Error::Io(err).with_path(path))?; + Ok(root_device == dent_device) +} + +#[cfg(unix)] +fn device_num>(path: P) -> io::Result { + use std::os::unix::fs::MetadataExt; + + path.as_ref().metadata().map(|md| md.dev()) +} + +#[cfg(windows)] +fn device_num>(path: P) -> io::Result { + use winapi_util::{file, Handle}; + + let h = Handle::from_path_any(path)?; + file::information(h).map(|info| info.volume_serial_number()) +} + +#[cfg(not(any(unix, windows)))] +fn device_num>(_: P) -> io::Result { + Err(io::Error::new( + io::ErrorKind::Other, + "walkdir: same_file_system option not supported on this platform", + )) +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + use std::fs::{self, File}; + use std::io::Write; + use std::path::Path; + use std::sync::{Arc, Mutex}; + + use super::{DirEntry, WalkBuilder, WalkState}; + use crate::tests::TempDir; + + fn wfile>(path: P, contents: &str) { + let mut file = File::create(path).unwrap(); + file.write_all(contents.as_bytes()).unwrap(); + } + + fn wfile_size>(path: P, size: u64) { + let file = File::create(path).unwrap(); + file.set_len(size).unwrap(); + } + + #[cfg(unix)] + fn symlink, Q: AsRef>(src: P, dst: Q) { + use std::os::unix::fs::symlink; + symlink(src, dst).unwrap(); + } + + fn mkdirp>(path: P) { + fs::create_dir_all(path).unwrap(); + } + + fn normal_path(unix: &str) -> String { + if cfg!(windows) { + unix.replace("\\", "/") + } else { + unix.to_string() + } + } + + fn walk_collect(prefix: &Path, builder: &WalkBuilder) -> Vec { + let mut paths = vec![]; + for result in builder.build() { + let dent = match result { + Err(_) => continue, + Ok(dent) => dent, + }; + let path = dent.path().strip_prefix(prefix).unwrap(); + if path.as_os_str().is_empty() { + continue; + } + paths.push(normal_path(path.to_str().unwrap())); + } + paths.sort(); + paths + } + + fn walk_collect_parallel(prefix: &Path, builder: &WalkBuilder) -> Vec { + let mut paths = vec![]; + for dent in walk_collect_entries_parallel(builder) { + let path = dent.path().strip_prefix(prefix).unwrap(); + if path.as_os_str().is_empty() { + continue; + } + paths.push(normal_path(path.to_str().unwrap())); + } + paths.sort(); + paths + } + + fn walk_collect_entries_parallel(builder: &WalkBuilder) -> Vec { + let dents = Arc::new(Mutex::new(vec![])); + builder.build_parallel().run(|| { + let dents = dents.clone(); + Box::new(move |result| { + if let Ok(dent) = result { + dents.lock().unwrap().push(dent); + } + WalkState::Continue + }) + }); + + let dents = dents.lock().unwrap(); + dents.to_vec() + } + + fn mkpaths(paths: &[&str]) -> Vec { + let mut paths: Vec<_> = paths.iter().map(|s| s.to_string()).collect(); + paths.sort(); + paths + } + + fn tmpdir() -> TempDir { + TempDir::new().unwrap() + } + + fn assert_paths(prefix: &Path, builder: &WalkBuilder, expected: &[&str]) { + let got = walk_collect(prefix, builder); + assert_eq!(got, mkpaths(expected), "single threaded"); + let got = walk_collect_parallel(prefix, builder); + assert_eq!(got, mkpaths(expected), "parallel"); + } + + #[test] + fn no_ignores() { + let td = tmpdir(); + mkdirp(td.path().join("a/b/c")); + mkdirp(td.path().join("x/y")); + wfile(td.path().join("a/b/foo"), ""); + wfile(td.path().join("x/y/foo"), ""); + + assert_paths( + td.path(), + &WalkBuilder::new(td.path()), + &["x", "x/y", "x/y/foo", "a", "a/b", "a/b/foo", "a/b/c"], + ); + } + + #[test] + fn custom_ignore() { + let td = tmpdir(); + let custom_ignore = ".customignore"; + mkdirp(td.path().join("a")); + wfile(td.path().join(custom_ignore), "foo"); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("bar"), ""); + wfile(td.path().join("a/bar"), ""); + + let mut builder = WalkBuilder::new(td.path()); + builder.add_custom_ignore_filename(&custom_ignore); + assert_paths(td.path(), &builder, &["bar", "a", "a/bar"]); + } + + #[test] + fn custom_ignore_exclusive_use() { + let td = tmpdir(); + let custom_ignore = ".customignore"; + mkdirp(td.path().join("a")); + wfile(td.path().join(custom_ignore), "foo"); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("bar"), ""); + wfile(td.path().join("a/bar"), ""); + + let mut builder = WalkBuilder::new(td.path()); + builder.ignore(false); + builder.git_ignore(false); + builder.git_global(false); + builder.git_exclude(false); + builder.add_custom_ignore_filename(&custom_ignore); + assert_paths(td.path(), &builder, &["bar", "a", "a/bar"]); + } + + #[test] + fn gitignore() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("a")); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("bar"), ""); + wfile(td.path().join("a/bar"), ""); + + assert_paths( + td.path(), + &WalkBuilder::new(td.path()), + &["bar", "a", "a/bar"], + ); + } + + #[test] + fn explicit_ignore() { + let td = tmpdir(); + let igpath = td.path().join(".not-an-ignore"); + mkdirp(td.path().join("a")); + wfile(&igpath, "foo"); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("bar"), ""); + wfile(td.path().join("a/bar"), ""); + + let mut builder = WalkBuilder::new(td.path()); + assert!(builder.add_ignore(&igpath).is_none()); + assert_paths(td.path(), &builder, &["bar", "a", "a/bar"]); + } + + #[test] + fn explicit_ignore_exclusive_use() { + let td = tmpdir(); + let igpath = td.path().join(".not-an-ignore"); + mkdirp(td.path().join("a")); + wfile(&igpath, "foo"); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("bar"), ""); + wfile(td.path().join("a/bar"), ""); + + let mut builder = WalkBuilder::new(td.path()); + builder.standard_filters(false); + assert!(builder.add_ignore(&igpath).is_none()); + assert_paths( + td.path(), + &builder, + &[".not-an-ignore", "bar", "a", "a/bar"], + ); + } + + #[test] + fn gitignore_parent() { + let td = tmpdir(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("a")); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("a/bar"), ""); + + let root = td.path().join("a"); + assert_paths(&root, &WalkBuilder::new(&root), &["bar"]); + } + + #[test] + fn max_depth() { + let td = tmpdir(); + mkdirp(td.path().join("a/b/c")); + wfile(td.path().join("foo"), ""); + wfile(td.path().join("a/foo"), ""); + wfile(td.path().join("a/b/foo"), ""); + wfile(td.path().join("a/b/c/foo"), ""); + + let mut builder = WalkBuilder::new(td.path()); + assert_paths( + td.path(), + &builder, + &["a", "a/b", "a/b/c", "foo", "a/foo", "a/b/foo", "a/b/c/foo"], + ); + assert_paths(td.path(), builder.max_depth(Some(0)), &[]); + assert_paths(td.path(), builder.max_depth(Some(1)), &["a", "foo"]); + assert_paths( + td.path(), + builder.max_depth(Some(2)), + &["a", "a/b", "foo", "a/foo"], + ); + } + + #[test] + fn max_filesize() { + let td = tmpdir(); + mkdirp(td.path().join("a/b")); + wfile_size(td.path().join("foo"), 0); + wfile_size(td.path().join("bar"), 400); + wfile_size(td.path().join("baz"), 600); + wfile_size(td.path().join("a/foo"), 600); + wfile_size(td.path().join("a/bar"), 500); + wfile_size(td.path().join("a/baz"), 200); + + let mut builder = WalkBuilder::new(td.path()); + assert_paths( + td.path(), + &builder, + &["a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz"], + ); + assert_paths( + td.path(), + builder.max_filesize(Some(0)), + &["a", "a/b", "foo"], + ); + assert_paths( + td.path(), + builder.max_filesize(Some(500)), + &["a", "a/b", "foo", "bar", "a/bar", "a/baz"], + ); + assert_paths( + td.path(), + builder.max_filesize(Some(50000)), + &["a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz"], + ); + } + + #[cfg(unix)] // because symlinks on windows are weird + #[test] + fn symlinks() { + let td = tmpdir(); + mkdirp(td.path().join("a/b")); + symlink(td.path().join("a/b"), td.path().join("z")); + wfile(td.path().join("a/b/foo"), ""); + + let mut builder = WalkBuilder::new(td.path()); + assert_paths(td.path(), &builder, &["a", "a/b", "a/b/foo", "z"]); + assert_paths( + td.path(), + &builder.follow_links(true), + &["a", "a/b", "a/b/foo", "z", "z/foo"], + ); + } + + #[cfg(unix)] // because symlinks on windows are weird + #[test] + fn first_path_not_symlink() { + let td = tmpdir(); + mkdirp(td.path().join("foo")); + + let dents = WalkBuilder::new(td.path().join("foo")) + .build() + .into_iter() + .collect::, _>>() + .unwrap(); + assert_eq!(1, dents.len()); + assert!(!dents[0].path_is_symlink()); + + let dents = walk_collect_entries_parallel(&WalkBuilder::new(td.path().join("foo"))); + assert_eq!(1, dents.len()); + assert!(!dents[0].path_is_symlink()); + } + + #[cfg(unix)] // because symlinks on windows are weird + #[test] + fn symlink_loop() { + let td = tmpdir(); + mkdirp(td.path().join("a/b")); + symlink(td.path().join("a"), td.path().join("a/b/c")); + + let mut builder = WalkBuilder::new(td.path()); + assert_paths(td.path(), &builder, &["a", "a/b", "a/b/c"]); + assert_paths(td.path(), &builder.follow_links(true), &["a", "a/b"]); + } + + // It's a little tricky to test the 'same_file_system' option since + // we need an environment with more than one file system. We adopt a + // heuristic where /sys is typically a distinct volume on Linux and roll + // with that. + #[test] + #[cfg(target_os = "linux")] + fn same_file_system() { + use super::device_num; + + // If for some reason /sys doesn't exist or isn't a directory, just + // skip this test. + if !Path::new("/sys").is_dir() { + return; + } + + // If our test directory actually isn't a different volume from /sys, + // then this test is meaningless and we shouldn't run it. + let td = tmpdir(); + if device_num(td.path()).unwrap() == device_num("/sys").unwrap() { + return; + } + + mkdirp(td.path().join("same_file")); + symlink("/sys", td.path().join("same_file").join("alink")); + + // Create a symlink to sys and enable following symlinks. If the + // same_file_system option doesn't work, then this probably will hit a + // permission error. Otherwise, it should just skip over the symlink + // completely. + let mut builder = WalkBuilder::new(td.path()); + builder.follow_links(true).same_file_system(true); + assert_paths(td.path(), &builder, &["same_file", "same_file/alink"]); + } + + #[cfg(target_os = "linux")] + #[test] + fn no_read_permissions() { + let dir_path = Path::new("/root"); + + // There's no /etc/sudoers.d, skip the test. + if !dir_path.is_dir() { + return; + } + // We're the root, so the test won't check what we want it to. + if fs::read_dir(&dir_path).is_ok() { + return; + } + + // Check that we can't descend but get an entry for the parent dir. + let builder = WalkBuilder::new(&dir_path); + assert_paths(dir_path.parent().unwrap(), &builder, &["root"]); + } + + #[test] + fn filter() { + let td = tmpdir(); + mkdirp(td.path().join("a/b/c")); + mkdirp(td.path().join("x/y")); + wfile(td.path().join("a/b/foo"), ""); + wfile(td.path().join("x/y/foo"), ""); + + assert_paths( + td.path(), + &WalkBuilder::new(td.path()), + &["x", "x/y", "x/y/foo", "a", "a/b", "a/b/foo", "a/b/c"], + ); + + assert_paths( + td.path(), + &WalkBuilder::new(td.path()).filter_entry(|entry| entry.file_name() != OsStr::new("a")), + &["x", "x/y", "x/y/foo"], + ); + } +} diff --git a/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.gitignore b/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.gitignore new file mode 100644 index 000000000000..ac09e12f7aba --- /dev/null +++ b/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.gitignore @@ -0,0 +1,216 @@ +# Based on https://github.com/behnam/gitignore-test/blob/master/.gitignore + +### file in root + +# MATCH /file_root_1 +file_root_00 + +# NO_MATCH +file_root_01/ + +# NO_MATCH +file_root_02/* + +# NO_MATCH +file_root_03/** + + +# MATCH /file_root_10 +/file_root_10 + +# NO_MATCH +/file_root_11/ + +# NO_MATCH +/file_root_12/* + +# NO_MATCH +/file_root_13/** + + +# NO_MATCH +*/file_root_20 + +# NO_MATCH +*/file_root_21/ + +# NO_MATCH +*/file_root_22/* + +# NO_MATCH +*/file_root_23/** + + +# MATCH /file_root_30 +**/file_root_30 + +# NO_MATCH +**/file_root_31/ + +# NO_MATCH +**/file_root_32/* + +# NO_MATCH +**/file_root_33/** + + +### file in sub-dir + +# MATCH /parent_dir/file_deep_1 +file_deep_00 + +# NO_MATCH +file_deep_01/ + +# NO_MATCH +file_deep_02/* + +# NO_MATCH +file_deep_03/** + + +# NO_MATCH +/file_deep_10 + +# NO_MATCH +/file_deep_11/ + +# NO_MATCH +/file_deep_12/* + +# NO_MATCH +/file_deep_13/** + + +# MATCH /parent_dir/file_deep_20 +*/file_deep_20 + +# NO_MATCH +*/file_deep_21/ + +# NO_MATCH +*/file_deep_22/* + +# NO_MATCH +*/file_deep_23/** + + +# MATCH /parent_dir/file_deep_30 +**/file_deep_30 + +# NO_MATCH +**/file_deep_31/ + +# NO_MATCH +**/file_deep_32/* + +# NO_MATCH +**/file_deep_33/** + + +### dir in root + +# MATCH /dir_root_00 +dir_root_00 + +# MATCH /dir_root_01 +dir_root_01/ + +# MATCH /dir_root_02 +dir_root_02/* + +# MATCH /dir_root_03 +dir_root_03/** + + +# MATCH /dir_root_10 +/dir_root_10 + +# MATCH /dir_root_11 +/dir_root_11/ + +# MATCH /dir_root_12 +/dir_root_12/* + +# MATCH /dir_root_13 +/dir_root_13/** + + +# NO_MATCH +*/dir_root_20 + +# NO_MATCH +*/dir_root_21/ + +# NO_MATCH +*/dir_root_22/* + +# NO_MATCH +*/dir_root_23/** + + +# MATCH /dir_root_30 +**/dir_root_30 + +# MATCH /dir_root_31 +**/dir_root_31/ + +# MATCH /dir_root_32 +**/dir_root_32/* + +# MATCH /dir_root_33 +**/dir_root_33/** + + +### dir in sub-dir + +# MATCH /parent_dir/dir_deep_00 +dir_deep_00 + +# MATCH /parent_dir/dir_deep_01 +dir_deep_01/ + +# NO_MATCH +dir_deep_02/* + +# NO_MATCH +dir_deep_03/** + + +# NO_MATCH +/dir_deep_10 + +# NO_MATCH +/dir_deep_11/ + +# NO_MATCH +/dir_deep_12/* + +# NO_MATCH +/dir_deep_13/** + + +# MATCH /parent_dir/dir_deep_20 +*/dir_deep_20 + +# MATCH /parent_dir/dir_deep_21 +*/dir_deep_21/ + +# MATCH /parent_dir/dir_deep_22 +*/dir_deep_22/* + +# MATCH /parent_dir/dir_deep_23 +*/dir_deep_23/** + + +# MATCH /parent_dir/dir_deep_30 +**/dir_deep_30 + +# MATCH /parent_dir/dir_deep_31 +**/dir_deep_31/ + +# MATCH /parent_dir/dir_deep_32 +**/dir_deep_32/* + +# MATCH /parent_dir/dir_deep_33 +**/dir_deep_33/** diff --git a/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.rs b/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.rs new file mode 100644 index 000000000000..b7b7c6f95087 --- /dev/null +++ b/crates/ignore/tests/gitignore_matched_path_or_any_parents_tests.rs @@ -0,0 +1,291 @@ +use std::path::Path; + +use ignore::gitignore::{Gitignore, GitignoreBuilder}; + +const IGNORE_FILE: &'static str = "tests/gitignore_matched_path_or_any_parents_tests.gitignore"; + +fn get_gitignore() -> Gitignore { + let mut builder = GitignoreBuilder::new("ROOT"); + let error = builder.add(IGNORE_FILE); + assert!(error.is_none(), "failed to open gitignore file"); + builder.build().unwrap() +} + +#[test] +#[should_panic(expected = "path is expected to be under the root")] +fn test_path_should_be_under_root() { + let gitignore = get_gitignore(); + let path = "/tmp/some_file"; + gitignore.matched_path_or_any_parents(Path::new(path), false); + assert!(false); +} + +#[test] +fn test_files_in_root() { + let gitignore = get_gitignore(); + let m = |path: &str| gitignore.matched_path_or_any_parents(Path::new(path), false); + + // 0x + assert!(m("ROOT/file_root_00").is_ignore()); + assert!(m("ROOT/file_root_01").is_none()); + assert!(m("ROOT/file_root_02").is_none()); + assert!(m("ROOT/file_root_03").is_none()); + + // 1x + assert!(m("ROOT/file_root_10").is_ignore()); + assert!(m("ROOT/file_root_11").is_none()); + assert!(m("ROOT/file_root_12").is_none()); + assert!(m("ROOT/file_root_13").is_none()); + + // 2x + assert!(m("ROOT/file_root_20").is_none()); + assert!(m("ROOT/file_root_21").is_none()); + assert!(m("ROOT/file_root_22").is_none()); + assert!(m("ROOT/file_root_23").is_none()); + + // 3x + assert!(m("ROOT/file_root_30").is_ignore()); + assert!(m("ROOT/file_root_31").is_none()); + assert!(m("ROOT/file_root_32").is_none()); + assert!(m("ROOT/file_root_33").is_none()); +} + +#[test] +fn test_files_in_deep() { + let gitignore = get_gitignore(); + let m = |path: &str| gitignore.matched_path_or_any_parents(Path::new(path), false); + + // 0x + assert!(m("ROOT/parent_dir/file_deep_00").is_ignore()); + assert!(m("ROOT/parent_dir/file_deep_01").is_none()); + assert!(m("ROOT/parent_dir/file_deep_02").is_none()); + assert!(m("ROOT/parent_dir/file_deep_03").is_none()); + + // 1x + assert!(m("ROOT/parent_dir/file_deep_10").is_none()); + assert!(m("ROOT/parent_dir/file_deep_11").is_none()); + assert!(m("ROOT/parent_dir/file_deep_12").is_none()); + assert!(m("ROOT/parent_dir/file_deep_13").is_none()); + + // 2x + assert!(m("ROOT/parent_dir/file_deep_20").is_ignore()); + assert!(m("ROOT/parent_dir/file_deep_21").is_none()); + assert!(m("ROOT/parent_dir/file_deep_22").is_none()); + assert!(m("ROOT/parent_dir/file_deep_23").is_none()); + + // 3x + assert!(m("ROOT/parent_dir/file_deep_30").is_ignore()); + assert!(m("ROOT/parent_dir/file_deep_31").is_none()); + assert!(m("ROOT/parent_dir/file_deep_32").is_none()); + assert!(m("ROOT/parent_dir/file_deep_33").is_none()); +} + +#[test] +fn test_dirs_in_root() { + let gitignore = get_gitignore(); + let m = + |path: &str, is_dir: bool| gitignore.matched_path_or_any_parents(Path::new(path), is_dir); + + // 00 + assert!(m("ROOT/dir_root_00", true).is_ignore()); + assert!(m("ROOT/dir_root_00/file", false).is_ignore()); + assert!(m("ROOT/dir_root_00/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_00/child_dir/file", false).is_ignore()); + + // 01 + assert!(m("ROOT/dir_root_01", true).is_ignore()); + assert!(m("ROOT/dir_root_01/file", false).is_ignore()); + assert!(m("ROOT/dir_root_01/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_01/child_dir/file", false).is_ignore()); + + // 02 + assert!(m("ROOT/dir_root_02", true).is_none()); // dir itself doesn't match + assert!(m("ROOT/dir_root_02/file", false).is_ignore()); + assert!(m("ROOT/dir_root_02/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_02/child_dir/file", false).is_ignore()); + + // 03 + assert!(m("ROOT/dir_root_03", true).is_none()); // dir itself doesn't match + assert!(m("ROOT/dir_root_03/file", false).is_ignore()); + assert!(m("ROOT/dir_root_03/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_03/child_dir/file", false).is_ignore()); + + // 10 + assert!(m("ROOT/dir_root_10", true).is_ignore()); + assert!(m("ROOT/dir_root_10/file", false).is_ignore()); + assert!(m("ROOT/dir_root_10/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_10/child_dir/file", false).is_ignore()); + + // 11 + assert!(m("ROOT/dir_root_11", true).is_ignore()); + assert!(m("ROOT/dir_root_11/file", false).is_ignore()); + assert!(m("ROOT/dir_root_11/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_11/child_dir/file", false).is_ignore()); + + // 12 + assert!(m("ROOT/dir_root_12", true).is_none()); // dir itself doesn't match + assert!(m("ROOT/dir_root_12/file", false).is_ignore()); + assert!(m("ROOT/dir_root_12/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_12/child_dir/file", false).is_ignore()); + + // 13 + assert!(m("ROOT/dir_root_13", true).is_none()); + assert!(m("ROOT/dir_root_13/file", false).is_ignore()); + assert!(m("ROOT/dir_root_13/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_13/child_dir/file", false).is_ignore()); + + // 20 + assert!(m("ROOT/dir_root_20", true).is_none()); + assert!(m("ROOT/dir_root_20/file", false).is_none()); + assert!(m("ROOT/dir_root_20/child_dir", true).is_none()); + assert!(m("ROOT/dir_root_20/child_dir/file", false).is_none()); + + // 21 + assert!(m("ROOT/dir_root_21", true).is_none()); + assert!(m("ROOT/dir_root_21/file", false).is_none()); + assert!(m("ROOT/dir_root_21/child_dir", true).is_none()); + assert!(m("ROOT/dir_root_21/child_dir/file", false).is_none()); + + // 22 + assert!(m("ROOT/dir_root_22", true).is_none()); + assert!(m("ROOT/dir_root_22/file", false).is_none()); + assert!(m("ROOT/dir_root_22/child_dir", true).is_none()); + assert!(m("ROOT/dir_root_22/child_dir/file", false).is_none()); + + // 23 + assert!(m("ROOT/dir_root_23", true).is_none()); + assert!(m("ROOT/dir_root_23/file", false).is_none()); + assert!(m("ROOT/dir_root_23/child_dir", true).is_none()); + assert!(m("ROOT/dir_root_23/child_dir/file", false).is_none()); + + // 30 + assert!(m("ROOT/dir_root_30", true).is_ignore()); + assert!(m("ROOT/dir_root_30/file", false).is_ignore()); + assert!(m("ROOT/dir_root_30/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_30/child_dir/file", false).is_ignore()); + + // 31 + assert!(m("ROOT/dir_root_31", true).is_ignore()); + assert!(m("ROOT/dir_root_31/file", false).is_ignore()); + assert!(m("ROOT/dir_root_31/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_31/child_dir/file", false).is_ignore()); + + // 32 + assert!(m("ROOT/dir_root_32", true).is_none()); // dir itself doesn't match + assert!(m("ROOT/dir_root_32/file", false).is_ignore()); + assert!(m("ROOT/dir_root_32/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_32/child_dir/file", false).is_ignore()); + + // 33 + assert!(m("ROOT/dir_root_33", true).is_none()); // dir itself doesn't match + assert!(m("ROOT/dir_root_33/file", false).is_ignore()); + assert!(m("ROOT/dir_root_33/child_dir", true).is_ignore()); + assert!(m("ROOT/dir_root_33/child_dir/file", false).is_ignore()); +} + +#[test] +fn test_dirs_in_deep() { + let gitignore = get_gitignore(); + let m = + |path: &str, is_dir: bool| gitignore.matched_path_or_any_parents(Path::new(path), is_dir); + + // 00 + assert!(m("ROOT/parent_dir/dir_deep_00", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_00/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_00/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_00/child_dir/file", false).is_ignore()); + + // 01 + assert!(m("ROOT/parent_dir/dir_deep_01", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_01/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_01/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_01/child_dir/file", false).is_ignore()); + + // 02 + assert!(m("ROOT/parent_dir/dir_deep_02", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_02/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_02/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_02/child_dir/file", false).is_none()); + + // 03 + assert!(m("ROOT/parent_dir/dir_deep_03", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_03/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_03/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_03/child_dir/file", false).is_none()); + + // 10 + assert!(m("ROOT/parent_dir/dir_deep_10", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_10/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_10/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_10/child_dir/file", false).is_none()); + + // 11 + assert!(m("ROOT/parent_dir/dir_deep_11", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_11/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_11/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_11/child_dir/file", false).is_none()); + + // 12 + assert!(m("ROOT/parent_dir/dir_deep_12", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_12/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_12/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_12/child_dir/file", false).is_none()); + + // 13 + assert!(m("ROOT/parent_dir/dir_deep_13", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_13/file", false).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_13/child_dir", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_13/child_dir/file", false).is_none()); + + // 20 + assert!(m("ROOT/parent_dir/dir_deep_20", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_20/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_20/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_20/child_dir/file", false).is_ignore()); + + // 21 + assert!(m("ROOT/parent_dir/dir_deep_21", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_21/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_21/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_21/child_dir/file", false).is_ignore()); + + // 22 + // dir itself doesn't match + assert!(m("ROOT/parent_dir/dir_deep_22", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_22/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_22/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_22/child_dir/file", false).is_ignore()); + + // 23 + // dir itself doesn't match + assert!(m("ROOT/parent_dir/dir_deep_23", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_23/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_23/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_23/child_dir/file", false).is_ignore()); + + // 30 + assert!(m("ROOT/parent_dir/dir_deep_30", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_30/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_30/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_30/child_dir/file", false).is_ignore()); + + // 31 + assert!(m("ROOT/parent_dir/dir_deep_31", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_31/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_31/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_31/child_dir/file", false).is_ignore()); + + // 32 + // dir itself doesn't match + assert!(m("ROOT/parent_dir/dir_deep_32", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_32/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_32/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_32/child_dir/file", false).is_ignore()); + + // 33 + // dir itself doesn't match + assert!(m("ROOT/parent_dir/dir_deep_33", true).is_none()); + assert!(m("ROOT/parent_dir/dir_deep_33/file", false).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_33/child_dir", true).is_ignore()); + assert!(m("ROOT/parent_dir/dir_deep_33/child_dir/file", false).is_ignore()); +} diff --git a/crates/oxide/Cargo.toml b/crates/oxide/Cargo.toml index f3b67d8d75fb..448fc5c9f227 100644 --- a/crates/oxide/Cargo.toml +++ b/crates/oxide/Cargo.toml @@ -13,11 +13,11 @@ crossbeam = "0.8.4" tracing = { version = "0.1.40", features = [] } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } walkdir = "2.5.0" -ignore = "0.4.23" dunce = "1.0.5" bexpand = "1.2.0" fast-glob = "0.4.3" classification-macros = { path = "../classification-macros" } +ignore = { path = "../ignore" } regex = "1.11.1" fancy-regex = "0.14.0" From 18442bebae6b5db55226229bec0ba28e68ca7ae9 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Mon, 17 Mar 2025 19:58:56 +0100 Subject: [PATCH 02/26] manually apply patches to inlined `ignore` crate --- Cargo.lock | 5 +- crates/ignore/Cargo.toml | 3 +- crates/ignore/src/dir.rs | 154 +++++++++++++++++++++----------------- crates/ignore/src/walk.rs | 7 +- 4 files changed, 97 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4a6baa0a56ce..ee6a465fcb69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,9 +199,9 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "globset" -version = "0.4.15" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15f1ce686646e7f1e19bf7d5533fe443a45dbfb990e00629110797578b42fb19" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" dependencies = [ "aho-corasick", "bstr", @@ -228,6 +228,7 @@ dependencies = [ "bstr", "crossbeam-channel", "crossbeam-deque", + "dunce", "globset", "log", "memchr", diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml index 3a22a48c1bb9..b8ae1b1bf721 100644 --- a/crates/ignore/Cargo.toml +++ b/crates/ignore/Cargo.toml @@ -20,11 +20,12 @@ bench = false [dependencies] crossbeam-deque = "0.8.3" -globset = "0.4.15" +globset = "0.4.16" log = "0.4.20" memchr = "2.6.3" same-file = "1.0.6" walkdir = "2.4.0" +dunce = "1.0.5" [dependencies.regex-automata] version = "0.4.0" diff --git a/crates/ignore/src/dir.rs b/crates/ignore/src/dir.rs index 48bc7ac1332d..69eba476048c 100644 --- a/crates/ignore/src/dir.rs +++ b/crates/ignore/src/dir.rs @@ -176,7 +176,8 @@ impl Ignore { if !self.is_root() { panic!("Ignore::add_parents called on non-root matcher"); } - let absolute_base = match path.as_ref().canonicalize() { + // CHANGED: Use `dunce::canonicalize` as we use it everywhere else. + let absolute_base = match dunce::canonicalize(path.as_ref()) { Ok(path) => Arc::new(path), Err(_) => { // There's not much we can do here, so just return our @@ -428,60 +429,51 @@ impl Ignore { saw_git = saw_git || ig.0.has_git; } if self.0.opts.parents { - if let Some(abs_parent_path) = self.absolute_base() { - // What we want to do here is take the absolute base path of - // this directory and join it with the path we're searching. - // The main issue we want to avoid is accidentally duplicating - // directory components, so we try to strip any common prefix - // off of `path`. Overall, this seems a little ham-fisted, but - // it does fix a nasty bug. It should do fine until we overhaul - // this crate. - let dirpath = self.0.dir.as_path(); - let path_prefix = match strip_prefix("./", dirpath) { - None => dirpath, - Some(stripped_dot_slash) => stripped_dot_slash, - }; - let path = match strip_prefix(path_prefix, path) { - None => abs_parent_path.join(path), - Some(p) => { - let p = match strip_prefix("/", p) { - None => p, - Some(p) => p, - }; - abs_parent_path.join(p) - } - }; - - for ig in self.parents().skip_while(|ig| !ig.0.is_absolute_parent) { - if m_custom_ignore.is_none() { - m_custom_ignore = - ig.0.custom_ignore_matcher - .matched(&path, is_dir) - .map(IgnoreMatch::gitignore); - } - if m_ignore.is_none() { - m_ignore = - ig.0.ignore_matcher - .matched(&path, is_dir) - .map(IgnoreMatch::gitignore); - } - if any_git && !saw_git && m_gi.is_none() { - m_gi = - ig.0.git_ignore_matcher - .matched(&path, is_dir) - .map(IgnoreMatch::gitignore); - } - if any_git && !saw_git && m_gi_exclude.is_none() { - m_gi_exclude = - ig.0.git_exclude_matcher - .matched(&path, is_dir) - .map(IgnoreMatch::gitignore); - } - saw_git = saw_git || ig.0.has_git; + // CHANGED: We removed a code path that rewrote the `path` to be relative to + // `self.absolute_base()` because it assumed that the every path is inside the base + // which is not the case for us as we use `WalkBuilder#add` to add roots outside of the + // base. + for ig in self.parents().skip_while(|ig| !ig.0.is_absolute_parent) { + if m_custom_ignore.is_none() { + m_custom_ignore = + ig.0.custom_ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); } + if m_ignore.is_none() { + m_ignore = + ig.0.ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi.is_none() { + m_gi = + ig.0.git_ignore_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if any_git && !saw_git && m_gi_exclude.is_none() { + m_gi_exclude = + ig.0.git_exclude_matcher + .matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + saw_git = saw_git || ig.0.has_git; } } for gi in self.0.explicit_ignores.iter().rev() { + // CHANGED: We need to make sure that the explicit gitignore rules apply to the path + // + // path = Is the current file/folder we are traversing + // gi.path() = Is the path of the custom gitignore file + // + // E.g.: If we have a custom rule for `/src/utils` with `**/*`, and we are looking at + // just `/src`, then the `**/*` rules do not apply to this folder, so we can + // ignore the current custom gitignore file. + // + if !path.starts_with(gi.path()) { + continue; + } if !m_explicit.is_none() { break; } @@ -496,24 +488,47 @@ impl Ignore { Match::None }; - m_custom_ignore - .or(m_ignore) - .or(m_gi) - .or(m_gi_exclude) - .or(m_global) - .or(m_explicit) + // CHANGED: We added logic to configure an order in which the ignore files are respected and + // allowed a whitelist in a later file to overrule a block on an earlier file. + let order = [ + // Global gitignore + &m_global, + // .git/info/exclude + &m_gi_exclude, + // .gitignore + &m_gi, + // .ignore + &m_ignore, + // .custom-ignore + &m_custom_ignore, + // Manually added ignores + &m_explicit, + ]; + + for (idx, check) in order.into_iter().enumerate() { + if check.is_none() { + continue; + } + + let remaining = &order[idx + 1..]; + if check.is_ignore() { + if remaining.iter().any(|other| other.is_whitelist()) { + continue; + } + } else if remaining.iter().any(|other| other.is_ignore()) { + continue; + } + + return check.clone(); + } + + m_explicit } /// Returns an iterator over parent ignore matchers, including this one. pub(crate) fn parents(&self) -> Parents<'_> { Parents(Some(self)) } - - /// Returns the first absolute path of the first absolute parent, if - /// one exists. - fn absolute_base(&self) -> Option<&Path> { - self.0.absolute_base.as_ref().map(|p| &***p) - } } /// An iterator over all parents of an ignore matcher, including itself. @@ -875,9 +890,10 @@ mod tests { .build() .add_child(td.path()); assert!(err.is_none()); - assert!(ig.matched("foo", false).is_ignore()); - assert!(ig.matched("bar", false).is_whitelist()); - assert!(ig.matched("baz", false).is_none()); + assert!(ig.matched(td.path().join("foo"), false).is_ignore()); + assert!(ig.matched(td.path().join("bar"), false).is_whitelist()); + assert!(ig.matched(td.path().join("baz"), false).is_none()); + assert!(ig.matched("/foo", false).is_none()); } #[test] @@ -1131,8 +1147,10 @@ mod tests { let (ig2, err) = ig1.add_child("src"); assert!(err.is_none()); - assert!(ig1.matched("llvm", true).is_none()); - assert!(ig2.matched("llvm", true).is_none()); + // CHANGED: These test cases do not make sense for us as we never call the Ignore with + // relative paths. + assert!(ig1.matched("llvm", true).is_ignore()); + assert!(ig2.matched("llvm", true).is_ignore()); assert!(ig2.matched("src/llvm", true).is_none()); assert!(ig2.matched("foo", false).is_ignore()); assert!(ig2.matched("src/foo", false).is_ignore()); diff --git a/crates/ignore/src/walk.rs b/crates/ignore/src/walk.rs index 5a8da6e336f6..9c1f7413d918 100644 --- a/crates/ignore/src/walk.rs +++ b/crates/ignore/src/walk.rs @@ -16,7 +16,7 @@ use { use crate::{ dir::{Ignore, IgnoreBuilder}, - gitignore::GitignoreBuilder, + gitignore::{Gitignore, GitignoreBuilder}, overrides::Override, types::Types, Error, PartialErrorBuilder, @@ -666,6 +666,11 @@ impl WalkBuilder { errs.into_error_option() } + /// CHANGED: Add a Gitignore to the builder. + pub fn add_gitignore(&mut self, gi: Gitignore) { + self.ig_builder.add_ignore(gi); + } + /// Add a custom ignore file name /// /// These ignore files have higher precedence than all other ignore files. From 22bd00f45b922fe9b7f92f5fe8e775fc5c99e305 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 16:58:56 +0100 Subject: [PATCH 03/26] run all CI tests when `[ci-all]` exists in PR description --- .github/workflows/ci.yml | 2 +- .github/workflows/integration-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3844c3050a6..9544baa663bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: # Exclude windows and macos from being built on feature branches on-main-branch: - - ${{ github.ref == 'refs/heads/main' }} + - ${{ github.ref == 'refs/heads/main' || contains(github.event.pull_request.body, '[ci-all]') }} exclude: - on-main-branch: false runner: diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 55fac5cb1cd4..4fa0900ef9fd 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -34,7 +34,7 @@ jobs: # Exclude windows and macos from being built on feature branches on-main-branch: - - ${{ github.ref == 'refs/heads/main' }} + - ${{ github.ref == 'refs/heads/main' || contains(github.event.pull_request.body, '[ci-all]') }} exclude: - on-main-branch: false runner: From ba153be8700b4454cad48d12132862246d82d5b8 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:39:38 +0100 Subject: [PATCH 04/26] big refactor, move `Scanner` to `scanner/mod.rs` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a chunky commit which includes a refactor and all the necessary changes. Sorry that it's a bit confusing to read because so many things changed. The big picture changes: 1. We were dealing with `GlobEntry` enums, but we split this up to better model what's going on. - `GlobEntry`: literally represents a `Glob`. We still return these so our frontends like `@tailwindcss/postcss` can still use them. - `SourceEntry`: in the `node` crate, a `SourceEntry` is what used to be a `GlobEntry` but also has a negated flag now. This represents a `@source not '…'` for example. This will map to a `PublicSourceEntry` inside of the `oxide` crate. - `PublicSourceEntry`: in the `oxide` crate, this is exactly the same structure as the `SourceEntry` in the `node` crate. - `SourceEntry`: this is an enum that better represents the domain we are dealing with.: ```rs pub enum SourceEntry { Auto { base: PathBuf }, IgnoredAuto { base: PathBuf }, Pattern { base: PathBuf, pattern: String }, IgnoredPattern { base: PathBuf, pattern: String }, } ``` 2. One big realisation we had is that auto source detection and the ignored files/extensions/... _basically_ map to a giant set of `.gitignore` rules. This is also how we modeled it now. The `Scanner` will get a set of `sources` (`Vec`) and generates the necessary `.gitignore` rules. One nice benefit with this approach is that it unlocks a few features for free: 1. No need to duplicate the rules in several spots anymore (scanning files initially, scanning files in subsequent calls, scanning specific files via `scanFiles()`). They all use the same "rules". 2. No need to potentially do multiple file system walks anymore. All the rules are setup once, and we will be able to walk over the files and folders that adhere to the rules. Bonus points: _if_ you reach a folder that we know with 100% certainty is ignored by the gitignore rules, then we can skip that folder entirely. 3. `@source` order matters, but it allows you to explicitly ignore an allowed file, or explicitly allow an ignored file. 3. We split the internal `Scanner` API, it essentially looks like this: 1. The `Scanner::new(sources)` sets up the world and creates a `walk_builder` (the big engine behind all of this) 2. `Scanner.scan_sources(…)` — the only thing it does is use the walk_builder and stores information based on the info we found: 1. Collect `dirs` 2. Collect `files` 3. Collect `extensions` 3. `Scanner.scan_content` — this is what the `@tailwindcss/cli` calls via `scanner.scanFiles(…)`. This also just registers the changed files and content into the `scanner`. For `ChangedContent::File(_)` we verify that the file is allowed by all the rules mentioned above. 4. `Scanner.extract_candidates` — essentially take whatever is stored in `self.changed_content` and parse these files by extracting all the candidates. This is used by `scan_sources` and `scan_content`. Co-authored-by: Philipp Spiess --- crates/oxide/src/glob.rs | 151 +++- crates/oxide/src/lib.rs | 574 +-------------- crates/oxide/src/scanner/allowed_paths.rs | 128 ---- .../src/scanner/auto_source_detection.rs | 54 ++ crates/oxide/src/scanner/detect_sources.rs | 292 +++----- crates/oxide/src/scanner/mod.rs | 686 +++++++++++++++++- crates/oxide/src/scanner/sources.rs | 228 ++++++ 7 files changed, 1213 insertions(+), 900 deletions(-) delete mode 100644 crates/oxide/src/scanner/allowed_paths.rs create mode 100644 crates/oxide/src/scanner/auto_source_detection.rs create mode 100644 crates/oxide/src/scanner/sources.rs diff --git a/crates/oxide/src/glob.rs b/crates/oxide/src/glob.rs index ca73b5ee116b..7f99631c35b4 100644 --- a/crates/oxide/src/glob.rs +++ b/crates/oxide/src/glob.rs @@ -1,11 +1,117 @@ -use fast_glob::glob_match; +use crate::PublicSourceEntry; use fxhash::{FxHashMap, FxHashSet}; -use std::path::{Path, PathBuf}; -use tracing::event; +use std::path::PathBuf; +use tracing::{event, Level}; -use crate::GlobEntry; +#[derive(Debug, Clone, PartialEq)] +pub struct GlobEntry { + /// Base path of the glob + pub base: String, -pub fn hoist_static_glob_parts(entries: &Vec) -> Vec { + /// Glob pattern + pub pattern: String, +} + +/// Optimize the PublicSourceEntry by trying to move all the static parts of the pattern to the +/// base of the PublicSourceEntry. +/// +/// ```diff +/// - { base: '/', pattern: 'src/**/*.html'} +/// + { base: '/src', pattern: '**/*.html'} +/// ``` +/// +/// A file stays in the `pattern` part, because the `base` should only be a directory. +/// +/// ```diff +/// - { base: '/', pattern: 'src/examples/index.html'} +/// + { base: '/src/examples', pattern: 'index.html'} +/// ``` +/// +/// A folder will be moved to the `base` part, and the `pattern` will be set to `**/*`. +/// +/// ```diff +/// - { base: '/', pattern: 'src/examples'} +/// + { base: '/src/examples', pattern: '**/*'} +/// ``` +/// +/// In addition, we will canonicalize the base path so we always work with the correctly resolved +/// path. +pub fn optimize_public_source_entry(source: &mut PublicSourceEntry) { + // Resolve base path immediately + let Ok(base) = dunce::canonicalize(&source.base) else { + event!(Level::ERROR, "Failed to resolve base: {:?}", source.base); + return; + }; + source.base = base.to_string_lossy().to_string(); + + // No dynamic part, figure out if we are dealing with a file or a directory. + if !source.pattern.contains('*') { + let combined_path = if source.pattern.starts_with("/") { + PathBuf::from(&source.pattern) + } else { + PathBuf::from(&source.base).join(&source.pattern) + }; + + match dunce::canonicalize(combined_path) { + Ok(resolved_path) if resolved_path.is_dir() => { + source.base = resolved_path.to_string_lossy().to_string(); + source.pattern = "**/*".to_owned(); + } + Ok(resolved_path) if resolved_path.is_file() => { + source.base = resolved_path + .parent() + .unwrap() + .to_string_lossy() + .to_string(); + // Ensure leading slash, otherwise it will match against all files in all folders/ + source.pattern = format!( + "/{}", + resolved_path + .file_name() + .unwrap() + .to_string_lossy() + .to_string() + ); + } + _ => {} + } + return; + } + + // Contains dynamic part + let (static_part, dynamic_part) = split_pattern(&source.pattern); + + let base: PathBuf = source.base.clone().into(); + let base = match static_part { + Some(static_part) => base.join(static_part), + None => base, + }; + + // TODO: If the base does not exist on disk, try removing the last slash and try again. + let base = match dunce::canonicalize(&base) { + Ok(base) => base, + Err(err) => { + event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err); + return; + } + }; + + let pattern = match dynamic_part { + Some(dynamic_part) => dynamic_part, + None => { + if base.is_dir() { + "**/*".to_owned() + } else { + "".to_owned() + } + } + }; + + source.base = base.to_string_lossy().to_string(); + source.pattern = pattern; +} + +pub fn hoist_static_glob_parts(entries: &Vec, emit_parent_glob: bool) -> Vec { let mut result = vec![]; for entry in entries { @@ -40,7 +146,7 @@ pub fn hoist_static_glob_parts(entries: &Vec) -> Vec { // If the base path is a file, then we want to move the file to the pattern, and point the // directory to the base. This is necessary for file watchers that can only listen to // folders. - if pattern.is_empty() && base.is_file() { + if emit_parent_glob && pattern.is_empty() && base.is_file() { result.push(GlobEntry { // SAFETY: `parent()` will be available because we verify `base` is a file, thus a // parent folder exists. @@ -83,7 +189,7 @@ pub fn hoist_static_glob_parts(entries: &Vec) -> Vec { /// tailwind --pwd ./project/components --content "**/*.js" /// ``` pub fn optimize_patterns(entries: &Vec) -> Vec { - let entries = hoist_static_glob_parts(entries); + let entries = hoist_static_glob_parts(entries, true); // Track all base paths and their patterns. Later we will turn them back into `GlobalEntry`s. let mut pattern_map: FxHashMap> = FxHashMap::default(); @@ -132,10 +238,25 @@ pub fn optimize_patterns(entries: &Vec) -> Vec { // using `*`. // // E.g.: -// Original input: `../project-b/**/*.{html,js}` -// Expanded input: `../project-b/**/*.html` & `../project-b/**/*.js` -// Split on first input: ("../project-b", "**/*.html") -// Split on second input: ("../project-b", "**/*.js") +// +// Original input: +// - `../project-b/**/*.{html,js}` +// +// Expanded input: +// - `../project-b/**/*.html` +// - `../project-b/**/*.js` +// +// Split results in: +// - `("../project-b", "**/*.html")` +// - `("../project-b", "**/*.js")` +// +// A static file glob should also be considered as a dynamic part. +// +// E.g.: +// +// Input: `../project-b/foo/bar.html` +// Split results in: `("../project-b/foo", "bar.html")` +// fn split_pattern(pattern: &str) -> (Option, Option) { // No dynamic parts, so we can just return the input as-is. if !pattern.contains('*') { @@ -168,14 +289,6 @@ fn split_pattern(pattern: &str) -> (Option, Option) { (static_part, dynamic_part) } -pub fn path_matches_globs(path: &Path, globs: &[GlobEntry]) -> bool { - let path = path.to_string_lossy(); - - globs - .iter() - .any(|g| glob_match(format!("{}/{}", g.base, g.pattern), path.as_bytes())) -} - #[cfg(test)] mod tests { use super::optimize_patterns; diff --git a/crates/oxide/src/lib.rs b/crates/oxide/src/lib.rs index 9088763c39de..d7f0321af16a 100644 --- a/crates/oxide/src/lib.rs +++ b/crates/oxide/src/lib.rs @@ -1,22 +1,3 @@ -use crate::glob::hoist_static_glob_parts; -use crate::scanner::allowed_paths::resolve_paths; -use crate::scanner::detect_sources::DetectSources; -use bexpand::Expression; -use bstr::ByteSlice; -use extractor::{Extracted, Extractor}; -use fast_glob::glob_match; -use fxhash::{FxHashMap, FxHashSet}; -use glob::optimize_patterns; -use paths::Path; -use rayon::prelude::*; -use scanner::allowed_paths::read_dir; -use std::borrow::Cow; -use std::fs; -use std::path::PathBuf; -use std::sync; -use std::time::SystemTime; -use tracing::event; - pub mod cursor; pub mod extractor; pub mod fast_skip; @@ -25,554 +6,7 @@ pub mod paths; pub mod scanner; pub mod throughput; -static SHOULD_TRACE: sync::LazyLock = sync::LazyLock::new( - || matches!(std::env::var("DEBUG"), Ok(value) if value.eq("*") || (value.contains("tailwindcss:oxide") && !value.contains("-tailwindcss:oxide"))), -); - -fn init_tracing() { - if !*SHOULD_TRACE { - return; - } - - _ = tracing_subscriber::fmt() - .with_max_level(tracing::Level::INFO) - .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE) - .compact() - .try_init(); -} - -#[derive(Debug, Clone)] -pub enum ChangedContent<'a> { - File(PathBuf, Cow<'a, str>), - Content(String, Cow<'a, str>), -} - -#[derive(Debug, Clone)] -pub struct ScanOptions { - /// Base path to start scanning from - pub base: Option, - /// Glob sources - pub sources: Vec, -} - -#[derive(Debug, Clone)] -pub struct ScanResult { - pub candidates: Vec, - pub files: Vec, - pub globs: Vec, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct GlobEntry { - pub base: String, - pub pattern: String, -} - -#[derive(Debug, Clone, Default)] -pub struct Scanner { - /// Glob sources - sources: Option>, - - /// Scanner is ready to scan. We delay the file system traversal for detecting all files until - /// we actually need them. - ready: bool, - - /// All files that we have to scan - files: Vec, - - /// All directories, sub-directories, etc… we saw during source detection - dirs: Vec, - - /// All generated globs - globs: Vec, - - /// Track file modification times - mtimes: FxHashMap, - - /// Track unique set of candidates - candidates: FxHashSet, -} - -impl Scanner { - pub fn new(sources: Option>) -> Self { - Self { - sources, - ..Default::default() - } - } - - pub fn scan(&mut self) -> Vec { - init_tracing(); - - self.prepare(); - self.compute_candidates(); - - let mut candidates: Vec = self.candidates.clone().into_par_iter().collect(); - candidates.par_sort_unstable(); - - candidates - } - - #[tracing::instrument(skip_all)] - pub fn scan_content(&mut self, changed_content: Vec) -> Vec { - self.prepare(); - let candidates = parse_all_blobs(read_all_files(changed_content)); - - let mut new_candidates = vec![]; - for candidate in candidates { - if self.candidates.contains(&candidate) { - continue; - } - self.candidates.insert(candidate.clone()); - new_candidates.push(candidate); - } - - new_candidates - } - - #[tracing::instrument(skip_all)] - pub fn get_candidates_with_positions( - &mut self, - changed_content: ChangedContent, - ) -> Vec<(String, usize)> { - self.prepare(); - - let content = read_changed_content(changed_content).unwrap_or_default(); - let original_content = &content; - - // Workaround for legacy upgrades: - // - // `-[]` won't parse in the new parser (`[…]` must contain _something_), but we do need it - // for people using `group-[]` (which we will later replace with `in-[.group]` instead). - let content = content.replace("-[]", "XYZ"); - let offset = content.as_ptr() as usize; - - let mut extractor = Extractor::new(&content[..]); - - extractor - .extract() - .into_par_iter() - .flat_map(|extracted| match extracted { - Extracted::Candidate(s) => { - let i = s.as_ptr() as usize - offset; - let original = &original_content[i..i + s.len()]; - if original.contains_str("-[]") { - return Some(unsafe { - (String::from_utf8_unchecked(original.to_vec()), i) - }); - } - - // SAFETY: When we parsed the candidates, we already guaranteed that the byte - // slices are valid, therefore we don't have to re-check here when we want to - // convert it back to a string. - Some(unsafe { (String::from_utf8_unchecked(s.to_vec()), i) }) - } - - _ => None, - }) - .collect() - } - - #[tracing::instrument(skip_all)] - pub fn get_files(&mut self) -> Vec { - self.prepare(); - - self.files - .par_iter() - .filter_map(|x| Path::from(x.clone()).canonicalize().ok()) - .map(|x| x.to_string()) - .collect() - } - - #[tracing::instrument(skip_all)] - pub fn get_globs(&mut self) -> Vec { - self.prepare(); - - self.globs.clone() - } - - #[tracing::instrument(skip_all)] - fn compute_candidates(&mut self) { - let mut changed_content = vec![]; - - let current_mtimes = self - .files - .par_iter() - .map(|path| { - fs::metadata(path) - .and_then(|m| m.modified()) - .unwrap_or(SystemTime::now()) - }) - .collect::>(); - - for (idx, path) in self.files.iter().enumerate() { - let current_time = current_mtimes[idx]; - let previous_time = self.mtimes.insert(path.clone(), current_time); - - let should_scan_file = match previous_time { - // Time has changed, so we need to re-scan the file - Some(prev) if prev != current_time => true, - - // File was in the cache, no need to re-scan - Some(_) => false, - - // File didn't exist before, so we need to scan it - None => true, - }; - - if should_scan_file { - let extension = path.extension().unwrap_or_default().to_string_lossy(); - changed_content.push(ChangedContent::File(path.to_path_buf(), extension)) - } - } - - if !changed_content.is_empty() { - let candidates = parse_all_blobs(read_all_files(changed_content)); - self.candidates.par_extend(candidates); - } - } - - // Ensures that all files/globs are resolved and the scanner is ready to scan - // content for candidates. - fn prepare(&mut self) { - if self.ready { - self.check_for_new_files(); - return; - } - - self.scan_sources(); - - self.ready = true; - } - - #[tracing::instrument(skip_all)] - fn check_for_new_files(&mut self) { - let current_mtimes = self - .dirs - .par_iter() - .map(|path| { - fs::metadata(path) - .and_then(|m| m.modified()) - .unwrap_or(SystemTime::now()) - }) - .collect::>(); - - let mut modified_dirs: Vec = vec![]; - - // Check all directories to see if they were modified - for (idx, path) in self.dirs.iter().enumerate() { - let current_time = current_mtimes[idx]; - let previous_time = self.mtimes.insert(path.clone(), current_time); - - let should_scan = match previous_time { - // Time has changed, so we need to re-scan the file - Some(prev) if prev != current_time => true, - - // File was in the cache, no need to re-scan - Some(_) => false, - - // File didn't exist before, so we need to scan it - None => true, - }; - - if should_scan { - modified_dirs.push(path.clone()); - } - } - - // Scan all modified directories for their immediate files - let mut known = FxHashSet::from_iter(self.files.iter().chain(self.dirs.iter()).cloned()); - - while !modified_dirs.is_empty() { - let new_entries = modified_dirs - .iter() - .flat_map(|dir| read_dir(dir, Some(1))) - .map(|entry| entry.path().to_owned()) - .filter(|path| !known.contains(path)) - .collect::>(); - - modified_dirs.clear(); - - for path in new_entries { - if path.is_file() { - known.insert(path.clone()); - self.files.push(path); - } else if path.is_dir() { - known.insert(path.clone()); - self.dirs.push(path.clone()); - - // Recursively scan the new directory for files - modified_dirs.push(path); - } - } - } - } - - #[tracing::instrument(skip_all)] - fn scan_sources(&mut self) { - let Some(sources) = &self.sources else { - return; - }; - - if sources.is_empty() { - return; - } - - // Expand glob patterns and create new `GlobEntry` instances for each expanded pattern. - let sources = sources - .iter() - .flat_map(|source| { - let expression: Result = source.pattern[..].try_into(); - let Ok(expression) = expression else { - return vec![source.clone()]; - }; - - expression - .into_iter() - .filter_map(Result::ok) - .map(move |pattern| GlobEntry { - base: source.base.clone(), - pattern: pattern.into(), - }) - .collect::>() - }) - .collect::>(); - - // Partition sources into sources that should be promoted to auto source detection and - // sources that should be resolved as globs. - let (auto_sources, glob_sources): (Vec<_>, Vec<_>) = sources.iter().partition(|source| { - // If a glob ends with `/**/*`, then we just want to register the base path as a new - // base. Essentially converting it to use auto source detection. - if source.pattern.ends_with("**/*") { - return true; - } - - // Directories should be promoted to auto source detection - if PathBuf::from(&source.base).join(&source.pattern).is_dir() { - return true; - } - - false - }); - - fn join_paths(a: &str, b: &str) -> PathBuf { - let mut tmp = a.to_owned(); - let b = b.trim_end_matches("**/*").trim_end_matches('/'); - - if b.starts_with('/') { - return PathBuf::from(b); - } - - // On Windows a path like C:/foo.txt is absolute but C:foo.txt is not - // (the 2nd is relative to the CWD) - if b.chars().nth(1) == Some(':') && b.chars().nth(2) == Some('/') { - return PathBuf::from(b); - } - - tmp += "/"; - tmp += b; - - PathBuf::from(&tmp) - } - - for path in auto_sources.iter().filter_map(|source| { - dunce::canonicalize(join_paths(&source.base, &source.pattern)).ok() - }) { - // Insert a glob for the base path, so we can see new files/folders in the directory itself. - self.globs.push(GlobEntry { - base: path.to_string_lossy().into(), - pattern: "*".into(), - }); - - // Detect all files/folders in the directory - let detect_sources = DetectSources::new(path); - - let (files, globs, dirs) = detect_sources.detect(); - self.files.extend(files); - self.globs.extend(globs); - self.dirs.extend(dirs); - } - - // Turn `Vec<&GlobEntry>` in `Vec` - let glob_sources: Vec<_> = glob_sources.into_iter().cloned().collect(); - let hoisted = hoist_static_glob_parts(&glob_sources); - - for source in &hoisted { - // If the pattern is empty, then the base points to a specific file or folder already - // if it doesn't contain any dynamic parts. In that case we can use the base as the - // pattern. - // - // Otherwise we need to combine the base and the pattern, otherwise a pattern that - // looks like `*.html`, will never match a path that looks like - // `/my-project/project-a/index.html`, because it contains `/`. - // - // We can't prepend `**/`, because then `/my-project/project-a/nested/index.html` would - // match as well. - // - // Instead we combine the base and the pattern as a single glob pattern. - let mut full_pattern = source.base.clone().replace('\\', "/"); - - if !source.pattern.is_empty() { - full_pattern.push('/'); - full_pattern.push_str(&source.pattern); - } - - let base = PathBuf::from(&source.base); - for entry in resolve_paths(&base) { - let Some(file_type) = entry.file_type() else { - continue; - }; - - if !file_type.is_file() { - continue; - } - - let file_path = entry.into_path(); - - let Some(file_path_str) = file_path.to_str() else { - continue; - }; - - let file_path_str = file_path_str.replace('\\', "/"); - - if glob_match(&full_pattern, &file_path_str) { - self.files.push(file_path); - } - } - } - - self.globs.extend(hoisted); - - // Re-optimize the globs to reduce the number of patterns we have to scan. - self.globs = optimize_patterns(&self.globs); - } -} - -fn read_changed_content(c: ChangedContent) -> Option> { - let (content, extension) = match c { - ChangedContent::File(file, extension) => match std::fs::read(&file) { - Ok(content) => (content, extension), - Err(e) => { - event!(tracing::Level::ERROR, "Failed to read file: {:?}", e); - return None; - } - }, - - ChangedContent::Content(contents, extension) => (contents.into_bytes(), extension), - }; - - Some(pre_process_input(&content, &extension)) -} - -pub fn pre_process_input(content: &[u8], extension: &str) -> Vec { - use crate::extractor::pre_processors::*; - - match extension { - "clj" | "cljs" | "cljc" => Clojure.process(content), - "cshtml" | "razor" => Razor.process(content), - "haml" => Haml.process(content), - "json" => Json.process(content), - "pug" => Pug.process(content), - "rb" | "erb" => Ruby.process(content), - "slim" => Slim.process(content), - "svelte" => Svelte.process(content), - "vue" => Vue.process(content), - _ => content.to_vec(), - } -} - -#[tracing::instrument(skip_all)] -fn read_all_files(changed_content: Vec) -> Vec> { - event!( - tracing::Level::INFO, - "Reading {:?} file(s)", - changed_content.len() - ); - - changed_content - .into_par_iter() - .filter_map(read_changed_content) - .collect() -} - -#[tracing::instrument(skip_all)] -fn parse_all_blobs(blobs: Vec>) -> Vec { - let mut result: Vec<_> = blobs - .par_iter() - .flat_map(|blob| blob.par_split(|x| *x == b'\n')) - .filter_map(|blob| { - if blob.is_empty() { - return None; - } - - let extracted = crate::extractor::Extractor::new(blob).extract(); - if extracted.is_empty() { - return None; - } - - Some(FxHashSet::from_iter(extracted.into_iter().map( - |x| match x { - Extracted::Candidate(bytes) => bytes, - Extracted::CssVariable(bytes) => bytes, - }, - ))) - }) - .reduce(Default::default, |mut a, b| { - a.extend(b); - a - }) - .into_iter() - .map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) }) - .collect(); - - // SAFETY: Unstable sort is faster and in this scenario it's also safe because we are - // guaranteed to have unique candidates. - result.par_sort_unstable(); - - result -} - -#[cfg(test)] -mod tests { - use crate::Scanner; - - #[test] - fn test_positions() { - let mut scanner = Scanner::new(None); - - for (input, expected) in [ - // Before migrations - ( - r#"

"#, - vec![ - ("class".to_string(), 5), - ("tw:flex!".to_string(), 12), - ("tw:sm:block!".to_string(), 21), - ("tw:bg-linear-to-t".to_string(), 34), - ("flex".to_string(), 52), - ("tw:[color:red]".to_string(), 57), - ("tw:in-[.tw\\:group]:flex".to_string(), 72), - ], - ), - ] { - let candidates = scanner.get_candidates_with_positions(crate::ChangedContent::Content( - input.to_string(), - "html".into(), - )); - assert_eq!(candidates, expected); - } - } -} +pub use glob::GlobEntry; +pub use scanner::sources::PublicSourceEntry; +pub use scanner::ChangedContent; +pub use scanner::Scanner; diff --git a/crates/oxide/src/scanner/allowed_paths.rs b/crates/oxide/src/scanner/allowed_paths.rs deleted file mode 100644 index b906335f8979..000000000000 --- a/crates/oxide/src/scanner/allowed_paths.rs +++ /dev/null @@ -1,128 +0,0 @@ -use ignore::{DirEntry, WalkBuilder}; -use std::{path::Path, sync}; - -static BINARY_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/binary-extensions.txt") - .trim() - .lines() - .collect() -}); - -static IGNORED_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/ignored-extensions.txt") - .trim() - .lines() - .collect() -}); - -static IGNORED_FILES: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/ignored-files.txt") - .trim() - .lines() - .collect() -}); - -static IGNORED_CONTENT_DIRS: sync::LazyLock> = - sync::LazyLock::new(|| vec![".git"]); - -#[tracing::instrument(skip_all)] -pub fn resolve_allowed_paths(root: &Path) -> impl Iterator { - // Read the directory recursively with no depth limit - read_dir(root, None) -} - -#[tracing::instrument(skip_all)] -pub fn resolve_paths(root: &Path) -> impl Iterator { - create_walk_builder(root).build().filter_map(Result::ok) -} - -pub fn read_dir(root: &Path, depth: Option) -> impl Iterator { - create_walk_builder(root) - .max_depth(depth) - .filter_entry(move |entry| match entry.file_type() { - Some(file_type) if file_type.is_dir() => match entry.file_name().to_str() { - Some(dir) => !IGNORED_CONTENT_DIRS.contains(&dir), - None => false, - }, - Some(file_type) if file_type.is_file() || file_type.is_symlink() => { - is_allowed_content_path(entry.path()) - } - _ => false, - }) - .build() - .filter_map(Result::ok) -} - -fn create_walk_builder(root: &Path) -> WalkBuilder { - let mut builder = WalkBuilder::new(root); - - // Scan hidden files / directories - builder.hidden(false); - - // By default, allow .gitignore files to be used regardless of whether or not - // a .git directory is present. This is an optimization for when projects - // are first created and may not be in a git repo yet. - builder.require_git(false); - - // Don't descend into .git directories inside the root folder - // This is necessary when `root` contains the `.git` dir. - builder.filter_entry(|entry| entry.file_name() != ".git"); - - // If we are in a git repo then require it to ensure that only rules within - // the repo are used. For example, we don't want to consider a .gitignore file - // in the user's home folder if we're in a git repo. - // - // The alternative is using a call like `.parents(false)` but that will - // prevent looking at parent directories for .gitignore files from within - // the repo and that's not what we want. - // - // For example, in a project with this structure: - // - // home - // .gitignore - // my-project - // .gitignore - // apps - // .gitignore - // web - // {root} - // - // We do want to consider all .gitignore files listed: - // - home/.gitignore - // - my-project/.gitignore - // - my-project/apps/.gitignore - // - // However, if a repo is initialized inside my-project then only the following - // make sense for consideration: - // - my-project/.gitignore - // - my-project/apps/.gitignore - // - // Setting the require_git(true) flag conditionally allows us to do this. - for parent in root.ancestors() { - if parent.join(".git").exists() { - builder.require_git(true); - break; - } - } - - builder -} - -pub fn is_allowed_content_path(path: &Path) -> bool { - // Skip known ignored files - if path - .file_name() - .unwrap() - .to_str() - .map(|s| IGNORED_FILES.contains(&s)) - .unwrap_or(false) - { - return false; - } - - // Skip known ignored extensions - path.extension() - .map(|s| s.to_str().unwrap_or_default()) - .map(|ext| !IGNORED_EXTENSIONS.contains(&ext) && !BINARY_EXTENSIONS.contains(&ext)) - .unwrap_or(false) -} diff --git a/crates/oxide/src/scanner/auto_source_detection.rs b/crates/oxide/src/scanner/auto_source_detection.rs new file mode 100644 index 000000000000..9d2e3705c6f8 --- /dev/null +++ b/crates/oxide/src/scanner/auto_source_detection.rs @@ -0,0 +1,54 @@ +use ignore::gitignore::{Gitignore, GitignoreBuilder}; +use std::sync; + +/// All the default rules for auto source detection. +/// +/// This includes: +/// +/// - Ignoring common content directories like `.git` and `node_modules` +/// - Ignoring file extensions we definitely don't want to include like `.css` and `.scss` +/// - Ignoring common binary file extensions like `.png` and `.jpg` +/// - Ignoring common files like `yarn.lock` and `package-lock.json` +/// +pub static RULES: sync::LazyLock = sync::LazyLock::new(|| { + let mut builder = GitignoreBuilder::new(""); + + builder + .add_line(None, &format!("{{{}}}", IGNORED_CONTENT_DIRS.join(","))) + .unwrap(); + builder + .add_line(None, &format!("*.{{{}}}", IGNORED_EXTENSIONS.join(","))) + .unwrap(); + builder + .add_line(None, &format!("*.{{{}}}", BINARY_EXTENSIONS.join(","))) + .unwrap(); + builder + .add_line(None, &format!("{{{}}}", IGNORED_FILES.join(","))) + .unwrap(); + + builder.build().unwrap() +}); + +static BINARY_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { + include_str!("fixtures/binary-extensions.txt") + .trim() + .lines() + .collect() +}); + +static IGNORED_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { + include_str!("fixtures/ignored-extensions.txt") + .trim() + .lines() + .collect() +}); + +static IGNORED_FILES: sync::LazyLock> = sync::LazyLock::new(|| { + include_str!("fixtures/ignored-files.txt") + .trim() + .lines() + .collect() +}); + +static IGNORED_CONTENT_DIRS: sync::LazyLock> = + sync::LazyLock::new(|| vec![".git"]); diff --git a/crates/oxide/src/scanner/detect_sources.rs b/crates/oxide/src/scanner/detect_sources.rs index 6828e8eca7c7..3a32038ddedd 100644 --- a/crates/oxide/src/scanner/detect_sources.rs +++ b/crates/oxide/src/scanner/detect_sources.rs @@ -1,16 +1,11 @@ -use crate::scanner::allowed_paths::{is_allowed_content_path, resolve_allowed_paths}; use crate::GlobEntry; use fxhash::FxHashSet; +use globwalk::DirEntry; use std::cmp::Ordering; use std::path::PathBuf; use std::sync; use walkdir::WalkDir; -#[derive(Debug, Clone)] -pub struct DetectSources { - base: PathBuf, -} - static KNOWN_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { include_str!("fixtures/template-extensions.txt") .trim() @@ -22,200 +17,133 @@ static KNOWN_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new .collect() }); -impl DetectSources { - pub fn new(base: PathBuf) -> Self { - Self { base } - } - - pub fn detect(&self) -> (Vec, Vec, Vec) { - let (files, dirs) = self.resolve_files(); - let globs = self.resolve_globs(&dirs); - - (files, globs, dirs) +// Sorting to make sure that we always see the directories before the files. Also sorting +// alphabetically by default. +fn sort_by_dir_and_name(a: &DirEntry, z: &DirEntry) -> Ordering { + match (a.file_type().is_dir(), z.file_type().is_dir()) { + (true, false) => Ordering::Less, + (false, true) => Ordering::Greater, + _ => a.file_name().cmp(z.file_name()), } +} - fn resolve_files(&self) -> (Vec, Vec) { - let mut files: Vec = vec![]; - let mut dirs: Vec = vec![]; - - for entry in resolve_allowed_paths(&self.base) { - let Some(file_type) = entry.file_type() else { - continue; - }; - - if file_type.is_file() { - files.push(entry.into_path()); - } else if file_type.is_dir() { - dirs.push(entry.into_path()); - } +pub fn resolve_globs( + base: PathBuf, + dirs: &[PathBuf], + extensions: &FxHashSet, +) -> Vec { + let allowed_paths: FxHashSet = FxHashSet::from_iter(dirs.iter().cloned()); + + // A list of known extensions + a list of extensions we found in the project. + let mut found_extensions: FxHashSet = + FxHashSet::from_iter(KNOWN_EXTENSIONS.iter().map(|x| x.to_string())); + found_extensions.extend(extensions.iter().cloned()); + + // A list of directory names where we can't use globs, but we should track each file + // individually instead. This is because these directories are often used for both source and + // destination files. + let forced_static_directories: FxHashSet = + FxHashSet::from_iter(vec![base.join("public")]); + + // All directories where we can safely use deeply nested globs to watch all files. + // In other comments we refer to these as "deep glob directories" or similar. + // + // E.g.: `./src/**/*.{html,js}` + let mut deep_globable_directories: FxHashSet = Default::default(); + + // All directories where we can only use shallow globs to watch all direct files but not + // folders. + // In other comments we refer to these as "shallow glob directories" or similar. + // + // E.g.: `./src/*/*.{html,js}` + let mut shallow_globable_directories: FxHashSet = Default::default(); + + // Collect all valid paths from the root. This will already filter out ignored files, unknown + // extensions and binary files. + let mut it = WalkDir::new(&base) + .sort_by(sort_by_dir_and_name) + .into_iter(); + + // Figure out all the shallow globable directories. + while let Some(Ok(entry)) = it.next() { + let path = entry.path(); + if !path.is_dir() { + continue; } - (files, dirs) - } + if !allowed_paths.contains(path) { + let mut path = path; + while let Some(parent) = path.parent() { + if parent == base { + break; + } - fn resolve_globs(&self, dirs: &Vec) -> Vec { - let allowed_paths = FxHashSet::from_iter(dirs); - - // A list of directory names where we can't use globs, but we should track each file - // individually instead. This is because these directories are often used for both source and - // destination files. - let mut forced_static_directories = vec![self.base.join("public")]; - - // A list of known extensions + a list of extensions we found in the project. - let mut found_extensions = - FxHashSet::from_iter(KNOWN_EXTENSIONS.iter().map(|x| x.to_string())); - - // All root directories. - let mut root_directories = FxHashSet::from_iter(vec![self.base.clone()]); - - // All directories where we can safely use deeply nested globs to watch all files. - // In other comments we refer to these as "deep glob directories" or similar. - // - // E.g.: `./src/**/*.{html,js}` - let mut deep_globable_directories: FxHashSet = FxHashSet::default(); - - // All directories where we can only use shallow globs to watch all direct files but not - // folders. - // In other comments we refer to these as "shallow glob directories" or similar. - // - // E.g.: `./src/*/*.{html,js}` - let mut shallow_globable_directories: FxHashSet = FxHashSet::default(); - - // Collect all valid paths from the root. This will already filter out ignored files, unknown - // extensions and binary files. - let mut it = WalkDir::new(&self.base) - // Sorting to make sure that we always see the directories before the files. Also sorting - // alphabetically by default. - .sort_by( - |a, z| match (a.file_type().is_dir(), z.file_type().is_dir()) { - (true, false) => Ordering::Less, - (false, true) => Ordering::Greater, - _ => a.file_name().cmp(z.file_name()), - }, - ) - .into_iter(); - - // We are only interested in valid entries - while let Some(Ok(entry)) = it.next() { - // Ignore known directories that we don't want to traverse into. - if entry.file_type().is_dir() && entry.file_name() == ".git" { - it.skip_current_dir(); - continue; + shallow_globable_directories.insert(parent.to_path_buf()); + path = parent } - if entry.file_type().is_dir() { - // If we are in a directory where we know that we can't use any globs, then we have to - // track each file individually. - if forced_static_directories.contains(&entry.path().to_path_buf()) { - forced_static_directories.push(entry.path().to_path_buf()); - root_directories.insert(entry.path().to_path_buf()); - continue; - } + it.skip_current_dir(); + } + } - // Although normally very unlikely, if running inside a dockerfile - // the current directory might be "/" with no parent - if let Some(parent) = entry.path().parent() { - // If we are in a directory where the parent is a forced static directory, then this - // will become a forced static directory as well. - if forced_static_directories.contains(&parent.to_path_buf()) { - forced_static_directories.push(entry.path().to_path_buf()); - root_directories.insert(entry.path().to_path_buf()); - continue; - } - } + // Figure out all the deep globable directories. + let mut it = WalkDir::new(&base) + .sort_by(sort_by_dir_and_name) + .into_iter(); - // If we are in a directory, and the directory is git ignored, then we don't have to - // descent into the directory. However, we have to make sure that we mark the _parent_ - // directory as a shallow glob directory because using deep globs from any of the - // parent directories will include this ignored directory which should not be the case. - // - // Another important part is that if one of the ignored directories is a deep glob - // directory, then all of its parents (until the root) should be marked as shallow glob - // directories as well. - if !allowed_paths.contains(&entry.path().to_path_buf()) { - let mut parent = entry.path().parent(); - while let Some(parent_path) = parent { - // If the parent is already marked as a valid deep glob directory, then we have - // to mark it as a shallow glob directory instead, because we won't be able to - // use deep globs for this directory anymore. - if deep_globable_directories.contains(parent_path) { - deep_globable_directories.remove(parent_path); - shallow_globable_directories.insert(parent_path.to_path_buf()); - } - - // If we reached the root, then we can stop. - if parent_path == self.base { - break; - } - - // Mark the parent directory as a shallow glob directory and continue with its - // parent. - shallow_globable_directories.insert(parent_path.to_path_buf()); - parent = parent_path.parent(); - } - - it.skip_current_dir(); - continue; - } + while let Some(Ok(entry)) = it.next() { + let path = entry.path(); + if path.is_file() { + continue; + } - // If we are in a directory that is not git ignored, then we can mark this directory as - // a valid deep glob directory. This is only necessary if any of its parents aren't - // marked as deep glob directories already. - let mut found_deep_glob_parent = false; - let mut parent = entry.path().parent(); - while let Some(parent_path) = parent { - // If we reached the root, then we can stop. - if parent_path == self.base { - break; - } - - // If the parent is already marked as a deep glob directory, then we can stop - // because this glob will match the current directory already. - if deep_globable_directories.contains(parent_path) { - found_deep_glob_parent = true; - break; - } - - parent = parent_path.parent(); - } + if path == base { + continue; + } - // If we didn't find a deep glob directory parent, then we can mark this directory as a - // deep glob directory (unless it is the root). - if !found_deep_glob_parent && entry.path() != self.base { - deep_globable_directories.insert(entry.path().to_path_buf()); - } - } + if !allowed_paths.contains(path) { + continue; + } - // Handle allowed content paths - if is_allowed_content_path(entry.path()) - && allowed_paths.contains(&entry.path().to_path_buf()) - { - let path = entry.path(); + // Already marked as a shallow globable directory. + if shallow_globable_directories.contains(path) { + continue; + } - // Collect the extension for future use when building globs. - if let Some(extension) = path.extension().and_then(|x| x.to_str()) { - found_extensions.insert(extension.to_string()); - } - } + if forced_static_directories.contains(path) { + it.skip_current_dir(); + continue; } - let mut extension_list = found_extensions.into_iter().collect::>(); + // Track deep globable directories. + deep_globable_directories.insert(path.to_path_buf()); + it.skip_current_dir(); + } - extension_list.sort(); + let mut extension_list = found_extensions.clone().into_iter().collect::>(); - let extension_list = extension_list.join(","); + extension_list.sort(); - // Build the globs for all globable directories. - let shallow_globs = shallow_globable_directories.iter().map(|path| GlobEntry { - base: path.display().to_string(), - pattern: format!("*/*.{{{}}}", extension_list), - }); + let extension_list = extension_list.join(","); - let deep_globs = deep_globable_directories.iter().map(|path| GlobEntry { - base: path.display().to_string(), - pattern: format!("**/*.{{{}}}", extension_list), - }); + // Build the globs for all globable directories. + let shallow_globs = shallow_globable_directories.iter().map(|path| GlobEntry { + base: path.display().to_string(), + pattern: format!("*/*.{{{}}}", extension_list), + }); - shallow_globs.chain(deep_globs).collect::>() - } + let deep_globs = deep_globable_directories.iter().map(|path| GlobEntry { + base: path.display().to_string(), + pattern: format!("**/*.{{{}}}", extension_list), + }); + + shallow_globs + .chain(deep_globs) + // Insert a glob for the base path, so we can see new files/folders in the directory + // itself + .chain(vec![GlobEntry { + base: base.to_string_lossy().into(), + pattern: "*".into(), + }]) + .collect::>() } diff --git a/crates/oxide/src/scanner/mod.rs b/crates/oxide/src/scanner/mod.rs index 8ddf60fd0ef2..c5dd131a8f53 100644 --- a/crates/oxide/src/scanner/mod.rs +++ b/crates/oxide/src/scanner/mod.rs @@ -1,2 +1,686 @@ -pub mod allowed_paths; +pub mod auto_source_detection; pub mod detect_sources; +pub mod sources; + +use crate::extractor::{Extracted, Extractor}; +use crate::glob::optimize_patterns; +use crate::scanner::detect_sources::resolve_globs; +use crate::scanner::sources::{ + public_source_entries_to_private_source_entries, PublicSourceEntry, SourceEntry, Sources, +}; +use crate::GlobEntry; +use bstr::ByteSlice; +use fast_glob::glob_match; +use fxhash::{FxHashMap, FxHashSet}; +use ignore::{gitignore::GitignoreBuilder, WalkBuilder}; +use rayon::prelude::*; +use std::collections::{BTreeMap, BTreeSet}; +use std::path::Path; +use std::path::PathBuf; +use std::sync::{self, Arc, Mutex}; +use std::time::SystemTime; +use tracing::event; + +// @source "some/folder"; // This is auto source detection +// @source "some/folder/**/*"; // This is auto source detection +// @source "some/folder/*.html"; // This is just a glob, but new files matching this should be included +// @source "node_modules/my-ui-lib"; // Auto source detection but since node_modules is explicit we allow it +// // Maybe could be considered `external(…)` automatically if: +// // 1. It's git ignored but listed explicitly +// // 2. It exists outside of the current working directory (do we know that?) +// +// @source "do-include-me.bin"; // `.bin` is typically ignored, but now it's explicit so should be included +// @source "git-ignored.html"; // A git ignored file that is listed explicitly, should be scanned +static SHOULD_TRACE: sync::LazyLock = sync::LazyLock::new( + || matches!(std::env::var("DEBUG"), Ok(value) if value.eq("*") || (value.contains("tailwindcss:oxide") && !value.contains("-tailwindcss:oxide"))), +); + +fn init_tracing() { + if !*SHOULD_TRACE { + return; + } + + _ = tracing_subscriber::fmt() + .with_max_level(tracing::Level::INFO) + .with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE) + .compact() + .try_init(); +} + +#[derive(Debug, Clone)] +pub enum ChangedContent { + File(PathBuf, String), + Content(String, String), +} + +#[derive(Debug, Clone)] +pub struct ScanOptions { + /// Base path to start scanning from + pub base: Option, + + /// Glob sources + pub sources: Vec, +} + +#[derive(Debug, Clone)] +pub struct ScanResult { + pub candidates: Vec, + pub files: Vec, + pub globs: Vec, +} + +#[derive(Debug, Clone, Default)] +pub struct Scanner { + /// Content sources + sources: Sources, + + /// The walker to detect all files that we have to scan + walker: Option, + + /// All changed content that we have to parse + changed_content: Vec, + + /// All found extensions + extensions: FxHashSet, + + /// All files that we have to scan + files: Vec, + + /// All directories, sub-directories, etc… we saw during source detection + dirs: Vec, + + /// All generated globs, used for setting up watchers + globs: Vec, + + /// Track unique set of candidates + candidates: FxHashSet, +} + +impl Scanner { + pub fn new(sources: Vec) -> Self { + let sources = Sources::new(public_source_entries_to_private_source_entries(sources)); + + Self { + sources: sources.clone(), + walker: create_walker(sources), + ..Default::default() + } + } + + pub fn scan(&mut self) -> Vec { + init_tracing(); + self.scan_sources(); + + // TODO: performance improvement, bail early if we don't have any changed content + // if self.changed_content.is_empty() { + // return vec![]; + // } + + let _new_candidates = self.extract_candidates(); + + // Make sure we have a sorted list of candidates + let mut candidates = self.candidates.iter().cloned().collect::>(); + candidates.par_sort_unstable(); + + // Return all candidates instead of only the new ones + candidates + } + + #[tracing::instrument(skip_all)] + pub fn scan_content(&mut self, changed_content: Vec) -> Vec { + let (changed_files, changed_contents) = + changed_content + .into_iter() + .partition::, _>(|x| match x { + ChangedContent::File(_, _) => true, + ChangedContent::Content(_, _) => false, + }); + + // Raw content can be parsed directly, no need to verify if the file exists and is allowed + // to be scanned. + self.changed_content.extend(changed_contents); + + // Fully resolve all files + let changed_files = changed_files + .into_iter() + .filter_map(|changed_content| match changed_content { + ChangedContent::File(file, extension) => { + let Ok(file) = dunce::canonicalize(file) else { + return None; + }; + Some(ChangedContent::File(file, extension)) + } + _ => unreachable!(), + }) + .collect::>(); + + let (known_files, mut new_unknown_files) = changed_files + .into_iter() + .partition::, _>(|changed_file| match changed_file { + ChangedContent::Content(_, _) => unreachable!(), + ChangedContent::File(file, _) => self.files.contains(file), + }); + + // All known files are allowed to be scanned + self.changed_content.extend(known_files); + + // Figure out if the new unknown files are allowed to be scanned + if !new_unknown_files.is_empty() { + if let Some(walk_builder) = &mut self.walker { + for entry in walk_builder.build().filter_map(Result::ok) { + let path = entry.path(); + if !path.is_file() { + continue; + } + + let mut drop_file_indexes = vec![]; + for (idx, changed_file) in new_unknown_files.iter().enumerate().rev() { + let ChangedContent::File(file, _) = changed_file else { + continue; + }; + + // When the file is found on disk it means that all the rules pass. We can + // extract the current file and remove it from the list of passed in files. + if file == path { + self.files.push(path.to_path_buf()); // Track for future use + self.changed_content.push(changed_file.clone()); // Track for parsing + drop_file_indexes.push(idx); + } + } + + // Remove all files that we found on disk + if !drop_file_indexes.is_empty() { + drop_file_indexes.into_iter().for_each(|idx| { + new_unknown_files.remove(idx); + }); + } + + // We can stop walking the file system if all files we are interested in have + // been found. + if new_unknown_files.is_empty() { + break; + } + } + } + } + + self.extract_candidates() + } + + #[tracing::instrument(skip_all)] + fn extract_candidates(&mut self) -> Vec { + let changed_content = self.changed_content.drain(..).collect::>(); + + let candidates = parse_all_blobs(read_all_files(changed_content)); + + // Only compute the new candidates and ignore the ones we already have. This is for + // subsequent calls to prevent serializing the entire set of candidates every time. + let mut new_candidates = candidates + .into_par_iter() + .filter(|candidate| !self.candidates.contains(candidate)) + .collect::>(); + + new_candidates.par_sort_unstable(); + + // Track new candidates for subsequent calls + self.candidates.par_extend(new_candidates.clone()); + + new_candidates + } + + #[tracing::instrument(skip_all)] + fn scan_sources(&mut self) { + let Some(walker) = &mut self.walker else { + return; + }; + + for entry in walker.build().filter_map(Result::ok) { + let path = entry.into_path(); + let Ok(metadata) = path.metadata() else { + continue; + }; + if metadata.is_dir() { + self.dirs.push(path); + } else if metadata.is_file() { + let extension = path + .extension() + .and_then(|x| x.to_str()) + .unwrap_or_default(); // In case the file has no extension + + self.extensions.insert(extension.to_owned()); + self.changed_content.push(ChangedContent::File( + path.to_path_buf(), + extension.to_owned(), + )); + + self.files.push(path); + } + } + } + + #[tracing::instrument(skip_all)] + pub fn get_files(&mut self) -> Vec { + self.scan_sources(); + + self.files + .par_iter() + .filter_map(|x| x.clone().into_os_string().into_string().ok()) + .collect() + } + + #[tracing::instrument(skip_all)] + pub fn get_globs(&mut self) -> Vec { + self.scan_sources(); + + for source in self.sources.iter() { + if let SourceEntry::Auto { base } = source { + let globs = resolve_globs((base).to_path_buf(), &self.dirs, &self.extensions); + self.globs.extend(globs); + } else if let SourceEntry::Pattern { base, pattern } = source { + self.globs.push(GlobEntry { + base: base.to_string_lossy().to_string(), + pattern: pattern.to_string(), + }); + } + } + + // Re-optimize the globs to reduce the number of patterns we have to scan. + self.globs = optimize_patterns(&self.globs); + + self.globs.clone() + } + + #[tracing::instrument(skip_all)] + pub fn get_normalized_sources(&mut self) -> Vec { + self.sources + .iter() + .filter_map(|source| match source { + SourceEntry::Auto { base } => Some(GlobEntry { + base: base.to_string_lossy().to_string(), + pattern: "**/*".to_string(), + }), + SourceEntry::Pattern { base, pattern } => Some(GlobEntry { + base: base.to_string_lossy().to_string(), + pattern: pattern.to_string(), + }), + _ => None, + }) + .collect() + } + + #[tracing::instrument(skip_all)] + pub fn get_candidates_with_positions( + &mut self, + changed_content: ChangedContent, + ) -> Vec<(String, usize)> { + let content = read_changed_content(changed_content).unwrap_or_default(); + let original_content = &content; + + // Workaround for legacy upgrades: + // + // `-[]` won't parse in the new parser (`[…]` must contain _something_), but we do need it + // for people using `group-[]` (which we will later replace with `in-[.group]` instead). + let content = content.replace("-[]", "XYZ"); + let offset = content.as_ptr() as usize; + + let mut extractor = Extractor::new(&content[..]); + + extractor + .extract() + .into_par_iter() + .flat_map(|extracted| match extracted { + Extracted::Candidate(s) => { + let i = s.as_ptr() as usize - offset; + let original = &original_content[i..i + s.len()]; + if original.contains_str("-[]") { + return Some(unsafe { + (String::from_utf8_unchecked(original.to_vec()), i) + }); + } + + // SAFETY: When we parsed the candidates, we already guaranteed that the byte + // slices are valid, therefore we don't have to re-check here when we want to + // convert it back to a string. + Some(unsafe { (String::from_utf8_unchecked(s.to_vec()), i) }) + } + + _ => None, + }) + .collect() + } +} + +fn read_changed_content(c: ChangedContent) -> Option> { + let (content, extension) = match c { + ChangedContent::File(file, extension) => match std::fs::read(&file) { + Ok(content) => (content, extension), + Err(e) => { + event!(tracing::Level::ERROR, "Failed to read file: {:?}", e); + return None; + } + }, + + ChangedContent::Content(contents, extension) => (contents.into_bytes(), extension), + }; + + Some(pre_process_input(&content, &extension)) +} + +pub fn pre_process_input(content: &[u8], extension: &str) -> Vec { + use crate::extractor::pre_processors::*; + + match extension { + "clj" | "cljs" | "cljc" => Clojure.process(content), + "cshtml" | "razor" => Razor.process(content), + "haml" => Haml.process(content), + "json" => Json.process(content), + "pug" => Pug.process(content), + "rb" | "erb" => Ruby.process(content), + "slim" => Slim.process(content), + "svelte" => Svelte.process(content), + "vue" => Vue.process(content), + _ => content.to_vec(), + } +} + +#[tracing::instrument(skip_all)] +fn read_all_files(changed_content: Vec) -> Vec> { + event!( + tracing::Level::INFO, + "Reading {:?} file(s)", + changed_content.len() + ); + + changed_content + .into_par_iter() + .filter_map(read_changed_content) + .collect() +} + +#[tracing::instrument(skip_all)] +fn parse_all_blobs(blobs: Vec>) -> Vec { + let mut result: Vec<_> = blobs + .par_iter() + .flat_map(|blob| blob.par_split(|x| *x == b'\n')) + .filter_map(|blob| { + if blob.is_empty() { + return None; + } + + let extracted = crate::extractor::Extractor::new(blob).extract(); + if extracted.is_empty() { + return None; + } + + Some(FxHashSet::from_iter(extracted.into_iter().map( + |x| match x { + Extracted::Candidate(bytes) => bytes, + Extracted::CssVariable(bytes) => bytes, + }, + ))) + }) + .reduce(Default::default, |mut a, b| { + a.extend(b); + a + }) + .into_iter() + .map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) }) + .collect(); + + // SAFETY: Unstable sort is faster and in this scenario it's also safe because we are + // guaranteed to have unique candidates. + result.par_sort_unstable(); + + result +} + +/// Create a walker for the given sources to detect all the files that we have to scan. +/// +/// The `mtimes` map is used to keep track of the last modified time of each file. This is used to +/// determine if a file or folder has changed since the last scan and we can skip folders that +/// haven't changed. +fn create_walker(sources: Sources) -> Option { + let mtimes: Arc>> = Default::default(); + let mut other_roots: FxHashSet<&PathBuf> = FxHashSet::default(); + let mut first_root: Option<&PathBuf> = None; + let mut ignores: BTreeMap<&PathBuf, BTreeSet> = Default::default(); + + let mut auto_content_roots = FxHashSet::default(); + + for source in sources.iter() { + match source { + SourceEntry::Auto { base } => { + auto_content_roots.insert(base); + if first_root.is_none() { + first_root = Some(base); + } else { + other_roots.insert(base); + } + } + SourceEntry::IgnoredAuto { base } => { + ignores.entry(base).or_default().insert("**/*".to_string()); + } + SourceEntry::Pattern { base, pattern } => { + let mut pattern = pattern.to_string(); + + if first_root.is_none() { + first_root = Some(base); + } else { + other_roots.insert(base); + } + + if !pattern.contains("**") { + // Ensure that the pattern is pinned to the base path. + if !pattern.starts_with("/") { + pattern = format!("/{pattern}"); + } + + // Specific patterns should take precedence even over git-ignored files: + ignores + .entry(base) + .or_default() + .insert(format!("!{}", pattern)); + } else { + // Assumption: the pattern we receive will already be brace expanded. So + // `*.{html,jsx}` will result in two separate patterns: `*.html` and `*.jsx`. + if let Some(extension) = Path::new(&pattern).extension() { + // Extend auto source detection to include the extension + ignores + .entry(base) + .or_default() + .insert(format!("!*.{}", extension.to_string_lossy())); + } + } + } + SourceEntry::IgnoredPattern { base, pattern } => { + let mut pattern = pattern.to_string(); + // Ensure that the pattern is pinned to the base path. + if !pattern.starts_with("/") { + pattern = format!("/{pattern}"); + } + ignores.entry(base).or_default().insert(pattern); + } + } + } + + let mut builder = WalkBuilder::new(first_root?); + + // Scan hidden files / directories + builder.hidden(false); + + // Don't respect global gitignore files + builder.git_global(false); + + // By default, allow .gitignore files to be used regardless of whether or not + // a .git directory is present. This is an optimization for when projects + // are first created and may not be in a git repo yet. + builder.require_git(false); + + // If we are in a git repo then require it to ensure that only rules within + // the repo are used. For example, we don't want to consider a .gitignore file + // in the user's home folder if we're in a git repo. + // + // The alternative is using a call like `.parents(false)` but that will + // prevent looking at parent directories for .gitignore files from within + // the repo and that's not what we want. + // + // For example, in a project with this structure: + // + // home + // .gitignore + // my-project + // .gitignore + // apps + // .gitignore + // web + // {root} + // + // We do want to consider all .gitignore files listed: + // - home/.gitignore + // - my-project/.gitignore + // - my-project/apps/.gitignore + // + // However, if a repo is initialized inside my-project then only the following + // make sense for consideration: + // - my-project/.gitignore + // - my-project/apps/.gitignore + // + // Setting the require_git(true) flag conditionally allows us to do this. + for parent in first_root?.ancestors() { + if parent.join(".git").exists() { + builder.require_git(true); + break; + } + } + + for root in other_roots { + builder.add(root); + } + + // Setup auto source detection rules + builder.add_gitignore(auto_source_detection::RULES.clone()); + + // Setup ignores based on `@source` definitions + for (base, patterns) in ignores { + let mut ignore_builder = GitignoreBuilder::new(base); + for pattern in patterns { + ignore_builder.add_line(None, &pattern).unwrap(); + } + let ignore = ignore_builder.build().unwrap(); + builder.add_gitignore(ignore); + } + + builder.filter_entry({ + move |entry| { + let path = entry.path(); + + // Ensure the entries are matching any of the provided source patterns (this is + // necessary for manual-patterns that can filter the file extension) + if path.is_file() { + let mut matches = false; + for source in sources.iter() { + match source { + SourceEntry::Auto { base } => { + if path.starts_with(base) { + matches = true; + break; + } + } + SourceEntry::Pattern { base, pattern } => { + let mut pattern = pattern.to_string(); + // Ensure that the pattern is pinned to the base path. + if !pattern.starts_with("/") { + pattern = format!("/{pattern}"); + } + + // Check if path starts with base, if so, remove the prefix and check the remainder against the pattern + let remainder = path.strip_prefix(base); + if remainder.is_ok_and(|remainder| { + let mut path_str = remainder.to_string_lossy().to_string(); + if !path_str.starts_with("/") { + path_str = format!("/{path_str}"); + } + glob_match(pattern, path_str.as_bytes()) + }) { + matches = true; + break; + } + } + _ => {} + } + } + + if !matches { + return false; + } + } + + let mut mtimes = mtimes.lock().unwrap(); + let current_time = match entry.metadata() { + Ok(metadata) if metadata.is_file() => { + if let Ok(time) = metadata.modified() { + Some(time) + } else { + None + } + } + _ => None, + }; + + let previous_time = + current_time.and_then(|time| mtimes.insert(entry.clone().into_path(), time)); + + match (current_time, previous_time) { + (Some(current), Some(prev)) if prev == current => false, + _ => true, + } + } + }); + + Some(builder) +} + +#[cfg(test)] +mod tests { + use super::{ChangedContent, Scanner}; + + #[test] + fn test_positions() { + let mut scanner = Scanner::new(vec![]); + + for (input, expected) in [ + // Before migrations + ( + r#"
"#, + vec![ + ("class".to_string(), 5), + ("tw:flex!".to_string(), 12), + ("tw:sm:block!".to_string(), 21), + ("tw:bg-linear-to-t".to_string(), 34), + ("flex".to_string(), 52), + ("tw:[color:red]".to_string(), 57), + ("tw:in-[.tw\\:group]:flex".to_string(), 72), + ], + ), + ] { + let candidates = scanner.get_candidates_with_positions(ChangedContent::Content( + input.to_string(), + "html".into(), + )); + assert_eq!(candidates, expected); + } + } +} diff --git a/crates/oxide/src/scanner/sources.rs b/crates/oxide/src/scanner/sources.rs new file mode 100644 index 000000000000..cc0f8805a45b --- /dev/null +++ b/crates/oxide/src/scanner/sources.rs @@ -0,0 +1,228 @@ +use crate::glob::optimize_public_source_entry; +use crate::GlobEntry; +use bexpand::Expression; +use std::path::PathBuf; + +#[derive(Debug, Clone)] +pub struct PublicSourceEntry { + /// Base path of the glob + pub base: String, + + /// Glob pattern + pub pattern: String, + + /// Negated flag + pub negated: bool, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum SourceEntry { + /// Auto source detection + /// + /// Represented by: + /// + /// ```css + /// @source "src";` + /// @source "src/**/*";` + /// ``` + Auto { base: PathBuf }, + + /// Ignored auto source detection + /// + /// Represented by: + /// + /// ```css + /// @source not "src";` + /// @source not "src/**/*";` + /// ``` + IgnoredAuto { base: PathBuf }, + + /// Explicit source pattern regardless of any auto source detection rules + /// + /// Represented by: + /// + /// ```css + /// @source "src/**/*.html";` + /// ``` + Pattern { base: PathBuf, pattern: String }, + + /// Explicit ignored source pattern regardless of any auto source detection rules + /// + /// Represented by: + /// + /// ```css + /// @source not "src/**/*.html";` + /// ``` + IgnoredPattern { base: PathBuf, pattern: String }, +} + +#[derive(Debug, Clone, Default)] +pub struct Sources { + sources: Vec, +} + +impl Sources { + pub fn new(sources: Vec) -> Self { + Self { sources } + } + + pub fn iter(&self) -> impl Iterator { + self.sources.iter() + } +} + +impl PublicSourceEntry { + pub fn from_pattern(dir: PathBuf, pattern: &str) -> Self { + let mut parts = pattern.split_whitespace(); + let _ = parts.next().unwrap_or_default(); + let not_or_pattern = parts.next().unwrap_or_default(); + if not_or_pattern == "not" { + let pattern = parts.next().unwrap_or_default(); + return Self { + base: dir.to_string_lossy().into(), + pattern: pattern[1..pattern.len() - 1].to_string(), + negated: true, + }; + } + + Self { + base: dir.to_string_lossy().into(), + pattern: not_or_pattern[1..not_or_pattern.len() - 1].to_string(), + negated: false, + } + } +} + +/// For each public source entry: +/// +/// 1. Perform brace expansion +/// +/// ```diff +/// - { base: '/', pattern: 'src/{foo,bar}.html'} +/// + { base: '/', pattern: 'src/foo.html'} +/// + { base: '/', pattern: 'src/bar.html'} +/// ``` +/// +/// 2. Hoist static parts, e.g.: +/// +/// ```diff +/// - { base: '/', pattern: 'src/**/*.html'} +/// + { base: '/src', pattern: '**/*.html'} +/// ``` +/// +/// 3. Convert to private SourceEntry +/// +pub fn public_source_entries_to_private_source_entries( + sources: Vec, +) -> Vec { + // Perform brace expansion + let expanded_globs = sources + .into_iter() + .flat_map(|source| { + let expression: Result = source.pattern[..].try_into(); + let Ok(expression) = expression else { + return vec![source]; + }; + + expression + .into_iter() + .filter_map(Result::ok) + .map(move |pattern| PublicSourceEntry { + base: source.base.clone(), + pattern: pattern.into(), + negated: source.negated, + }) + .collect::>() + }) + .map(|mut public_source| { + optimize_public_source_entry(&mut public_source); + public_source + }) + .collect::>(); + + // Convert from public SourceEntry to private SourceEntry + expanded_globs + .into_iter() + .map(Into::into) + .collect::>() +} + +/// Convert a public source entry to a source entry +impl From for SourceEntry { + fn from(value: PublicSourceEntry) -> Self { + let auto = value.pattern.ends_with("**/*") + || PathBuf::from(&value.base).join(&value.pattern).is_dir(); + + match (value.negated, auto) { + (false, true) => SourceEntry::Auto { + base: value.base.into(), + }, + (false, false) => SourceEntry::Pattern { + base: value.base.into(), + pattern: value.pattern, + }, + (true, true) => SourceEntry::IgnoredAuto { + base: value.base.into(), + }, + (true, false) => SourceEntry::IgnoredPattern { + base: value.base.into(), + pattern: value.pattern, + }, + } + } +} + +impl From for SourceEntry { + fn from(value: GlobEntry) -> Self { + SourceEntry::Pattern { + base: PathBuf::from(value.base), + pattern: value.pattern, + } + } +} + +impl From for GlobEntry { + fn from(value: SourceEntry) -> Self { + match value { + SourceEntry::Auto { base } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: "**/*".into(), + }, + SourceEntry::Pattern { base, pattern } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: pattern.clone(), + }, + SourceEntry::IgnoredAuto { base } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: "**/*".into(), + }, + SourceEntry::IgnoredPattern { base, pattern } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: pattern.clone(), + }, + } + } +} + +impl From<&SourceEntry> for GlobEntry { + fn from(value: &SourceEntry) -> Self { + match value { + SourceEntry::Auto { base } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: "**/*".into(), + }, + SourceEntry::Pattern { base, pattern } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: pattern.clone(), + }, + SourceEntry::IgnoredAuto { base } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: "**/*".into(), + }, + SourceEntry::IgnoredPattern { base, pattern } => GlobEntry { + base: base.to_string_lossy().into(), + pattern: pattern.clone(), + }, + } + } +} From 72b18cd904574b2231d91d66037182334beb1a4f Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:30:22 +0100 Subject: [PATCH 05/26] simplify `scanner` test setup Essentially we used to combine files and globs into a shared `paths`. Now these are split again, and instead of using tuple syntax you can get them from a `ScanResult`. Additionally, we will _not_ add `**/*` (Auto Source Detection) by default, you have to be explicit about this. --- crates/oxide/tests/scanner.rs | 435 ++++++++++++++++++++++++---------- 1 file changed, 310 insertions(+), 125 deletions(-) diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index c5556aaa262e..e14dc152bc85 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -8,6 +8,13 @@ mod scanner { use tailwindcss_oxide::*; use tempfile::tempdir; + struct ScanResult { + files: Vec, + globs: Vec, + normalized_sources: Vec, + candidates: Vec, + } + fn create_files_in(dir: &path::Path, paths: &[(&str, &str)]) { // Create the necessary files for (path, contents) in paths { @@ -24,8 +31,8 @@ mod scanner { fn scan_with_globs( paths_with_content: &[(&str, &str)], - globs: Vec<&str>, - ) -> (Vec, Vec) { + source_directives: Vec<&str>, + ) -> ScanResult { // Create a temporary working directory let dir = tempdir().unwrap().into_path(); @@ -38,101 +45,147 @@ mod scanner { let base = format!("{}", dir.display()).replace('\\', "/"); // Resolve all content paths for the (temporary) current working directory - let mut sources: Vec = globs + let sources: Vec = source_directives .iter() - .map(|x| GlobEntry { - base: base.clone(), - pattern: x.to_string(), - }) + .map(|str| PublicSourceEntry::from_pattern(base.clone().into(), str)) .collect(); - sources.push(GlobEntry { - base: base.clone(), - pattern: "**/*".to_string(), - }); - - let mut scanner = Scanner::new(Some(sources)); + let mut scanner = Scanner::new(sources); let candidates = scanner.scan(); - let mut paths: Vec<_> = scanner.get_files(); - - for glob in scanner.get_globs() { - paths.push(format!("{}{}{}", glob.base, "/", glob.pattern)); - } - - let parent_dir = + let base_dir = format!("{}{}", dunce::canonicalize(&base).unwrap().display(), "/").replace('\\', "/"); - paths = paths - .into_iter() - .map(|x| { + // Get all scanned files as strings relative to the base directory + let mut files = scanner + .get_files() + .iter() // Normalize paths to use unix style separators - x.replace('\\', "/").replace(&parent_dir, "") - }) - .collect(); - - // Sort the output for easier comparison (depending on internal data structure the order - // _could_ be random) - paths.sort(); + .map(|file| file.replace('\\', "/").replace(&base_dir, "")) + .collect::>(); + files.sort(); - (paths, candidates) + // Get all scanned globs as strings relative to the base directory + let mut globs = scanner + .get_globs() + .iter() + .map(|glob| { + if glob.pattern.starts_with('/') { + format!("{}{}", glob.base, glob.pattern) + } else { + format!("{}/{}", glob.base, glob.pattern) + } + }) + // Normalize paths to use unix style separators + .map(|file| file.replace('\\', "/").replace(&base_dir, "")) + .collect::>(); + globs.sort(); + + // Get all normalized sources as strings relative to the base directory + let mut normalized_sources = scanner + .get_normalized_sources() + .iter() + .map(|glob| { + if glob.pattern.starts_with('/') { + format!("{}{}", glob.base, glob.pattern) + } else { + format!("{}/{}", glob.base, glob.pattern) } + }) + // Normalize paths to use unix style separators + .map(|file| file.replace('\\', "/").replace(&base_dir, "")) + .collect::>(); + normalized_sources.sort(); - fn scan(paths_with_content: &[(&str, &str)]) -> (Vec, Vec) { - scan_with_globs(paths_with_content, vec![]) + ScanResult { + files, + globs, + normalized_sources, + candidates, + } } - fn test(paths_with_content: &[(&str, &str)]) -> Vec { - scan(paths_with_content).0 + fn scan(paths_with_content: &[(&str, &str)]) -> ScanResult { + scan_with_globs(paths_with_content, vec!["@source '**/*'"]) } #[test] fn it_should_work_with_a_set_of_root_files() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("a.html", ""), ("b.html", ""), ("c.html", ""), ]); - assert_eq!(globs, vec!["*", "a.html", "b.html", "c.html", "index.html"]); + assert_eq!(files, vec!["a.html", "b.html", "c.html", "index.html"]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_work_with_a_set_of_root_files_and_ignore_ignored_files() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ (".gitignore", "b.html"), ("index.html", ""), ("a.html", ""), ("b.html", ""), ("c.html", ""), ]); - assert_eq!(globs, vec!["*", "a.html", "c.html", "index.html"]); + assert_eq!(files, vec!["a.html", "c.html", "index.html"]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_list_all_files_in_the_public_folder_explicitly() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("public/a.html", ""), ("public/b.html", ""), ("public/c.html", ""), + ("public/nested/c.html", ""), + ("public/deeply/nested/c.html", ""), ]); + assert_eq!( - globs, + files, vec![ - "*", "index.html", "public/a.html", "public/b.html", "public/c.html", + "public/deeply/nested/c.html", + "public/nested/c.html", ] ); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_list_nested_folders_explicitly_in_the_public_folder() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("public/a.html", ""), ("public/b.html", ""), @@ -143,10 +196,10 @@ mod scanner { ("public/nested/again/a.html", ""), ("public/very/deeply/nested/a.html", ""), ]); + assert_eq!( - globs, + files, vec![ - "*", "index.html", "public/a.html", "public/b.html", @@ -158,72 +211,119 @@ mod scanner { "public/very/deeply/nested/a.html", ] ); + assert_eq!(globs, vec!["*",]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_list_all_files_in_the_public_folder_explicitly_except_ignored_files() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ (".gitignore", "public/b.html\na.html"), ("index.html", ""), ("public/a.html", ""), ("public/b.html", ""), ("public/c.html", ""), ]); - assert_eq!(globs, vec!["*", "index.html", "public/c.html",]); + + assert_eq!(files, vec!["index.html", "public/c.html",]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_use_a_glob_for_top_level_folders() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("src/a.html", ""), ("src/b.html", ""), ("src/c.html", ""), ]); - assert_eq!(globs, vec!["*", - "index.html", + + assert_eq!( + files, + vec!["index.html", "src/a.html", "src/b.html", "src/c.html"] + ); + assert_eq!(globs, vec![ + "*", "src/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", - "src/a.html", - "src/b.html", - "src/c.html" ]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_ignore_binary_files() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("a.mp4", ""), ("b.png", ""), ("c.lock", ""), ]); - assert_eq!(globs, vec!["*", "index.html"]); + + assert_eq!(files, vec!["index.html"]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_ignore_known_extensions() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("a.css", ""), ("b.sass", ""), ("c.less", ""), ]); - assert_eq!(globs, vec!["*", "index.html"]); + + assert_eq!(files, vec!["index.html"]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_ignore_known_files() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ ("index.html", ""), ("package-lock.json", ""), ("yarn.lock", ""), ]); - assert_eq!(globs, vec!["*", "index.html"]); + + assert_eq!(files, vec!["index.html"]); + assert_eq!(globs, vec!["*"]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_ignore_and_expand_nested_ignored_folders() { - let globs = test(&[ + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[ // Explicitly listed root files ("foo.html", ""), ("bar.html", ""), @@ -267,38 +367,28 @@ mod scanner { ]); assert_eq!( - globs, + files, vec![ - "*", "bar.html", "baz.html", "foo.html", - "nested-a/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-a/bar.html", "nested-a/baz.html", "nested-a/foo.html", - "nested-b/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-b/deeply-nested/bar.html", "nested-b/deeply-nested/baz.html", "nested-b/deeply-nested/foo.html", - "nested-c/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-c/bar.html", "nested-c/baz.html", "nested-c/foo.html", - "nested-c/sibling-folder/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-c/sibling-folder/bar.html", "nested-c/sibling-folder/baz.html", "nested-c/sibling-folder/foo.html", - "nested-d/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-d/bar.html", "nested-d/baz.html", "nested-d/foo.html", - "nested-d/very/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", - "nested-d/very/deeply/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", - "nested-d/very/deeply/nested/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-d/very/deeply/nested/bar.html", "nested-d/very/deeply/nested/baz.html", - "nested-d/very/deeply/nested/directory/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "nested-d/very/deeply/nested/directory/again/foo.html", "nested-d/very/deeply/nested/directory/bar.html", "nested-d/very/deeply/nested/directory/baz.html", @@ -306,6 +396,19 @@ mod scanner { "nested-d/very/deeply/nested/foo.html", ] ); + assert_eq!(globs, vec![ + "*", + "nested-a/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-b/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-c/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-c/sibling-folder/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-d/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-d/very/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-d/very/deeply/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-d/very/deeply/nested/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "nested-d/very/deeply/nested/directory/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + ]); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] @@ -314,7 +417,11 @@ mod scanner { ignores.push_str("# md:font-bold\n"); ignores.push_str("foo.html\n"); - let candidates = scan(&[ + let ScanResult { + candidates, + normalized_sources, + .. + } = scan(&[ // The gitignore file is used to filter out files but not scanned for candidates (".gitignore", &ignores), // A file that should definitely be scanned @@ -333,8 +440,7 @@ mod scanner { ("index2.svelte", ""), ("index3.svelte", ""), ("index4.svelte", ""), - ]) - .1; + ]); assert_eq!( candidates, @@ -351,32 +457,48 @@ mod scanner { "underline" ] ); + assert_eq!(normalized_sources, vec!["**/*"]); } #[test] fn it_should_be_possible_to_scan_in_the_parent_directory() { - let candidates = scan_with_globs( + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( &[("foo/bar/baz/foo.html", "content-['foo.html']")], - vec!["./foo/bar/baz/.."], - ) - .1; + vec!["@source '**/*'", "@source './foo/bar/baz/..'"], + ); assert_eq!(candidates, vec!["content-['foo.html']"]); + assert_eq!(normalized_sources, vec!["**/*", "foo/bar/**/*"]); } #[test] fn it_should_scan_files_without_extensions() { // These look like folders, but they are files - let candidates = - scan_with_globs(&[("my-file", "content-['my-file']")], vec!["./my-file"]).1; + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( + &[("my-file", "content-['my-file']")], + vec!["@source '**/*'", "@source './my-file'"], + ); assert_eq!(candidates, vec!["content-['my-file']"]); + assert_eq!(normalized_sources, vec!["**/*", "my-file"]); } #[test] fn it_should_scan_folders_with_extensions() { // These look like files, but they are folders - let candidates = scan_with_globs( + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( &[ ( "my-folder.templates/foo.html", @@ -387,9 +509,12 @@ mod scanner { "content-['my-folder.bin/foo.html']", ), ], - vec!["./my-folder.templates", "./my-folder.bin"], - ) - .1; + vec![ + "@source '**/*'", + "@source './my-folder.templates'", + "@source './my-folder.bin'", + ], + ); assert_eq!( candidates, @@ -398,26 +523,38 @@ mod scanner { "content-['my-folder.templates/foo.html']", ] ); + assert_eq!( + normalized_sources, + vec!["**/*", "my-folder.bin/**/*", "my-folder.templates/**/*"] + ); } #[test] fn it_should_scan_content_paths() { - let candidates = scan_with_globs( + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( &[ // We know that `.styl` extensions are ignored, so they are not covered by auto content // detection. ("foo.styl", "content-['foo.styl']"), ], - vec!["*.styl"], - ) - .1; + vec!["@source '**/*'", "@source '*.styl'"], + ); assert_eq!(candidates, vec!["content-['foo.styl']"]); + assert_eq!(normalized_sources, vec!["**/*", "*.styl"]); } #[test] fn it_should_scan_next_dynamic_folders() { - let candidates = scan_with_globs( + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( &[ // We know that `.styl` extensions are ignored, so they are not covered by auto content // detection. @@ -426,9 +563,8 @@ mod scanner { ("app/[[...slug]]/page.styl", "content-['[[...slug]]']"), ("app/(theme)/page.styl", "content-['(theme)']"), ], - vec!["./**/*.{styl}"], - ) - .1; + vec!["@source '**/*'", "@source './**/*.{styl}'"], + ); assert_eq!( candidates, @@ -439,6 +575,7 @@ mod scanner { "content-['[slug]']", ], ); + assert_eq!(normalized_sources, vec!["**/*", "**/*.styl"]); } #[test] @@ -461,12 +598,13 @@ mod scanner { // Get POSIX-style absolute path let full_path = format!("{}", dir.display()).replace('\\', "/"); - let sources = vec![GlobEntry { + let sources = vec![PublicSourceEntry { base: full_path.clone(), pattern: full_path.clone(), + negated: false, }]; - let mut scanner = Scanner::new(Some(sources)); + let mut scanner = Scanner::new(sources); let candidates = scanner.scan(); // We've done the initial scan and found the files @@ -481,18 +619,23 @@ mod scanner { #[test] fn it_should_scan_content_paths_even_when_they_are_git_ignored() { - let candidates = scan_with_globs( + let ScanResult { + candidates, + normalized_sources, + .. + } = scan_with_globs( &[ (".gitignore", "foo.styl"), // We know that `.styl` extensions are ignored, so they are not covered by auto content // detection. ("foo.styl", "content-['foo.styl']"), ], - vec!["foo.styl"], - ) - .1; + // But explicitly including them should still work + vec!["@source '**/*'", "@source 'foo.styl'"], + ); assert_eq!(candidates, vec!["content-['foo.styl']"]); + assert_eq!(normalized_sources, vec!["**/*", "foo.styl"]); } #[test] @@ -513,17 +656,11 @@ mod scanner { ); let sources = vec![ - GlobEntry { - base: dir.join("project-a").to_string_lossy().to_string(), - pattern: "**/*".to_owned(), - }, - GlobEntry { - base: dir.join("project-b").to_string_lossy().to_string(), - pattern: "**/*".to_owned(), - }, + PublicSourceEntry::from_pattern(dir.join("project-a"), "@source '**/*'"), + PublicSourceEntry::from_pattern(dir.join("project-b"), "@source '**/*'"), ]; - let mut scanner = Scanner::new(Some(sources)); + let mut scanner = Scanner::new(sources); let candidates = scanner.scan(); // We've done the initial scan and found the files @@ -668,18 +805,41 @@ mod scanner { ], ); - let sources = vec![GlobEntry { - base: dir - .join("home/project/apps/web") + let sources = vec![ + PublicSourceEntry::from_pattern( + dir.join("home/project/apps/web") .to_string_lossy() - .to_string(), - pattern: "**/*".to_owned(), - }]; + .to_string() + .into(), + "@source '**/*'", + ), + PublicSourceEntry::from_pattern( + dir.join("home/project/apps/web") + .to_string_lossy() + .to_string() + .into(), + "@source '../admin'", + ), + PublicSourceEntry::from_pattern( + dir.join("home/project/apps/web") + .to_string_lossy() + .to_string() + .into(), + "@source '../dashboard/*.html'", + ), + ]; - let candidates = Scanner::new(Some(sources.clone())).scan(); + let candidates = Scanner::new(sources.clone()).scan(); // All ignore files are applied because there's no git repo - assert_eq!(candidates, vec!["content-['index.html']".to_owned(),]); + assert_eq!( + candidates, + vec![ + "content-['home/project/apps/admin/index.html']", + "content-['home/project/apps/dashboard/index.html']", + "content-['index.html']" + ] + ); // Initialize `home` as a git repository and scan again // The results should be the same as before @@ -687,9 +847,16 @@ mod scanner { .arg("init") .current_dir(dir.join("home")) .output(); - let candidates = Scanner::new(Some(sources.clone())).scan(); + let candidates = Scanner::new(sources.clone()).scan(); - assert_eq!(candidates, vec!["content-['index.html']".to_owned(),]); + assert_eq!( + candidates, + vec![ + "content-['home/project/apps/admin/index.html']", + "content-['home/project/apps/dashboard/index.html']", + "content-['index.html']" + ] + ); // Drop the .git folder fs::remove_dir_all(dir.join("home/.git")).unwrap(); @@ -699,13 +866,15 @@ mod scanner { .arg("init") .current_dir(dir.join("home/project")) .output(); - let candidates = Scanner::new(Some(sources.clone())).scan(); + let candidates = Scanner::new(sources.clone()).scan(); assert_eq!( candidates, vec![ - "content-['ignore-home.html']".to_owned(), - "content-['index.html']".to_owned(), + "content-['home/project/apps/admin/index.html']", + "content-['home/project/apps/dashboard/index.html']", + "content-['ignore-home.html']", + "content-['index.html']" ] ); @@ -717,14 +886,16 @@ mod scanner { .arg("init") .current_dir(dir.join("home/project/apps")) .output(); - let candidates = Scanner::new(Some(sources.clone())).scan(); + let candidates = Scanner::new(sources.clone()).scan(); assert_eq!( candidates, vec![ - "content-['ignore-home.html']".to_owned(), - "content-['ignore-project.html']".to_owned(), - "content-['index.html']".to_owned(), + "content-['home/project/apps/admin/index.html']", + "content-['home/project/apps/dashboard/index.html']", + "content-['ignore-home.html']", + "content-['ignore-project.html']", + "content-['index.html']" ] ); @@ -736,7 +907,21 @@ mod scanner { .arg("init") .current_dir(dir.join("home/project/apps/web")) .output(); - let candidates = Scanner::new(Some(sources.clone())).scan(); + + let candidates = Scanner::new(sources.clone()).scan(); + + assert_eq!( + candidates, + vec![ + "content-['home/project/apps/admin/index.html']", + "content-['home/project/apps/dashboard/index.html']", + "content-['ignore-apps.html']", + "content-['ignore-home.html']", + "content-['ignore-project.html']", + "content-['index.html']", + ] + ); + } assert_eq!( candidates, From cd6ccb2d79f71b0fbd38d065c9c26f6b2019921d Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:38:28 +0100 Subject: [PATCH 06/26] add new Scanner tests --- crates/oxide/tests/scanner.rs | 613 +++++++++++++++++++++++++++++++++- 1 file changed, 606 insertions(+), 7 deletions(-) diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index e14dc152bc85..47dceb5b5134 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -61,7 +61,7 @@ mod scanner { let mut files = scanner .get_files() .iter() - // Normalize paths to use unix style separators + // Normalize paths to use unix style separators .map(|file| file.replace('\\', "/").replace(&base_dir, "")) .collect::>(); files.sort(); @@ -91,7 +91,7 @@ mod scanner { format!("{}{}", glob.base, glob.pattern) } else { format!("{}/{}", glob.base, glob.pattern) - } + } }) // Normalize paths to use unix style separators .map(|file| file.replace('\\', "/").replace(&base_dir, "")) @@ -298,6 +298,20 @@ mod scanner { assert_eq!(normalized_sources, vec!["**/*"]); } + #[test] + fn it_should_find_new_extensions() { + let ScanResult { + files, + globs, + normalized_sources, + .. + } = scan(&[("src/index.my-extension", "")]); + + assert_eq!(files, vec!["src/index.my-extension"]); + assert_eq!(globs, vec!["*", "src/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,my-extension,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}"]); + assert_eq!(normalized_sources, vec!["**/*"]); + } + #[test] fn it_should_ignore_known_files() { let ScanResult { @@ -769,6 +783,244 @@ mod scanner { ); } + #[test] + fn it_should_ignore_negated_custom_sources() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + ("src/index.ts", "content-['src/index.ts']"), + ("src/colors/red.jsx", "content-['src/colors/red.jsx']"), + ("src/colors/blue.tsx", "content-['src/colors/blue.tsx']"), + ("src/colors/green.tsx", "content-['src/colors/green.tsx']"), + ("src/utils/string.ts", "content-['src/utils/string.ts']"), + ("src/utils/date.ts", "content-['src/utils/date.ts']"), + ("src/utils/file.ts", "content-['src/utils/file.ts']"), + ( + "src/admin/foo/template.html", + "content-['src/admin/template.html']", + ), + ( + "src/templates/index.html", + "content-['src/templates/index.html']", + ), + ], + vec![ + "@source '**/*'", + "@source not 'src/index.ts'", + "@source not '**/*.{jsx,tsx}'", + "@source not 'src/utils'", + "@source not 'dist'", + ], + ); + + assert_eq!( + candidates, + vec![ + "content-['src/admin/template.html']", + "content-['src/templates/index.html']", + ] + ); + + assert_eq!( + files, + vec![ + "src/admin/foo/template.html", + "src/templates/index.html", + // These files are ignored and thus do not need to be watched: + + // "src/colors/blue.tsx", + // "src/colors/green.tsx", + // "src/colors/red.jsx", + // "src/index.ts", + // "src/utils/date.ts", + // "src/utils/file.ts", + // "src/utils/string.ts" + ] + ); + assert_eq!( + globs, + vec![ + "*", + "src/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "src/admin/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "src/colors/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "src/templates/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + ] + ); + + assert_eq!(normalized_sources, vec!["**/*",]); + } + + #[test] + fn it_should_include_defined_extensions_that_are_ignored_by_default() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + // Typically skipped + &[ + ("src/index.exe", "content-['src/index.exe']"), + ("src/index.bin", "content-['src/index.bin']"), + ("out/out.exe", "content-['out/out.exe']"), + ], + // But explicitly included + vec!["@source '**/*'", "@source 'src/**/*.{exe,bin}'"], + ); + + assert_eq!( + candidates, + vec!["content-['src/index.bin']", "content-['src/index.exe']",] + ); + assert_eq!(files, vec!["src/index.bin", "src/index.exe",]); + assert_eq!( + globs, + vec![ + "*", + // Contains `.exe` and `.bin` in the list + "out/**/*.{aspx,astro,bin,cjs,cts,eex,erb,exe,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", + "src/{**/*.bin,**/*.exe,**/*.{aspx,astro,bin,cjs,cts,eex,erb,exe,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}}", + ] + ); + assert_eq!( + normalized_sources, + vec!["**/*", "src/**/*.bin", "src/**/*.exe"] + ); + } + + #[test] + fn it_should_work_with_manual_glob_only() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + ("index.html", "content-['index.html']"), + ("src/index.html", "content-['src/index.html']"), + ("src/ignore.html", "content-['src/ignore.html']"), + ("src/admin/index.html", "content-['src/admin/index.html']"), + ("src/admin/ignore.html", "content-['src/admin/ignore.html']"), + ( + "src/dashboard/index.html", + "content-['src/dashboard/index.html']", + ), + ( + "src/dashboard/ignore.html", + "content-['src/dashboard/ignore.html']", + ), + ("src/lib.ts", "content-['src/lib.ts']"), + ], + vec![ + "@source './src/**/*.html'", + "@source not './src/index.html'", + "@source not './src/**/ignore.html'", + ], + ); + + assert_eq!( + candidates, + vec![ + "content-['src/admin/index.html']", + "content-['src/dashboard/index.html']", + ] + ); + + assert_eq!( + files, + vec!["src/admin/index.html", "src/dashboard/index.html",] + ); + assert_eq!(globs, vec!["src/**/*.html"]); + assert_eq!(normalized_sources, vec!["src/**/*.html"]); + } + + #[test] + fn it_respects_gitignore_in_workspace_root2() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + (".gitignore", "ignore-1.html\nweb/ignore-2.html"), + ("src/index.html", "content-['src/index.html']"), + ("web/index.html", "content-['web/index.html']"), + ("web/ignore-1.html", "content-['web/ignore-1.html']"), + ("web/ignore-2.html", "content-['web/ignore-2.html']"), + ], + vec!["@source './src'", "@source './web'"], + ); + + assert_eq!( + candidates, + vec!["content-['src/index.html']", "content-['web/index.html']",] + ); + + assert_eq!(files, vec!["src/index.html", "web/index.html",]); + assert_eq!(globs, vec!["src/*", "web/*",]); + assert_eq!(normalized_sources, vec!["src/**/*", "web/**/*",]); + } + + #[test] + fn it_includes_skipped_by_default_extensions_with_a_specific_source() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + ("src/logo.jpg", "content-['/src/logo.jpg']"), + ("src/logo.png", "content-['/src/logo.png']"), + ], + vec!["@source './src/logo.{jpg,png}'"], + ); + + assert_eq!( + candidates, + vec!["content-['/src/logo.jpg']", "content-['/src/logo.png']"] + ); + assert_eq!(files, vec!["src/logo.jpg", "src/logo.png"]); + assert!(globs.is_empty()); + assert_eq!(normalized_sources, vec!["src/logo.jpg", "src/logo.png"]); + } + + #[test] + fn it_respects_gitignore_in_workspace_root_for_manual_globs() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + (".gitignore", "ignore-1.html\n/web/ignore-2.html"), + ("web/index.html", "content-['web/index.html']"), + ("web/ignore-1.html", "content-['web/ignore-1.html']"), + ("web/ignore-2.html", "content-['web/ignore-2.html']"), + ], + vec!["@source './web'", "@source './web/ignore-1.html'"], + ); + assert_eq!( + candidates, + vec![ + "content-['web/ignore-1.html']", + "content-['web/index.html']", + ] + ); + + assert_eq!(files, vec!["web/ignore-1.html", "web/index.html",]); + assert_eq!(globs, vec!["web/*"]); + assert_eq!(normalized_sources, vec!["web/**/*", "web/ignore-1.html"]); + } + #[test] fn skips_ignore_files_outside_of_a_repo() { // Create a temporary working directory @@ -802,13 +1054,23 @@ mod scanner { "home/project/apps/web/ignore-web.html", "content-['ignore-web.html']", ), + // Auto content detection outside of `web/` + ( + "home/project/apps/admin/index.html", + "content-['home/project/apps/admin/index.html']", + ), + // Manual sources outside of `web/` + ( + "home/project/apps/dashboard/index.html", + "content-['home/project/apps/dashboard/index.html']", + ), ], ); let sources = vec![ PublicSourceEntry::from_pattern( dir.join("home/project/apps/web") - .to_string_lossy() + .to_string_lossy() .to_string() .into(), "@source '**/*'", @@ -923,14 +1185,351 @@ mod scanner { ); } + #[test] + fn test_explicitly_ignore_explicitly_allowed_files() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + ("src/keep-me.html", "content-['keep-me.html']"), + ("src/ignore-me.html", "content-['ignore-me.html']"), + ], + ); + + let sources = vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), + PublicSourceEntry::from_pattern(dir.clone(), "@source not 'src/ignore-me.html'"), + ]; + + let candidates = Scanner::new(sources.clone()).scan(); + + assert_eq!(candidates, vec!["content-['keep-me.html']"]); + } + + #[test] + fn test_works_with_filenames_containing_glob_characters() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + ("src/app/[foo]/ignore-me.html", "content-['ignore-me.html']"), + ("src/app/[foo]/keep-me.html", "content-['keep-me.html']"), + ], + ); + + let sources = vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), + PublicSourceEntry::from_pattern( + dir.clone(), + "@source not 'src/app/[foo]/ignore*.html'", + ), + ]; + + let candidates = Scanner::new(sources.clone()).scan(); + + assert_eq!(candidates, vec!["content-['keep-me.html']"]); + } + + #[test] + fn test_ignore_files_can_be_included_with_custom_source_rule() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[("src/keep-me.html", "content-['src/keep-me.html']")], + ); + + let mut scanner = Scanner::new(vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), + PublicSourceEntry::from_pattern( + dir.clone(), + "@source not 'src/ignored-by-source-not.html'", + ), + ]); + + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['src/keep-me.html']"]); + + // Create new files that should definitely be ignored + create_files_in( + &dir, + &[ + // Create new file that matches the `@source '…'` glob + ("src/new-file.html", "content-['src/new-file.html']"), + // Create new file that is ignored based on file extension + ( + "src/ignore-by-extension.bin", + "content-['src/ignore-by-extension.bin']", + ), + // Create a file that is ignored based on the `.gitignore` file + (".gitignore", "src/ignored-by-gitignore.html"), + ( + "src/ignored-by-gitignore.html", + "content-['src/ignored-by-gitignore.html']", + ), + // Create a file that is ignored by the `@source not '…'` + ( + "src/ignored-by-source-not.html", + "content-['src/ignored-by-source-not.html']", + ), + ], + ); + + let candidates = scanner.scan(); + + assert_eq!( + candidates, + vec![ + // Ignored by git ignore BUT included by `@source "**/*.html"` + "content-['src/ignored-by-gitignore.html']", + "content-['src/keep-me.html']", + "content-['src/new-file.html']" + ] + ); + } + + #[test] + fn test_allow_default_ignored_files() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in(&dir, &[("foo.styl", "content-['foo.styl']")]); + + let sources = vec![PublicSourceEntry::from_pattern( + dir.clone(), + "@source '**/*'", + )]; + + let mut scanner = Scanner::new(sources.clone()); + + let candidates = scanner.scan(); + assert!(candidates.is_empty()); + + // Explicitly allow `.styl` files + let mut scanner = Scanner::new(vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), + PublicSourceEntry::from_pattern(dir.clone(), "@source '*.styl'"), + ]); + + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['foo.styl']"]); + } + + #[test] + fn test_allow_default_ignored_files_via_gitignore() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + ("index.html", "content-['index.html']"), + (".gitignore", "index.html"), + ], + ); + + let mut scanner = Scanner::new(vec![PublicSourceEntry::from_pattern( + dir.clone(), + "@source '**/*'", + )]); + + let candidates = scanner.scan(); + assert!(candidates.is_empty()); + + let mut scanner = Scanner::new(vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), + PublicSourceEntry::from_pattern(dir.clone(), "@source './*.html'"), + ]); + + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['index.html']"]); + } + + #[test] + fn test_allow_explicit_node_modules_paths() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + // Current project + ("src/index.html", "content-['src/index.html']"), + // Ignore file + (".gitignore", "node_modules"), + // Library ignored by default + ( + "node_modules/my-ui-lib/index.html", + "content-['node_modules/my-ui-lib/index.html']", + ), + ], + ); + + // Default auto source detection + let sources = vec![PublicSourceEntry::from_pattern(dir.clone(), "@source './'")]; + + let mut scanner = Scanner::new(sources.clone()); + + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['src/index.html']"]); + + // Explicitly listing all `*.html` files, should not include `node_modules` because it's + // ignored + let sources = vec![PublicSourceEntry::from_pattern( + dir.clone(), + "@source '**/*.html'", + )]; + + let mut scanner = Scanner::new(sources.clone()); + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['src/index.html']"]); + + // Explicitly listing all `*.html` files + // Explicitly list the `node_modules/my-ui-lib` + // + let sources = vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), + PublicSourceEntry::from_pattern(dir.clone(), "@source 'node_modules/my-ui-lib'"), + ]; + + let mut scanner = Scanner::new(sources.clone()); + let candidates = scanner.scan(); assert_eq!( candidates, vec![ - "content-['ignore-apps.html']".to_owned(), - "content-['ignore-home.html']".to_owned(), - "content-['ignore-project.html']".to_owned(), - "content-['index.html']".to_owned(), + "content-['node_modules/my-ui-lib/index.html']", + "content-['src/index.html']" ] ); } + + // TODO: external(…) so that `.gitignore` from main project doesn't apply to external projects + #[test] + #[ignore] + fn test_ignore_files_in_node_modules() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + (".gitignore", "node_modules\ndist"), + ( + "node_modules/my-ui-lib/dist/index.html", + "content-['node_modules/my-ui-lib/dist/index.html']", + ), + ], + ); + + // Explicitly listing all `*.html` files, should not include `node_modules` because it's + // ignored + let sources = vec![ + PublicSourceEntry::from_pattern(dir.clone(), "@source './'"), + PublicSourceEntry::from_pattern(dir.clone(), "@source './node_modules/my-ui-lib'"), + ]; + + let mut scanner = Scanner::new(sources.clone()); + let candidates = scanner.scan(); + assert_eq!( + candidates, + vec!["content-['node_modules/my-ui-lib/dist/index.html']"] + ); + } + + #[test] + fn test_manually_scanning_files_should_follow_all_rules() { + // Create a temporary working directory + let dir = tempdir().unwrap().into_path(); + + // Create files + create_files_in( + &dir, + &[ + // Ignore all `.jsx` files, and all `generated` folders + (".gitignore", "*.jsx\ngenerated/"), + // .tsx files are allowed + ( + "src/components/button.tsx", + "content-['src/components/button.tsx']", + ), + // .jsx files are not allowed + ( + "src/components/button.jsx", + "content-['src/components/button.jsx']", + ), + ], + ); + + let mut scanner = Scanner::new(vec![PublicSourceEntry::from_pattern( + dir.clone(), + "@source '**/*'", + )]); + + let candidates = scanner.scan(); + assert_eq!(candidates, vec!["content-['src/components/button.tsx']"]); + + // Create 2 new files, one "good" and one "bad" file, and manually scan them. This should + // only return the "good" file because the "bad" one is ignored by a `.gitignore` file. + create_files_in( + &dir, + &[ + ( + "src/components/good.tsx", + "content-['src/components/good.tsx']", + ), + ( + "src/components/bad.jsx", + "content-['src/components/bad.jsx']", + ), + ], + ); + + let candidates = scanner.scan_content(vec![ + ChangedContent::File(dir.join("src/components/good.tsx"), "tsx".to_owned()), + ChangedContent::File(dir.join("src/components/bad.jsx"), "jsx".to_owned()), + ]); + + assert_eq!(candidates, vec!["content-['src/components/good.tsx']"]); + + // Create a generated file in a nested folder that is ignored by a `.gitignore` file higher + // up the tree. + create_files_in( + &dir, + &[ + ( + "src/components/generated/bad.tsx", + "content-['src/components/generated/bad.tsx']", + ), + ( + "src/components/generated/bad.jsx", + "content-['src/components/generated/bad.jsx']", + ), + ], + ); + + let candidates = scanner.scan_content(vec![ + ChangedContent::File( + dir.join("src/components/generated/bad.tsx"), + "tsx".to_owned(), + ), + ChangedContent::File( + dir.join("src/components/generated/bad.jsx"), + "jsx".to_owned(), + ), + ]); + + assert!(candidates.is_empty()); + } } From 324569c94e00abb2da4b531cea13f6113b10f5e2 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:40:41 +0100 Subject: [PATCH 07/26] update public Rust API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gist of it is that we now are dealing with `SourceEntry` instead of `GlobEntry`. A `SourceEntry` in the `node` crate maps to a `PublicSourceEntry`. This makes it easy for JS APIs to pass in the values. A `PublicSourceEntry` looks like: - `base` — the base path of the source - `pattern` — the glob pattern - `negated` — whether or not the pattern should be negated Internally we will map this to a `SourceEntry`, this is a proper enum that looks like this: ```rs pub enum SourceEntry { Auto { base: PathBuf }, IgnoredAuto { base: PathBuf }, Pattern { base: PathBuf, pattern: String }, IgnoredPattern { base: PathBuf, pattern: String }, } ``` Before we construct a Scanner, we will also make sure to optimize these patterns. Some optimization steps: 1. Each `pattern` will be brace-expanded and a new `SourceEntry` will be created. This allows us to always deal with simple patterns. 2. The `base` of each `SourceEntry` will be canonicalized so we are dealing with the real paths and symlinks are resolved. 3. When patterns include static parts such as `/src/*.html`, then the static part (in this case `src`) will be moved to the `base`. --- crates/node/src/lib.rs | 56 +++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/crates/node/src/lib.rs b/crates/node/src/lib.rs index 5811698c3bdd..11ff25b0aca0 100644 --- a/crates/node/src/lib.rs +++ b/crates/node/src/lib.rs @@ -28,20 +28,27 @@ pub struct GlobEntry { pub pattern: String, } -impl From for tailwindcss_oxide::ChangedContent<'_> { +#[derive(Debug, Clone)] +#[napi(object)] +pub struct SourceEntry { + /// Base path of the glob + pub base: String, + + /// Glob pattern + pub pattern: String, + + /// Negated flag + pub negated: bool, +} + +impl From for tailwindcss_oxide::ChangedContent { fn from(changed_content: ChangedContent) -> Self { if let Some(file) = changed_content.file { - return tailwindcss_oxide::ChangedContent::File( - file.into(), - changed_content.extension.into(), - ); + return tailwindcss_oxide::ChangedContent::File(file.into(), changed_content.extension); } if let Some(contents) = changed_content.content { - return tailwindcss_oxide::ChangedContent::Content( - contents, - changed_content.extension.into(), - ); + return tailwindcss_oxide::ChangedContent::Content(contents, changed_content.extension); } unreachable!() @@ -66,13 +73,23 @@ impl From for GlobEntry { } } +impl From for tailwindcss_oxide::PublicSourceEntry { + fn from(source: SourceEntry) -> Self { + Self { + base: source.base, + pattern: source.pattern, + negated: source.negated, + } + } +} + // --- #[derive(Debug, Clone)] #[napi(object)] pub struct ScannerOptions { /// Glob sources - pub sources: Option>, + pub sources: Option>, } #[derive(Debug, Clone)] @@ -96,11 +113,10 @@ impl Scanner { #[napi(constructor)] pub fn new(opts: ScannerOptions) -> Self { Self { - scanner: tailwindcss_oxide::Scanner::new( - opts - .sources - .map(|x| x.into_iter().map(Into::into).collect()), - ), + scanner: tailwindcss_oxide::Scanner::new(match opts.sources { + Some(sources) => sources.into_iter().map(Into::into).collect(), + None => vec![], + }), } } @@ -158,4 +174,14 @@ impl Scanner { .map(Into::into) .collect() } + + #[napi(getter)] + pub fn normalized_sources(&mut self) -> Vec { + self + .scanner + .get_normalized_sources() + .into_iter() + .map(Into::into) + .collect() + } } From 335e9e198f16f7a00a9750a231fccf5ac75111cd Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:45:36 +0100 Subject: [PATCH 08/26] update use statements due to big refactor --- crates/oxide/src/extractor/mod.rs | 2 +- crates/oxide/src/extractor/pre_processors/vue.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/oxide/src/extractor/mod.rs b/crates/oxide/src/extractor/mod.rs index ce6d0aeae31b..737189104df9 100644 --- a/crates/oxide/src/extractor/mod.rs +++ b/crates/oxide/src/extractor/mod.rs @@ -202,7 +202,7 @@ mod tests { use std::hint::black_box; fn pre_process_input(input: &str, extension: &str) -> String { - let input = crate::pre_process_input(input.as_bytes(), extension); + let input = crate::scanner::pre_process_input(input.as_bytes(), extension); String::from_utf8(input).unwrap() } diff --git a/crates/oxide/src/extractor/pre_processors/vue.rs b/crates/oxide/src/extractor/pre_processors/vue.rs index 15440bb865ff..119e2a3d2079 100644 --- a/crates/oxide/src/extractor/pre_processors/vue.rs +++ b/crates/oxide/src/extractor/pre_processors/vue.rs @@ -1,5 +1,5 @@ use crate::extractor::pre_processors::pre_processor::PreProcessor; -use crate::pre_process_input; +use crate::scanner::pre_process_input; use bstr::ByteSlice; use regex::Regex; use std::sync; From 88a60489c33664894c738eb7feeb2a3725d6c897 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:08:02 +0100 Subject: [PATCH 09/26] rename `globs` to `sources` --- .../@tailwindcss-cli/src/commands/build/index.ts | 2 +- packages/@tailwindcss-postcss/src/index.ts | 2 +- packages/@tailwindcss-upgrade/src/index.ts | 2 +- .../src/template/prepare-config.ts | 4 ++-- packages/@tailwindcss-vite/src/index.ts | 2 +- packages/tailwindcss/src/at-import.test.ts | 4 ++-- .../tailwindcss/src/compat/apply-compat-hooks.ts | 14 +++++++------- packages/tailwindcss/src/compat/config.test.ts | 2 +- packages/tailwindcss/src/index.test.ts | 8 ++++---- packages/tailwindcss/src/index.ts | 16 ++++++++-------- 10 files changed, 28 insertions(+), 28 deletions(-) diff --git a/packages/@tailwindcss-cli/src/commands/build/index.ts b/packages/@tailwindcss-cli/src/commands/build/index.ts index 4e9c99e23459..8da8758bac3c 100644 --- a/packages/@tailwindcss-cli/src/commands/build/index.ts +++ b/packages/@tailwindcss-cli/src/commands/build/index.ts @@ -180,7 +180,7 @@ export async function handle(args: Result>) { // Use the specified root return [compiler.root] - })().concat(compiler.globs) + })().concat(compiler.sources) let scanner = new Scanner({ sources }) DEBUG && I.end('Setup compiler') diff --git a/packages/@tailwindcss-postcss/src/index.ts b/packages/@tailwindcss-postcss/src/index.ts index e21217f12b86..2e0ca8b78373 100644 --- a/packages/@tailwindcss-postcss/src/index.ts +++ b/packages/@tailwindcss-postcss/src/index.ts @@ -195,7 +195,7 @@ function tailwindcss(opts: PluginOptions = {}): AcceptedPlugin { // Use the specified root return [context.compiler.root] - })().concat(context.compiler.globs) + })().concat(context.compiler.sources) // Look for candidates used to generate the CSS context.scanner = new Scanner({ sources }) diff --git a/packages/@tailwindcss-upgrade/src/index.ts b/packages/@tailwindcss-upgrade/src/index.ts index 9069b4ea84ba..f8f97934fe03 100644 --- a/packages/@tailwindcss-upgrade/src/index.ts +++ b/packages/@tailwindcss-upgrade/src/index.ts @@ -163,7 +163,7 @@ async function run() { // Template migrations for (let config of configBySheet.values()) { let set = new Set() - for (let globEntry of config.globs.flatMap((entry) => hoistStaticGlobParts(entry))) { + for (let globEntry of config.sources.flatMap((entry) => hoistStaticGlobParts(entry))) { let files = await globby([globEntry.pattern], { absolute: true, gitignore: true, diff --git a/packages/@tailwindcss-upgrade/src/template/prepare-config.ts b/packages/@tailwindcss-upgrade/src/template/prepare-config.ts index e936df6a3332..c74244fa8f92 100644 --- a/packages/@tailwindcss-upgrade/src/template/prepare-config.ts +++ b/packages/@tailwindcss-upgrade/src/template/prepare-config.ts @@ -19,7 +19,7 @@ export async function prepareConfig( options: { base: string }, ): Promise<{ designSystem: DesignSystem - globs: { base: string; pattern: string }[] + sources: { base: string; pattern: string }[] userConfig: Config configFilePath: string @@ -59,7 +59,7 @@ export async function prepareConfig( return { designSystem, - globs: compiler.globs, + sources: compiler.sources, userConfig, newPrefix, configFilePath, diff --git a/packages/@tailwindcss-vite/src/index.ts b/packages/@tailwindcss-vite/src/index.ts index 1d27ee1689fd..f9e0791e00b8 100644 --- a/packages/@tailwindcss-vite/src/index.ts +++ b/packages/@tailwindcss-vite/src/index.ts @@ -273,7 +273,7 @@ class Root { // Use the specified root return [this.compiler.root] - })().concat(this.compiler.globs) + })().concat(this.compiler.sources) this.scanner = new Scanner({ sources }) DEBUG && I.end('Setup scanner') diff --git a/packages/tailwindcss/src/at-import.test.ts b/packages/tailwindcss/src/at-import.test.ts index 32a203d5166d..958cc00b1a96 100644 --- a/packages/tailwindcss/src/at-import.test.ts +++ b/packages/tailwindcss/src/at-import.test.ts @@ -474,7 +474,7 @@ test('emits the right base for @source directives inside nested files', async () { base: '/root', loadStylesheet }, ) - expect(compiler.globs).toEqual([ + expect(compiler.sources).toEqual([ { pattern: './nested/**/*.css', base: '/root/foo' }, { pattern: './root/**/*.css', base: '/root' }, ]) @@ -521,7 +521,7 @@ test('emits the right base for @source found inside JS configs and plugins from { base: '/root', loadStylesheet, loadModule }, ) - expect(compiler.globs).toEqual([ + expect(compiler.sources).toEqual([ { pattern: './nested-plugin/*.html', base: '/root/foo-plugin' }, { pattern: './root-plugin/*.html', base: '/root-plugin' }, diff --git a/packages/tailwindcss/src/compat/apply-compat-hooks.ts b/packages/tailwindcss/src/compat/apply-compat-hooks.ts index fe30f6270ba4..65265320bda8 100644 --- a/packages/tailwindcss/src/compat/apply-compat-hooks.ts +++ b/packages/tailwindcss/src/compat/apply-compat-hooks.ts @@ -21,7 +21,7 @@ export async function applyCompatibilityHooks({ base, ast, loadModule, - globs, + sources, }: { designSystem: DesignSystem base: string @@ -31,7 +31,7 @@ export async function applyCompatibilityHooks({ base: string, resourceHint: 'plugin' | 'config', ) => Promise<{ module: any; base: string }> - globs: { origin?: string; pattern: string }[] + sources: { base: string; pattern: string }[] }) { let features = Features.None let pluginPaths: [{ id: string; base: string; reference: boolean }, CssPluginOptions | null][] = @@ -145,7 +145,7 @@ export async function applyCompatibilityHooks({ designSystem, base, ast, - globs, + sources, configs: [], pluginDetails: [], }) @@ -186,7 +186,7 @@ export async function applyCompatibilityHooks({ designSystem, base, ast, - globs, + sources, configs, pluginDetails, }) @@ -198,14 +198,14 @@ function upgradeToFullPluginSupport({ designSystem, base, ast, - globs, + sources, configs, pluginDetails, }: { designSystem: DesignSystem base: string ast: AstNode[] - globs: { origin?: string; pattern: string }[] + sources: { base: string; pattern: string }[] configs: { path: string base: string @@ -362,7 +362,7 @@ function upgradeToFullPluginSupport({ ) } - globs.push(file) + sources.push(file) } return features } diff --git a/packages/tailwindcss/src/compat/config.test.ts b/packages/tailwindcss/src/compat/config.test.ts index f856a557d8b6..151c2c738cd9 100644 --- a/packages/tailwindcss/src/compat/config.test.ts +++ b/packages/tailwindcss/src/compat/config.test.ts @@ -15,7 +15,7 @@ test('Config files can add content', async () => { loadModule: async () => ({ module: { content: ['./file.txt'] }, base: '/root' }), }) - expect(compiler.globs).toEqual([{ base: '/root', pattern: './file.txt' }]) + expect(compiler.sources).toEqual([{ base: '/root', pattern: './file.txt' }]) }) test('Config files can change dark mode (media)', async () => { diff --git a/packages/tailwindcss/src/index.test.ts b/packages/tailwindcss/src/index.test.ts index 25d30f2ea670..85046dd31a8e 100644 --- a/packages/tailwindcss/src/index.test.ts +++ b/packages/tailwindcss/src/index.test.ts @@ -3236,18 +3236,18 @@ describe('plugins', () => { describe('@source', () => { test('emits @source files', async () => { - let { globs } = await compile( + let { sources } = await compile( css` @source "./foo/bar/*.ts"; `, { base: '/root' }, ) - expect(globs).toEqual([{ pattern: './foo/bar/*.ts', base: '/root' }]) + expect(sources).toEqual([{ pattern: './foo/bar/*.ts', base: '/root' }]) }) test('emits multiple @source files', async () => { - let { globs } = await compile( + let { sources } = await compile( css` @source "./foo/**/*.ts"; @source "./php/secr3t/smarty.php"; @@ -3255,7 +3255,7 @@ describe('@source', () => { { base: '/root' }, ) - expect(globs).toEqual([ + expect(sources).toEqual([ { pattern: './foo/**/*.ts', base: '/root' }, { pattern: './php/secr3t/smarty.php', base: '/root' }, ]) diff --git a/packages/tailwindcss/src/index.ts b/packages/tailwindcss/src/index.ts index ebf4b3875baf..9020aa04746f 100644 --- a/packages/tailwindcss/src/index.ts +++ b/packages/tailwindcss/src/index.ts @@ -128,7 +128,7 @@ async function parseCss( let firstThemeRule = null as StyleRule | null let utilitiesNode = null as AtRule | null let variantNodes: AtRule[] = [] - let globs: { base: string; pattern: string }[] = [] + let sources: { base: string; pattern: string }[] = [] let inlineCandidates: string[] = [] let ignoredCandidates: string[] = [] let root = null as Root @@ -247,7 +247,7 @@ async function parseCss( } } } else { - globs.push({ base: context.base as string, pattern: source }) + sources.push({ base: context.base as string, pattern: source }) } replaceWith([]) return @@ -552,7 +552,7 @@ async function parseCss( base, ast, loadModule, - globs, + sources, }) for (let customVariant of customVariants) { @@ -637,7 +637,7 @@ async function parseCss( return { designSystem, ast, - globs, + sources, root, utilitiesNode, features, @@ -649,12 +649,12 @@ export async function compileAst( input: AstNode[], opts: CompileOptions = {}, ): Promise<{ - globs: { base: string; pattern: string }[] + sources: { base: string; pattern: string }[] root: Root features: Features build(candidates: string[]): AstNode[] }> { - let { designSystem, ast, globs, root, utilitiesNode, features, inlineCandidates } = + let { designSystem, ast, sources, root, utilitiesNode, features, inlineCandidates } = await parseCss(input, opts) if (process.env.NODE_ENV !== 'test') { @@ -682,7 +682,7 @@ export async function compileAst( } return { - globs, + sources, root, features, build(newRawCandidates: string[]) { @@ -747,7 +747,7 @@ export async function compile( css: string, opts: CompileOptions = {}, ): Promise<{ - globs: { base: string; pattern: string }[] + sources: { base: string; pattern: string }[] root: Root features: Features build(candidates: string[]): string From 602a8017d26865d837f1bf762ae86b9d1df14a50 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:08:36 +0100 Subject: [PATCH 10/26] add `negated` flag to `sources` --- .../src/commands/build/index.ts | 4 ++-- packages/@tailwindcss-postcss/src/index.ts | 4 ++-- .../src/migrate-js-config.ts | 18 +++++++++++++++--- packages/@tailwindcss-vite/src/index.ts | 4 ++-- packages/tailwindcss/src/at-import.test.ts | 16 ++++++++-------- .../src/compat/apply-compat-hooks.ts | 11 ++++++++--- packages/tailwindcss/src/compat/config.test.ts | 2 +- packages/tailwindcss/src/index.test.ts | 6 +++--- packages/tailwindcss/src/index.ts | 8 ++++---- 9 files changed, 45 insertions(+), 28 deletions(-) diff --git a/packages/@tailwindcss-cli/src/commands/build/index.ts b/packages/@tailwindcss-cli/src/commands/build/index.ts index 8da8758bac3c..b2252afc50d9 100644 --- a/packages/@tailwindcss-cli/src/commands/build/index.ts +++ b/packages/@tailwindcss-cli/src/commands/build/index.ts @@ -175,11 +175,11 @@ export async function handle(args: Result>) { // No root specified, use the base directory if (compiler.root === null) { - return [{ base, pattern: '**/*' }] + return [{ base, pattern: '**/*', negated: false }] } // Use the specified root - return [compiler.root] + return [{ ...compiler.root, negated: false }] })().concat(compiler.sources) let scanner = new Scanner({ sources }) diff --git a/packages/@tailwindcss-postcss/src/index.ts b/packages/@tailwindcss-postcss/src/index.ts index 2e0ca8b78373..d8ef557767a8 100644 --- a/packages/@tailwindcss-postcss/src/index.ts +++ b/packages/@tailwindcss-postcss/src/index.ts @@ -190,11 +190,11 @@ function tailwindcss(opts: PluginOptions = {}): AcceptedPlugin { // No root specified, use the base directory if (context.compiler.root === null) { - return [{ base, pattern: '**/*' }] + return [{ base, pattern: '**/*', negated: false }] } // Use the specified root - return [context.compiler.root] + return [{ ...context.compiler.root, negated: false }] })().concat(context.compiler.sources) // Look for candidates used to generate the CSS diff --git a/packages/@tailwindcss-upgrade/src/migrate-js-config.ts b/packages/@tailwindcss-upgrade/src/migrate-js-config.ts index ac0a71a15e4b..bedf10f2a137 100644 --- a/packages/@tailwindcss-upgrade/src/migrate-js-config.ts +++ b/packages/@tailwindcss-upgrade/src/migrate-js-config.ts @@ -274,7 +274,11 @@ async function migrateContent( throw new Error('Unsupported content value: ' + pattern) } - let sourceFiles = patternSourceFiles({ base, pattern }) + let sourceFiles = patternSourceFiles({ + base, + pattern: pattern[0] === '!' ? pattern.slice(1) : pattern, + negated: pattern[0] === '!', + }) let autoContentContainsAllSourceFiles = true for (let sourceFile of sourceFiles) { @@ -375,12 +379,20 @@ function keyframesToCss(keyframes: Record): string { } function autodetectedSourceFiles(base: string) { - let scanner = new Scanner({ sources: [{ base, pattern: '**/*' }] }) + let scanner = new Scanner({ + sources: [ + { + base, + pattern: '**/*', + negated: false, + }, + ], + }) scanner.scan() return scanner.files } -function patternSourceFiles(source: { base: string; pattern: string }): string[] { +function patternSourceFiles(source: { base: string; pattern: string; negated: boolean }): string[] { let scanner = new Scanner({ sources: [source] }) scanner.scan() return scanner.files diff --git a/packages/@tailwindcss-vite/src/index.ts b/packages/@tailwindcss-vite/src/index.ts index f9e0791e00b8..de6b2cde33cd 100644 --- a/packages/@tailwindcss-vite/src/index.ts +++ b/packages/@tailwindcss-vite/src/index.ts @@ -268,11 +268,11 @@ class Root { // No root specified, auto-detect based on the `**/*` pattern if (this.compiler.root === null) { - return [{ base: this.base, pattern: '**/*' }] + return [{ base: this.base, pattern: '**/*', negated: false }] } // Use the specified root - return [this.compiler.root] + return [{ ...this.compiler.root, negated: false }] })().concat(this.compiler.sources) this.scanner = new Scanner({ sources }) diff --git a/packages/tailwindcss/src/at-import.test.ts b/packages/tailwindcss/src/at-import.test.ts index 958cc00b1a96..6b418169ff8e 100644 --- a/packages/tailwindcss/src/at-import.test.ts +++ b/packages/tailwindcss/src/at-import.test.ts @@ -475,8 +475,8 @@ test('emits the right base for @source directives inside nested files', async () ) expect(compiler.sources).toEqual([ - { pattern: './nested/**/*.css', base: '/root/foo' }, - { pattern: './root/**/*.css', base: '/root' }, + { pattern: './nested/**/*.css', base: '/root/foo', negated: false }, + { pattern: './root/**/*.css', base: '/root', negated: false }, ]) }) @@ -522,14 +522,14 @@ test('emits the right base for @source found inside JS configs and plugins from ) expect(compiler.sources).toEqual([ - { pattern: './nested-plugin/*.html', base: '/root/foo-plugin' }, - { pattern: './root-plugin/*.html', base: '/root-plugin' }, + { pattern: './nested-plugin/*.html', base: '/root/foo-plugin', negated: false }, + { pattern: './root-plugin/*.html', base: '/root-plugin', negated: false }, - { pattern: './nested-config-plugin/*.html', base: '/root/foo-config' }, - { pattern: './nested-config/*.html', base: '/root/foo-config' }, + { pattern: './nested-config-plugin/*.html', base: '/root/foo-config', negated: false }, + { pattern: './nested-config/*.html', base: '/root/foo-config', negated: false }, - { pattern: './root-config-plugin/*.html', base: '/root-config' }, - { pattern: './root-config/*.html', base: '/root-config' }, + { pattern: './root-config-plugin/*.html', base: '/root-config', negated: false }, + { pattern: './root-config/*.html', base: '/root-config', negated: false }, ]) }) diff --git a/packages/tailwindcss/src/compat/apply-compat-hooks.ts b/packages/tailwindcss/src/compat/apply-compat-hooks.ts index 65265320bda8..1cf9b1ce5c77 100644 --- a/packages/tailwindcss/src/compat/apply-compat-hooks.ts +++ b/packages/tailwindcss/src/compat/apply-compat-hooks.ts @@ -31,7 +31,7 @@ export async function applyCompatibilityHooks({ base: string, resourceHint: 'plugin' | 'config', ) => Promise<{ module: any; base: string }> - sources: { base: string; pattern: string }[] + sources: { base: string; pattern: string; negated: boolean }[] }) { let features = Features.None let pluginPaths: [{ id: string; base: string; reference: boolean }, CssPluginOptions | null][] = @@ -205,7 +205,7 @@ function upgradeToFullPluginSupport({ designSystem: DesignSystem base: string ast: AstNode[] - sources: { base: string; pattern: string }[] + sources: { base: string; pattern: string; negated: boolean }[] configs: { path: string base: string @@ -362,7 +362,12 @@ function upgradeToFullPluginSupport({ ) } - sources.push(file) + let negated = false + if (file.pattern[0] == '!') { + negated = true + file.pattern = file.pattern.slice(1) + } + sources.push({ ...file, negated }) } return features } diff --git a/packages/tailwindcss/src/compat/config.test.ts b/packages/tailwindcss/src/compat/config.test.ts index 151c2c738cd9..372f5f3e3ae0 100644 --- a/packages/tailwindcss/src/compat/config.test.ts +++ b/packages/tailwindcss/src/compat/config.test.ts @@ -15,7 +15,7 @@ test('Config files can add content', async () => { loadModule: async () => ({ module: { content: ['./file.txt'] }, base: '/root' }), }) - expect(compiler.sources).toEqual([{ base: '/root', pattern: './file.txt' }]) + expect(compiler.sources).toEqual([{ base: '/root', pattern: './file.txt', negated: false }]) }) test('Config files can change dark mode (media)', async () => { diff --git a/packages/tailwindcss/src/index.test.ts b/packages/tailwindcss/src/index.test.ts index 85046dd31a8e..619ed203715a 100644 --- a/packages/tailwindcss/src/index.test.ts +++ b/packages/tailwindcss/src/index.test.ts @@ -3243,7 +3243,7 @@ describe('@source', () => { { base: '/root' }, ) - expect(sources).toEqual([{ pattern: './foo/bar/*.ts', base: '/root' }]) + expect(sources).toEqual([{ pattern: './foo/bar/*.ts', base: '/root', negated: false }]) }) test('emits multiple @source files', async () => { @@ -3256,8 +3256,8 @@ describe('@source', () => { ) expect(sources).toEqual([ - { pattern: './foo/**/*.ts', base: '/root' }, - { pattern: './php/secr3t/smarty.php', base: '/root' }, + { pattern: './foo/**/*.ts', base: '/root', negated: false }, + { pattern: './php/secr3t/smarty.php', base: '/root', negated: false }, ]) }) diff --git a/packages/tailwindcss/src/index.ts b/packages/tailwindcss/src/index.ts index 9020aa04746f..e1ac3dd8dc8b 100644 --- a/packages/tailwindcss/src/index.ts +++ b/packages/tailwindcss/src/index.ts @@ -128,7 +128,7 @@ async function parseCss( let firstThemeRule = null as StyleRule | null let utilitiesNode = null as AtRule | null let variantNodes: AtRule[] = [] - let sources: { base: string; pattern: string }[] = [] + let sources: { base: string; pattern: string; negated: boolean }[] = [] let inlineCandidates: string[] = [] let ignoredCandidates: string[] = [] let root = null as Root @@ -247,7 +247,7 @@ async function parseCss( } } } else { - sources.push({ base: context.base as string, pattern: source }) + sources.push({ base: context.base as string, pattern: source, negated: not }) } replaceWith([]) return @@ -649,7 +649,7 @@ export async function compileAst( input: AstNode[], opts: CompileOptions = {}, ): Promise<{ - sources: { base: string; pattern: string }[] + sources: { base: string; pattern: string; negated: boolean }[] root: Root features: Features build(candidates: string[]): AstNode[] @@ -747,7 +747,7 @@ export async function compile( css: string, opts: CompileOptions = {}, ): Promise<{ - sources: { base: string; pattern: string }[] + sources: { base: string; pattern: string; negated: boolean }[] root: Root features: Features build(candidates: string[]): string From 9798637654d92711c506d2b5b7c5559912614cff Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:23:09 +0100 Subject: [PATCH 11/26] add missing `reference` property This isn't strictly necessary for this PR, but noticed missing values when updating the `globs` -> `sources` and made TypeScript happy by adding this. --- packages/@tailwindcss-upgrade/src/migrate-js-config.ts | 1 + packages/@tailwindcss-upgrade/src/template/prepare-config.ts | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/@tailwindcss-upgrade/src/migrate-js-config.ts b/packages/@tailwindcss-upgrade/src/migrate-js-config.ts index bedf10f2a137..46d19ef6c4a9 100644 --- a/packages/@tailwindcss-upgrade/src/migrate-js-config.ts +++ b/packages/@tailwindcss-upgrade/src/migrate-js-config.ts @@ -100,6 +100,7 @@ async function migrateTheme( let configToResolve: ConfigFile = { base, config: { ...unresolvedConfig, plugins: [], presets: undefined }, + reference: false, } let { resolvedConfig, replacedThemeKeys } = resolveConfig(designSystem, [configToResolve]) diff --git a/packages/@tailwindcss-upgrade/src/template/prepare-config.ts b/packages/@tailwindcss-upgrade/src/template/prepare-config.ts index c74244fa8f92..b4a6e18b31e1 100644 --- a/packages/@tailwindcss-upgrade/src/template/prepare-config.ts +++ b/packages/@tailwindcss-upgrade/src/template/prepare-config.ts @@ -82,7 +82,7 @@ async function createResolvedUserConfig(fullConfigPath: string): Promise ]) return resolveConfig(noopDesignSystem, [ - { base: dirname(fullConfigPath), config: unresolvedUserConfig }, + { base: dirname(fullConfigPath), config: unresolvedUserConfig, reference: false }, ]).resolvedConfig as any } From 9958612d0d1c8de7016f8a0c9748fbe878f29939 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:46:12 +0100 Subject: [PATCH 12/26] update integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests will reflect 2 big changes: 1. New tests in general 2. Tests where we used auto source detection and `.gitignore` files, will now favor the `@source '…'` rules over the `.gitignore` rules. Note: If you _don't_ want this behavior, then you can use `@source not '…'` to override rules. --- integrations/cli/index.test.ts | 29 +- integrations/postcss/index.test.ts | 587 ------------------ integrations/postcss/source.test.ts | 799 +++++++++++++++++++++++++ packages/tailwindcss/src/index.test.ts | 15 + 4 files changed, 834 insertions(+), 596 deletions(-) create mode 100644 integrations/postcss/source.test.ts diff --git a/integrations/cli/index.test.ts b/integrations/cli/index.test.ts index 487c88e40b76..dc06ecb095c0 100644 --- a/integrations/cli/index.test.ts +++ b/integrations/cli/index.test.ts @@ -665,7 +665,7 @@ test( /* (4) */ /* - './pages' should be auto-scanned */ /* - Only '.html' files should be included */ - /* - './page/ignored.html' should be ignored */ + /* - './page/ignored.html' will not be ignored because of the specific pattern */ @source "./pages/**/*.html"; `, @@ -702,7 +702,7 @@ test( // (4) 'pages/foo.html': 'content-["pages/foo.html"]', 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', - 'pages/ignored.html': 'content-["pages/ignored.html"] content-["BAD"]', + 'pages/ignored.html': 'content-["pages/ignored.html"]', 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', }, @@ -733,6 +733,10 @@ test( --tw-content: "pages/foo.html"; content: var(--tw-content); } + .content-\\[\\"pages\\/ignored\\.html\\"\\] { + --tw-content: "pages/ignored.html"; + content: var(--tw-content); + } .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { --tw-content: "pages/nested/foo.html"; content: var(--tw-content); @@ -893,10 +897,11 @@ test( bar.html `, - // Project D, foo.html is ignored by the gitignore file. + // Project D, foo.html is ignored by the gitignore file but the source rule is explicit about + // adding all `.html` files. 'project-d/src/foo.html': html`
`, @@ -971,6 +976,10 @@ test( --tw-content: 'project-d/src/bar.html'; content: var(--tw-content); } + .content-\\[\\'project-d\\/src\\/foo\\.html\\'\\] { + --tw-content: 'project-d/src/foo.html'; + content: var(--tw-content); + } .content-\\[\\'project-d\\/src\\/index\\.html\\'\\] { --tw-content: 'project-d/src/index.html'; content: var(--tw-content); @@ -1135,15 +1144,13 @@ test( @reference 'tailwindcss/theme'; /* (1) */ - /* - Only './src' should be auto-scanned, not the current working directory */ - /* - .gitignore'd paths should be ignored (node_modules) */ - /* - Binary extensions should be ignored (jpg, zip) */ + /* - Disable auto-source detection */ @import 'tailwindcss/utilities' source(none); /* (2) */ /* - './pages' should be auto-scanned */ /* - Only '.html' files should be included */ - /* - './page/ignored.html' should be ignored */ + /* - './page/ignored.html' will not be ignored because of the specific pattern */ @source "./pages/**/*.html"; `, @@ -1163,7 +1170,7 @@ test( // (4) 'pages/foo.html': 'content-["pages/foo.html"]', 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', - 'pages/ignored.html': 'content-["pages/ignored.html"] content-["BAD"]', + 'pages/ignored.html': 'content-["pages/ignored.html"]', 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', }, @@ -1178,6 +1185,10 @@ test( --tw-content: "pages/foo.html"; content: var(--tw-content); } + .content-\\[\\"pages\\/ignored\\.html\\"\\] { + --tw-content: "pages/ignored.html"; + content: var(--tw-content); + } .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { --tw-content: "pages/nested/foo.html"; content: var(--tw-content); diff --git a/integrations/postcss/index.test.ts b/integrations/postcss/index.test.ts index 524752b217f7..63a0f019eaf8 100644 --- a/integrations/postcss/index.test.ts +++ b/integrations/postcss/index.test.ts @@ -1,4 +1,3 @@ -import dedent from 'dedent' import path from 'node:path' import { candidate, css, html, js, json, test, ts, yaml } from '../utils' @@ -636,589 +635,3 @@ test( await fs.expectFileToContain('project-a/dist/out.css', [candidate`content-['c/src/index.js']`]) }, ) - -test( - 'auto source detection kitchen sink', - { - fs: { - 'package.json': json` - { - "dependencies": { - "postcss": "^8", - "postcss-cli": "^10", - "tailwindcss": "workspace:^", - "@tailwindcss/postcss": "workspace:^" - } - } - `, - 'postcss.config.js': js` - module.exports = { - plugins: { - '@tailwindcss/postcss': {}, - }, - } - `, - 'index.css': css` - @reference 'tailwindcss/theme'; - - /* (1) */ - /* - Only './src' should be auto-scanned, not the current working directory */ - /* - .gitignore'd paths should be ignored (node_modules) */ - /* - Binary extensions should be ignored (jpg, zip) */ - @import 'tailwindcss/utilities' source('./src'); - - /* (2) */ - /* - All HTML and JSX files in 'ignored/components' should be scanned */ - /* - All other extensions should be ignored */ - @source "./ignored/components/*.{html,jsx}"; - - /* (3) */ - /* - './components' should be auto-scanned in addition to './src' */ - /* - './components/ignored.html' should still be ignored */ - /* - Binary extensions in './components' should be ignored */ - @source "./components"; - - /* (4) */ - /* - './pages' should be auto-scanned */ - /* - Only '.html' files should be included */ - /* - './page/ignored.html' should be ignored */ - @source "./pages/**/*.html"; - `, - - '.gitignore': dedent` - /src/ignored - /ignored - /components/ignored.html - /pages/ignored.html - `, - - // (1) - 'index.html': 'content-["index.html"] content-["BAD"]', // "Root" source is in `./src` - 'src/index.html': 'content-["src/index.html"]', - 'src/nested/index.html': 'content-["src/nested/index.html"]', - 'src/index.jpg': 'content-["src/index.jpg"] content-["BAD"]', - 'src/nested/index.tar': 'content-["src/nested/index.tar"] content-["BAD"]', - 'src/ignored/index.html': 'content-["src/ignored/index.html"] content-["BAD"]', - - // (2) - 'ignored/components/my-component.html': 'content-["ignored/components/my-component.html"]', - 'ignored/components/my-component.jsx': 'content-["ignored/components/my-component.jsx"]', - - // Ignored and not explicitly listed by (2) - 'ignored/components/my-component.tsx': - 'content-["ignored/components/my-component.tsx"] content-["BAD"]', - 'ignored/components/nested/my-component.html': - 'content-["ignored/components/nested/my-component.html"] content-["BAD"]', - - // (3) - 'components/my-component.tsx': 'content-["components/my-component.tsx"]', - 'components/nested/my-component.tsx': 'content-["components/nested/my-component.tsx"]', - 'components/ignored.html': 'content-["components/ignored.html"] content-["BAD"]', - - // (4) - 'pages/foo.html': 'content-["pages/foo.html"]', - 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', - 'pages/ignored.html': 'content-["pages/ignored.html"] content-["BAD"]', - 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', - 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', - }, - }, - async ({ fs, exec, expect }) => { - await exec('pnpm postcss index.css --output dist/out.css') - - expect(await fs.dumpFiles('./dist/*.css')).toMatchInlineSnapshot(` - " - --- ./dist/out.css --- - .content-\\[\\"components\\/my-component\\.tsx\\"\\] { - --tw-content: "components/my-component.tsx"; - content: var(--tw-content); - } - .content-\\[\\"components\\/nested\\/my-component\\.tsx\\"\\] { - --tw-content: "components/nested/my-component.tsx"; - content: var(--tw-content); - } - .content-\\[\\"ignored\\/components\\/my-component\\.html\\"\\] { - --tw-content: "ignored/components/my-component.html"; - content: var(--tw-content); - } - .content-\\[\\"ignored\\/components\\/my-component\\.jsx\\"\\] { - --tw-content: "ignored/components/my-component.jsx"; - content: var(--tw-content); - } - .content-\\[\\"pages\\/foo\\.html\\"\\] { - --tw-content: "pages/foo.html"; - content: var(--tw-content); - } - .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { - --tw-content: "pages/nested/foo.html"; - content: var(--tw-content); - } - .content-\\[\\"src\\/index\\.html\\"\\] { - --tw-content: "src/index.html"; - content: var(--tw-content); - } - .content-\\[\\"src\\/nested\\/index\\.html\\"\\] { - --tw-content: "src/nested/index.html"; - content: var(--tw-content); - } - @property --tw-content { - syntax: "*"; - inherits: false; - initial-value: ""; - } - " - `) - }, -) - -test( - 'auto source detection in depth, source(…) and `@source` can be configured to use auto source detection (build + watch mode)', - { - fs: { - 'package.json': json`{}`, - 'pnpm-workspace.yaml': yaml` - # - packages: - - project-a - `, - 'project-a/package.json': json` - { - "dependencies": { - "postcss": "^8", - "postcss-cli": "^10", - "tailwindcss": "workspace:^", - "@tailwindcss/postcss": "workspace:^" - } - } - `, - 'project-a/postcss.config.js': js` - module.exports = { - plugins: { - '@tailwindcss/postcss': {}, - }, - } - `, - 'project-a/src/index.css': css` - @reference 'tailwindcss/theme'; - - /* Run auto-content detection in ../../project-b */ - @import 'tailwindcss/utilities' source('../../project-b'); - - /* Explicitly using node_modules in the @source allows git ignored folders */ - @source '../node_modules/{my-lib-1,my-lib-2}/src/**/*.html'; - - /* We typically ignore these extensions, but now include them explicitly */ - @source './logo.{jpg,png}'; - - /* Project C should apply auto source detection */ - @source '../../project-c'; - - /* Project D should apply auto source detection rules, such as ignoring node_modules */ - @source '../../project-d/**/*.{html,js}'; - @source '../../project-d/**/*.bin'; - - /* Same as above, but my-lib-2 _should_ be includes */ - @source '../../project-d/node_modules/my-lib-2/src/*.{html,js}'; - - /* bar.html is git ignored, but explicitly listed here to scan */ - @source '../../project-d/src/bar.html'; - `, - - // Project A is the current folder, but we explicitly configured - // `source(project-b)`, therefore project-a should not be included in - // the output. - 'project-a/src/index.html': html` -
- `, - - // Project A explicitly includes an extension we usually ignore, - // therefore it should be included in the output. - 'project-a/src/logo.jpg': html` -
- `, - - // Project A explicitly includes node_modules/{my-lib-1,my-lib-2}, - // therefore these files should be included in the output. - 'project-a/node_modules/my-lib-1/src/index.html': html` -
- `, - 'project-a/node_modules/my-lib-2/src/index.html': html` -
- `, - - // Project B is the configured `source(…)`, therefore auto source - // detection should include known extensions and folders in the output. - 'project-b/src/index.html': html` -
- `, - - // Project B is the configured `source(…)`, therefore auto source - // detection should apply and node_modules should not be included in the - // output. - 'project-b/node_modules/my-lib-3/src/index.html': html` -
- `, - - // Project C should apply auto source detection, therefore known - // extensions and folders should be included in the output. - 'project-c/src/index.html': html` -
- `, - - // Project C should apply auto source detection, therefore known ignored - // extensions should not be included in the output. - 'project-c/src/logo.jpg': html` -
- `, - - // Project C should apply auto source detection, therefore node_modules - // should not be included in the output. - 'project-c/node_modules/my-lib-1/src/index.html': html` -
- `, - - // Project D should apply auto source detection rules, such as ignoring - // node_modules. - 'project-d/node_modules/my-lib-1/src/index.html': html` -
- `, - - // Project D has an explicit glob containing node_modules, thus should include the html file - 'project-d/node_modules/my-lib-2/src/index.html': html` -
- `, - - 'project-d/src/.gitignore': dedent` - foo.html - bar.html - `, - - // Project D, foo.html is ignored by the gitignore file. - 'project-d/src/foo.html': html` -
- `, - - // Project D, bar.html is ignored by the gitignore file. But explicitly - // listed as a `@source` glob. - 'project-d/src/bar.html': html` -
- `, - - // Project D should look for files with the extensions html and js. - 'project-d/src/index.html': html` -
- `, - - // Project D should have a binary file even though we ignore binary files - // by default, but it's explicitly listed. - 'project-d/my-binary-file.bin': html` -
- `, - }, - }, - async ({ fs, exec, spawn, root, expect }) => { - await exec('pnpm postcss src/index.css --output dist/out.css --verbose', { - cwd: path.join(root, 'project-a'), - }) - - expect(await fs.dumpFiles('./project-a/dist/*.css')).toMatchInlineSnapshot(` - " - --- ./project-a/dist/out.css --- - .content-\\[\\'project-a\\/node_modules\\/my-lib-1\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-a/node modules/my-lib-1/src/index.html'; - content: var(--tw-content); - } - .content-\\[\\'project-a\\/node_modules\\/my-lib-2\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-a/node modules/my-lib-2/src/index.html'; - content: var(--tw-content); - } - .content-\\[\\'project-a\\/src\\/logo\\.jpg\\'\\] { - --tw-content: 'project-a/src/logo.jpg'; - content: var(--tw-content); - } - .content-\\[\\'project-b\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-b/src/index.html'; - content: var(--tw-content); - } - .content-\\[\\'project-c\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-c/src/index.html'; - content: var(--tw-content); - } - .content-\\[\\'project-d\\/my-binary-file\\.bin\\'\\] { - --tw-content: 'project-d/my-binary-file.bin'; - content: var(--tw-content); - } - .content-\\[\\'project-d\\/node_modules\\/my-lib-2\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-d/node modules/my-lib-2/src/index.html'; - content: var(--tw-content); - } - .content-\\[\\'project-d\\/src\\/bar\\.html\\'\\] { - --tw-content: 'project-d/src/bar.html'; - content: var(--tw-content); - } - .content-\\[\\'project-d\\/src\\/index\\.html\\'\\] { - --tw-content: 'project-d/src/index.html'; - content: var(--tw-content); - } - @property --tw-content { - syntax: "*"; - inherits: false; - initial-value: ""; - } - " - `) - - // Watch mode tests - let process = await spawn( - 'pnpm postcss src/index.css --output dist/out.css --watch --verbose', - { - cwd: path.join(root, 'project-a'), - }, - ) - await process.onStderr((message) => message.includes('Waiting for file changes...')) - - // Changes to project-a should not be included in the output, we changed the - // base folder to project-b. - await fs.write( - 'project-a/src/index.html', - html`
`, - ) - await fs.expectFileNotToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-a/src/index.html']`, - ]) - - // Changes to this file should be included, because we explicitly listed - // them using `@source`. - await fs.write( - 'project-a/src/logo.jpg', - html`
`, - ) - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-a/src/logo.jpg']`, - ]) - - // Changes to these files should be included, because we explicitly listed - // them using `@source`. - await fs.write( - 'project-a/node_modules/my-lib-1/src/index.html', - html`
`, - ) - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-a/node_modules/my-lib-1/src/index.html']`, - ]) - - await fs.write( - 'project-a/node_modules/my-lib-2/src/index.html', - html`
`, - ) - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-a/node_modules/my-lib-2/src/index.html']`, - ]) - - // Changes to this file should be included, because we changed the base to - // `project-b`. - await fs.write( - 'project-b/src/index.html', - html`
`, - ) - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-b/src/index.html']`, - ]) - - // Changes to this file should not be included. We did change the base to - // `project-b`, but we still apply the auto source detection rules which - // ignore `node_modules`. - await fs.write( - 'project-b/node_modules/my-lib-3/src/index.html', - html`
`, - ) - await fs.expectFileNotToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-b/node_modules/my-lib-3/src/index.html']`, - ]) - - // Project C was added explicitly via `@source`, therefore changes to these - // files should be included. - await fs.write( - 'project-c/src/index.html', - html`
`, - ) - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-c/src/index.html']`, - ]) - - // Except for these files, since they are ignored by the default auto source - // detection rules. - await fs.write( - 'project-c/src/logo.jpg', - html`
`, - ) - await fs.expectFileNotToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-c/src/logo.jpg']`, - ]) - await fs.write( - 'project-c/node_modules/my-lib-1/src/index.html', - html`
`, - ) - await fs.expectFileNotToContain('./project-a/dist/out.css', [ - candidate`[.changed_&]:content-['project-c/node_modules/my-lib-1/src/index.html']`, - ]) - - // Creating new files in the "root" of auto source detected folders - // We need to create the files and *then* update them because postcss-cli - // does not pick up new files — only changes to existing files. - await fs.create([ - 'project-b/new-file.html', - 'project-b/new-folder/new-file.html', - 'project-c/new-file.html', - 'project-c/new-folder/new-file.html', - ]) - - // If we don't wait writes will be coalesced into a "add" event which - // isn't picked up by postcss-cli. - await new Promise((resolve) => setTimeout(resolve, 100)) - - await fs.write( - 'project-b/new-file.html', - html`
`, - ) - await fs.write( - 'project-b/new-folder/new-file.html', - html`
`, - ) - await fs.write( - 'project-c/new-file.html', - html`
`, - ) - await fs.write( - 'project-c/new-folder/new-file.html', - html`
`, - ) - - await fs.expectFileToContain('./project-a/dist/out.css', [ - candidate`[.created_&]:content-['project-b/new-file.html']`, - candidate`[.created_&]:content-['project-b/new-folder/new-file.html']`, - candidate`[.created_&]:content-['project-c/new-file.html']`, - candidate`[.created_&]:content-['project-c/new-folder/new-file.html']`, - ]) - }, -) - -test( - 'auto source detection disabled', - { - fs: { - 'package.json': json` - { - "dependencies": { - "postcss": "^8", - "postcss-cli": "^10", - "tailwindcss": "workspace:^", - "@tailwindcss/postcss": "workspace:^" - } - } - `, - 'postcss.config.js': js` - module.exports = { - plugins: { - '@tailwindcss/postcss': {}, - }, - } - `, - 'index.css': css` - @reference 'tailwindcss/theme'; - - /* (1) */ - /* - Only './src' should be auto-scanned, not the current working directory */ - /* - .gitignore'd paths should be ignored (node_modules) */ - /* - Binary extensions should be ignored (jpg, zip) */ - @import 'tailwindcss/utilities' source(none); - - /* (2) */ - /* - './pages' should be auto-scanned */ - /* - Only '.html' files should be included */ - /* - './page/ignored.html' should be ignored */ - @source "./pages/**/*.html"; - `, - - '.gitignore': dedent` - /src/ignored - /pages/ignored.html - `, - - // (1) - 'index.html': 'content-["index.html"] content-["BAD"]', // "Root" source is in `./src` - 'src/index.html': 'content-["src/index.html"] content-["BAD"]', - 'src/nested/index.html': 'content-["src/nested/index.html"] content-["BAD"]', - 'src/index.jpg': 'content-["src/index.jpg"] content-["BAD"]', - 'src/nested/index.tar': 'content-["src/nested/index.tar"] content-["BAD"]', - 'src/ignored/index.html': 'content-["src/ignored/index.html"] content-["BAD"]', - - // (4) - 'pages/foo.html': 'content-["pages/foo.html"]', - 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', - 'pages/ignored.html': 'content-["pages/ignored.html"] content-["BAD"]', - 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', - 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', - }, - }, - async ({ fs, exec, expect }) => { - await exec('pnpm postcss index.css --output dist/out.css') - - expect(await fs.dumpFiles('./dist/*.css')).toMatchInlineSnapshot(` - " - --- ./dist/out.css --- - .content-\\[\\"pages\\/foo\\.html\\"\\] { - --tw-content: "pages/foo.html"; - content: var(--tw-content); - } - .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { - --tw-content: "pages/nested/foo.html"; - content: var(--tw-content); - } - @property --tw-content { - syntax: "*"; - inherits: false; - initial-value: ""; - } - " - `) - }, -) diff --git a/integrations/postcss/source.test.ts b/integrations/postcss/source.test.ts new file mode 100644 index 000000000000..111a8156238c --- /dev/null +++ b/integrations/postcss/source.test.ts @@ -0,0 +1,799 @@ +import dedent from 'dedent' +import path from 'node:path' +import { candidate, css, html, js, json, test, yaml } from '../utils' + +test( + 'auto source detection kitchen sink', + { + fs: { + 'package.json': json` + { + "dependencies": { + "postcss": "^8", + "postcss-cli": "^10", + "tailwindcss": "workspace:^", + "@tailwindcss/postcss": "workspace:^" + } + } + `, + 'postcss.config.js': js` + module.exports = { + plugins: { + '@tailwindcss/postcss': {}, + }, + } + `, + 'index.css': css` + @reference 'tailwindcss/theme'; + + /* (1) */ + /* - Only './src' should be auto-scanned, not the current working directory */ + /* - .gitignore'd paths should be ignored (node_modules) */ + /* - Binary extensions should be ignored (jpg, zip) */ + @import 'tailwindcss/utilities' source('./src'); + + /* (2) */ + /* - All HTML and JSX files in 'ignored/components' should be scanned */ + /* - All other extensions should be ignored */ + @source "./ignored/components/*.{html,jsx}"; + + /* (3) */ + /* - './components' should be auto-scanned in addition to './src' */ + /* - './components/ignored.html' should still be ignored */ + /* - Binary extensions in './components' should be ignored */ + @source "./components"; + + /* (4) */ + /* - './pages' should be auto-scanned */ + /* - Only '.html' files should be included */ + /* - './page/ignored.html' will not be ignored because of the specific pattern */ + @source "./pages/**/*.html"; + `, + + '.gitignore': dedent` + /src/ignored + /ignored + /components/ignored.html + /pages/ignored.html + `, + + // (1) + 'index.html': 'content-["index.html"] content-["BAD"]', // "Root" source is in `./src` + 'src/index.html': 'content-["src/index.html"]', + 'src/nested/index.html': 'content-["src/nested/index.html"]', + 'src/index.jpg': 'content-["src/index.jpg"] content-["BAD"]', + 'src/nested/index.tar': 'content-["src/nested/index.tar"] content-["BAD"]', + 'src/ignored/index.html': 'content-["src/ignored/index.html"] content-["BAD"]', + + // (2) + 'ignored/components/my-component.html': 'content-["ignored/components/my-component.html"]', + 'ignored/components/my-component.jsx': 'content-["ignored/components/my-component.jsx"]', + + // Ignored and not explicitly listed by (2) + 'ignored/components/my-component.tsx': + 'content-["ignored/components/my-component.tsx"] content-["BAD"]', + 'ignored/components/nested/my-component.html': + 'content-["ignored/components/nested/my-component.html"] content-["BAD"]', + + // (3) + 'components/my-component.tsx': 'content-["components/my-component.tsx"]', + 'components/nested/my-component.tsx': 'content-["components/nested/my-component.tsx"]', + 'components/ignored.html': 'content-["components/ignored.html"] content-["BAD"]', + + // (4) + 'pages/foo.html': 'content-["pages/foo.html"]', + 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', + 'pages/ignored.html': 'content-["pages/ignored.html"]', + 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', + 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', + }, + }, + async ({ fs, exec, expect }) => { + await exec('pnpm postcss index.css --output dist/out.css') + + expect(await fs.dumpFiles('./dist/*.css')).toMatchInlineSnapshot(` + " + --- ./dist/out.css --- + .content-\\[\\"components\\/my-component\\.tsx\\"\\] { + --tw-content: "components/my-component.tsx"; + content: var(--tw-content); + } + .content-\\[\\"components\\/nested\\/my-component\\.tsx\\"\\] { + --tw-content: "components/nested/my-component.tsx"; + content: var(--tw-content); + } + .content-\\[\\"ignored\\/components\\/my-component\\.html\\"\\] { + --tw-content: "ignored/components/my-component.html"; + content: var(--tw-content); + } + .content-\\[\\"ignored\\/components\\/my-component\\.jsx\\"\\] { + --tw-content: "ignored/components/my-component.jsx"; + content: var(--tw-content); + } + .content-\\[\\"pages\\/foo\\.html\\"\\] { + --tw-content: "pages/foo.html"; + content: var(--tw-content); + } + .content-\\[\\"pages\\/ignored\\.html\\"\\] { + --tw-content: "pages/ignored.html"; + content: var(--tw-content); + } + .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { + --tw-content: "pages/nested/foo.html"; + content: var(--tw-content); + } + .content-\\[\\"src\\/index\\.html\\"\\] { + --tw-content: "src/index.html"; + content: var(--tw-content); + } + .content-\\[\\"src\\/nested\\/index\\.html\\"\\] { + --tw-content: "src/nested/index.html"; + content: var(--tw-content); + } + @property --tw-content { + syntax: "*"; + inherits: false; + initial-value: ""; + } + " + `) + }, +) + +test( + 'auto source detection in depth, source(…) and `@source` can be configured to use auto source detection (build + watch mode)', + { + fs: { + 'package.json': json`{}`, + 'pnpm-workspace.yaml': yaml` + # + packages: + - project-a + `, + 'project-a/package.json': json` + { + "dependencies": { + "postcss": "^8", + "postcss-cli": "^10", + "tailwindcss": "workspace:^", + "@tailwindcss/postcss": "workspace:^" + } + } + `, + 'project-a/postcss.config.js': js` + module.exports = { + plugins: { + '@tailwindcss/postcss': {}, + }, + } + `, + 'project-a/src/index.css': css` + @reference 'tailwindcss/theme'; + + /* Run auto-content detection in ../../project-b */ + @import 'tailwindcss/utilities' source('../../project-b'); + + /* Explicitly using node_modules in the @source allows git ignored folders */ + @source '../node_modules/{my-lib-1,my-lib-2}/src/**/*.html'; + + /* We typically ignore these extensions, but now include them explicitly */ + @source './logo.{jpg,png}'; + + /* Project C should apply auto source detection */ + @source '../../project-c'; + + /* Project D should apply auto source detection rules, such as ignoring node_modules */ + @source '../../project-d/**/*.{html,js}'; + @source '../../project-d/**/*.bin'; + + /* Same as above, but my-lib-2 _should_ be includes */ + @source '../../project-d/node_modules/my-lib-2/src/*.{html,js}'; + + /* bar.html is git ignored, but explicitly listed here to scan */ + @source '../../project-d/src/bar.html'; + `, + + // Project A is the current folder, but we explicitly configured + // `source(project-b)`, therefore project-a should not be included in + // the output. + 'project-a/src/index.html': html` +
+ `, + + // Project A explicitly includes an extension we usually ignore, + // therefore it should be included in the output. + 'project-a/src/logo.jpg': html` +
+ `, + + // Project A explicitly includes node_modules/{my-lib-1,my-lib-2}, + // therefore these files should be included in the output. + 'project-a/node_modules/my-lib-1/src/index.html': html` +
+ `, + 'project-a/node_modules/my-lib-2/src/index.html': html` +
+ `, + + // Project B is the configured `source(…)`, therefore auto source + // detection should include known extensions and folders in the output. + 'project-b/src/index.html': html` +
+ `, + + // Project B is the configured `source(…)`, therefore auto source + // detection should apply and node_modules should not be included in the + // output. + 'project-b/node_modules/my-lib-3/src/index.html': html` +
+ `, + + // Project C should apply auto source detection, therefore known + // extensions and folders should be included in the output. + 'project-c/src/index.html': html` +
+ `, + + // Project C should apply auto source detection, therefore known ignored + // extensions should not be included in the output. + 'project-c/src/logo.jpg': html` +
+ `, + + // Project C should apply auto source detection, therefore node_modules + // should not be included in the output. + 'project-c/node_modules/my-lib-1/src/index.html': html` +
+ `, + + // Project D should apply auto source detection rules, such as ignoring + // node_modules. + 'project-d/node_modules/my-lib-1/src/index.html': html` +
+ `, + + // Project D has an explicit glob containing node_modules, thus should include the html file + 'project-d/node_modules/my-lib-2/src/index.html': html` +
+ `, + + 'project-d/src/.gitignore': dedent` + foo.html + bar.html + `, + + // Project D, foo.html is ignored by the gitignore file but the source rule is explicit about + // adding all `.html` files. + 'project-d/src/foo.html': html` +
+ `, + + // Project D, bar.html is ignored by the gitignore file. But explicitly + // listed as a `@source` glob. + 'project-d/src/bar.html': html` +
+ `, + + // Project D should look for files with the extensions html and js. + 'project-d/src/index.html': html` +
+ `, + + // Project D should have a binary file even though we ignore binary files + // by default, but it's explicitly listed. + 'project-d/my-binary-file.bin': html` +
+ `, + }, + }, + async ({ fs, exec, spawn, root, expect }) => { + await exec('pnpm postcss src/index.css --output dist/out.css --verbose', { + cwd: path.join(root, 'project-a'), + }) + + expect(await fs.dumpFiles('./project-a/dist/*.css')).toMatchInlineSnapshot(` + " + --- ./project-a/dist/out.css --- + .content-\\[\\'project-a\\/node_modules\\/my-lib-1\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-a/node modules/my-lib-1/src/index.html'; + content: var(--tw-content); + } + .content-\\[\\'project-a\\/node_modules\\/my-lib-2\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-a/node modules/my-lib-2/src/index.html'; + content: var(--tw-content); + } + .content-\\[\\'project-a\\/src\\/logo\\.jpg\\'\\] { + --tw-content: 'project-a/src/logo.jpg'; + content: var(--tw-content); + } + .content-\\[\\'project-b\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-b/src/index.html'; + content: var(--tw-content); + } + .content-\\[\\'project-c\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-c/src/index.html'; + content: var(--tw-content); + } + .content-\\[\\'project-d\\/my-binary-file\\.bin\\'\\] { + --tw-content: 'project-d/my-binary-file.bin'; + content: var(--tw-content); + } + .content-\\[\\'project-d\\/node_modules\\/my-lib-2\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-d/node modules/my-lib-2/src/index.html'; + content: var(--tw-content); + } + .content-\\[\\'project-d\\/src\\/bar\\.html\\'\\] { + --tw-content: 'project-d/src/bar.html'; + content: var(--tw-content); + } + .content-\\[\\'project-d\\/src\\/foo\\.html\\'\\] { + --tw-content: 'project-d/src/foo.html'; + content: var(--tw-content); + } + .content-\\[\\'project-d\\/src\\/index\\.html\\'\\] { + --tw-content: 'project-d/src/index.html'; + content: var(--tw-content); + } + @property --tw-content { + syntax: "*"; + inherits: false; + initial-value: ""; + } + " + `) + + // Watch mode tests + let process = await spawn( + 'pnpm postcss src/index.css --output dist/out.css --watch --verbose', + { + cwd: path.join(root, 'project-a'), + }, + ) + await process.onStderr((message) => message.includes('Waiting for file changes...')) + + // Changes to project-a should not be included in the output, we changed the + // base folder to project-b. + await fs.write( + 'project-a/src/index.html', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/src/index.html']`, + ]) + + // Changes to this file should be included, because we explicitly listed + // them using `@source`. + await fs.write( + 'project-a/src/logo.jpg', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/src/logo.jpg']`, + ]) + + // Changes to these files should be included, because we explicitly listed + // them using `@source`. + await fs.write( + 'project-a/node_modules/my-lib-1/src/index.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/node_modules/my-lib-1/src/index.html']`, + ]) + + await fs.write( + 'project-a/node_modules/my-lib-2/src/index.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/node_modules/my-lib-2/src/index.html']`, + ]) + + // Changes to this file should be included, because we changed the base to + // `project-b`. + await fs.write( + 'project-b/src/index.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-b/src/index.html']`, + ]) + + // Changes to this file should not be included. We did change the base to + // `project-b`, but we still apply the auto source detection rules which + // ignore `node_modules`. + await fs.write( + 'project-b/node_modules/my-lib-3/src/index.html', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-b/node_modules/my-lib-3/src/index.html']`, + ]) + + // Project C was added explicitly via `@source`, therefore changes to these + // files should be included. + await fs.write( + 'project-c/src/index.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-c/src/index.html']`, + ]) + + // Except for these files, since they are ignored by the default auto source + // detection rules. + await fs.write( + 'project-c/src/logo.jpg', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-c/src/logo.jpg']`, + ]) + await fs.write( + 'project-c/node_modules/my-lib-1/src/index.html', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-c/node_modules/my-lib-1/src/index.html']`, + ]) + + // Creating new files in the "root" of auto source detected folders + // We need to create the files and *then* update them because postcss-cli + // does not pick up new files — only changes to existing files. + await fs.create([ + 'project-b/new-file.html', + 'project-b/new-folder/new-file.html', + 'project-c/new-file.html', + 'project-c/new-folder/new-file.html', + ]) + + // If we don't wait writes will be coalesced into a "add" event which + // isn't picked up by postcss-cli. + await new Promise((resolve) => setTimeout(resolve, 100)) + + await fs.write( + 'project-b/new-file.html', + html`
`, + ) + await fs.write( + 'project-b/new-folder/new-file.html', + html`
`, + ) + await fs.write( + 'project-c/new-file.html', + html`
`, + ) + await fs.write( + 'project-c/new-folder/new-file.html', + html`
`, + ) + + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.created_&]:content-['project-b/new-file.html']`, + candidate`[.created_&]:content-['project-b/new-folder/new-file.html']`, + candidate`[.created_&]:content-['project-c/new-file.html']`, + candidate`[.created_&]:content-['project-c/new-folder/new-file.html']`, + ]) + }, +) + +test( + 'auto source detection disabled', + { + fs: { + 'package.json': json` + { + "dependencies": { + "postcss": "^8", + "postcss-cli": "^10", + "tailwindcss": "workspace:^", + "@tailwindcss/postcss": "workspace:^" + } + } + `, + 'postcss.config.js': js` + module.exports = { + plugins: { + '@tailwindcss/postcss': {}, + }, + } + `, + 'index.css': css` + @reference 'tailwindcss/theme'; + + /* (1) */ + /* - Only './src' should be auto-scanned, not the current working directory */ + /* - .gitignore'd paths should be ignored (node_modules) */ + /* - Binary extensions should be ignored (jpg, zip) */ + @import 'tailwindcss/utilities' source(none); + + /* (2) */ + /* - './pages' should be auto-scanned */ + /* - Only '.html' files should be included */ + /* - './page/ignored.html' will not be ignored because of the specific pattern */ + @source "./pages/**/*.html"; + `, + + '.gitignore': dedent` + /src/ignored + /pages/ignored.html + `, + + // (1) + 'index.html': 'content-["index.html"] content-["BAD"]', // "Root" source is in `./src` + 'src/index.html': 'content-["src/index.html"] content-["BAD"]', + 'src/nested/index.html': 'content-["src/nested/index.html"] content-["BAD"]', + 'src/index.jpg': 'content-["src/index.jpg"] content-["BAD"]', + 'src/nested/index.tar': 'content-["src/nested/index.tar"] content-["BAD"]', + 'src/ignored/index.html': 'content-["src/ignored/index.html"] content-["BAD"]', + + // (4) + 'pages/foo.html': 'content-["pages/foo.html"]', + 'pages/nested/foo.html': 'content-["pages/nested/foo.html"]', + 'pages/ignored.html': 'content-["pages/ignored.html"]', + 'pages/foo.jsx': 'content-["pages/foo.jsx"] content-["BAD"]', + 'pages/nested/foo.jsx': 'content-["pages/nested/foo.jsx"] content-["BAD"]', + }, + }, + async ({ fs, exec, expect }) => { + await exec('pnpm postcss index.css --output dist/out.css') + + expect(await fs.dumpFiles('./dist/*.css')).toMatchInlineSnapshot(` + " + --- ./dist/out.css --- + .content-\\[\\"pages\\/foo\\.html\\"\\] { + --tw-content: "pages/foo.html"; + content: var(--tw-content); + } + .content-\\[\\"pages\\/ignored\\.html\\"\\] { + --tw-content: "pages/ignored.html"; + content: var(--tw-content); + } + .content-\\[\\"pages\\/nested\\/foo\\.html\\"\\] { + --tw-content: "pages/nested/foo.html"; + content: var(--tw-content); + } + @property --tw-content { + syntax: "*"; + inherits: false; + initial-value: ""; + } + " + `) + }, +) + +test( + '`@source not "…"`', + { + fs: { + 'package.json': json`{}`, + 'pnpm-workspace.yaml': yaml` + # + packages: + - project-a + `, + 'project-a/package.json': json` + { + "dependencies": { + "postcss": "^8", + "postcss-cli": "^10", + "tailwindcss": "workspace:^", + "@tailwindcss/postcss": "workspace:^" + } + } + `, + 'project-a/postcss.config.js': js` + module.exports = { + plugins: { + '@tailwindcss/postcss': {}, + }, + } + `, + 'project-a/src/index.css': css` + @reference 'tailwindcss/theme'; + @import 'tailwindcss/utilities'; + + /* Ignore a specific file */ + @source not "./ignore-me-file.html"; + + /* Ignore a entire folder */ + @source not "./ignore-me-folder"; + + /* Ignore an extension */ + @source not "**/*.ts"; + + /* Explicit source detection for 'project-b' */ + @source "../../project-b/**/*.html"; + + /* Explicitly ignoring a file in 'project-b' */ + @source not "../../project-b/src/ignore-me.html"; + `, + 'project-a/src/ignore-me-file.html': html` +
+
+
+ `, + 'project-a/src/ignore-me-folder/index.html': html` +
+
+
+ `, + 'project-a/src/keep-me.html': html`
`, + 'project-a/src/ignore-me-extension.ts': html` +
+
+
+ `, + 'project-b/src/ignore-me.html': html` +
+
+
+ `, + 'project-b/src/keep-me.html': html` +
+
+
+ `, + }, + }, + async ({ fs, exec, spawn, root, expect }) => { + await exec('pnpm postcss src/index.css --output dist/out.css --verbose', { + cwd: path.join(root, 'project-a'), + }) + + expect(await fs.dumpFiles('./project-a/dist/*.css')).toMatchInlineSnapshot(` + " + --- ./project-a/dist/out.css --- + .content-\\[\\'keep-me\\.html\\'\\] { + --tw-content: 'keep-me.html'; + content: var(--tw-content); + } + .content-\\[\\'project-b\\/src\\/keep-me\\.html\\'\\] { + --tw-content: 'project-b/src/keep-me.html'; + content: var(--tw-content); + } + @property --tw-content { + syntax: "*"; + inherits: false; + initial-value: ""; + } + " + `) + + // Watch mode tests + let process = await spawn( + 'pnpm postcss src/index.css --output dist/out.css --watch --verbose', + { + cwd: path.join(root, 'project-a'), + }, + ) + await process.onStderr((message) => message.includes('Waiting for file changes...')) + + fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`content-['project-a/src/ignore-me-file.html']`, + candidate`content-['project-a/src/ignore-me-folder/index.html']`, + candidate`content-['project-b/src/ignore-me.html']`, + ]) + + // Changes to the keep-me files should be included + await fs.write( + 'project-a/src/keep-me.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/src/keep-me.html']`, + ]) + + await fs.write( + 'project-b/src/keep-me.html', + html`
`, + ) + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-b/src/keep-me.html']`, + ]) + + // Changes to the ignored files should not be included + await fs.write( + 'project-a/src/ignore-me.html', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-a/src/ignore-me.html']`, + ]) + + await fs.write( + 'project-b/src/ignore-me.html', + html`
`, + ) + await fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`[.changed_&]:content-['project-b/src/ignore-me.html']`, + ]) + + fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`content-['project-a/src/ignore-me-file.html']`, + candidate`content-['project-a/src/ignore-me-folder/index.html']`, + candidate`content-['project-b/src/ignore-me.html']`, + ]) + + // Creating new files that match the source patterns should be included. + await fs.create([ + 'project-a/src/new-file.html', + 'project-a/src/new-folder/new-file.html', + 'project-b/src/new-file.html', + 'project-b/src/new-folder/new-file.html', + ]) + + await fs.write( + 'project-a/src/new-file.html', + html`
`, + ) + await fs.write( + 'project-a/src/new-folder/new-file.html', + html`
`, + ) + await fs.write( + 'project-b/src/new-file.html', + html`
`, + ) + await fs.write( + 'project-b/src/new-folder/new-file.html', + html`
`, + ) + + // If we don't wait writes will be coalesced into a "add" event which + // isn't picked up by postcss-cli. + await new Promise((resolve) => setTimeout(resolve, 100)) + + await fs.expectFileToContain('./project-a/dist/out.css', [ + candidate`[.created_&]:content-['project-a/src/new-file.html']`, + candidate`[.created_&]:content-['project-a/src/new-folder/new-file.html']`, + candidate`[.created_&]:content-['project-b/src/new-file.html']`, + candidate`[.created_&]:content-['project-b/src/new-folder/new-file.html']`, + ]) + + fs.expectFileNotToContain('./project-a/dist/out.css', [ + candidate`content-['project-a/src/ignore-me-file.html']`, + candidate`content-['project-a/src/ignore-me-folder/index.html']`, + candidate`content-['project-b/src/ignore-me.html']`, + ]) + }, +) diff --git a/packages/tailwindcss/src/index.test.ts b/packages/tailwindcss/src/index.test.ts index 619ed203715a..9a4279d546ee 100644 --- a/packages/tailwindcss/src/index.test.ts +++ b/packages/tailwindcss/src/index.test.ts @@ -3261,6 +3261,21 @@ describe('@source', () => { ]) }) + test('emits negated @source files', async () => { + let { sources } = await compile( + css` + @source not "./foo/**/*.ts"; + @source not "./php/secr3t/smarty.php"; + `, + { base: '/root' }, + ) + + expect(sources).toEqual([ + { pattern: './foo/**/*.ts', base: '/root', negated: true }, + { pattern: './php/secr3t/smarty.php', base: '/root', negated: true }, + ]) + }) + describe('@source inline(…)', () => { test('always includes the candidate', async () => { let { build } = await compile( From 878782ecacdc99924f03f52c4cc6e9fe605603fe Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:47:53 +0100 Subject: [PATCH 13/26] use `normalizedSources` in the CLI We still emit `globs` for `@tailwindcss/postcss`, but we also added better normalized globs based on your `@source` directives. We returned them as a new property to stay backward compatible, but this also means that the `@tailwindcss/cli` can make use of this. --- .../@tailwindcss-cli/src/commands/build/index.ts | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/packages/@tailwindcss-cli/src/commands/build/index.ts b/packages/@tailwindcss-cli/src/commands/build/index.ts index b2252afc50d9..8eb60d90286b 100644 --- a/packages/@tailwindcss-cli/src/commands/build/index.ts +++ b/packages/@tailwindcss-cli/src/commands/build/index.ts @@ -334,18 +334,6 @@ export async function handle(args: Result>) { eprintln(`Done in ${formatDuration(end - start)}`) } -function watchDirectories(scanner: Scanner) { - return scanner.globs.flatMap((globEntry) => { - // We don't want a watcher for negated globs. - if (globEntry.pattern[0] === '!') return [] - - // We don't want a watcher for files, only directories. - if (globEntry.pattern === '') return [] - - return globEntry.base - }) -} - async function createWatchers(dirs: string[], cb: (files: string[]) => void) { // Remove any directories that are children of an already watched directory. // If we don't we may not get notified of certain filesystem events regardless @@ -474,3 +462,7 @@ function optimizeCss( // nesting is applied. This creates a more optimized output. return optimize(optimize(Buffer.from(input))).toString() } + +function watchDirectories(scanner: Scanner) { + return [...new Set(scanner.normalizedSources.flatMap((globEntry) => globEntry.base))] +} From a7bef4ec75040e1702f84764ca7e7d234a09acc8 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 17:48:55 +0100 Subject: [PATCH 14/26] add `.gitignore` as a default ignored file --- crates/oxide/src/scanner/fixtures/ignored-files.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/oxide/src/scanner/fixtures/ignored-files.txt b/crates/oxide/src/scanner/fixtures/ignored-files.txt index 45d4ced87afd..d2d231ec7b0d 100644 --- a/crates/oxide/src/scanner/fixtures/ignored-files.txt +++ b/crates/oxide/src/scanner/fixtures/ignored-files.txt @@ -1,3 +1,4 @@ package-lock.json pnpm-lock.yaml bun.lockb +.gitignore From 8d154d112afc9bd902ded076ff9a30955cc16786 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 18:07:20 +0100 Subject: [PATCH 15/26] add `node_modules` as a default ignored folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some people don't have `node_modules` in the `.gitignore` file so in some scenario's this will always be scanned. Another example is if you use the Vercel CLI to deploy your project because `.gitignore` files are not pushed... This is technically a breaking change, but now that we have `@source not '…'` you can include files/folders from `node_modules` even though the folder is ignored by default. --- crates/oxide/src/scanner/auto_source_detection.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/oxide/src/scanner/auto_source_detection.rs b/crates/oxide/src/scanner/auto_source_detection.rs index 9d2e3705c6f8..62bccb5df792 100644 --- a/crates/oxide/src/scanner/auto_source_detection.rs +++ b/crates/oxide/src/scanner/auto_source_detection.rs @@ -51,4 +51,4 @@ static IGNORED_FILES: sync::LazyLock> = sync::LazyLock::new(|| }); static IGNORED_CONTENT_DIRS: sync::LazyLock> = - sync::LazyLock::new(|| vec![".git"]); + sync::LazyLock::new(|| vec![".git", "node_modules"]); From 38ed37e0446306a7b6c7c013974991af37b9c3ee Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 18:19:15 +0100 Subject: [PATCH 16/26] add `enableSourceNot` feature flag --- packages/tailwindcss/src/feature-flags.ts | 1 + packages/tailwindcss/src/index.ts | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/packages/tailwindcss/src/feature-flags.ts b/packages/tailwindcss/src/feature-flags.ts index 203bfe335af8..d00d1d45b1ae 100644 --- a/packages/tailwindcss/src/feature-flags.ts +++ b/packages/tailwindcss/src/feature-flags.ts @@ -5,5 +5,6 @@ export const enablePointerVariants = process.env.FEATURES_ENV !== 'stable' export const enableSafeAlignment = process.env.FEATURES_ENV !== 'stable' export const enableScripting = process.env.FEATURES_ENV !== 'stable' export const enableSourceInline = process.env.FEATURES_ENV !== 'stable' +export const enableSourceNot = process.env.FEATURES_ENV !== 'stable' export const enableUserValid = process.env.FEATURES_ENV !== 'stable' export const enableWrapAnywhere = process.env.FEATURES_ENV !== 'stable' diff --git a/packages/tailwindcss/src/index.ts b/packages/tailwindcss/src/index.ts index e1ac3dd8dc8b..d8d4f73315ea 100644 --- a/packages/tailwindcss/src/index.ts +++ b/packages/tailwindcss/src/index.ts @@ -26,7 +26,7 @@ import { applyVariant, compileCandidates } from './compile' import { substituteFunctions } from './css-functions' import * as CSS from './css-parser' import { buildDesignSystem, type DesignSystem } from './design-system' -import { enableSourceInline } from './feature-flags' +import { enableSourceInline, enableSourceNot } from './feature-flags' import { Theme, ThemeOptions } from './theme' import { createCssUtility } from './utilities' import { expand } from './utils/brace-expansion' @@ -216,12 +216,14 @@ async function parseCss( let inline = false let path = node.params - if (enableSourceInline) { + if (enableSourceNot) { if (path[0] === 'n' && path.startsWith('not ')) { not = true path = path.slice(4) } + } + if (enableSourceInline) { if (path[0] === 'i' && path.startsWith('inline(')) { inline = true path = path.slice(7, -1) @@ -247,7 +249,11 @@ async function parseCss( } } } else { - sources.push({ base: context.base as string, pattern: source, negated: not }) + sources.push({ + base: context.base as string, + pattern: source, + negated: enableSourceNot ? not : false, + }) } replaceWith([]) return From 0f737971c88a3ab38556f7df6fcc78bbbe314cf4 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Fri, 21 Mar 2025 18:19:43 +0100 Subject: [PATCH 17/26] run prettier --- crates/ignore/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/ignore/README.md b/crates/ignore/README.md index 72258e6b5824..a4c34e505cf3 100644 --- a/crates/ignore/README.md +++ b/crates/ignore/README.md @@ -1,5 +1,5 @@ -ignore -====== +# ignore + The ignore crate provides a fast recursive directory iterator that respects various filters such as globs, file types and `.gitignore` files. This crate also provides lower level direct access to gitignore and file type matchers. @@ -29,7 +29,6 @@ recursively traverse the current directory while automatically filtering out files and directories according to ignore globs found in files like `.ignore` and `.gitignore`: - ```rust,no_run use ignore::Walk; From 97fc4bf0ea4f430bdae48eaf6b117178ee4c081a Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 11:50:57 +0100 Subject: [PATCH 18/26] Simplify gitignore order change --- crates/ignore/src/dir.rs | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/crates/ignore/src/dir.rs b/crates/ignore/src/dir.rs index 69eba476048c..9bbf1442b382 100644 --- a/crates/ignore/src/dir.rs +++ b/crates/ignore/src/dir.rs @@ -491,34 +491,25 @@ impl Ignore { // CHANGED: We added logic to configure an order in which the ignore files are respected and // allowed a whitelist in a later file to overrule a block on an earlier file. let order = [ - // Global gitignore - &m_global, - // .git/info/exclude - &m_gi_exclude, - // .gitignore - &m_gi, - // .ignore - &m_ignore, - // .custom-ignore - &m_custom_ignore, // Manually added ignores &m_explicit, + // .custom-ignore + &m_custom_ignore, + // .ignore + &m_ignore, + // .gitignore + &m_gi, + // .git/info/exclude + &m_gi_exclude, + // Global gitignore + &m_global, ]; - for (idx, check) in order.into_iter().enumerate() { + for check in order.into_iter() { if check.is_none() { continue; } - let remaining = &order[idx + 1..]; - if check.is_ignore() { - if remaining.iter().any(|other| other.is_whitelist()) { - continue; - } - } else if remaining.iter().any(|other| other.is_ignore()) { - continue; - } - return check.clone(); } From 94473d3bc98d80d76abe090f232ccb0f928ec836 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 11:51:35 +0100 Subject: [PATCH 19/26] Rename GitHub CI var --- .github/workflows/ci.yml | 6 +++--- .github/workflows/integration-tests.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9544baa663bb..2ebe99d27c96 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,13 +25,13 @@ jobs: os: macos-14 # Exclude windows and macos from being built on feature branches - on-main-branch: + run-all: - ${{ github.ref == 'refs/heads/main' || contains(github.event.pull_request.body, '[ci-all]') }} exclude: - - on-main-branch: false + - run-all: false runner: name: Windows - - on-main-branch: false + - run-all: false runner: name: macOS diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 4fa0900ef9fd..761b769d75e2 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -33,13 +33,13 @@ jobs: - workers # Exclude windows and macos from being built on feature branches - on-main-branch: + run-all: - ${{ github.ref == 'refs/heads/main' || contains(github.event.pull_request.body, '[ci-all]') }} exclude: - - on-main-branch: false + - run-all: false runner: name: Windows - - on-main-branch: false + - run-all: false runner: name: macOS From 7442baaf63cf4a976598678a4fcf3ad1074ce7d0 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 11:52:44 +0100 Subject: [PATCH 20/26] Explicitly mark test functions --- crates/oxide/tests/scanner.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index 47dceb5b5134..c4e4d7eff653 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -15,6 +15,7 @@ mod scanner { candidates: Vec, } + #[cfg(test)] fn create_files_in(dir: &path::Path, paths: &[(&str, &str)]) { // Create the necessary files for (path, contents) in paths { @@ -29,6 +30,7 @@ mod scanner { } } + #[cfg(test)] fn scan_with_globs( paths_with_content: &[(&str, &str)], source_directives: Vec<&str>, @@ -106,6 +108,7 @@ mod scanner { } } + #[cfg(test)] fn scan(paths_with_content: &[(&str, &str)]) -> ScanResult { scan_with_globs(paths_with_content, vec!["@source '**/*'"]) } From 2425aef95c9bccc9c9dae74ffa43745612fe13d0 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 12:42:24 +0100 Subject: [PATCH 21/26] Handle source paths into ignored content dirs as "external" --- .../src/scanner/auto_source_detection.rs | 63 ++++++++++-------- .../scanner/fixtures/ignored-content-dirs.txt | 2 + crates/oxide/src/scanner/mod.rs | 52 ++++++++++----- crates/oxide/src/scanner/sources.rs | 60 +++++++++-------- crates/oxide/tests/scanner.rs | 65 ++++++++++++++----- 5 files changed, 153 insertions(+), 89 deletions(-) create mode 100644 crates/oxide/src/scanner/fixtures/ignored-content-dirs.txt diff --git a/crates/oxide/src/scanner/auto_source_detection.rs b/crates/oxide/src/scanner/auto_source_detection.rs index 62bccb5df792..e9b7f64aacbc 100644 --- a/crates/oxide/src/scanner/auto_source_detection.rs +++ b/crates/oxide/src/scanner/auto_source_detection.rs @@ -13,42 +13,53 @@ use std::sync; pub static RULES: sync::LazyLock = sync::LazyLock::new(|| { let mut builder = GitignoreBuilder::new(""); - builder - .add_line(None, &format!("{{{}}}", IGNORED_CONTENT_DIRS.join(","))) - .unwrap(); - builder - .add_line(None, &format!("*.{{{}}}", IGNORED_EXTENSIONS.join(","))) - .unwrap(); - builder - .add_line(None, &format!("*.{{{}}}", BINARY_EXTENSIONS.join(","))) - .unwrap(); - builder - .add_line(None, &format!("{{{}}}", IGNORED_FILES.join(","))) - .unwrap(); + builder.add_line(None, &IGNORED_CONTENT_DIRS_GLOB).unwrap(); + builder.add_line(None, &IGNORED_EXTENSIONS_GLOB).unwrap(); + builder.add_line(None, &BINARY_EXTENSIONS_GLOB).unwrap(); + builder.add_line(None, &IGNORED_FILES_GLOB).unwrap(); builder.build().unwrap() }); -static BINARY_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/binary-extensions.txt") +pub static IGNORED_CONTENT_DIRS: sync::LazyLock> = sync::LazyLock::new(|| { + include_str!("fixtures/ignored-content-dirs.txt") .trim() .lines() .collect() }); -static IGNORED_EXTENSIONS: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/ignored-extensions.txt") - .trim() - .lines() - .collect() +static IGNORED_CONTENT_DIRS_GLOB: sync::LazyLock = + sync::LazyLock::new(|| format!("{{{}}}/", IGNORED_CONTENT_DIRS.join(","))); + +static IGNORED_EXTENSIONS_GLOB: sync::LazyLock = sync::LazyLock::new(|| { + format!( + "*.{{{}}}", + include_str!("fixtures/ignored-extensions.txt") + .trim() + .lines() + .collect::>() + .join(",") + ) }); -static IGNORED_FILES: sync::LazyLock> = sync::LazyLock::new(|| { - include_str!("fixtures/ignored-files.txt") - .trim() - .lines() - .collect() +pub static BINARY_EXTENSIONS_GLOB: sync::LazyLock = sync::LazyLock::new(|| { + format!( + "*.{{{}}}", + include_str!("fixtures/binary-extensions.txt") + .trim() + .lines() + .collect::>() + .join(",") + ) }); -static IGNORED_CONTENT_DIRS: sync::LazyLock> = - sync::LazyLock::new(|| vec![".git", "node_modules"]); +static IGNORED_FILES_GLOB: sync::LazyLock = sync::LazyLock::new(|| { + format!( + "{{{}}}", + include_str!("fixtures/ignored-files.txt") + .trim() + .lines() + .collect::>() + .join(",") + ) +}); diff --git a/crates/oxide/src/scanner/fixtures/ignored-content-dirs.txt b/crates/oxide/src/scanner/fixtures/ignored-content-dirs.txt new file mode 100644 index 000000000000..85dcc16df69a --- /dev/null +++ b/crates/oxide/src/scanner/fixtures/ignored-content-dirs.txt @@ -0,0 +1,2 @@ +.git +node_modules diff --git a/crates/oxide/src/scanner/mod.rs b/crates/oxide/src/scanner/mod.rs index c5dd131a8f53..445dfc98372b 100644 --- a/crates/oxide/src/scanner/mod.rs +++ b/crates/oxide/src/scanner/mod.rs @@ -9,6 +9,7 @@ use crate::scanner::sources::{ public_source_entries_to_private_source_entries, PublicSourceEntry, SourceEntry, Sources, }; use crate::GlobEntry; +use auto_source_detection::BINARY_EXTENSIONS_GLOB; use bstr::ByteSlice; use fast_glob::glob_match; use fxhash::{FxHashMap, FxHashSet}; @@ -273,14 +274,18 @@ impl Scanner { self.scan_sources(); for source in self.sources.iter() { - if let SourceEntry::Auto { base } = source { - let globs = resolve_globs((base).to_path_buf(), &self.dirs, &self.extensions); - self.globs.extend(globs); - } else if let SourceEntry::Pattern { base, pattern } = source { - self.globs.push(GlobEntry { - base: base.to_string_lossy().to_string(), - pattern: pattern.to_string(), - }); + match source { + SourceEntry::Auto { base } | SourceEntry::External { base } => { + let globs = resolve_globs((base).to_path_buf(), &self.dirs, &self.extensions); + self.globs.extend(globs); + } + SourceEntry::Pattern { base, pattern } => { + self.globs.push(GlobEntry { + base: base.to_string_lossy().to_string(), + pattern: pattern.to_string(), + }); + } + _ => {} } } @@ -295,7 +300,7 @@ impl Scanner { self.sources .iter() .filter_map(|source| match source { - SourceEntry::Auto { base } => Some(GlobEntry { + SourceEntry::Auto { base } | SourceEntry::External { base } => Some(GlobEntry { base: base.to_string_lossy().to_string(), pattern: "**/*".to_string(), }), @@ -445,21 +450,15 @@ fn create_walker(sources: Sources) -> Option { let mut first_root: Option<&PathBuf> = None; let mut ignores: BTreeMap<&PathBuf, BTreeSet> = Default::default(); - let mut auto_content_roots = FxHashSet::default(); - for source in sources.iter() { match source { SourceEntry::Auto { base } => { - auto_content_roots.insert(base); if first_root.is_none() { first_root = Some(base); } else { other_roots.insert(base); } } - SourceEntry::IgnoredAuto { base } => { - ignores.entry(base).or_default().insert("**/*".to_string()); - } SourceEntry::Pattern { base, pattern } => { let mut pattern = pattern.to_string(); @@ -492,7 +491,7 @@ fn create_walker(sources: Sources) -> Option { } } } - SourceEntry::IgnoredPattern { base, pattern } => { + SourceEntry::Ignored { base, pattern } => { let mut pattern = pattern.to_string(); // Ensure that the pattern is pinned to the base path. if !pattern.starts_with("/") { @@ -500,6 +499,25 @@ fn create_walker(sources: Sources) -> Option { } ignores.entry(base).or_default().insert(pattern); } + SourceEntry::External { base } => { + if first_root.is_none() { + first_root = Some(base); + } else { + other_roots.insert(base); + } + + // External sources should take precedence even over git-ignored files: + ignores + .entry(base) + .or_default() + .insert(format!("!{}", "/**/*")); + + // External sources should still disallow binary extensions: + ignores + .entry(base) + .or_default() + .insert(BINARY_EXTENSIONS_GLOB.clone()); + } } } @@ -580,7 +598,7 @@ fn create_walker(sources: Sources) -> Option { let mut matches = false; for source in sources.iter() { match source { - SourceEntry::Auto { base } => { + SourceEntry::Auto { base } | SourceEntry::External { base } => { if path.starts_with(base) { matches = true; break; diff --git a/crates/oxide/src/scanner/sources.rs b/crates/oxide/src/scanner/sources.rs index cc0f8805a45b..56101b3f4f25 100644 --- a/crates/oxide/src/scanner/sources.rs +++ b/crates/oxide/src/scanner/sources.rs @@ -3,6 +3,8 @@ use crate::GlobEntry; use bexpand::Expression; use std::path::PathBuf; +use super::auto_source_detection::IGNORED_CONTENT_DIRS; + #[derive(Debug, Clone)] pub struct PublicSourceEntry { /// Base path of the glob @@ -27,33 +29,34 @@ pub enum SourceEntry { /// ``` Auto { base: PathBuf }, - /// Ignored auto source detection + /// Explicit source pattern regardless of any auto source detection rules /// /// Represented by: /// /// ```css - /// @source not "src";` - /// @source not "src/**/*";` + /// @source "src/**/*.html";` /// ``` - IgnoredAuto { base: PathBuf }, + Pattern { base: PathBuf, pattern: String }, - /// Explicit source pattern regardless of any auto source detection rules + /// Ignored pattern /// /// Represented by: /// /// ```css - /// @source "src/**/*.html";` + /// @source not "src";` + /// @source not "src/**/*.html";` /// ``` - Pattern { base: PathBuf, pattern: String }, + Ignored { base: PathBuf, pattern: String }, - /// Explicit ignored source pattern regardless of any auto source detection rules + /// External sources are sources outside of your git root which should not + /// follow gitignore rules. /// /// Represented by: /// /// ```css - /// @source not "src/**/*.html";` + /// @source "../node_modules/my-lib";` /// ``` - IgnoredPattern { base: PathBuf, pattern: String }, + External { base: PathBuf }, } #[derive(Debug, Clone, Default)] @@ -153,18 +156,27 @@ impl From for SourceEntry { let auto = value.pattern.ends_with("**/*") || PathBuf::from(&value.base).join(&value.pattern).is_dir(); - match (value.negated, auto) { - (false, true) => SourceEntry::Auto { + let inside_ignored_content_dir = IGNORED_CONTENT_DIRS.iter().any(|dir| { + value.base.contains(&format!( + "{}{}{}", + std::path::MAIN_SEPARATOR, + dir, + std::path::MAIN_SEPARATOR + )) + }); + + match (value.negated, auto, inside_ignored_content_dir) { + (false, true, false) => SourceEntry::Auto { base: value.base.into(), }, - (false, false) => SourceEntry::Pattern { + (false, true, true) => SourceEntry::External { base: value.base.into(), - pattern: value.pattern, }, - (true, true) => SourceEntry::IgnoredAuto { + (false, false, _) => SourceEntry::Pattern { base: value.base.into(), + pattern: value.pattern, }, - (true, false) => SourceEntry::IgnoredPattern { + (true, _, _) => SourceEntry::Ignored { base: value.base.into(), pattern: value.pattern, }, @@ -184,7 +196,7 @@ impl From for SourceEntry { impl From for GlobEntry { fn from(value: SourceEntry) -> Self { match value { - SourceEntry::Auto { base } => GlobEntry { + SourceEntry::Auto { base } | SourceEntry::External { base } => GlobEntry { base: base.to_string_lossy().into(), pattern: "**/*".into(), }, @@ -192,11 +204,7 @@ impl From for GlobEntry { base: base.to_string_lossy().into(), pattern: pattern.clone(), }, - SourceEntry::IgnoredAuto { base } => GlobEntry { - base: base.to_string_lossy().into(), - pattern: "**/*".into(), - }, - SourceEntry::IgnoredPattern { base, pattern } => GlobEntry { + SourceEntry::Ignored { base, pattern } => GlobEntry { base: base.to_string_lossy().into(), pattern: pattern.clone(), }, @@ -207,7 +215,7 @@ impl From for GlobEntry { impl From<&SourceEntry> for GlobEntry { fn from(value: &SourceEntry) -> Self { match value { - SourceEntry::Auto { base } => GlobEntry { + SourceEntry::Auto { base } | SourceEntry::External { base } => GlobEntry { base: base.to_string_lossy().into(), pattern: "**/*".into(), }, @@ -215,11 +223,7 @@ impl From<&SourceEntry> for GlobEntry { base: base.to_string_lossy().into(), pattern: pattern.clone(), }, - SourceEntry::IgnoredAuto { base } => GlobEntry { - base: base.to_string_lossy().into(), - pattern: "**/*".into(), - }, - SourceEntry::IgnoredPattern { base, pattern } => GlobEntry { + SourceEntry::Ignored { base, pattern } => GlobEntry { base: base.to_string_lossy().into(), pattern: pattern.clone(), }, diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index c4e4d7eff653..cd0946761565 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -944,7 +944,7 @@ mod scanner { } #[test] - fn it_respects_gitignore_in_workspace_root2() { + fn it_respects_gitignore_in_workspace_root() { let ScanResult { candidates, files, @@ -1417,38 +1417,67 @@ mod scanner { ); } - // TODO: external(…) so that `.gitignore` from main project doesn't apply to external projects #[test] - #[ignore] - fn test_ignore_files_in_node_modules() { - // Create a temporary working directory - let dir = tempdir().unwrap().into_path(); + fn test_ignore_node_modules_without_gitignore() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + ( + "packages/web/index.html", + "content-['packages/web/index.html']", + ), + ( + "node_modules/index.html", + "content-['node_modules/index.html']", + ), + ( + "packages/web/node_modules/index.html", + "content-['packages/web/node_modules/index.html']", + ), + ], + vec!["@source '**/*'"], + ); - // Create files - create_files_in( - &dir, + assert_eq!(candidates, vec!["content-['packages/web/index.html']"]); + + assert_eq!(files, vec!["packages/web/index.html",]); + assert_eq!(globs, vec!["*", "packages/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}", "packages/web/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}"]); + assert_eq!(normalized_sources, vec!["**/*"]); + } + + #[test] + fn test_ignore_gitignore_in_node_modules_source() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( &[ (".gitignore", "node_modules\ndist"), ( "node_modules/my-ui-lib/dist/index.html", "content-['node_modules/my-ui-lib/dist/index.html']", ), + ( + "node_modules/my-ui-lib/node.exe", + "content-['node_modules/my-ui-lib/node.exe']", + ), ], + vec!["@source 'node_modules/my-ui-lib'"], ); - // Explicitly listing all `*.html` files, should not include `node_modules` because it's - // ignored - let sources = vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source './'"), - PublicSourceEntry::from_pattern(dir.clone(), "@source './node_modules/my-ui-lib'"), - ]; - - let mut scanner = Scanner::new(sources.clone()); - let candidates = scanner.scan(); assert_eq!( candidates, vec!["content-['node_modules/my-ui-lib/dist/index.html']"] ); + assert_eq!(files, vec!["node_modules/my-ui-lib/dist/index.html"]); + assert_eq!(globs, vec!["node_modules/my-ui-lib/*", "node_modules/my-ui-lib/dist/**/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}"]); + assert_eq!(normalized_sources, vec!["node_modules/my-ui-lib/**/*"]); } #[test] From bb0610dd641e97bfb4425430fc8a73c9ec6eeca7 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 12:42:47 +0100 Subject: [PATCH 22/26] Revert "Explicitly mark test functions" This reverts commit fb4d8d84505a5c25c3f1294dd4c093b8b208f586. --- crates/oxide/tests/scanner.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index cd0946761565..0125abca9c90 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -15,7 +15,6 @@ mod scanner { candidates: Vec, } - #[cfg(test)] fn create_files_in(dir: &path::Path, paths: &[(&str, &str)]) { // Create the necessary files for (path, contents) in paths { @@ -30,7 +29,6 @@ mod scanner { } } - #[cfg(test)] fn scan_with_globs( paths_with_content: &[(&str, &str)], source_directives: Vec<&str>, @@ -108,7 +106,6 @@ mod scanner { } } - #[cfg(test)] fn scan(paths_with_content: &[(&str, &str)]) -> ScanResult { scan_with_globs(paths_with_content, vec!["@source '**/*'"]) } From 6952871af2b54830ca479f43d4bd71c8e3c96902 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 12:43:11 +0100 Subject: [PATCH 23/26] Explicitly mark test functions --- crates/oxide/src/scanner/sources.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/oxide/src/scanner/sources.rs b/crates/oxide/src/scanner/sources.rs index 56101b3f4f25..b1d3dc5be926 100644 --- a/crates/oxide/src/scanner/sources.rs +++ b/crates/oxide/src/scanner/sources.rs @@ -75,6 +75,7 @@ impl Sources { } impl PublicSourceEntry { + #[cfg(test)] pub fn from_pattern(dir: PathBuf, pattern: &str) -> Self { let mut parts = pattern.split_whitespace(); let _ = parts.next().unwrap_or_default(); From b6ab98f56db40ebcf7b71635e2e1e53d2dc3fcc8 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 12:44:31 +0100 Subject: [PATCH 24/26] Cleanup comment --- crates/oxide/src/glob.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/oxide/src/glob.rs b/crates/oxide/src/glob.rs index 7f99631c35b4..90bd34755bb9 100644 --- a/crates/oxide/src/glob.rs +++ b/crates/oxide/src/glob.rs @@ -239,10 +239,7 @@ pub fn optimize_patterns(entries: &Vec) -> Vec { // // E.g.: // -// Original input: -// - `../project-b/**/*.{html,js}` -// -// Expanded input: +// Input: // - `../project-b/**/*.html` // - `../project-b/**/*.js` // From 5e7d03570f00a255c293ac8e7286004050ea6ac4 Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 12:54:07 +0100 Subject: [PATCH 25/26] Cleanup public source entry creation --- crates/oxide/src/glob.rs | 104 +------------------------ crates/oxide/src/scanner/sources.rs | 115 +++++++++++++++++++++++----- crates/oxide/tests/scanner.rs | 70 +++++++++++------ 3 files changed, 146 insertions(+), 143 deletions(-) diff --git a/crates/oxide/src/glob.rs b/crates/oxide/src/glob.rs index 90bd34755bb9..5b6e0715f2b4 100644 --- a/crates/oxide/src/glob.rs +++ b/crates/oxide/src/glob.rs @@ -1,7 +1,6 @@ -use crate::PublicSourceEntry; use fxhash::{FxHashMap, FxHashSet}; use std::path::PathBuf; -use tracing::{event, Level}; +use tracing::event; #[derive(Debug, Clone, PartialEq)] pub struct GlobEntry { @@ -12,105 +11,6 @@ pub struct GlobEntry { pub pattern: String, } -/// Optimize the PublicSourceEntry by trying to move all the static parts of the pattern to the -/// base of the PublicSourceEntry. -/// -/// ```diff -/// - { base: '/', pattern: 'src/**/*.html'} -/// + { base: '/src', pattern: '**/*.html'} -/// ``` -/// -/// A file stays in the `pattern` part, because the `base` should only be a directory. -/// -/// ```diff -/// - { base: '/', pattern: 'src/examples/index.html'} -/// + { base: '/src/examples', pattern: 'index.html'} -/// ``` -/// -/// A folder will be moved to the `base` part, and the `pattern` will be set to `**/*`. -/// -/// ```diff -/// - { base: '/', pattern: 'src/examples'} -/// + { base: '/src/examples', pattern: '**/*'} -/// ``` -/// -/// In addition, we will canonicalize the base path so we always work with the correctly resolved -/// path. -pub fn optimize_public_source_entry(source: &mut PublicSourceEntry) { - // Resolve base path immediately - let Ok(base) = dunce::canonicalize(&source.base) else { - event!(Level::ERROR, "Failed to resolve base: {:?}", source.base); - return; - }; - source.base = base.to_string_lossy().to_string(); - - // No dynamic part, figure out if we are dealing with a file or a directory. - if !source.pattern.contains('*') { - let combined_path = if source.pattern.starts_with("/") { - PathBuf::from(&source.pattern) - } else { - PathBuf::from(&source.base).join(&source.pattern) - }; - - match dunce::canonicalize(combined_path) { - Ok(resolved_path) if resolved_path.is_dir() => { - source.base = resolved_path.to_string_lossy().to_string(); - source.pattern = "**/*".to_owned(); - } - Ok(resolved_path) if resolved_path.is_file() => { - source.base = resolved_path - .parent() - .unwrap() - .to_string_lossy() - .to_string(); - // Ensure leading slash, otherwise it will match against all files in all folders/ - source.pattern = format!( - "/{}", - resolved_path - .file_name() - .unwrap() - .to_string_lossy() - .to_string() - ); - } - _ => {} - } - return; - } - - // Contains dynamic part - let (static_part, dynamic_part) = split_pattern(&source.pattern); - - let base: PathBuf = source.base.clone().into(); - let base = match static_part { - Some(static_part) => base.join(static_part), - None => base, - }; - - // TODO: If the base does not exist on disk, try removing the last slash and try again. - let base = match dunce::canonicalize(&base) { - Ok(base) => base, - Err(err) => { - event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err); - return; - } - }; - - let pattern = match dynamic_part { - Some(dynamic_part) => dynamic_part, - None => { - if base.is_dir() { - "**/*".to_owned() - } else { - "".to_owned() - } - } - }; - - source.base = base.to_string_lossy().to_string(); - source.pattern = pattern; -} - pub fn hoist_static_glob_parts(entries: &Vec, emit_parent_glob: bool) -> Vec { let mut result = vec![]; @@ -254,7 +154,7 @@ pub fn optimize_patterns(entries: &Vec) -> Vec { // Input: `../project-b/foo/bar.html` // Split results in: `("../project-b/foo", "bar.html")` // -fn split_pattern(pattern: &str) -> (Option, Option) { +pub fn split_pattern(pattern: &str) -> (Option, Option) { // No dynamic parts, so we can just return the input as-is. if !pattern.contains('*') { return (Some(pattern.to_owned()), None); diff --git a/crates/oxide/src/scanner/sources.rs b/crates/oxide/src/scanner/sources.rs index b1d3dc5be926..46430f47adf6 100644 --- a/crates/oxide/src/scanner/sources.rs +++ b/crates/oxide/src/scanner/sources.rs @@ -1,7 +1,8 @@ -use crate::glob::optimize_public_source_entry; +use crate::glob::split_pattern; use crate::GlobEntry; use bexpand::Expression; use std::path::PathBuf; +use tracing::{event, Level}; use super::auto_source_detection::IGNORED_CONTENT_DIRS; @@ -75,25 +76,103 @@ impl Sources { } impl PublicSourceEntry { - #[cfg(test)] - pub fn from_pattern(dir: PathBuf, pattern: &str) -> Self { - let mut parts = pattern.split_whitespace(); - let _ = parts.next().unwrap_or_default(); - let not_or_pattern = parts.next().unwrap_or_default(); - if not_or_pattern == "not" { - let pattern = parts.next().unwrap_or_default(); - return Self { - base: dir.to_string_lossy().into(), - pattern: pattern[1..pattern.len() - 1].to_string(), - negated: true, + /// Optimize the PublicSourceEntry by trying to move all the static parts of the pattern to the + /// base of the PublicSourceEntry. + /// + /// ```diff + /// - { base: '/', pattern: 'src/**/*.html'} + /// + { base: '/src', pattern: '**/*.html'} + /// ``` + /// + /// A file stays in the `pattern` part, because the `base` should only be a directory. + /// + /// ```diff + /// - { base: '/', pattern: 'src/examples/index.html'} + /// + { base: '/src/examples', pattern: 'index.html'} + /// ``` + /// + /// A folder will be moved to the `base` part, and the `pattern` will be set to `**/*`. + /// + /// ```diff + /// - { base: '/', pattern: 'src/examples'} + /// + { base: '/src/examples', pattern: '**/*'} + /// ``` + /// + /// In addition, we will canonicalize the base path so we always work with the correctly + /// resolved path. + pub fn optimize(&mut self) { + // Resolve base path immediately + let Ok(base) = dunce::canonicalize(&self.base) else { + event!(Level::ERROR, "Failed to resolve base: {:?}", self.base); + return; + }; + self.base = base.to_string_lossy().to_string(); + + // No dynamic part, figure out if we are dealing with a file or a directory. + if !self.pattern.contains('*') { + let combined_path = if self.pattern.starts_with("/") { + PathBuf::from(&self.pattern) + } else { + PathBuf::from(&self.base).join(&self.pattern) }; - } - Self { - base: dir.to_string_lossy().into(), - pattern: not_or_pattern[1..not_or_pattern.len() - 1].to_string(), - negated: false, + match dunce::canonicalize(combined_path) { + Ok(resolved_path) if resolved_path.is_dir() => { + self.base = resolved_path.to_string_lossy().to_string(); + self.pattern = "**/*".to_owned(); + } + Ok(resolved_path) if resolved_path.is_file() => { + self.base = resolved_path + .parent() + .unwrap() + .to_string_lossy() + .to_string(); + // Ensure leading slash, otherwise it will match against all files in all folders/ + self.pattern = format!( + "/{}", + resolved_path + .file_name() + .unwrap() + .to_string_lossy() + .to_string() + ); + } + _ => {} + } + return; } + + // Contains dynamic part + let (static_part, dynamic_part) = split_pattern(&self.pattern); + + let base: PathBuf = self.base.clone().into(); + let base = match static_part { + Some(static_part) => base.join(static_part), + None => base, + }; + + // TODO: If the base does not exist on disk, try removing the last slash and try again. + let base = match dunce::canonicalize(&base) { + Ok(base) => base, + Err(err) => { + event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err); + return; + } + }; + + let pattern = match dynamic_part { + Some(dynamic_part) => dynamic_part, + None => { + if base.is_dir() { + "**/*".to_owned() + } else { + "".to_owned() + } + } + }; + + self.base = base.to_string_lossy().to_string(); + self.pattern = pattern; } } @@ -139,7 +218,7 @@ pub fn public_source_entries_to_private_source_entries( .collect::>() }) .map(|mut public_source| { - optimize_public_source_entry(&mut public_source); + public_source.optimize(); public_source }) .collect::>(); diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index 0125abca9c90..4a578a15a0bd 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -1,5 +1,6 @@ #[cfg(test)] mod scanner { + use std::path::PathBuf; use std::process::Command; use std::thread::sleep; use std::time::Duration; @@ -8,6 +9,26 @@ mod scanner { use tailwindcss_oxide::*; use tempfile::tempdir; + fn public_source_entry_from_pattern(dir: PathBuf, pattern: &str) -> PublicSourceEntry { + let mut parts = pattern.split_whitespace(); + let _ = parts.next().unwrap_or_default(); + let not_or_pattern = parts.next().unwrap_or_default(); + if not_or_pattern == "not" { + let pattern = parts.next().unwrap_or_default(); + return PublicSourceEntry { + base: dir.to_string_lossy().into(), + pattern: pattern[1..pattern.len() - 1].to_string(), + negated: true, + }; + } + + PublicSourceEntry { + base: dir.to_string_lossy().into(), + pattern: not_or_pattern[1..not_or_pattern.len() - 1].to_string(), + negated: false, + } + } + struct ScanResult { files: Vec, globs: Vec, @@ -47,7 +68,7 @@ mod scanner { // Resolve all content paths for the (temporary) current working directory let sources: Vec = source_directives .iter() - .map(|str| PublicSourceEntry::from_pattern(base.clone().into(), str)) + .map(|str| public_source_entry_from_pattern(base.clone().into(), str)) .collect(); let mut scanner = Scanner::new(sources); @@ -670,8 +691,8 @@ mod scanner { ); let sources = vec![ - PublicSourceEntry::from_pattern(dir.join("project-a"), "@source '**/*'"), - PublicSourceEntry::from_pattern(dir.join("project-b"), "@source '**/*'"), + public_source_entry_from_pattern(dir.join("project-a"), "@source '**/*'"), + public_source_entry_from_pattern(dir.join("project-b"), "@source '**/*'"), ]; let mut scanner = Scanner::new(sources); @@ -1068,21 +1089,21 @@ mod scanner { ); let sources = vec![ - PublicSourceEntry::from_pattern( + public_source_entry_from_pattern( dir.join("home/project/apps/web") .to_string_lossy() .to_string() .into(), "@source '**/*'", ), - PublicSourceEntry::from_pattern( + public_source_entry_from_pattern( dir.join("home/project/apps/web") .to_string_lossy() .to_string() .into(), "@source '../admin'", ), - PublicSourceEntry::from_pattern( + public_source_entry_from_pattern( dir.join("home/project/apps/web") .to_string_lossy() .to_string() @@ -1200,8 +1221,8 @@ mod scanner { ); let sources = vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), - PublicSourceEntry::from_pattern(dir.clone(), "@source not 'src/ignore-me.html'"), + public_source_entry_from_pattern(dir.clone(), "@source '**/*.html'"), + public_source_entry_from_pattern(dir.clone(), "@source not 'src/ignore-me.html'"), ]; let candidates = Scanner::new(sources.clone()).scan(); @@ -1224,8 +1245,8 @@ mod scanner { ); let sources = vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), - PublicSourceEntry::from_pattern( + public_source_entry_from_pattern(dir.clone(), "@source '**/*'"), + public_source_entry_from_pattern( dir.clone(), "@source not 'src/app/[foo]/ignore*.html'", ), @@ -1248,8 +1269,8 @@ mod scanner { ); let mut scanner = Scanner::new(vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), - PublicSourceEntry::from_pattern( + public_source_entry_from_pattern(dir.clone(), "@source '**/*.html'"), + public_source_entry_from_pattern( dir.clone(), "@source not 'src/ignored-by-source-not.html'", ), @@ -1304,7 +1325,7 @@ mod scanner { // Create files create_files_in(&dir, &[("foo.styl", "content-['foo.styl']")]); - let sources = vec![PublicSourceEntry::from_pattern( + let sources = vec![public_source_entry_from_pattern( dir.clone(), "@source '**/*'", )]; @@ -1316,8 +1337,8 @@ mod scanner { // Explicitly allow `.styl` files let mut scanner = Scanner::new(vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), - PublicSourceEntry::from_pattern(dir.clone(), "@source '*.styl'"), + public_source_entry_from_pattern(dir.clone(), "@source '**/*'"), + public_source_entry_from_pattern(dir.clone(), "@source '*.styl'"), ]); let candidates = scanner.scan(); @@ -1338,7 +1359,7 @@ mod scanner { ], ); - let mut scanner = Scanner::new(vec![PublicSourceEntry::from_pattern( + let mut scanner = Scanner::new(vec![public_source_entry_from_pattern( dir.clone(), "@source '**/*'", )]); @@ -1347,8 +1368,8 @@ mod scanner { assert!(candidates.is_empty()); let mut scanner = Scanner::new(vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*'"), - PublicSourceEntry::from_pattern(dir.clone(), "@source './*.html'"), + public_source_entry_from_pattern(dir.clone(), "@source '**/*'"), + public_source_entry_from_pattern(dir.clone(), "@source './*.html'"), ]); let candidates = scanner.scan(); @@ -1377,7 +1398,10 @@ mod scanner { ); // Default auto source detection - let sources = vec![PublicSourceEntry::from_pattern(dir.clone(), "@source './'")]; + let sources = vec![public_source_entry_from_pattern( + dir.clone(), + "@source './'", + )]; let mut scanner = Scanner::new(sources.clone()); @@ -1386,7 +1410,7 @@ mod scanner { // Explicitly listing all `*.html` files, should not include `node_modules` because it's // ignored - let sources = vec![PublicSourceEntry::from_pattern( + let sources = vec![public_source_entry_from_pattern( dir.clone(), "@source '**/*.html'", )]; @@ -1399,8 +1423,8 @@ mod scanner { // Explicitly list the `node_modules/my-ui-lib` // let sources = vec![ - PublicSourceEntry::from_pattern(dir.clone(), "@source '**/*.html'"), - PublicSourceEntry::from_pattern(dir.clone(), "@source 'node_modules/my-ui-lib'"), + public_source_entry_from_pattern(dir.clone(), "@source '**/*.html'"), + public_source_entry_from_pattern(dir.clone(), "@source 'node_modules/my-ui-lib'"), ]; let mut scanner = Scanner::new(sources.clone()); @@ -1501,7 +1525,7 @@ mod scanner { ], ); - let mut scanner = Scanner::new(vec![PublicSourceEntry::from_pattern( + let mut scanner = Scanner::new(vec![public_source_entry_from_pattern( dir.clone(), "@source '**/*'", )]); From 9d4c2af7f18119c707f446c032b01dc6391ec5ff Mon Sep 17 00:00:00 2001 From: Philipp Spiess Date: Tue, 25 Mar 2025 13:01:26 +0100 Subject: [PATCH 26/26] Cleanups and test utf8 special characters in paths --- .../src/extractor/pre_processors/ruby.rs | 2 +- crates/oxide/src/scanner/sources.rs | 21 ++++++++------- crates/oxide/tests/scanner.rs | 26 +++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/crates/oxide/src/extractor/pre_processors/ruby.rs b/crates/oxide/src/extractor/pre_processors/ruby.rs index edc7be49d7d3..121af8e5cd3d 100644 --- a/crates/oxide/src/extractor/pre_processors/ruby.rs +++ b/crates/oxide/src/extractor/pre_processors/ruby.rs @@ -3,7 +3,7 @@ use crate::cursor; use crate::extractor::bracket_stack; use crate::extractor::pre_processors::pre_processor::PreProcessor; -use crate::pre_process_input; +use crate::scanner::pre_process_input; use bstr::ByteSlice; use fancy_regex::Regex; use std::sync; diff --git a/crates/oxide/src/scanner/sources.rs b/crates/oxide/src/scanner/sources.rs index 46430f47adf6..450fcb7c2873 100644 --- a/crates/oxide/src/scanner/sources.rs +++ b/crates/oxide/src/scanner/sources.rs @@ -147,17 +147,18 @@ impl PublicSourceEntry { let base: PathBuf = self.base.clone().into(); let base = match static_part { - Some(static_part) => base.join(static_part), - None => base, - }; - - // TODO: If the base does not exist on disk, try removing the last slash and try again. - let base = match dunce::canonicalize(&base) { - Ok(base) => base, - Err(err) => { - event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err); - return; + Some(static_part) => { + // TODO: If the base does not exist on disk, try removing the last slash and try + // again. + match dunce::canonicalize(base.join(static_part)) { + Ok(base) => base, + Err(err) => { + event!(tracing::Level::ERROR, "Failed to resolve glob: {:?}", err); + return; + } + } } + None => base, }; let pattern = match dynamic_part { diff --git a/crates/oxide/tests/scanner.rs b/crates/oxide/tests/scanner.rs index 4a578a15a0bd..03daee0ebafb 100644 --- a/crates/oxide/tests/scanner.rs +++ b/crates/oxide/tests/scanner.rs @@ -1585,4 +1585,30 @@ mod scanner { assert!(candidates.is_empty()); } + + #[test] + fn test_works_with_utf8_special_character_paths() { + let ScanResult { + candidates, + files, + globs, + normalized_sources, + } = scan_with_globs( + &[ + ("src/💩.js", "content-['src/💩.js']"), + ("src/🤦‍♂️.tsx", "content-['src/🤦‍♂️.tsx']"), + ("src/🤦‍♂️/foo.tsx", "content-['src/🤦‍♂️/foo.tsx']"), + ], + vec!["@source '**/*'", "@source not 'src/🤦‍♂️'"], + ); + + assert_eq!( + candidates, + vec!["content-['src/💩.js']", "content-['src/🤦‍♂️.tsx']"] + ); + + assert_eq!(files, vec!["src/💩.js", "src/🤦‍♂️.tsx"]); + assert_eq!(globs, vec!["*", "src/*/*.{aspx,astro,cjs,cts,eex,erb,gjs,gts,haml,handlebars,hbs,heex,html,jade,js,jsx,liquid,md,mdx,mjs,mts,mustache,njk,nunjucks,php,pug,py,razor,rb,rhtml,rs,slim,svelte,tpl,ts,tsx,twig,vue}"]); + assert_eq!(normalized_sources, vec!["**/*"]); + } }