diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index c446df24..f337a1aa 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -32,7 +32,7 @@ import time import urllib.error import urllib.request from pathlib import Path -from typing import Optional +from typing import Dict, Optional SECRET_BASEURL_TPL = "{}/secrets/v1/secret/{{}}".format( os.environ.get("TASKCLUSTER_PROXY_URL", "http://taskcluster").rstrip("/") @@ -545,6 +545,52 @@ def configure_volume_posix(volume, user, group, running_as_root): set_dir_permissions(volume, user.pw_uid, group.gr_gid) +def git_fetch( + destination_path: str, + ref: str, + remote: str = "origin", + tags: bool = False, + shallow: bool = False, + env: Optional[Dict[str, str]] = None, +): + args = ["git", "fetch"] + if tags: + # `--force` is needed to be able to update an existing outdated tag. + args.extend(["--tags", "--force"]) + + args.extend([remote, ref]) + + if shallow: + # If we have a full sha, we can fetch it directly + if re.match(r"^[a-f0-9]{40}$", ref): + fetch_args = args[:2] + ["--depth=1"] + args[2:] + ret = run_command(b"vcs", fetch_args, cwd=destination_path, extra_env=env) + if ret == 0: + return + + # Otherwise we need to incrementally deepen the repo until we detect + # the ref. + for deepen in range(10, 100, 10): + fetch_args = args[:2] + [f"--deepen={deepen}"] + args[2:] + run_command(b"vcs", fetch_args, cwd=destination_path, extra_env=env) + + # Check if the target ref exists, if not deepen further. + ret = run_command( + b"vcs", + ["git", "cat-file", "-e", "FETCH_HEAD"], + cwd=destination_path, + extra_env=env, + ) + if ret == 0: + return + + print(f"unable to fetch {ref} from {remote} in shallow clone") + sys.exit(1) + + # Non-shallow repo + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + def _clean_git_checkout(destination_path): # Delete untracked files (i.e. build products) print_line(b"vcs", b"cleaning git checkout...\n") @@ -584,16 +630,29 @@ def _clean_git_checkout(destination_path): print_line(b"vcs", b"successfully cleaned git checkout!\n") +def shortref(ref: str) -> str: + """Normalize a git ref to its short form. + + Returns the ref unchanged if it's already in short form. + """ + # Strip common ref prefixes + for prefix in ("refs/heads/", "refs/tags/"): + if ref.startswith(prefix): + return ref[len(prefix) :] + + return ref + + def git_checkout( destination_path: str, head_repo: str, base_repo: Optional[str], - base_ref: Optional[str], base_rev: Optional[str], ref: Optional[str], commit: Optional[str], ssh_key_file: Optional[Path], ssh_known_hosts_file: Optional[Path], + shallow: bool = False, ): env = { # abort if transfer speed is lower than 1kB/s for 1 minute @@ -637,62 +696,50 @@ def git_checkout( args = [ "git", "clone", - base_repo if base_repo else head_repo, - destination_path, ] - retry_required_command(b"vcs", args, extra_env=env) - - if base_ref: - args = ["git", "fetch", "origin", base_ref] - - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + if shallow: + args.extend(["--depth=1", "--no-checkout"]) - # Create local branch so that taskgraph is able to compute differences - # between the head branch and the base one, if needed - args = ["git", "checkout", base_ref] + args.extend( + [ + base_repo if base_repo else head_repo, + destination_path, + ] + ) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + retry_required_command(b"vcs", args, extra_env=env) - # When commits are force-pushed (like on a testing branch), base_rev doesn't - # exist on base_ref. Fetching it allows taskgraph to compute differences - # between the previous state before the force-push and the current state. - # - # Unlike base_ref just above, there is no need to checkout the revision: - # it's immediately available after the fetch. + # First fetch the base_rev. This allows Taskgraph to compute the files + # changed by the push. if base_rev and base_rev != NULL_REVISION: - args = ["git", "fetch", "origin", base_rev] + git_fetch(destination_path, base_rev, shallow=shallow, env=env) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + # Next fetch the head ref. # If a ref was provided, it might be tag, so we need to make sure we fetch # those. This is explicitly only done when base and head repo match, # because it is the only scenario where tags could be present. (PRs, for # example, always include an explicit rev.) Failure to do this could result # in not having a tag, or worse: having an outdated version of one. - # `--force` is needed to be able to update an existing tag. - if ref and base_repo == head_repo: - args = [ - "git", - "fetch", - "--tags", - "--force", - base_repo, - ref, - ] - - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - - # If a ref isn't provided, we fetch all refs from head_repo, which may be slow - args = [ - "git", - "fetch", - "--no-tags", - head_repo, - ref if ref else "+refs/heads/*:refs/remotes/work/*", - ] + tags = False + if ref and not ref.startswith("refs/heads/") and base_repo == head_repo: + tags = True + + # If a ref isn't provided, we fetch all refs from head_repo, which may be slow. + target = ref if ref else "+refs/heads/*:refs/remotes/work/*" + git_fetch( + destination_path, + target, + remote=head_repo, + tags=tags, + shallow=shallow, + env=env, + ) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + # If we have a shallow clone and specific commit, we need to fetch it too. + if shallow and commit and commit != ref: + git_fetch(destination_path, commit, remote=head_repo, shallow=shallow, env=env) args = [ "git", @@ -700,8 +747,8 @@ def git_checkout( "-f", ] - if ref: - args.extend(["-B", ref]) + if ref and ref != commit: + args.extend(["-B", shortref(ref)]) # `git fetch` set `FETCH_HEAD` reference to the last commit of the desired branch args.append(commit if commit else "FETCH_HEAD") @@ -878,17 +925,22 @@ def add_vcs_arguments(parser, project, name): f"--{project}-sparse-profile", help=f"Path to sparse profile for {name} checkout", ) + parser.add_argument( + f"--{project}-shallow-clone", + action="store_true", + help=f"Use shallow clone for {name}", + ) def collect_vcs_options(args, project, name): checkout = getattr(args, f"{project}_checkout") sparse_profile = getattr(args, f"{project}_sparse_profile") + shallow_clone = getattr(args, f"{project}_shallow_clone") env_prefix = project.upper() repo_type = os.environ.get(f"{env_prefix}_REPOSITORY_TYPE") base_repo = os.environ.get(f"{env_prefix}_BASE_REPOSITORY") - base_ref = os.environ.get(f"{env_prefix}_BASE_REF") base_rev = os.environ.get(f"{env_prefix}_BASE_REV") head_repo = os.environ.get(f"{env_prefix}_HEAD_REPOSITORY") revision = os.environ.get(f"{env_prefix}_HEAD_REV") @@ -921,7 +973,6 @@ def collect_vcs_options(args, project, name): "checkout": checkout, "sparse-profile": sparse_profile, "base-repo": base_repo, - "base-ref": base_ref, "base-rev": base_rev, "head-repo": head_repo, "revision": revision, @@ -929,6 +980,7 @@ def collect_vcs_options(args, project, name): "repo-type": repo_type, "ssh-secret-name": private_key_secret, "pip-requirements": pip_requirements, + "shallow-clone": shallow_clone, } @@ -971,12 +1023,12 @@ def vcs_checkout_from_args(options): options["checkout"], options["head-repo"], options["base-repo"], - options["base-ref"], options["base-rev"], ref, revision, ssh_key_file, ssh_known_hosts_file, + shallow=options.get("shallow-clone", False), ) elif options["repo-type"] == "hg": if not revision and not ref: diff --git a/src/taskgraph/util/vcs.py b/src/taskgraph/util/vcs.py index 80f08e4d..65ed37a3 100644 --- a/src/taskgraph/util/vcs.py +++ b/src/taskgraph/util/vcs.py @@ -388,6 +388,10 @@ def base_rev(self): def branch(self): return self.run("branch", "--show-current").strip() or None + @property + def is_shallow(self): + return self.run("rev-parse", "--is-shallow-repository").strip() == "true" + @property def all_remote_names(self): remotes = self.run("remote").splitlines() @@ -546,10 +550,39 @@ def update(self, ref): self.run("checkout", ref) def find_latest_common_revision(self, base_ref_or_rev, head_rev): - try: - return self.run("merge-base", base_ref_or_rev, head_rev).strip() - except subprocess.CalledProcessError: - return self.NULL_REVISION + def run_merge_base(): + try: + return self.run("merge-base", base_ref_or_rev, head_rev).strip() + except subprocess.CalledProcessError: + return None + + # First try to find merge base + rev = run_merge_base() + if rev or not self.is_shallow: + return rev or self.NULL_REVISION + + # If we couldn't find a merge base, try deepening with both refs + for deepen in (10, 100, 500, 1000): + # Deepen and fetch both specific refs to ensure we get their history + self.run( + "fetch", + "--deepen", + str(deepen), + self.remote_name, + base_ref_or_rev, + head_rev, + return_codes=[128], + ) + + if rev := run_merge_base(): + break + else: + # If we still haven't found a merge base, unshallow the repo and + # try one last time. + self.run("fetch", "--unshallow", self.remote_name) + rev = run_merge_base() + + return rev or self.NULL_REVISION def does_revision_exist_locally(self, revision): try: diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index b3867d17..5ac6613b 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -151,9 +151,10 @@ def test_install_pip_requirements_with_uv( @pytest.mark.parametrize( - "env,extra_expected", + "args,env,extra_expected", [ pytest.param( + {}, { "REPOSITORY_TYPE": "hg", "BASE_REPOSITORY": "https://hg.mozilla.org/mozilla-central", @@ -164,10 +165,27 @@ def test_install_pip_requirements_with_uv( { "base-repo": "https://hg.mozilla.org/mozilla-unified", }, - ) + id="hg", + ), + pytest.param( + {"myrepo_shallow_clone": True}, + { + "REPOSITORY_TYPE": "git", + "HEAD_REPOSITORY": "https://github.com/test/repo.git", + "HEAD_REV": "abc123", + }, + {"shallow-clone": True}, + id="git_with_shallow_clone", + ), ], ) -def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): +def test_collect_vcs_options( + monkeypatch, + run_task_mod, + args, + env, + extra_expected, +): name = "myrepo" checkout = "checkout" @@ -175,15 +193,15 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): for k, v in env.items(): monkeypatch.setenv(f"{name.upper()}_{k.upper()}", v) - args = Namespace() - setattr(args, f"{name}_checkout", checkout) - setattr(args, f"{name}_sparse_profile", False) + args.setdefault(f"{name}_checkout", checkout) + args.setdefault(f"{name}_shallow_clone", False) + args.setdefault(f"{name}_sparse_profile", False) + args = Namespace(**args) result = run_task_mod.collect_vcs_options(args, name, name) expected = { "base-repo": env.get("BASE_REPOSITORY"), - "base-ref": env.get("BASE_REF"), "base-rev": env.get("BASE_REV"), "checkout": os.path.join(os.getcwd(), "checkout"), "env-prefix": name.upper(), @@ -194,6 +212,7 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): "ref": env.get("HEAD_REF"), "repo-type": env.get("REPOSITORY_TYPE"), "revision": env.get("HEAD_REV"), + "shallow-clone": False, "ssh-secret-name": env.get("SSH_SECRET_NAME"), "sparse-profile": False, "store-path": env.get("HG_STORE_PATH"), @@ -334,7 +353,9 @@ def mock_git_repo(): ) def _commit_file(message, filename): - with open(os.path.join(repo, filename), "w") as fout: + filepath = os.path.join(repo, filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, "w") as fout: fout.write("test file content") subprocess.check_call(["git", "add", filename], cwd=repo_path) subprocess.check_call(["git", "commit", "-m", message], cwd=repo_path) @@ -354,7 +375,7 @@ def _commit_file(message, filename): @pytest.mark.parametrize( - "base_ref,ref,files,hash_key", + "base_rev,ref,files,hash_key", [ (None, None, ["mainfile"], "main"), (None, "main", ["mainfile"], "main"), @@ -367,7 +388,7 @@ def test_git_checkout( mock_stdin, run_task_mod, mock_git_repo, - base_ref, + base_rev, ref, files, hash_key, @@ -378,8 +399,7 @@ def test_git_checkout( destination_path=destination, head_repo=mock_git_repo["path"], base_repo=mock_git_repo["path"], - base_ref=base_ref, - base_rev=None, + base_rev=base_rev, ref=ref, commit=None, ssh_key_file=None, @@ -407,20 +427,171 @@ def test_git_checkout_with_commit( mock_stdin, run_task_mod, mock_git_repo, + tmp_path, ): - with tempfile.TemporaryDirectory() as workdir: - destination = os.path.join(workdir, "destination") - run_task_mod.git_checkout( - destination_path=destination, - head_repo=mock_git_repo["path"], - base_repo=mock_git_repo["path"], - base_ref="mybranch", - base_rev=mock_git_repo["main"], - ref=mock_git_repo["branch"], - commit=mock_git_repo["branch"], - ssh_key_file=None, - ssh_known_hosts_file=None, - ) + destination = tmp_path / "destination" + + run_task_mod.git_checkout( + destination_path=str(destination), + head_repo=mock_git_repo["path"], + base_repo=mock_git_repo["path"], + base_rev=mock_git_repo["main"], + ref="mybranch", + commit=mock_git_repo["branch"], + ssh_key_file=None, + ssh_known_hosts_file=None, + shallow=False, + ) + + current_rev = subprocess.check_output( + args=["git", "rev-parse", "HEAD"], + cwd=str(destination), + universal_newlines=True, + ).strip() + assert current_rev == mock_git_repo["branch"] + + +def test_git_checkout_with_commit_shallow( + mock_stdin, + run_task_mod, + mock_git_repo, + tmp_path, + mocker, +): + destination = tmp_path / "destination" + + # Shallow clones don't work well with local repos, so mock git_fetch to + # track calls + original_git_fetch = run_task_mod.git_fetch + fetch_calls = [] + + def mock_git_fetch(*args, **kwargs): + fetch_calls.append((args, kwargs)) + return original_git_fetch(*args, **kwargs) + + mocker.patch.object(run_task_mod, "git_fetch", side_effect=mock_git_fetch) + + # Use shallow clone with ref != commit + run_task_mod.git_checkout( + destination_path=str(destination), + head_repo=mock_git_repo["path"], + base_repo=mock_git_repo["path"], + base_rev=mock_git_repo["main"], + ref="mybranch", # Branch name (different from commit) + commit=mock_git_repo["branch"], # Specific SHA + ssh_key_file=None, + ssh_known_hosts_file=None, + shallow=True, + ) + + # Verify that git_fetch was called for both the ref and the commit + # Should have at least 3 calls: base_rev, ref, and commit + assert len(fetch_calls) >= 3 + + # Verify base_rev fetch + base_call = fetch_calls[0] + assert base_call[0][1] == mock_git_repo["main"] # base_rev + assert base_call[1]["shallow"] is True + + # Verify ref fetch + ref_call = fetch_calls[1] + assert ref_call[1]["shallow"] is True + + # Verify commit fetch (our fix) + commit_call = fetch_calls[2] + assert commit_call[0][1] == mock_git_repo["branch"] # commit SHA + assert commit_call[1]["shallow"] is True + + # Verify final checkout worked + final_rev = subprocess.check_output( + args=["git", "rev-parse", "HEAD"], + cwd=str(destination), + universal_newlines=True, + ).strip() + assert final_rev == mock_git_repo["branch"] + + +def test_git_checkout_shallow_clone( + mock_stdin, + run_task_mod, + mock_git_repo, + tmp_path, +): + destination = tmp_path / "destination" + # Note: shallow clone with local repos doesn't work as expected due to git limitations + # The --depth flag is ignored in local clones + run_task_mod.git_checkout( + destination_path=str(destination), + head_repo=mock_git_repo["path"], + base_repo=mock_git_repo["path"], + base_rev=None, + ref="mybranch", + commit=None, + ssh_key_file=None, + ssh_known_hosts_file=None, + shallow=True, + ) + + # Check that files were checked out properly + assert (destination / "mainfile").exists() + assert (destination / "branchfile").exists() + + # Check repo is on the right branch + current_branch = subprocess.check_output( + args=["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=str(destination), + universal_newlines=True, + ).strip() + assert current_branch == "mybranch" + + +def test_git_fetch_shallow_sha( + mock_stdin, + run_task_mod, + mock_git_repo, + tmp_path, + mocker, +): + destination = tmp_path / "destination" + + run_task_mod.run_command( + b"vcs", + [ + "git", + "clone", + "--depth=1", + "--no-checkout", + mock_git_repo["path"], + str(destination), + ], + ) + + # Mock run_command to track calls + original_run_command = run_task_mod.run_command + command_calls = [] + + def mock_run_command(*args, **kwargs): + command_calls.append(args[1]) # Store the command arguments + return original_run_command(*args, **kwargs) + + mocker.patch.object(run_task_mod, "run_command", side_effect=mock_run_command) + + # Test fetching with full SHA (should use optimized path) + full_sha = mock_git_repo["branch"] # This is a full 40-char SHA + run_task_mod.git_fetch( + str(destination), full_sha, remote=mock_git_repo["path"], shallow=True + ) + + # Verify that the optimized SHA fetch was attempted first + fetch_commands = [ + cmd for cmd in command_calls if cmd[0] == "git" and cmd[1] == "fetch" + ] + assert len(fetch_commands) >= 1 + + # First fetch command should be the optimized SHA fetch with --depth=1 + first_fetch = fetch_commands[0] + assert "--depth=1" in first_fetch + assert full_sha in first_fetch def test_display_python_version_should_output_python_versions_title( diff --git a/test/test_util_vcs.py b/test/test_util_vcs.py index 7585fe36..aaa5d672 100644 --- a/test/test_util_vcs.py +++ b/test/test_util_vcs.py @@ -3,6 +3,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. import os +import shutil import subprocess from pathlib import Path from textwrap import dedent @@ -495,6 +496,121 @@ def test_find_latest_common_revision(repo_with_remote): ) +def test_find_latest_common_revision_shallow_clone( + tmpdir, git_repo, default_git_branch +): + """Test finding common revision in a shallow clone that requires deepening.""" + remote_path = str(tmpdir / "remote_repo") + shutil.copytree(git_repo, remote_path) + + # Add several commits to the remote repository to create depth + remote_repo = get_repository(remote_path) + + # Create multiple commits to establish depth + for i in range(5): + test_file = os.path.join(remote_path, f"file_{i}.txt") + with open(test_file, "w") as f: + f.write(f"content {i}") + remote_repo.run("add", test_file) + remote_repo.run("commit", "-m", f"Commit {i}") + + # Store the head revision of remote for comparison + remote_head = remote_repo.head_rev + + # Create a shallow clone with depth 1 + # Need to use file:// protocol for --depth to work with local repos + shallow_clone_path = str(tmpdir / "shallow_clone") + subprocess.check_call( + ["git", "clone", "--depth", "1", f"file://{remote_path}", shallow_clone_path] + ) + + shallow_repo = get_repository(shallow_clone_path) + assert shallow_repo.is_shallow + + # Configure git user for the cloned repo (needed for commits in CI) + shallow_repo.run("config", "user.email", "test@example.com") + shallow_repo.run("config", "user.name", "Test User") + + remote_name = "origin" + + # Create a new commit in the shallow clone to diverge from remote + new_file = os.path.join(shallow_clone_path, "local_file.txt") + with open(new_file, "w") as f: + f.write("local content") + shallow_repo.run("add", new_file) + shallow_repo.run("commit", "-m", "Local commit") + + # Now try to find the common revision - this should trigger deepening + # because the shallow clone doesn't have enough history + base_ref = f"{remote_name}/{default_git_branch}" + result = shallow_repo.find_latest_common_revision(base_ref, shallow_repo.head_rev) + + # The result should be the remote's head (the common ancestor) + assert result == remote_head + + # Verify the repository has been deepened + assert shallow_repo.does_revision_exist_locally(result) + + +def test_find_latest_common_revision_shallow_clone_different_branches( + tmpdir, git_repo, default_git_branch +): + """Test finding common revision in a shallow clone with refs on different branches.""" + remote_path = str(tmpdir / "remote_repo") + shutil.copytree(git_repo, remote_path) + + remote_repo = get_repository(remote_path) + + # Create a feature branch with multiple commits + remote_repo.run("checkout", "-b", "feature-branch") + for i in range(3): + test_file = os.path.join(remote_path, f"feature_{i}.txt") + with open(test_file, "w") as f: + f.write(f"feature content {i}") + remote_repo.run("add", test_file) + remote_repo.run("commit", "-m", f"Feature commit {i}") + + # Go back to main and add more commits + remote_repo.run("checkout", default_git_branch) + for i in range(3): + test_file = os.path.join(remote_path, f"main_{i}.txt") + with open(test_file, "w") as f: + f.write(f"main content {i}") + remote_repo.run("add", test_file) + remote_repo.run("commit", "-m", f"Main commit {i}") + + main_head = remote_repo.head_rev + + # Create a shallow clone of just the main branch + shallow_clone_path = str(tmpdir / "shallow_clone") + subprocess.check_call( + [ + "git", + "clone", + "--depth", + "1", + "--single-branch", + "--branch", + default_git_branch, + f"file://{remote_path}", + shallow_clone_path, + ] + ) + + shallow_repo = get_repository(shallow_clone_path) + assert shallow_repo.is_shallow + + # Try to find common revision between main and feature branch + # This should fetch the feature branch and find the merge base + result = shallow_repo.find_latest_common_revision( + "origin/feature-branch", main_head + ) + + # The result should not be the null revision + assert result != Repository.NULL_REVISION + assert shallow_repo.does_revision_exist_locally(result) + + def test_does_revision_exist_locally(repo): first_revision = repo.head_rev