From ae328efa5ecf2628f3ffbf339a0c4caacc61d7f7 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 15 Oct 2025 09:46:32 -0400 Subject: [PATCH 01/10] refactor(run-task): rename ref -> head_ref and {commit, revision} -> head_rev This makes the naming consistent with what we use in .taskcluster.yml and the rest of Taskgraph. Previously, I always had to look up where "ref" and "commit" / "revision" were coming from to double check they were the values I was expecting. This rename makes that much more obvious. --- src/taskgraph/run-task/run-task | 60 +++++++++++++++------------------ test/test_scripts_run_task.py | 20 +++++------ 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index 98f9882fe..0efd2ef17 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -611,8 +611,8 @@ def git_checkout( base_repo: Optional[str], base_ref: Optional[str], base_rev: Optional[str], - ref: Optional[str], - commit: Optional[str], + head_ref: Optional[str], + head_rev: Optional[str], ssh_key_file: Optional[Path], ssh_known_hosts_file: Optional[Path], ): @@ -686,31 +686,31 @@ def git_checkout( retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - # If a ref was provided, it might be tag, so we need to make sure we fetch + # If a head_ref was provided, it might be tag, so we need to make sure we fetch # those. This is explicitly only done when base and head repo match, # because it is the only scenario where tags could be present. (PRs, for # example, always include an explicit rev.) Failure to do this could result # in not having a tag, or worse: having an outdated version of one. # `--force` is needed to be able to update an existing tag. - if ref and base_repo == head_repo: + if head_ref and base_repo == head_repo: args = [ "git", "fetch", "--tags", "--force", base_repo, - ref, + head_ref, ] retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - # If a ref isn't provided, we fetch all refs from head_repo, which may be slow + # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow args = [ "git", "fetch", "--no-tags", head_repo, - ref if ref else "+refs/heads/*:refs/remotes/work/*", + head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*", ] retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) @@ -721,11 +721,11 @@ def git_checkout( "-f", ] - if ref: - args.extend(["-B", ref]) + if head_ref: + args.extend(["-B", head_ref]) # `git fetch` set `FETCH_HEAD` reference to the last commit of the desired branch - args.append(commit if commit else "FETCH_HEAD") + args.append(head_rev if head_rev else "FETCH_HEAD") run_required_command(b"vcs", args, cwd=destination_path) @@ -912,8 +912,8 @@ def collect_vcs_options(args, project, name): base_ref = os.environ.get(f"{env_prefix}_BASE_REF") base_rev = os.environ.get(f"{env_prefix}_BASE_REV") head_repo = os.environ.get(f"{env_prefix}_HEAD_REPOSITORY") - revision = os.environ.get(f"{env_prefix}_HEAD_REV") - ref = os.environ.get(f"{env_prefix}_HEAD_REF") + head_ref = os.environ.get(f"{env_prefix}_HEAD_REF") + head_rev = os.environ.get(f"{env_prefix}_HEAD_REV") pip_requirements = os.environ.get(f"{env_prefix}_PIP_REQUIREMENTS") private_key_secret = os.environ.get(f"{env_prefix}_SSH_SECRET_NAME") @@ -945,8 +945,8 @@ def collect_vcs_options(args, project, name): "base-ref": base_ref, "base-rev": base_rev, "head-repo": head_repo, - "revision": revision, - "ref": ref, + "head-ref": head_ref, + "head-rev": head_rev, "repo-type": repo_type, "ssh-secret-name": private_key_secret, "pip-requirements": pip_requirements, @@ -955,13 +955,13 @@ def collect_vcs_options(args, project, name): def vcs_checkout_from_args(options): if not options["checkout"]: - if options["ref"] and not options["revision"]: + if options["head-ref"] and not options["head-rev"]: print("task should be defined in terms of non-symbolic revision") sys.exit(1) return - revision = options["revision"] - ref = options["ref"] + head_ref = options["head-ref"] + head_rev = options["head-rev"] ssh_key_file = None ssh_known_hosts_file = None ssh_dir = None @@ -979,13 +979,14 @@ def vcs_checkout_from_args(options): ssh_known_hosts_file = ssh_dir.joinpath("known_hosts") ssh_known_hosts_file.write_bytes(GITHUB_SSH_FINGERPRINT) - if options["repo-type"] == "git": - if not revision and not ref: - raise RuntimeError( - "Git requires that either a ref, a revision, or both are provided" - ) + if not head_rev and not head_ref: + raise RuntimeError( + f"{options['repo-type'].capitalize()} requires that either a " + "ref, a revision, or both are provided" + ) - if not ref: + if options["repo-type"] == "git": + if not head_ref: print("Providing a ref will improve the performance of this checkout") revision = git_checkout( @@ -994,25 +995,20 @@ def vcs_checkout_from_args(options): options["base-repo"], options["base-ref"], options["base-rev"], - ref, - revision, + head_ref, + head_rev, ssh_key_file, ssh_known_hosts_file, ) elif options["repo-type"] == "hg": - if not revision and not ref: - raise RuntimeError( - "Hg requires that at least one of a ref or revision is provided" - ) - revision = hg_checkout( options["checkout"], options["head-repo"], options["base-repo"], options["store-path"], options["sparse-profile"], - ref, - revision, + head_ref, + head_rev, ) else: raise RuntimeError('Type of VCS must be either "git" or "hg"') diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index b3867d17d..9a0439d44 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -191,9 +191,9 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): "name": name, "pip-requirements": None, "project": name, - "ref": env.get("HEAD_REF"), + "head-ref": env.get("HEAD_REF"), + "head-rev": env.get("HEAD_REV"), "repo-type": env.get("REPOSITORY_TYPE"), - "revision": env.get("HEAD_REV"), "ssh-secret-name": env.get("SSH_SECRET_NAME"), "sparse-profile": False, "store-path": env.get("HG_STORE_PATH"), @@ -354,7 +354,7 @@ def _commit_file(message, filename): @pytest.mark.parametrize( - "base_ref,ref,files,hash_key", + "base_ref,head_ref,files,hash_key", [ (None, None, ["mainfile"], "main"), (None, "main", ["mainfile"], "main"), @@ -368,7 +368,7 @@ def test_git_checkout( run_task_mod, mock_git_repo, base_ref, - ref, + head_ref, files, hash_key, ): @@ -380,8 +380,8 @@ def test_git_checkout( base_repo=mock_git_repo["path"], base_ref=base_ref, base_rev=None, - ref=ref, - commit=None, + head_ref=head_ref, + head_rev=None, ssh_key_file=None, ssh_known_hosts_file=None, ) @@ -391,13 +391,13 @@ def test_git_checkout( assert os.path.exists(os.path.join(destination, filename)) # Check repo is on the right branch - if ref: + if head_ref: current_branch = subprocess.check_output( args=["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=destination, universal_newlines=True, ).strip() - assert current_branch == ref + assert current_branch == head_ref current_rev = git_current_rev(destination) assert current_rev == mock_git_repo[hash_key] @@ -416,8 +416,8 @@ def test_git_checkout_with_commit( base_repo=mock_git_repo["path"], base_ref="mybranch", base_rev=mock_git_repo["main"], - ref=mock_git_repo["branch"], - commit=mock_git_repo["branch"], + head_ref=mock_git_repo["branch"], + head_rev=mock_git_repo["branch"], ssh_key_file=None, ssh_known_hosts_file=None, ) From 756b50325a14be2d85b0c76025452d49c1338d9d Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Thu, 2 Oct 2025 14:13:29 -0400 Subject: [PATCH 02/10] fix(run-task): avoid unnecessary 'git fetch' call If the condition in the if statement is true, then we've already fetched ref from head_repo. There's no need to do so again. --- src/taskgraph/run-task/run-task | 39 +++++++++++++-------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index 0efd2ef17..650b620a5 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -686,33 +686,24 @@ def git_checkout( retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - # If a head_ref was provided, it might be tag, so we need to make sure we fetch - # those. This is explicitly only done when base and head repo match, - # because it is the only scenario where tags could be present. (PRs, for - # example, always include an explicit rev.) Failure to do this could result - # in not having a tag, or worse: having an outdated version of one. - # `--force` is needed to be able to update an existing tag. + # Fetch head_ref + args = ["git", "fetch"] if head_ref and base_repo == head_repo: - args = [ - "git", - "fetch", - "--tags", - "--force", - base_repo, - head_ref, - ] + # If a head_ref was provided, it might be tag, so we need to make sure we fetch + # those. This is explicitly only done when base and head repo match, + # because it is the only scenario where tags could be present. (PRs, for + # example, always include an explicit rev.) Failure to do this could result + # in not having a tag, or worse: having an outdated version of one. + # `--force` is needed to be able to update an existing tag. + args.extend(["--tags", "--force"]) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - - # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow - args = [ - "git", - "fetch", - "--no-tags", - head_repo, - head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*", - ] + else: + args.append("--no-tags") + # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow. + args.extend( + [head_repo, head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*"] + ) retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) args = [ From 09754d92e6023b713878a1f408256eb462a6d643 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Thu, 2 Oct 2025 14:47:14 -0400 Subject: [PATCH 03/10] fix(run-task)!: stop fetching / checking out 'base_ref' BREAKING CHANGE: `base_ref` will no longer be fetched or checked out by run-task Taskgraph uses base_rev anyway for computing files changed, so there's no need to additionally fetch base_ref. Some tasks may need to be updated to not rely on base_ref being present in the local clone. --- src/taskgraph/run-task/run-task | 26 +++++--------------------- test/test_scripts_run_task.py | 9 +++------ 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index 650b620a5..b1b6bb893 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -609,7 +609,6 @@ def git_checkout( destination_path: str, head_repo: str, base_repo: Optional[str], - base_ref: Optional[str], base_rev: Optional[str], head_ref: Optional[str], head_rev: Optional[str], @@ -664,23 +663,11 @@ def git_checkout( retry_required_command(b"vcs", args, extra_env=env) - if base_ref: - args = ["git", "fetch", "origin", base_ref] - - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - - # Create local branch so that taskgraph is able to compute differences - # between the head branch and the base one, if needed - args = ["git", "checkout", base_ref] - - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - - # When commits are force-pushed (like on a testing branch), base_rev doesn't - # exist on base_ref. Fetching it allows taskgraph to compute differences - # between the previous state before the force-push and the current state. - # - # Unlike base_ref just above, there is no need to checkout the revision: - # it's immediately available after the fetch. + # For Github based repos, base_rev often doesn't refer to an ancestor of + # head_rev simply due to Github not providing that information in their + # webhook events. Therefore we fetch it independently from `head_rev` so + # that consumers can compute the merge-base or files modified between the + # two as needed. if base_rev and base_rev != NULL_REVISION: args = ["git", "fetch", "origin", base_rev] @@ -900,7 +887,6 @@ def collect_vcs_options(args, project, name): repo_type = os.environ.get(f"{env_prefix}_REPOSITORY_TYPE") base_repo = os.environ.get(f"{env_prefix}_BASE_REPOSITORY") - base_ref = os.environ.get(f"{env_prefix}_BASE_REF") base_rev = os.environ.get(f"{env_prefix}_BASE_REV") head_repo = os.environ.get(f"{env_prefix}_HEAD_REPOSITORY") head_ref = os.environ.get(f"{env_prefix}_HEAD_REF") @@ -933,7 +919,6 @@ def collect_vcs_options(args, project, name): "checkout": checkout, "sparse-profile": sparse_profile, "base-repo": base_repo, - "base-ref": base_ref, "base-rev": base_rev, "head-repo": head_repo, "head-ref": head_ref, @@ -984,7 +969,6 @@ def vcs_checkout_from_args(options): options["checkout"], options["head-repo"], options["base-repo"], - options["base-ref"], options["base-rev"], head_ref, head_rev, diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index 9a0439d44..aa1c89a27 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -183,7 +183,6 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): expected = { "base-repo": env.get("BASE_REPOSITORY"), - "base-ref": env.get("BASE_REF"), "base-rev": env.get("BASE_REV"), "checkout": os.path.join(os.getcwd(), "checkout"), "env-prefix": name.upper(), @@ -354,7 +353,7 @@ def _commit_file(message, filename): @pytest.mark.parametrize( - "base_ref,head_ref,files,hash_key", + "base_rev,head_ref,files,hash_key", [ (None, None, ["mainfile"], "main"), (None, "main", ["mainfile"], "main"), @@ -367,7 +366,7 @@ def test_git_checkout( mock_stdin, run_task_mod, mock_git_repo, - base_ref, + base_rev, head_ref, files, hash_key, @@ -378,8 +377,7 @@ def test_git_checkout( destination_path=destination, head_repo=mock_git_repo["path"], base_repo=mock_git_repo["path"], - base_ref=base_ref, - base_rev=None, + base_rev=base_rev, head_ref=head_ref, head_rev=None, ssh_key_file=None, @@ -414,7 +412,6 @@ def test_git_checkout_with_commit( destination_path=destination, head_repo=mock_git_repo["path"], base_repo=mock_git_repo["path"], - base_ref="mybranch", base_rev=mock_git_repo["main"], head_ref=mock_git_repo["branch"], head_rev=mock_git_repo["branch"], From fb2a0a007a0ef1d744c8a631aada8bdac189d056 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Thu, 2 Oct 2025 14:45:53 -0400 Subject: [PATCH 04/10] refactor(run-task): move git fetch logic into helper function --- src/taskgraph/run-task/run-task | 45 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index b1b6bb893..f1434e7c4 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -566,6 +566,22 @@ def configure_volume_posix(volume, user, group, running_as_root): set_dir_permissions(volume, user.pw_uid, group.gr_gid) +def git_fetch( + destination_path: str, + ref: str, + remote: str = "origin", + tags: bool = False, + env: Optional[dict[str, str]] = None, +): + args = ["git", "fetch"] + if tags: + # `--force` is needed to be able to update an existing outdated tag. + args.extend(["--tags", "--force"]) + + args.extend([remote, ref]) + retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + + def _clean_git_checkout(destination_path): # Delete untracked files (i.e. build products) print_line(b"vcs", b"cleaning git checkout...\n") @@ -669,29 +685,18 @@ def git_checkout( # that consumers can compute the merge-base or files modified between the # two as needed. if base_rev and base_rev != NULL_REVISION: - args = ["git", "fetch", "origin", base_rev] + git_fetch(destination_path, base_rev, env=env) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) - - # Fetch head_ref - args = ["git", "fetch"] - if head_ref and base_repo == head_repo: - # If a head_ref was provided, it might be tag, so we need to make sure we fetch - # those. This is explicitly only done when base and head repo match, - # because it is the only scenario where tags could be present. (PRs, for - # example, always include an explicit rev.) Failure to do this could result - # in not having a tag, or worse: having an outdated version of one. - # `--force` is needed to be able to update an existing tag. - args.extend(["--tags", "--force"]) - - else: - args.append("--no-tags") + # If a head_ref was provided, it might be tag, so we need to make sure we fetch + # those. This is explicitly only done when base and head repo match, + # because it is the only scenario where tags could be present. (PRs, for + # example, always include an explicit rev.) Failure to do this could result + # in not having a tag, or worse: having an outdated version of one. + tags = True if head_ref and base_repo == head_repo else False # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow. - args.extend( - [head_repo, head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*"] - ) - retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) + target = head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*" + git_fetch(destination_path, target, remote=head_repo, tags=tags, env=env) args = [ "git", From 7c6c19d34270403e1f96f441289f3be8fa8585f1 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Fri, 3 Oct 2025 12:02:08 -0400 Subject: [PATCH 05/10] fix(run-task): don't fetch tags if target ref starts with 'refs/heads' --- src/taskgraph/run-task/run-task | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index f1434e7c4..a12016683 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -692,7 +692,9 @@ def git_checkout( # because it is the only scenario where tags could be present. (PRs, for # example, always include an explicit rev.) Failure to do this could result # in not having a tag, or worse: having an outdated version of one. - tags = True if head_ref and base_repo == head_repo else False + tags = False + if head_ref and not head_ref.startswith("refs/heads/") and base_repo == head_repo: + tags = True # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow. target = head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*" From dbda62de736a0ca84ca61993391333068bc06ed6 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 10 Sep 2025 14:25:41 -0400 Subject: [PATCH 06/10] test: use tmp_path fixture in test_git_checkout --- test/test_scripts_run_task.py | 81 +++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index aa1c89a27..fcbf1f0dd 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -366,58 +366,65 @@ def test_git_checkout( mock_stdin, run_task_mod, mock_git_repo, + tmp_path, base_rev, head_ref, files, hash_key, ): - with tempfile.TemporaryDirectory() as workdir: - destination = os.path.join(workdir, "destination") - run_task_mod.git_checkout( - destination_path=destination, - head_repo=mock_git_repo["path"], - base_repo=mock_git_repo["path"], - base_rev=base_rev, - head_ref=head_ref, - head_rev=None, - ssh_key_file=None, - ssh_known_hosts_file=None, - ) + destination = tmp_path / "destination" + run_task_mod.git_checkout( + destination_path=destination, + head_repo=mock_git_repo["path"], + base_repo=mock_git_repo["path"], + base_rev=base_rev, + head_ref=head_ref, + head_rev=None, + ssh_key_file=None, + ssh_known_hosts_file=None, + ) - # Check desired files exist - for filename in files: - assert os.path.exists(os.path.join(destination, filename)) + # Check desired files exist + for filename in files: + assert os.path.exists(os.path.join(destination, filename)) - # Check repo is on the right branch - if head_ref: - current_branch = subprocess.check_output( - args=["git", "rev-parse", "--abbrev-ref", "HEAD"], - cwd=destination, - universal_newlines=True, - ).strip() - assert current_branch == head_ref + # Check repo is on the right branch + if head_ref: + current_branch = subprocess.check_output( + args=["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=destination, + universal_newlines=True, + ).strip() + assert current_branch == head_ref - current_rev = git_current_rev(destination) - assert current_rev == mock_git_repo[hash_key] + current_rev = git_current_rev(destination) + assert current_rev == mock_git_repo[hash_key] def test_git_checkout_with_commit( mock_stdin, run_task_mod, mock_git_repo, + tmp_path, ): - with tempfile.TemporaryDirectory() as workdir: - destination = os.path.join(workdir, "destination") - run_task_mod.git_checkout( - destination_path=destination, - head_repo=mock_git_repo["path"], - base_repo=mock_git_repo["path"], - base_rev=mock_git_repo["main"], - head_ref=mock_git_repo["branch"], - head_rev=mock_git_repo["branch"], - ssh_key_file=None, - ssh_known_hosts_file=None, - ) + destination = tmp_path / "destination" + run_task_mod.git_checkout( + destination_path=str(destination), + head_repo=mock_git_repo["path"], + base_repo=mock_git_repo["path"], + base_rev=mock_git_repo["main"], + head_ref="mybranch", + head_rev=mock_git_repo["branch"], + ssh_key_file=None, + ssh_known_hosts_file=None, + ) + + current_rev = subprocess.check_output( + args=["git", "rev-parse", "HEAD"], + cwd=str(destination), + universal_newlines=True, + ).strip() + assert current_rev == mock_git_repo["branch"] def test_display_python_version_should_output_python_versions_title( From ea0b89b4165ca20c23b5bf1b4c9ffef3bf5dab5c Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 10 Sep 2025 14:25:41 -0400 Subject: [PATCH 07/10] fix(run-task)!: fetch head_rev if head_ref isn't provided BREAKING CHANGE: omitting `head_ref` no longer fetches all heads Previously we were fetching all heads in this case so that we could then run `git checkout ` successfully. But it's much faster to just explicitly fetch `` in the first place. This also refactors `git_fetch` to be able to fetch multiple targets at once. --- src/taskgraph/run-task/run-task | 24 +++++++++--- test/test_scripts_run_task.py | 67 +++++++++++++++++++++++---------- 2 files changed, 67 insertions(+), 24 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index a12016683..9bf38baf4 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -568,7 +568,7 @@ def configure_volume_posix(volume, user, group, running_as_root): def git_fetch( destination_path: str, - ref: str, + *targets: str, remote: str = "origin", tags: bool = False, env: Optional[dict[str, str]] = None, @@ -578,7 +578,7 @@ def git_fetch( # `--force` is needed to be able to update an existing outdated tag. args.extend(["--tags", "--force"]) - args.extend([remote, ref]) + args.extend([remote, *set(targets)]) retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) @@ -631,6 +631,8 @@ def git_checkout( ssh_key_file: Optional[Path], ssh_known_hosts_file: Optional[Path], ): + assert head_ref or head_rev + env = { # abort if transfer speed is lower than 1kB/s for 1 minute "GIT_HTTP_LOW_SPEED_LIMIT": "1024", @@ -696,9 +698,21 @@ def git_checkout( if head_ref and not head_ref.startswith("refs/heads/") and base_repo == head_repo: tags = True - # If a head_ref isn't provided, we fetch all refs from head_repo, which may be slow. - target = head_ref if head_ref else "+refs/heads/*:refs/remotes/work/*" - git_fetch(destination_path, target, remote=head_repo, tags=tags, env=env) + # Fetch head_ref and/or head_rev + targets = [] + if head_ref: + targets.append(head_ref) + if not targets: + # If head_ref wasn't provided, we fallback to head_rev. + targets.append(head_rev) + + git_fetch( + destination_path, + *targets, + remote=head_repo, + tags=tags, + env=env, + ) args = [ "git", diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index fcbf1f0dd..d7589df31 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -1,3 +1,4 @@ +import functools import io import os import site @@ -332,34 +333,42 @@ def mock_git_repo(): ["git", "config", "user.email", "py@tes.t"], cwd=repo_path ) - def _commit_file(message, filename): - with open(os.path.join(repo, filename), "w") as fout: - fout.write("test file content") + def _commit_file(message, filename, content): + filepath = os.path.join(repo, filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, "w") as fout: + fout.write(content) subprocess.check_call(["git", "add", filename], cwd=repo_path) subprocess.check_call(["git", "commit", "-m", message], cwd=repo_path) return git_current_rev(repo_path) # Commit mainfile (to main branch) - main_commit = _commit_file("Initial commit", "mainfile") + main_commits = [_commit_file("Initial commit", "mainfile", "foo")] # New branch mybranch subprocess.check_call(["git", "checkout", "-b", "mybranch"], cwd=repo_path) - # Commit branchfile to mybranch branch - branch_commit = _commit_file("File in mybranch", "branchfile") + # Create two commits to mybranch + branch_commits = [] + branch_commits.append( + _commit_file("Add file in mybranch2", "branchfile", "bar") + ) + branch_commits.append( + _commit_file("Update file in mybranch", "branchfile", "baz") + ) # Set current branch back to main subprocess.check_call(["git", "checkout", "main"], cwd=repo_path) - yield {"path": repo_path, "main": main_commit, "branch": branch_commit} + yield {"path": repo_path, "main": main_commits, "branch": branch_commits} @pytest.mark.parametrize( - "base_rev,head_ref,files,hash_key", + "base_rev,head_ref,files,hash_key,exc", [ - (None, None, ["mainfile"], "main"), - (None, "main", ["mainfile"], "main"), - (None, "mybranch", ["mainfile", "branchfile"], "branch"), - ("main", "main", ["mainfile"], "main"), - ("main", "mybranch", ["mainfile", "branchfile"], "branch"), + (None, None, ["mainfile"], "main", AssertionError), + (None, "main", ["mainfile"], "main", None), + (None, "mybranch", ["mainfile", "branchfile"], "branch", None), + ("main", "main", ["mainfile"], "main", None), + ("main", "mybranch", ["mainfile", "branchfile"], "branch", None), ], ) def test_git_checkout( @@ -371,9 +380,11 @@ def test_git_checkout( head_ref, files, hash_key, + exc, ): destination = tmp_path / "destination" - run_task_mod.git_checkout( + run_git_checkout = functools.partial( + run_task_mod.git_checkout, destination_path=destination, head_repo=mock_git_repo["path"], base_repo=mock_git_repo["path"], @@ -383,6 +394,12 @@ def test_git_checkout( ssh_key_file=None, ssh_known_hosts_file=None, ) + if exc: + with pytest.raises(exc): + run_git_checkout() + return + + run_git_checkout() # Check desired files exist for filename in files: @@ -398,23 +415,35 @@ def test_git_checkout( assert current_branch == head_ref current_rev = git_current_rev(destination) - assert current_rev == mock_git_repo[hash_key] + assert current_rev == mock_git_repo[hash_key][-1] +@pytest.mark.parametrize( + "head_ref,head_rev_index", + ( + pytest.param("mybranch", 1, id="head"), + pytest.param("mybranch", 0, id="non tip"), + pytest.param(None, 0, id="non tip without head_ref"), + ), +) def test_git_checkout_with_commit( mock_stdin, run_task_mod, mock_git_repo, tmp_path, + head_ref, + head_rev_index, ): destination = tmp_path / "destination" + base_rev = mock_git_repo["main"][-1] + head_rev = mock_git_repo["branch"][head_rev_index] run_task_mod.git_checkout( destination_path=str(destination), head_repo=mock_git_repo["path"], base_repo=mock_git_repo["path"], - base_rev=mock_git_repo["main"], - head_ref="mybranch", - head_rev=mock_git_repo["branch"], + base_rev=base_rev, + head_ref=head_ref, + head_rev=head_rev, ssh_key_file=None, ssh_known_hosts_file=None, ) @@ -424,7 +453,7 @@ def test_git_checkout_with_commit( cwd=str(destination), universal_newlines=True, ).strip() - assert current_rev == mock_git_repo["branch"] + assert current_rev == head_rev def test_display_python_version_should_output_python_versions_title( From 7bef1f92f1d4a88ba25a2a7476cb66dc7dd31b6f Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 10 Sep 2025 14:25:41 -0400 Subject: [PATCH 08/10] fix(run-task): normalize head_ref before checking it out This fixes the case where head_ref is passed in with a `refs/heads` prefix. --- src/taskgraph/run-task/run-task | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index 9bf38baf4..b89735012 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -621,6 +621,19 @@ def _clean_git_checkout(destination_path): print_line(b"vcs", b"successfully cleaned git checkout!\n") +def shortref(ref: str) -> str: + """Normalize a git ref to its short form. + + Returns the ref unchanged if it's already in short form. + """ + # Strip common ref prefixes + for prefix in ("refs/heads/", "refs/tags/"): + if ref.startswith(prefix): + return ref[len(prefix) :] + + return ref + + def git_checkout( destination_path: str, head_repo: str, @@ -721,7 +734,7 @@ def git_checkout( ] if head_ref: - args.extend(["-B", head_ref]) + args.extend(["-B", shortref(head_ref)]) # `git fetch` set `FETCH_HEAD` reference to the last commit of the desired branch args.append(head_rev if head_rev else "FETCH_HEAD") From 33eb91e4758a35a938016f602919e18834611c20 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 10 Sep 2025 14:25:41 -0400 Subject: [PATCH 09/10] feat(run-task): implement shallow git clones Shallow clones yield a massive improvement to clone performance, at the expense of making it tricky to determine the files that were modified. --- src/taskgraph/run-task/run-task | 33 ++++++-- test/test_scripts_run_task.py | 130 ++++++++++++++++++++++++++++++-- 2 files changed, 152 insertions(+), 11 deletions(-) diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index b89735012..cc48b499e 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -571,6 +571,7 @@ def git_fetch( *targets: str, remote: str = "origin", tags: bool = False, + shallow: bool = False, env: Optional[dict[str, str]] = None, ): args = ["git", "fetch"] @@ -578,6 +579,9 @@ def git_fetch( # `--force` is needed to be able to update an existing outdated tag. args.extend(["--tags", "--force"]) + if shallow: + args.append("--depth=1") + args.extend([remote, *set(targets)]) retry_required_command(b"vcs", args, cwd=destination_path, extra_env=env) @@ -643,6 +647,7 @@ def git_checkout( head_rev: Optional[str], ssh_key_file: Optional[Path], ssh_known_hosts_file: Optional[Path], + shallow: bool = False, ): assert head_ref or head_rev @@ -688,10 +693,18 @@ def git_checkout( args = [ "git", "clone", - base_repo if base_repo else head_repo, - destination_path, ] + if shallow: + args.extend(["--depth=1", "--no-checkout"]) + + args.extend( + [ + base_repo if base_repo else head_repo, + destination_path, + ] + ) + retry_required_command(b"vcs", args, extra_env=env) # For Github based repos, base_rev often doesn't refer to an ancestor of @@ -700,7 +713,7 @@ def git_checkout( # that consumers can compute the merge-base or files modified between the # two as needed. if base_rev and base_rev != NULL_REVISION: - git_fetch(destination_path, base_rev, env=env) + git_fetch(destination_path, base_rev, shallow=shallow, env=env) # If a head_ref was provided, it might be tag, so we need to make sure we fetch # those. This is explicitly only done when base and head repo match, @@ -715,8 +728,9 @@ def git_checkout( targets = [] if head_ref: targets.append(head_ref) - if not targets: - # If head_ref wasn't provided, we fallback to head_rev. + if not head_ref or (shallow and head_rev): + # If head_ref wasn't provided, we fallback to head_rev. If we have a + # shallow clone, head_rev needs to be fetched independently regardless. targets.append(head_rev) git_fetch( @@ -724,6 +738,7 @@ def git_checkout( *targets, remote=head_repo, tags=tags, + shallow=shallow, env=env, ) @@ -911,11 +926,17 @@ def add_vcs_arguments(parser, project, name): f"--{project}-sparse-profile", help=f"Path to sparse profile for {name} checkout", ) + parser.add_argument( + f"--{project}-shallow-clone", + action="store_true", + help=f"Use shallow clone for {name}", + ) def collect_vcs_options(args, project, name): checkout = getattr(args, f"{project}_checkout") sparse_profile = getattr(args, f"{project}_sparse_profile") + shallow_clone = getattr(args, f"{project}_shallow_clone") env_prefix = project.upper() @@ -960,6 +981,7 @@ def collect_vcs_options(args, project, name): "repo-type": repo_type, "ssh-secret-name": private_key_secret, "pip-requirements": pip_requirements, + "shallow-clone": shallow_clone, } @@ -1008,6 +1030,7 @@ def vcs_checkout_from_args(options): head_rev, ssh_key_file, ssh_known_hosts_file, + shallow=options.get("shallow-clone", False), ) elif options["repo-type"] == "hg": revision = hg_checkout( diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index d7589df31..ccba1a440 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -152,9 +152,10 @@ def test_install_pip_requirements_with_uv( @pytest.mark.parametrize( - "env,extra_expected", + "args,env,extra_expected", [ pytest.param( + {}, { "REPOSITORY_TYPE": "hg", "BASE_REPOSITORY": "https://hg.mozilla.org/mozilla-central", @@ -165,10 +166,27 @@ def test_install_pip_requirements_with_uv( { "base-repo": "https://hg.mozilla.org/mozilla-unified", }, - ) + id="hg", + ), + pytest.param( + {"myrepo_shallow_clone": True}, + { + "REPOSITORY_TYPE": "git", + "HEAD_REPOSITORY": "https://github.com/test/repo.git", + "HEAD_REV": "abc123", + }, + {"shallow-clone": True}, + id="git_with_shallow_clone", + ), ], ) -def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): +def test_collect_vcs_options( + monkeypatch, + run_task_mod, + args, + env, + extra_expected, +): name = "myrepo" checkout = "checkout" @@ -176,9 +194,10 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): for k, v in env.items(): monkeypatch.setenv(f"{name.upper()}_{k.upper()}", v) - args = Namespace() - setattr(args, f"{name}_checkout", checkout) - setattr(args, f"{name}_sparse_profile", False) + args.setdefault(f"{name}_checkout", checkout) + args.setdefault(f"{name}_shallow_clone", False) + args.setdefault(f"{name}_sparse_profile", False) + args = Namespace(**args) result = run_task_mod.collect_vcs_options(args, name, name) @@ -194,6 +213,7 @@ def test_collect_vcs_options(monkeypatch, run_task_mod, env, extra_expected): "head-ref": env.get("HEAD_REF"), "head-rev": env.get("HEAD_REV"), "repo-type": env.get("REPOSITORY_TYPE"), + "shallow-clone": False, "ssh-secret-name": env.get("SSH_SECRET_NAME"), "sparse-profile": False, "store-path": env.get("HG_STORE_PATH"), @@ -456,6 +476,104 @@ def test_git_checkout_with_commit( assert current_rev == head_rev +def test_git_checkout_shallow( + mock_stdin, + run_task_mod, + mock_git_repo, + tmp_path, +): + destination = tmp_path / "destination" + + # Git ignores `--depth` when cloning from local directories, so use file:// + # protocol to force shallow clone. + repo_url = f"file://{mock_git_repo['path']}" + base_rev = mock_git_repo["main"][-1] + head_rev = mock_git_repo["branch"][-1] + + # Use shallow clone with head_ref != head_rev + run_task_mod.git_checkout( + destination_path=str(destination), + head_repo=repo_url, + base_repo=repo_url, + base_rev=base_rev, + head_ref="mybranch", + head_rev=head_rev, + ssh_key_file=None, + ssh_known_hosts_file=None, + shallow=True, + ) + shallow_file = destination / ".git" / "shallow" + assert shallow_file.exists() + + # Verify we're on the correct commit + final_rev = subprocess.check_output( + ["git", "rev-parse", "HEAD"], + cwd=str(destination), + universal_newlines=True, + ).strip() + assert final_rev == head_rev + + # Verify both base_rev and head_rev are available. + for sha in (base_rev, head_rev): + result = subprocess.run( + ["git", "cat-file", "-t", sha], + cwd=str(destination), + capture_output=True, + text=True, + ) + assert result.returncode == 0, f"Commit {sha} should be available" + assert result.stdout.strip() == "commit" + + +def test_git_fetch_shallow( + mock_stdin, + run_task_mod, + mock_git_repo, + tmp_path, +): + destination = tmp_path / "destination" + + # Git ignores `--depth` when cloning from local directories, so use file:// + # protocol to force shallow clone. + repo_url = f"file://{mock_git_repo['path']}" + + run_task_mod.run_command( + b"vcs", + [ + "git", + "clone", + "--depth=1", + "--no-checkout", + repo_url, + str(destination), + ], + ) + shallow_file = destination / ".git" / "shallow" + assert shallow_file.exists() + + # Verify base_rev doesn't exist yet + base_rev = mock_git_repo["branch"][-1] + result = subprocess.run( + ["git", "cat-file", "-t", base_rev], + cwd=str(destination), + capture_output=True, + text=True, + ) + assert result.returncode != 0 + + run_task_mod.git_fetch(str(destination), base_rev, remote=repo_url, shallow=True) + + # Verify base_rev is now available + result = subprocess.run( + ["git", "cat-file", "-t", base_rev], + cwd=str(destination), + capture_output=True, + text=True, + ) + assert result.returncode == 0 + assert result.stdout.strip() == "commit" + + def test_display_python_version_should_output_python_versions_title( run_task_mod, capsys ): From c8788ca49c6e7123e932786f2beb892813c67cd6 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 15 Oct 2025 11:36:34 -0400 Subject: [PATCH 10/10] fix(vcs): better support for shallow clones in repo.get_changed_files() `git log BASE..HEAD` says, show me commits reachable from HEAD, but not reachable from BASE. In a shallow clone where we only fetch BASE and HEAD (which is what run-task does), this means the command will only return `HEAD`. In otherwords, we're only returning files changed by the tip commit of the push and ignoring everything else. By switching to `git diff BASE HEAD`, we're instead comparing the snapshots of both revisions. Sometimes this is what we want, e.g for force pushes, it'll be the interdiff of files modified between the two pushes (though some developers might expect it to contain the files modified since the merge base). Sometimes it's not what we want, e.g for PRs, it'll be the files changed between the PR and the latest commit on `main`. Either way, this behaviour is at least somewhat more accurate than git log when we don't have full history. Likely we'll need to fetch the proper changed files using the Github API in the future, but for now this is better than nothing. --- src/taskgraph/util/vcs.py | 22 +++++++++++ test/test_util_vcs.py | 82 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/src/taskgraph/util/vcs.py b/src/taskgraph/util/vcs.py index 75ff212dc..f189af544 100644 --- a/src/taskgraph/util/vcs.py +++ b/src/taskgraph/util/vcs.py @@ -57,6 +57,11 @@ def run(self, *args: str, **kwargs) -> str: def tool(self) -> str: """Version control system being used, either 'hg' or 'git'.""" + @property + @abstractmethod + def is_shallow(self) -> str: + """Whether this repo is a shallow clone.""" + @property @abstractmethod def head_rev(self) -> str: @@ -224,6 +229,10 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._env["HGPLAIN"] = "1" + @property + def is_shallow(self): + return False + @property def head_rev(self): return self.run("log", "-r", ".", "-T", "{node}").strip() @@ -371,6 +380,10 @@ def default_remote_name(self) -> str: _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P\S+)\s+HEAD") + @property + def is_shallow(self): + return self.run("rev-parse", "--is-shallow-repository").strip() == "true" + @property def head_rev(self): return self.run("rev-parse", "--verify", "HEAD").strip() @@ -492,6 +505,15 @@ def get_changed_files(self, diff_filter=None, mode=None, rev=None, base=None): cmd.append("--cached") elif mode == "all": cmd.append("HEAD") + elif self.is_shallow: + # In shallow clones, `git log` won't have the history necessary to + # determine the files changed. Using `git diff` finds the + # differences between the two trees which is slightly more + # accurate. However, Github events often don't provide the true + # base revision so shallow Github clones will still return + # incorrect files changed in many cases, most notably pull + # requests that need rebasing. + cmd = ["diff", base, rev] else: revision_argument = f"{rev}~1..{rev}" if base is None else f"{base}..{rev}" cmd = ["log", "--format=format:", revision_argument] diff --git a/test/test_util_vcs.py b/test/test_util_vcs.py index 7585fe367..2e2327342 100644 --- a/test/test_util_vcs.py +++ b/test/test_util_vcs.py @@ -508,3 +508,85 @@ def test_does_revision_exist_locally(repo): assert repo.does_revision_exist_locally(first_revision) assert repo.does_revision_exist_locally(last_revision) assert not repo.does_revision_exist_locally("deadbeef") + + +def test_get_changed_files_shallow_clone(git_repo, tmp_path, default_git_branch): + tmp_repo = Path(git_repo) + + # Add initial files to the existing repo (which already has first_file from fixture) + (tmp_repo / "common.txt").write_text("common content") + (tmp_repo / "file_to_modify.txt").write_text("original content") + (tmp_repo / "file_to_delete.txt").write_text("will be deleted") + subprocess.check_call(["git", "add", "."], cwd=tmp_repo) + subprocess.check_call(["git", "commit", "-m", "Add test files"], cwd=tmp_repo) + + # Create feature branch and make changes + subprocess.check_call(["git", "checkout", "-b", "feature"], cwd=tmp_repo) + + # On feature branch: modify file, add new file, delete file + (tmp_repo / "file_to_modify.txt").write_text("modified in feature") + (tmp_repo / "feature_only.txt").write_text("feature specific file") + (tmp_repo / "file_to_delete.txt").unlink() + subprocess.check_call(["git", "rm", "file_to_delete.txt"], cwd=tmp_repo) + subprocess.check_call(["git", "add", "."], cwd=tmp_repo) + subprocess.check_call(["git", "commit", "-m", "Feature changes"], cwd=tmp_repo) + + feature_commit = subprocess.check_output( + ["git", "rev-parse", "HEAD"], cwd=tmp_repo, text=True + ).strip() + + # Switch back to main and make different changes + subprocess.check_call(["git", "checkout", default_git_branch], cwd=tmp_repo) + + # On main branch: different modifications + (tmp_repo / "file_to_modify.txt").write_text("modified in main") + (tmp_repo / "main_only.txt").write_text("main specific file") + subprocess.check_call(["git", "add", "."], cwd=tmp_repo) + subprocess.check_call(["git", "commit", "-m", "Main changes"], cwd=tmp_repo) + + main_commit = subprocess.check_output( + ["git", "rev-parse", "HEAD"], cwd=tmp_repo, text=True + ).strip() + + # Create a shallow clone with only the latest commit from each branch + shallow_path = tmp_path / "shallow" + subprocess.check_call( + [ + "git", + "clone", + "--depth=1", + "--no-single-branch", + f"file://{tmp_repo}", + str(shallow_path), + ] + ) + shallow_repo = get_repository(str(shallow_path)) + assert shallow_repo.is_shallow + + # Between main and feature: + # - file_to_modify.txt was modified differently (M) + # - file_to_delete.txt exists in main but not in feature (D) + # - feature_only.txt exists in feature but not in main (A) + # - main_only.txt exists in main but not in feature (D when comparing feature to main) + changed_files = shallow_repo.get_changed_files( + "AMD", "all", feature_commit, main_commit + ) + assert "file_to_modify.txt" in changed_files # Modified + assert "file_to_delete.txt" in changed_files # Deleted in feature + assert "feature_only.txt" in changed_files # Added in feature + assert ( + "main_only.txt" in changed_files + ) # Not in feature (shows as deleted from main's perspective) + + added = shallow_repo.get_changed_files("A", "all", feature_commit, main_commit) + assert "feature_only.txt" in added + assert "file_to_delete.txt" not in added + + deleted = shallow_repo.get_changed_files("D", "all", feature_commit, main_commit) + assert "file_to_delete.txt" in deleted + assert ( + "main_only.txt" in deleted + ) # From feature's perspective, main_only.txt doesn't exist + + modified = shallow_repo.get_changed_files("M", "all", feature_commit, main_commit) + assert "file_to_modify.txt" in modified