Skip to content

Commit 38c2317

Browse files
feat: include_submodules option (#313)
* feat: add optional --include-submodules flag to CLI and ingestion - Adds --include-submodules CLI flag to control submodule analysis - Propagates include_submodules through ingestion, schemas, and clone logic - Updates tests to cover submodule inclusion - Adds a helper function (_checkout_partial_clone) to avoid repetition - Adds include_submodules example in README.md - Web UI for this option is not implemented for now (#313 (comment)) --------- Co-authored-by: Filip Christiansen <[email protected]>
1 parent 149c8e9 commit 38c2317

File tree

8 files changed

+92
-14
lines changed

8 files changed

+92
-14
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ gitingest https://github.com/username/private-repo --token github_pat_...
122122
# Or set it as an environment variable
123123
export GITHUB_TOKEN=github_pat_...
124124
gitingest https://github.com/username/private-repo
125+
126+
# Include repository submodules
127+
gitingest https://github.com/username/repo-with-submodules --include-submodules
125128
```
126129

127130
By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
@@ -163,6 +166,9 @@ summary, tree, content = ingest("https://github.com/username/private-repo", toke
163166
import os
164167
os.environ["GITHUB_TOKEN"] = "github_pat_..."
165168
summary, tree, content = ingest("https://github.com/username/private-repo")
169+
170+
# Include repository submodules
171+
summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True)
166172
```
167173

168174
By default, this won't write a file but can be enabled with the `output` argument.

src/gitingest/cli.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class _CLIArgs(TypedDict):
2020
include_pattern: tuple[str, ...]
2121
branch: str | None
2222
include_gitignored: bool
23+
include_submodules: bool
2324
token: str | None
2425
output: str | None
2526

@@ -47,6 +48,12 @@ class _CLIArgs(TypedDict):
4748
default=False,
4849
help="Include files matched by .gitignore and .gitingestignore",
4950
)
51+
@click.option(
52+
"--include-submodules",
53+
is_flag=True,
54+
help="Include repository's submodules in the analysis",
55+
default=False,
56+
)
5057
@click.option(
5158
"--token",
5259
"-t",
@@ -94,6 +101,9 @@ def main(**cli_kwargs: Unpack[_CLIArgs]) -> None:
94101
$ gitingest https://github.com/user/private-repo -t ghp_token
95102
$ GITHUB_TOKEN=ghp_token gitingest https://github.com/user/private-repo
96103
104+
Include submodules:
105+
$ gitingest https://github.com/user/repo --include-submodules
106+
97107
"""
98108
asyncio.run(_async_main(**cli_kwargs))
99109

@@ -106,6 +116,7 @@ async def _async_main(
106116
include_pattern: tuple[str, ...] | None = None,
107117
branch: str | None = None,
108118
include_gitignored: bool = False,
119+
include_submodules: bool = False,
109120
token: str | None = None,
110121
output: str | None = None,
111122
) -> None:
@@ -129,6 +140,8 @@ async def _async_main(
129140
Git branch to ingest. If ``None``, the repository's default branch is used.
130141
include_gitignored : bool
131142
If ``True``, also ingest files matched by ``.gitignore`` or ``.gitingestignore`` (default: ``False``).
143+
include_submodules : bool
144+
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
132145
token : str | None
133146
GitHub personal access token (PAT) for accessing private repositories.
134147
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -155,14 +168,15 @@ async def _async_main(
155168
click.echo(f"Analyzing source, output will be written to '{output_target}'...", err=True)
156169

157170
summary, _, _ = await ingest_async(
158-
source=source,
171+
source,
159172
max_file_size=max_size,
160173
include_patterns=include_patterns,
161174
exclude_patterns=exclude_patterns,
162175
branch=branch,
163-
output=output_target,
164176
include_gitignored=include_gitignored,
177+
include_submodules=include_submodules,
165178
token=token,
179+
output=output_target,
166180
)
167181
except Exception as exc:
168182
# Convert any exception into Click.Abort so that exit status is non-zero

src/gitingest/clone.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

6565
clone_cmd += ["clone", "--single-branch"]
66-
# TODO: Re-enable --recurse-submodules when submodule support is needed
66+
67+
if config.include_submodules:
68+
clone_cmd += ["--recurse-submodules"]
6769

6870
if partial_clone:
6971
clone_cmd += ["--filter=blob:none", "--sparse"]
@@ -86,15 +88,28 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
8688

8789
# Checkout the subpath if it is a partial clone
8890
if partial_clone:
89-
subpath = config.subpath.lstrip("/")
90-
if config.blob:
91-
# When ingesting from a file url (blob/branch/path/file.txt), we need to remove the file name.
92-
subpath = str(Path(subpath).parent.as_posix())
93-
94-
checkout_cmd = create_git_command(["git"], local_path, url, token)
95-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
91+
await _checkout_partial_clone(config, token)
9692

9793
# Checkout the commit if it is provided
9894
if commit:
9995
checkout_cmd = create_git_command(["git"], local_path, url, token)
10096
await run_command(*checkout_cmd, "checkout", commit)
97+
98+
99+
async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100+
"""Configure sparse-checkout for a partially cloned repository.
101+
102+
Parameters
103+
----------
104+
config : CloneConfig
105+
The configuration for cloning the repository, including subpath and blob flag.
106+
token : str | None
107+
GitHub personal access token (PAT) for accessing private repositories.
108+
109+
"""
110+
subpath = config.subpath.lstrip("/")
111+
if config.blob:
112+
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113+
subpath = str(Path(subpath).parent.as_posix())
114+
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115+
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)

src/gitingest/entrypoint.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ async def ingest_async(
2727
branch: str | None = None,
2828
tag: str | None = None,
2929
include_gitignored: bool = False,
30+
include_submodules: bool = False,
3031
token: str | None = None,
3132
output: str | None = None,
3233
) -> tuple[str, str, str]:
@@ -52,6 +53,8 @@ async def ingest_async(
5253
The tag to clone and ingest. If ``None``, no tag is used.
5354
include_gitignored : bool
5455
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
56+
include_submodules : bool
57+
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
5558
token : str | None
5659
GitHub personal access token (PAT) for accessing private repositories.
5760
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -86,6 +89,8 @@ async def ingest_async(
8689
if query.url:
8790
_override_branch_and_tag(query, branch=branch, tag=tag)
8891

92+
query.include_submodules = include_submodules
93+
8994
async with _clone_repo_if_remote(query, token=token):
9095
summary, tree, content = ingest_query(query)
9196
await _write_output(tree, content=content, target=output)
@@ -101,6 +106,7 @@ def ingest(
101106
branch: str | None = None,
102107
tag: str | None = None,
103108
include_gitignored: bool = False,
109+
include_submodules: bool = False,
104110
token: str | None = None,
105111
output: str | None = None,
106112
) -> tuple[str, str, str]:
@@ -126,6 +132,8 @@ def ingest(
126132
The tag to clone and ingest. If ``None``, no tag is used.
127133
include_gitignored : bool
128134
If ``True``, include files ignored by ``.gitignore`` and ``.gitingestignore`` (default: ``False``).
135+
include_submodules : bool
136+
If ``True``, recursively include all Git submodules within the repository (default: ``False``).
129137
token : str | None
130138
GitHub personal access token (PAT) for accessing private repositories.
131139
Can also be set via the ``GITHUB_TOKEN`` environment variable.
@@ -156,6 +164,7 @@ def ingest(
156164
branch=branch,
157165
tag=tag,
158166
include_gitignored=include_gitignored,
167+
include_submodules=include_submodules,
159168
token=token,
160169
output=output,
161170
),

src/gitingest/schemas/ingestion.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212

1313
@dataclass
14-
class CloneConfig:
14+
class CloneConfig: # pylint: disable=too-many-instance-attributes
1515
"""Configuration for cloning a Git repository.
1616
1717
This class holds the necessary parameters for cloning a repository to a local path, including
@@ -33,6 +33,8 @@ class CloneConfig:
3333
The subpath to clone from the repository (default: ``"/"``).
3434
blob: bool
3535
Whether the repository is a blob (default: ``False``).
36+
include_submodules: bool
37+
Whether to clone submodules (default: ``False``).
3638
3739
"""
3840

@@ -43,6 +45,7 @@ class CloneConfig:
4345
tag: str | None = None
4446
subpath: str = "/"
4547
blob: bool = False
48+
include_submodules: bool = False
4649

4750

4851
class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
@@ -78,6 +81,8 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
7881
The patterns to ignore (default: ``set()``).
7982
include_patterns : set[str] | None
8083
The patterns to include.
84+
include_submodules : bool
85+
Whether to include all Git submodules within the repository. (default: ``False``)
8186
8287
"""
8388

@@ -95,6 +100,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
95100
max_file_size: int = Field(default=MAX_FILE_SIZE)
96101
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
97102
include_patterns: set[str] | None = None
103+
include_submodules: bool = False
98104

99105
def extract_clone_config(self) -> CloneConfig:
100106
"""Extract the relevant fields for the CloneConfig object.
@@ -122,6 +128,7 @@ def extract_clone_config(self) -> CloneConfig:
122128
tag=self.tag,
123129
subpath=self.subpath,
124130
blob=self.type == "blob",
131+
include_submodules=self.include_submodules,
125132
)
126133

127134
def ensure_url(self) -> None:

tests/query_parser/test_git_host_agnostic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ async def test_parse_query_without_host(
6868
"commit": None,
6969
"max_file_size": 50,
7070
"include_patterns": None,
71+
"include_submodules": False,
7172
}
7273

7374
assert actual == expected

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"tests/",
2828
"--include-pattern",
2929
"src/",
30+
"--include-submodules",
3031
],
3132
True,
3233
id="custom-options",

tests/test_clone.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,10 @@ async def test_clone_default_shallow_clone(run_command_mock: AsyncMock) -> None:
181181

182182

183183
@pytest.mark.asyncio
184-
async def test_clone_commit_without_branch(run_command_mock: AsyncMock) -> None:
185-
"""Test cloning when a commit hash is provided but no branch is specified.
184+
async def test_clone_commit(run_command_mock: AsyncMock) -> None:
185+
"""Test cloning when a commit hash is provided.
186186
187-
Given a valid URL and a commit hash (but no branch):
187+
Given a valid URL and a commit hash:
188188
When ``clone_repo`` is called,
189189
Then the repository should be cloned and checked out at that commit.
190190
"""
@@ -414,3 +414,28 @@ async def test_clone_with_commit_and_subpath(run_command_mock: AsyncMock) -> Non
414414
)
415415

416416
assert run_command_mock.call_count == expected_call_count
417+
418+
419+
@pytest.mark.asyncio
420+
async def test_clone_with_include_submodules(run_command_mock: AsyncMock) -> None:
421+
"""Test cloning a repository with submodules included.
422+
423+
Given a valid URL and ``include_submodules=True``:
424+
When ``clone_repo`` is called,
425+
Then the repository should be cloned with ``--recurse-submodules`` in the git command.
426+
"""
427+
expected_call_count = 1 # No commit and no partial clone
428+
clone_config = CloneConfig(url=DEMO_URL, local_path=LOCAL_REPO_PATH, branch="main", include_submodules=True)
429+
430+
await clone_repo(clone_config)
431+
432+
assert run_command_mock.call_count == expected_call_count
433+
run_command_mock.assert_called_once_with(
434+
"git",
435+
"clone",
436+
"--single-branch",
437+
"--recurse-submodules",
438+
"--depth=1",
439+
clone_config.url,
440+
clone_config.local_path,
441+
)

0 commit comments

Comments
 (0)