Skip to content

Commit f4fd4bb

Browse files
feat: partial cloning (#188)
This commit introduces the `partial_clone_repo` function, which performs a sparse clone of a repository (`git clone --filter=blob:none --sparse`) based on query parameters from a `ParsedQuery` object. - Add a new method (extact_clone_config) in ParsedQuery to encapsulate the creation of a CloneConfig from query parameters. - Replace repeated CloneConfig instantiation in repository_ingest.py and query_processor.py with calls to the new method. - Simplify code and improve maintainability by centralizing CloneConfig logic. * Refactor cloning logic to support subpath-based partial clones - Add `repo_name` and `subpath` fields to `CloneConfig` for flexible cloning. - Split out `partial_clone_repo` and `full_clone_repo` to handle subpath vs. full clones. - Update `CloneConfig` to include `repo_name` and `subpath`. - Simplify query processing to always call `clone_repo`, which now delegates to partial or full clone. - Improve docstrings to reflect new parameters and return types. --------- Co-authored-by: cyclotruc <[email protected]>
1 parent f90595d commit f4fd4bb

File tree

6 files changed

+200
-140
lines changed

6 files changed

+200
-140
lines changed

src/gitingest/query_parser.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from gitingest.config import MAX_FILE_SIZE, TMP_BASE_PATH
1414
from gitingest.exceptions import InvalidPatternError
1515
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
16-
from gitingest.repository_clone import _check_repo_exists, fetch_remote_branch_list
16+
from gitingest.repository_clone import CloneConfig, _check_repo_exists, fetch_remote_branch_list
1717

1818
HEX_DIGITS: Set[str] = set(string.hexdigits)
1919

@@ -35,11 +35,11 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
3535

3636
user_name: Optional[str]
3737
repo_name: Optional[str]
38-
subpath: str
3938
local_path: Path
4039
url: Optional[str]
4140
slug: str
4241
id: str
42+
subpath: str = "/"
4343
type: Optional[str] = None
4444
branch: Optional[str] = None
4545
commit: Optional[str] = None
@@ -48,6 +48,31 @@ class ParsedQuery: # pylint: disable=too-many-instance-attributes
4848
include_patterns: Optional[Set[str]] = None
4949
pattern_type: Optional[str] = None
5050

51+
def extact_clone_config(self) -> CloneConfig:
52+
"""
53+
Extract the relevant fields for the CloneConfig object.
54+
55+
Returns
56+
-------
57+
CloneConfig
58+
A CloneConfig object containing the relevant fields.
59+
60+
Raises
61+
------
62+
ValueError
63+
If the 'url' parameter is not provided.
64+
"""
65+
if not self.url:
66+
raise ValueError("The 'url' parameter is required.")
67+
68+
return CloneConfig(
69+
url=self.url,
70+
local_path=str(self.local_path),
71+
commit=self.commit,
72+
branch=self.branch,
73+
subpath=self.subpath,
74+
)
75+
5176

5277
async def parse_query(
5378
source: str,
@@ -171,7 +196,6 @@ async def _parse_repo_source(source: str) -> ParsedQuery:
171196
user_name=user_name,
172197
repo_name=repo_name,
173198
url=url,
174-
subpath="/",
175199
local_path=local_path,
176200
slug=slug,
177201
id=_id,
@@ -363,7 +387,6 @@ def _parse_path(path_str: str) -> ParsedQuery:
363387
user_name=None,
364388
repo_name=None,
365389
url=None,
366-
subpath="/",
367390
local_path=path_obj,
368391
slug=f"{path_obj.parent.name}/{path_obj.name}",
369392
id=str(uuid.uuid4()),

src/gitingest/repository_clone.py

Lines changed: 66 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,19 @@ class CloneConfig:
2929
The specific commit hash to check out after cloning (default is None).
3030
branch : str, optional
3131
The branch to clone (default is None).
32+
subpath : str
33+
The subpath to clone from the repository (default is "/").
3234
"""
3335

3436
url: str
3537
local_path: str
3638
commit: Optional[str] = None
3739
branch: Optional[str] = None
40+
subpath: str = "/"
3841

3942

4043
@async_timeout(TIMEOUT)
41-
async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
44+
async def clone_repo(config: CloneConfig) -> None:
4245
"""
4346
Clone a repository to a local path based on the provided configuration.
4447
@@ -49,35 +52,21 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
4952
Parameters
5053
----------
5154
config : CloneConfig
52-
A dictionary containing the following keys:
53-
- url (str): The URL of the repository.
54-
- local_path (str): The local path to clone the repository to.
55-
- commit (str, optional): The specific commit hash to checkout.
56-
- branch (str, optional): The branch to clone. Defaults to 'main' or 'master' if not provided.
57-
58-
Returns
59-
-------
60-
Tuple[bytes, bytes]
61-
A tuple containing the stdout and stderr of the Git commands executed.
55+
The configuration for cloning the repository.
6256
6357
Raises
6458
------
6559
ValueError
66-
If the 'url' or 'local_path' parameters are missing, or if the repository is not found.
60+
If the repository is not found or if the provided URL is invalid.
6761
OSError
68-
If there is an error creating the parent directory structure.
62+
If an error occurs while creating the parent directory for the repository.
6963
"""
7064
# Extract and validate query parameters
7165
url: str = config.url
7266
local_path: str = config.local_path
7367
commit: Optional[str] = config.commit
7468
branch: Optional[str] = config.branch
75-
76-
if not url:
77-
raise ValueError("The 'url' parameter is required.")
78-
79-
if not local_path:
80-
raise ValueError("The 'local_path' parameter is required.")
69+
partial_clone: bool = config.subpath != "/"
8170

8271
# Create parent directory if it doesn't exist
8372
parent_dir = Path(local_path).parent
@@ -90,34 +79,32 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]:
9079
if not await _check_repo_exists(url):
9180
raise ValueError("Repository not found, make sure it is public")
9281

93-
if commit:
94-
# Scenario 1: Clone and checkout a specific commit
95-
# Clone the repository without depth to ensure full history for checkout
96-
clone_cmd = ["git", "clone", "--recurse-submodules", "--single-branch", url, local_path]
97-
await _run_git_command(*clone_cmd)
82+
clone_cmd = ["git", "clone", "--recurse-submodules", "--single-branch"]
9883

99-
# Checkout the specific commit
100-
checkout_cmd = ["git", "-C", local_path, "checkout", commit]
101-
return await _run_git_command(*checkout_cmd)
84+
if partial_clone:
85+
clone_cmd += ["--filter=blob:none", "--sparse"]
10286

103-
if branch and branch.lower() not in ("main", "master"):
104-
# Scenario 2: Clone a specific branch with shallow depth
105-
clone_cmd = [
106-
"git",
107-
"clone",
108-
"--recurse-submodules",
109-
"--depth=1",
110-
"--single-branch",
111-
"--branch",
112-
branch,
113-
url,
114-
local_path,
115-
]
116-
return await _run_git_command(*clone_cmd)
117-
118-
# Scenario 3: Clone the default branch with shallow depth
119-
clone_cmd = ["git", "clone", "--recurse-submodules", "--depth=1", "--single-branch", url, local_path]
120-
return await _run_git_command(*clone_cmd)
87+
if not commit:
88+
clone_cmd += ["--depth=1"]
89+
if branch and branch.lower() not in ("main", "master"):
90+
clone_cmd += ["--branch", branch]
91+
92+
clone_cmd += [url, local_path]
93+
94+
# Clone the repository
95+
await _run_command(*clone_cmd)
96+
97+
if commit or partial_clone:
98+
checkout_cmd = ["git", "-C", local_path]
99+
100+
if partial_clone:
101+
checkout_cmd += ["sparse-checkout", "set", config.subpath.lstrip("/")]
102+
103+
if commit:
104+
checkout_cmd += ["checkout", commit]
105+
106+
# Check out the specific commit and/or subpath
107+
await _run_command(*checkout_cmd)
121108

122109

123110
async def _check_repo_exists(url: str) -> bool:
@@ -176,7 +163,7 @@ async def fetch_remote_branch_list(url: str) -> List[str]:
176163
A list of branch names available in the remote repository.
177164
"""
178165
fetch_branches_command = ["git", "ls-remote", "--heads", url]
179-
stdout, _ = await _run_git_command(*fetch_branches_command)
166+
stdout, _ = await _run_command(*fetch_branches_command)
180167
stdout_decoded = stdout.decode()
181168

182169
return [
@@ -186,41 +173,28 @@ async def fetch_remote_branch_list(url: str) -> List[str]:
186173
]
187174

188175

189-
async def _run_git_command(*args: str) -> Tuple[bytes, bytes]:
176+
async def _run_command(*args: str) -> Tuple[bytes, bytes]:
190177
"""
191-
Execute a Git command asynchronously and captures its output.
178+
Execute a command asynchronously and captures its output.
192179
193180
Parameters
194181
----------
195182
*args : str
196-
The Git command and its arguments to execute.
183+
The command and its arguments to execute.
197184
198185
Returns
199186
-------
200187
Tuple[bytes, bytes]
201-
A tuple containing the stdout and stderr of the Git command.
188+
A tuple containing the stdout and stderr of the command.
202189
203190
Raises
204191
------
205192
RuntimeError
206-
If Git is not installed or if the Git command exits with a non-zero status.
193+
If command exits with a non-zero status.
207194
"""
208-
# Check if Git is installed
209-
try:
210-
version_proc = await asyncio.create_subprocess_exec(
211-
"git",
212-
"--version",
213-
stdout=asyncio.subprocess.PIPE,
214-
stderr=asyncio.subprocess.PIPE,
215-
)
216-
_, stderr = await version_proc.communicate()
217-
if version_proc.returncode != 0:
218-
error_message = stderr.decode().strip() if stderr else "Git command not found"
219-
raise RuntimeError(f"Git is not installed or not accessible: {error_message}")
220-
except FileNotFoundError as exc:
221-
raise RuntimeError("Git is not installed. Please install Git before proceeding.") from exc
195+
await check_git_installed()
222196

223-
# Execute the requested Git command
197+
# Execute the requested command
224198
proc = await asyncio.create_subprocess_exec(
225199
*args,
226200
stdout=asyncio.subprocess.PIPE,
@@ -229,11 +203,36 @@ async def _run_git_command(*args: str) -> Tuple[bytes, bytes]:
229203
stdout, stderr = await proc.communicate()
230204
if proc.returncode != 0:
231205
error_message = stderr.decode().strip()
232-
raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}")
206+
raise RuntimeError(f"Command failed: {' '.join(args)}\nError: {error_message}")
233207

234208
return stdout, stderr
235209

236210

211+
async def check_git_installed() -> None:
212+
"""
213+
Check if Git is installed and accessible on the system.
214+
215+
Raises
216+
------
217+
RuntimeError
218+
If Git is not installed or if the Git command exits with a non-zero status.
219+
"""
220+
try:
221+
proc = await asyncio.create_subprocess_exec(
222+
"git",
223+
"--version",
224+
stdout=asyncio.subprocess.PIPE,
225+
stderr=asyncio.subprocess.PIPE,
226+
)
227+
_, stderr = await proc.communicate()
228+
if proc.returncode != 0:
229+
error_message = stderr.decode().strip() if stderr else "Git command not found"
230+
raise RuntimeError(f"Git is not installed or not accessible: {error_message}")
231+
232+
except FileNotFoundError as exc:
233+
raise RuntimeError("Git is not installed. Please install Git before proceeding.") from exc
234+
235+
237236
def _get_status_code(response: str) -> int:
238237
"""
239238
Extract the status code from an HTTP response.

src/gitingest/repository_ingest.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from gitingest.config import TMP_BASE_PATH
99
from gitingest.query_ingestion import run_ingest_query
1010
from gitingest.query_parser import ParsedQuery, parse_query
11-
from gitingest.repository_clone import CloneConfig, clone_repo
11+
from gitingest.repository_clone import clone_repo
1212

1313

1414
async def ingest_async(
@@ -70,13 +70,7 @@ async def ingest_async(
7070
selected_branch = branch if branch else parsed_query.branch # prioritize branch argument
7171
parsed_query.branch = selected_branch
7272

73-
# Extract relevant fields for CloneConfig
74-
clone_config = CloneConfig(
75-
url=parsed_query.url,
76-
local_path=str(parsed_query.local_path),
77-
commit=parsed_query.commit,
78-
branch=selected_branch,
79-
)
73+
clone_config = parsed_query.extact_clone_config()
8074
clone_coroutine = clone_repo(clone_config)
8175

8276
if inspect.iscoroutine(clone_coroutine):

src/server/query_processor.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from gitingest.query_ingestion import run_ingest_query
99
from gitingest.query_parser import ParsedQuery, parse_query
10-
from gitingest.repository_clone import CloneConfig, clone_repo
10+
from gitingest.repository_clone import clone_repo
1111
from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates
1212
from server.server_utils import Colors, log_slider_to_size
1313

@@ -84,15 +84,11 @@ async def process_query(
8484
if not parsed_query.url:
8585
raise ValueError("The 'url' parameter is required.")
8686

87-
clone_config = CloneConfig(
88-
url=parsed_query.url,
89-
local_path=str(parsed_query.local_path),
90-
commit=parsed_query.commit,
91-
branch=parsed_query.branch,
92-
)
87+
clone_config = parsed_query.extact_clone_config()
9388
await clone_repo(clone_config)
89+
9490
summary, tree, content = run_ingest_query(parsed_query)
95-
with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f:
91+
with open(f"{parsed_query.local_path}.txt", "w", encoding="utf-8") as f:
9692
f.write(tree + "\n" + content)
9793
except Exception as e:
9894
# hack to print error message when query is not defined

tests/query_parser/test_query_parser.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ async def test_parse_url_with_subpaths() -> None:
153153
Then user, repo, branch, and subpath should be identified correctly.
154154
"""
155155
url = "https://github.com/user/repo/tree/main/subdir/file"
156-
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
156+
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
157157
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
158158
with patch(
159159
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
@@ -332,7 +332,7 @@ async def test_parse_url_branch_and_commit_distinction(url: str, expected_branch
332332
When `_parse_repo_source` is called with branch fetching,
333333
Then the function should correctly set `branch` or `commit` based on the URL content.
334334
"""
335-
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
335+
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
336336
# Mocking the return value to include 'main' and some additional branches
337337
mock_run_git_command.return_value = (b"refs/heads/main\nrefs/heads/dev\nrefs/heads/feature-branch\n", b"")
338338
with patch(
@@ -439,7 +439,7 @@ async def test_parse_repo_source_with_failed_git_command(url, expected_branch, e
439439

440440
with pytest.warns(
441441
RuntimeWarning,
442-
match="Warning: Failed to fetch branch list: Git command failed: "
442+
match="Warning: Failed to fetch branch list: Command failed: "
443443
"git ls-remote --heads https://github.com/user/repo",
444444
):
445445

@@ -469,7 +469,7 @@ async def test_parse_repo_source_with_various_url_patterns(url, expected_branch,
469469
When `_parse_repo_source` is called with remote branch fetching,
470470
Then the correct branch/subpath should be set or None if unmatched.
471471
"""
472-
with patch("gitingest.repository_clone._run_git_command", new_callable=AsyncMock) as mock_run_git_command:
472+
with patch("gitingest.repository_clone._run_command", new_callable=AsyncMock) as mock_run_git_command:
473473
with patch(
474474
"gitingest.repository_clone.fetch_remote_branch_list", new_callable=AsyncMock
475475
) as mock_fetch_branches:

0 commit comments

Comments
 (0)