From 45da897821edddf11ce89ffd7f007a3c664f0143 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 17 Apr 2025 19:23:06 -0600 Subject: [PATCH 1/5] fix: traverse directories to allow pattern matching of files within them --- src/gitingest/cli.py | 35 ++++- src/gitingest/ingestion.py | 4 + src/gitingest/utils/ingestion_utils.py | 4 +- tests/test_ingestion.py | 185 ++++++++++++++++++++++++- 4 files changed, 221 insertions(+), 7 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index b691fd7f..78ddf4f1 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -13,10 +13,34 @@ @click.command() @click.argument("source", type=str, default=".") -@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") -@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") -@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") -@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") +@click.option( + "--output", + "-o", + default=None, + help="Output file path (default: .txt in current directory)", +) +@click.option( + "--max-size", + "-s", + default=MAX_FILE_SIZE, + help="Maximum file size to process in bytes", +) +@click.option( + "--exclude-pattern", + "-e", + multiple=True, + help="""Patterns to exclude. Handles python's arbitrary subset of Unix + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) +@click.option( + "--include-pattern", + "-i", + multiple=True, + help="""Patterns to include. Handles python's arbitrary subset of Unit + shell-style wildcards. See: + https://docs.python.org/3/library/fnmatch.html""", +) @click.option("--branch", "-b", default=None, help="Branch to clone and ingest") def main( source: str, @@ -27,10 +51,11 @@ def main( branch: Optional[str], ): """ - Main entry point for the CLI. This function is called when the CLI is run as a script. + Main entry point for the CLI. This function is called when the CLI is run as a script. It calls the async main function to run the command. + \b Parameters ---------- source : str diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index d3005250..ec378978 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -202,6 +202,10 @@ def _process_node( query=query, stats=stats, ) + + if not child_directory_node.children: + continue + node.children.append(child_directory_node) node.size += child_directory_node.size node.file_count += child_directory_node.file_count diff --git a/src/gitingest/utils/ingestion_utils.py b/src/gitingest/utils/ingestion_utils.py index b4bb552c..9ce2ae72 100644 --- a/src/gitingest/utils/ingestion_utils.py +++ b/src/gitingest/utils/ingestion_utils.py @@ -33,8 +33,10 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> return False rel_str = str(rel_path) + + # if path is a directory, include it by default if path.is_dir(): - rel_str += "/" + return True for pattern in include_patterns: if fnmatch(rel_str, pattern): diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 3e991f8f..1ddac966 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -6,6 +6,9 @@ """ from pathlib import Path +from typing import TypedDict + +import pytest from gitingest.ingestion import ingest_query from gitingest.query_parsing import IngestionQuery @@ -42,5 +45,185 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> # TODO: Additional tests: # - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. # - Edge cases with weird file names or deep subdirectory structures. -# TODO : def test_include_txt_pattern # TODO : def test_include_nonexistent_extension + + +class PatternScenario(TypedDict): + include_patterns: set[str] + ignore_patterns: set[str] + expected_num_files: int + expected_content: set[str] + expected_structure: set[str] + expected_not_structure: set[str] + + +@pytest.mark.parametrize( + "pattern_scenario", + [ + pytest.param( + PatternScenario( + { + "include_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": {"file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": { + "file1.txt", + "file2.py", + "file_dir1.txt", + "*/file_dir2.txt", + }, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"}, + "expected_structure": {"test_repo/", "dir2/"}, + "expected_not_structure": {"src/", "subdir/", "dir1/"}, + } + ), + id="include-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 3, + "expected_content": { + "file2.py", + "src/subfile2.py", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/", "dir2/"}, + } + ), + id="include-wildcard-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {"**/file_dir2.txt", "src/**/*.py"}, + "ignore_patterns": {*()}, + "expected_num_files": 2, + "expected_content": { + "dir2/file_dir2.txt", + "src/subdir/file_subdir.py", + }, + "expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="include-recursive-wildcard", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file2.py", "dir2/file_dir2.txt"}, + "expected_num_files": 6, + "expected_content": { + "file1.txt", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir1/file_dir1.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir1/"}, + "expected_not_structure": {"dir2/"}, + } + ), + id="exclude-explicit-files", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"file1.txt", "file2.py", "*/file_dir1.txt"}, + "expected_num_files": 5, + "expected_content": { + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "src/subdir/file_subdir.py", + "dir2/file_dir2.txt", + }, + "expected_structure": {"test_repo/", "src/", "subdir/", "dir2/"}, + "expected_not_structure": {"dir1/"}, + } + ), + id="exclude-wildcard-directory", + ), + pytest.param( + PatternScenario( + { + "include_patterns": {*()}, + "ignore_patterns": {"src/**/*.py"}, + "expected_num_files": 7, + "expected_content": { + "file1.txt", + "file2.py", + "src/subfile1.txt", + "src/subfile2.py", + "src/subdir/file_subdir.txt", + "dir1/file_dir1.txt", + "dir2/file_dir2.txt", + }, + "expected_structure": { + "test_repo/", + "dir1/", + "dir2/", + "src/", + "subdir/", + }, + "expected_not_structure": {*()}, + } + ), + id="exclude-recursive-wildcard", + ), + ], +) +def test_include_ignore_patterns( + temp_directory: Path, + sample_query: IngestionQuery, + pattern_scenario: PatternScenario, +) -> None: + """ + Test `ingest_query` to ensure included and ignored paths are included and ignored respectively. + + Given a directory with .txt and .py files, and a set of include patterns or a set of ignore patterns: + When `ingest_query` is invoked, + Then it should produce a summary string listing the files analyzed and a combined content string. + """ + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = pattern_scenario["include_patterns"] or None + sample_query.ignore_patterns = pattern_scenario["ignore_patterns"] or None + + summary, structure, content = ingest_query(sample_query) + + assert "Repository: test_user/test_repo" in summary + assert f"Files analyzed: {pattern_scenario["expected_num_files"]}" in summary + + # Check presence of key files in the content + for expected_content_item in pattern_scenario["expected_content"]: + assert expected_content_item in content + + # check presence of included directories in structure + for expected_structure_item in pattern_scenario["expected_structure"]: + assert expected_structure_item in structure + + # check non-presence of non-included directories in structure + for expected_not_structure_item in pattern_scenario["expected_not_structure"]: + assert expected_not_structure_item not in structure From c1feab0c2b7d8b16598bbff9b8954084b2686ad0 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 17 Apr 2025 22:52:48 -0600 Subject: [PATCH 2/5] fix: make directory structure interaction add file path instead of just file name --- src/gitingest/ingestion.py | 14 ++++++------ src/gitingest/output_formatters.py | 23 +++++++++++--------- src/gitingest/schemas/ingestion_schema.py | 2 ++ src/server/query_processor.py | 9 ++++++-- src/server/templates/components/result.jinja | 17 +++++---------- 5 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index ec378978..cb272759 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -54,7 +54,7 @@ def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: relative_path = path.relative_to(query.local_path) - file_node = FileSystemNode( + query.root_node = FileSystemNode( name=path.name, type=FileSystemNodeType.FILE, size=path.stat().st_size, @@ -63,12 +63,12 @@ def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: path=path, ) - if not file_node.content: - raise ValueError(f"File {file_node.name} has no content") + if not query.root_node.content: + raise ValueError(f"File {query.root_node.name} has no content") - return format_node(file_node, query) + return format_node(query.root_node, query) - root_node = FileSystemNode( + query.root_node = FileSystemNode( name=path.name, type=FileSystemNodeType.DIRECTORY, path_str=str(path.relative_to(query.local_path)), @@ -78,12 +78,12 @@ def ingest_query(query: IngestionQuery) -> Tuple[str, str, str]: stats = FileSystemStats() _process_node( - node=root_node, + node=query.root_node, query=query, stats=stats, ) - return format_node(root_node, query) + return format_node(query.root_node, query) def apply_gitingest_file(path: Path, query: IngestionQuery) -> None: diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 5bacba22..a8b6305b 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -1,6 +1,6 @@ """Functions to ingest and analyze a codebase directory or single file.""" -from typing import Optional, Tuple +from typing import List, Optional, Tuple import tiktoken @@ -35,8 +35,8 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str, summary += f"File: {node.name}\n" summary += f"Lines: {len(node.content.splitlines()):,}\n" - tree = "Directory structure:\n" + _create_tree_structure(query, node) - _create_tree_structure(query, node) + tree_patterns = [("Directory structure:\n", "")] + create_tree_structure(query, node) + tree = "".join([x[0] for x in tree_patterns]) content = _gather_file_contents(node) @@ -108,7 +108,9 @@ def _gather_file_contents(node: FileSystemNode) -> str: return "\n".join(_gather_file_contents(child) for child in node.children) -def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: +def create_tree_structure( + query: IngestionQuery, node: FileSystemNode, prefix: str = "", is_last: bool = True +) -> List[Tuple[str, str]]: """ Generate a tree-like string representation of the file structure. @@ -128,14 +130,15 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: Returns ------- - str - A string representing the directory structure formatted as a tree. + List[Tuple[str, str]] + A list pairs of strings, the first a line representing the directory structure formatted as a tree + and the second is the corresponding filename with path. """ if not node.name: # If no name is present, use the slug as the top-level directory name node.name = query.slug - tree_str = "" + tree_items: List[Tuple[str, str]] = [] current_prefix = "└── " if is_last else "├── " # Indicate directories with a trailing slash @@ -145,13 +148,13 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix: elif node.type == FileSystemNodeType.SYMLINK: display_name += " -> " + node.path.readlink().name - tree_str += f"{prefix}{current_prefix}{display_name}\n" + tree_items.append((f"{prefix}{current_prefix}{display_name}\n", node.path_str)) if node.type == FileSystemNodeType.DIRECTORY and node.children: prefix += " " if is_last else "│ " for i, child in enumerate(node.children): - tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) - return tree_str + tree_items += create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1) + return tree_items def _format_token_count(text: str) -> Optional[str]: diff --git a/src/gitingest/schemas/ingestion_schema.py b/src/gitingest/schemas/ingestion_schema.py index 02b1c678..c50ff5f0 100644 --- a/src/gitingest/schemas/ingestion_schema.py +++ b/src/gitingest/schemas/ingestion_schema.py @@ -7,6 +7,7 @@ from pydantic import BaseModel, ConfigDict, Field from gitingest.config import MAX_FILE_SIZE +from gitingest.schemas.filesystem_schema import FileSystemNode @dataclass @@ -57,6 +58,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes max_file_size: int = Field(default=MAX_FILE_SIZE) ignore_patterns: Optional[Set[str]] = None include_patterns: Optional[Set[str]] = None + root_node: Optional[FileSystemNode] = None model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 00b1c640..9cf11097 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -1,12 +1,14 @@ """Process a query by parsing input, cloning a repository, and generating a summary.""" from functools import partial +from typing import Any, Dict from fastapi import Request from starlette.templating import _TemplateResponse from gitingest.cloning import clone_repo from gitingest.ingestion import ingest_query +from gitingest.output_formatters import create_tree_structure from gitingest.query_parsing import IngestionQuery, parse_query from server.server_config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE, templates from server.server_utils import Colors, log_slider_to_size @@ -64,7 +66,7 @@ async def process_query( template_response = partial(templates.TemplateResponse, name=template) max_file_size = log_slider_to_size(slider_position) - context = { + context: Dict[str, Any] = { "request": request, "repo_url": input_text, "examples": EXAMPLE_REPOS if is_index else [], @@ -87,6 +89,9 @@ async def process_query( clone_config = query.extract_clone_config() await clone_repo(clone_config) summary, tree, content = ingest_query(query) + + tree_with_filenames = ((query.root_node is not None) and create_tree_structure(query, query.root_node)) or [] + with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) except Exception as exc: @@ -122,7 +127,7 @@ async def process_query( { "result": True, "summary": summary, - "tree": tree, + "tree": tree_with_filenames, "content": content, "ingest_id": query.id, } diff --git a/src/server/templates/components/result.jinja b/src/server/templates/components/result.jinja index 151bc02f..f9eed282 100644 --- a/src/server/templates/components/result.jinja +++ b/src/server/templates/components/result.jinja @@ -1,11 +1,5 @@