Skip to content

Commit 9996a06

Browse files
authored
Fix file management and permissions issues (#23)
1 parent 41a76f3 commit 9996a06

File tree

3 files changed

+118
-4
lines changed

3 files changed

+118
-4
lines changed

Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,18 @@ FROM python:3.12
22

33
WORKDIR /app
44

5+
# Create a non-root user
6+
RUN useradd -m -u 1000 appuser
7+
58
COPY src/ ./
69
COPY requirements.txt ./
710

811
RUN pip install -r requirements.txt
912

13+
# Change ownership of the application files
14+
RUN chown -R appuser:appuser /app
15+
16+
# Switch to non-root user
17+
USER appuser
18+
1019
CMD ["uvicorn", "main:app", "--reload"]

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
fastapi[standard]
22
uvicorn
33
fastapi-analytics
4-
slowapi
4+
slowapi
5+
tokencost

src/ingest.py

Lines changed: 107 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
from tokencost import count_string_tokens
55
from typing import Dict, List, Union
66

7+
MAX_DIRECTORY_DEPTH = 10 # Maximum depth of directory traversal
8+
MAX_FILES = 10000 # Maximum number of files to process
9+
MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024 # 100MB total size limit
10+
711
def should_ignore(path: str, base_path: str, ignore_patterns: List[str]) -> bool:
812
"""Checks if a file or directory should be ignored based on patterns."""
913
name = os.path.basename(path)
@@ -15,6 +19,19 @@ def should_ignore(path: str, base_path: str, ignore_patterns: List[str]) -> bool
1519
return True
1620
return False
1721

22+
def is_safe_symlink(symlink_path: str, base_path: str) -> bool:
23+
"""Check if a symlink points to a location within the base directory."""
24+
try:
25+
# Get the absolute path of the symlink target
26+
target_path = os.path.realpath(symlink_path)
27+
# Get the absolute path of the base directory
28+
base_path = os.path.realpath(base_path)
29+
# Check if the target path starts with the base path
30+
return os.path.commonpath([target_path]) == os.path.commonpath([target_path, base_path])
31+
except (OSError, ValueError):
32+
# If there's any error resolving the paths, consider it unsafe
33+
return False
34+
1835
def is_text_file(file_path: str) -> bool:
1936
"""Determines if a file is likely a text file based on its content."""
2037
try:
@@ -32,8 +49,34 @@ def read_file_content(file_path: str) -> str:
3249
except Exception as e:
3350
return f"Error reading file: {str(e)}"
3451

35-
def scan_directory(path: str, ignore_patterns: List[str], base_path: str) -> Dict:
36-
"""Recursively analyzes a directory and its contents."""
52+
def scan_directory(path: str, ignore_patterns: List[str], base_path: str, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict:
53+
"""Recursively analyzes a directory and its contents with safety limits."""
54+
if seen_paths is None:
55+
seen_paths = set()
56+
if stats is None:
57+
stats = {"total_files": 0, "total_size": 0}
58+
59+
# Check depth limit
60+
if depth > MAX_DIRECTORY_DEPTH:
61+
print(f"Skipping deep directory: {path} (max depth {MAX_DIRECTORY_DEPTH} reached)")
62+
return None
63+
64+
# Check total files limit
65+
if stats["total_files"] >= MAX_FILES:
66+
print(f"Skipping further processing: maximum file limit ({MAX_FILES}) reached")
67+
return None
68+
69+
# Check total size limit
70+
if stats["total_size"] >= MAX_TOTAL_SIZE_BYTES:
71+
print(f"Skipping further processing: maximum total size ({MAX_TOTAL_SIZE_BYTES/1024/1024:.1f}MB) reached")
72+
return None
73+
74+
real_path = os.path.realpath(path)
75+
if real_path in seen_paths:
76+
print(f"Skipping already visited path: {path}")
77+
return None
78+
seen_paths.add(real_path)
79+
3780
result = {
3881
"name": os.path.basename(path),
3982
"type": "directory",
@@ -51,8 +94,69 @@ def scan_directory(path: str, ignore_patterns: List[str], base_path: str) -> Dic
5194
if should_ignore(item_path, base_path, ignore_patterns):
5295
continue
5396

97+
# Handle symlinks
98+
if os.path.islink(item_path):
99+
if not is_safe_symlink(item_path, base_path):
100+
print(f"Skipping symlink that points outside base directory: {item_path}")
101+
continue
102+
real_path = os.path.realpath(item_path)
103+
if real_path in seen_paths:
104+
print(f"Skipping already visited symlink target: {item_path}")
105+
continue
106+
107+
if os.path.isfile(real_path):
108+
file_size = os.path.getsize(real_path)
109+
# Check if adding this file would exceed total size limit
110+
if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES:
111+
print(f"Skipping file {item_path}: would exceed total size limit")
112+
continue
113+
114+
stats["total_files"] += 1
115+
stats["total_size"] += file_size
116+
117+
if stats["total_files"] > MAX_FILES:
118+
print(f"Maximum file limit ({MAX_FILES}) reached")
119+
return result
120+
121+
is_text = is_text_file(real_path)
122+
content = read_file_content(real_path) if is_text else "[Non-text file]"
123+
124+
child = {
125+
"name": item,
126+
"type": "file",
127+
"size": file_size,
128+
"content": content,
129+
"path": item_path
130+
}
131+
result["children"].append(child)
132+
result["size"] += file_size
133+
result["file_count"] += 1
134+
135+
elif os.path.isdir(real_path):
136+
subdir = scan_directory(real_path, ignore_patterns, base_path, seen_paths, depth + 1, stats)
137+
if subdir:
138+
subdir["name"] = item
139+
subdir["path"] = item_path
140+
result["children"].append(subdir)
141+
result["size"] += subdir["size"]
142+
result["file_count"] += subdir["file_count"]
143+
result["dir_count"] += 1 + subdir["dir_count"]
144+
continue
145+
54146
if os.path.isfile(item_path):
55147
file_size = os.path.getsize(item_path)
148+
# Check if adding this file would exceed total size limit
149+
if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES:
150+
print(f"Skipping file {item_path}: would exceed total size limit")
151+
continue
152+
153+
stats["total_files"] += 1
154+
stats["total_size"] += file_size
155+
156+
if stats["total_files"] > MAX_FILES:
157+
print(f"Maximum file limit ({MAX_FILES}) reached")
158+
return result
159+
56160
is_text = is_text_file(item_path)
57161
content = read_file_content(item_path) if is_text else "[Non-text file]"
58162

@@ -68,7 +172,7 @@ def scan_directory(path: str, ignore_patterns: List[str], base_path: str) -> Dic
68172
result["file_count"] += 1
69173

70174
elif os.path.isdir(item_path):
71-
subdir = scan_directory(item_path, ignore_patterns, base_path)
175+
subdir = scan_directory(item_path, ignore_patterns, base_path, seen_paths, depth + 1, stats)
72176
if subdir:
73177
result["children"].append(subdir)
74178
result["size"] += subdir["size"]

0 commit comments

Comments
 (0)