Skip to content

Commit f1fdc6c

Browse files
authored
Merge pull request #2 from noob6t5/my-feature-branch
Add: More tigthen Dummy
2 parents 56ec4bc + 1b6326f commit f1fdc6c

File tree

2 files changed

+56
-21
lines changed

2 files changed

+56
-21
lines changed

domain_filter.py

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import re
22
from urllib.parse import urlparse
33

4-
4+
# Skip useless domains
55
NOT2SAVE_DOMAINS = {
66
"example",
77
"example.com",
@@ -15,10 +15,11 @@
1515
"youtube.com",
1616
"stackoverflow.com",
1717
"bitly.com",
18-
"en.wikipedia.org" ,
19-
"apache.org/licenses"
18+
"en.wikipedia.org",
19+
"apache.org/licenses",
2020
}
21-
# can be added more further
21+
22+
2223
def normalize_hostname(hostname):
2324
if not hostname:
2425
return ""
@@ -29,21 +30,51 @@ def is_validurl(url: str) -> bool:
2930
try:
3031
parsed = urlparse(url)
3132
hostname = normalize_hostname(parsed.hostname)
33+
path = parsed.path.lower()
34+
3235
if not hostname:
3336
return False
37+
3438
if hostname in NOT2SAVE_DOMAINS:
3539
return False
40+
3641
if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
3742
return False
38-
if re.match(
39-
r"https?://[^/]+\.\w{1,6}[:/]*$", url
40-
): # overly short root or malformed
43+
44+
if re.match(r"https?://[^/]+\.\w{1,6}[:/]*$", url):
45+
return False
46+
47+
# ===== GitHub-specific filtering =====
48+
if hostname == "github.com":
49+
# Block PRs, issues, etc
50+
if re.search(
51+
r"/(pull|issues|commit|commits|discussions|blob|tree|compare|releases|actions)(/|$)",
52+
path,
53+
):
54+
return False
55+
56+
# Block GitHub actions or PR trash if somehow missed
57+
if (
58+
"/pull/" in path
59+
or "/issues/" in path
60+
or "/commit/" in path
61+
or "/discussions/" in path
62+
):
4163
return False
64+
65+
# Still allow:
66+
# - https://github.com/user
67+
# - https://github.com/user/repo
68+
# - https://raw.githubusercontent.com/...
69+
# - https://gist.github.com/...
70+
4271
except:
4372
return False
73+
4474
return True
4575

46-
# for false-positive package names
76+
77+
# Common non-target packages (false-positive filter)
4778
NOT2SAVE_PACKAGES = {
4879
"host",
4980
"port",
@@ -75,7 +106,6 @@ def is_validurl(url: str) -> bool:
75106
"sample",
76107
}
77108

78-
79109
def is_valid_package(pkg: str) -> bool:
80110
if not pkg or len(pkg.strip()) < 2:
81111
return False
@@ -85,6 +115,6 @@ def is_valid_package(pkg: str) -> bool:
85115
return False
86116
if re.match(r"^[A-Z0-9_]{3,}$", pkg):
87117
return False
88-
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style
118+
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # allow dot for Maven style
89119
return False
90120
return True

finder.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from bs4 import BeautifulSoup
1212
from domain_filter import is_validurl, is_valid_package
1313

14-
1514
GITHUB_API = "https://api.github.com"
1615
GITHUB_TOKEN = os.getenv("GH_TOKEN")
1716

@@ -68,10 +67,7 @@ def check_package_url(name, lang):
6867
def check_url_live(url):
6968
try:
7069
res = requests.get(url, timeout=5)
71-
if res.status_code < 400:
72-
return url, True
73-
else:
74-
return url, False
70+
return url, res.status_code < 400
7571
except:
7672
return url, False
7773

@@ -131,33 +127,42 @@ def extract_urls_and_packages(repo_path):
131127
) as file:
132128
content = file.read()
133129
raw_urls = URL_REGEX.findall(content)
130+
134131
if full_path.endswith(".md"):
135132
raw_urls += re.findall(
136133
r"\[.*?\]\((https?://[^\s\)]+)\)", content
137134
)
135+
138136
if full_path.endswith((".html", ".htm")):
139137
soup = BeautifulSoup(content, "html.parser")
140138
raw_urls += [
141139
a["href"]
142140
for a in soup.find_all("a", href=True)
143141
if a["href"].startswith("http")
144142
]
143+
145144
if full_path.endswith(".json"):
146145
content = content.replace("\\/", "/")
147-
if raw_urls:
148-
print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
149-
for u in raw_urls:
150-
print(f" URL: {u}")
151-
filtered = [u for u in raw_urls if is_validurl(u)]
146+
147+
filtered = [u for u in raw_urls if is_validurl(u)]
148+
if filtered:
149+
cleaned = [u for u in filtered if is_validurl(u)]
150+
if cleaned:
151+
print(f"[+] {len(cleaned)} useful URLs in {full_path}")
152+
for u in cleaned:
153+
print(f" URL: {u}")
152154
findings["urls"].update(filtered)
155+
153156
declared = extract_declared_packages(full_path)
154157
for k in declared:
155158
cleaned = {p for p in declared[k] if is_valid_package(p)}
156159
findings["packages"][k].update(cleaned)
160+
157161
except Exception as e:
158162
print(f"[!] Error reading {full_path}: {e}")
159163
return findings
160-
# org from here
164+
165+
161166
def get_repos(org):
162167
repos = []
163168
page = 1

0 commit comments

Comments
 (0)