Skip to content

Commit 56ec4bc

Browse files
authored
Merge pull request #1 from noob6t5/my-feature-branch
Fix: Avoiding Dummy URLS
2 parents 2430dbe + 12b4497 commit 56ec4bc

File tree

4 files changed

+123
-46
lines changed

4 files changed

+123
-46
lines changed

domain_filter.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import re
2+
from urllib.parse import urlparse
3+
4+
5+
NOT2SAVE_DOMAINS = {
6+
"example",
7+
"example.com",
8+
"example.org",
9+
"example.net",
10+
"localhost",
11+
"127.0.0.1",
12+
"0.0.0.0",
13+
"test.com",
14+
"dummy.com",
15+
"youtube.com",
16+
"stackoverflow.com",
17+
"bitly.com",
18+
"en.wikipedia.org" ,
19+
"apache.org/licenses"
20+
}
21+
# can be added more further
22+
def normalize_hostname(hostname):
23+
if not hostname:
24+
return ""
25+
return hostname.lower().lstrip("www.")
26+
27+
28+
def is_validurl(url: str) -> bool:
29+
try:
30+
parsed = urlparse(url)
31+
hostname = normalize_hostname(parsed.hostname)
32+
if not hostname:
33+
return False
34+
if hostname in NOT2SAVE_DOMAINS:
35+
return False
36+
if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
37+
return False
38+
if re.match(
39+
r"https?://[^/]+\.\w{1,6}[:/]*$", url
40+
): # overly short root or malformed
41+
return False
42+
except:
43+
return False
44+
return True
45+
46+
# for false-positive package names
47+
NOT2SAVE_PACKAGES = {
48+
"host",
49+
"port",
50+
"design",
51+
"pretty",
52+
"performance",
53+
"value",
54+
"index",
55+
"main",
56+
"default",
57+
"debug",
58+
"error",
59+
"message",
60+
"json",
61+
"config",
62+
"release",
63+
"object",
64+
"input",
65+
"output",
66+
"none",
67+
"true",
68+
"false",
69+
"null",
70+
"env",
71+
"test",
72+
"data",
73+
"code",
74+
"temp",
75+
"sample",
76+
}
77+
78+
79+
def is_valid_package(pkg: str) -> bool:
80+
if not pkg or len(pkg.strip()) < 2:
81+
return False
82+
if pkg.lower() in NOT2SAVE_PACKAGES:
83+
return False
84+
if pkg.isdigit() or re.fullmatch(r"[-_.]+", pkg):
85+
return False
86+
if re.match(r"^[A-Z0-9_]{3,}$", pkg):
87+
return False
88+
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style
89+
return False
90+
return True

finder.py

Lines changed: 10 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
from git import Repo
1010
from urllib.parse import urlparse
1111
from bs4 import BeautifulSoup
12+
from domain_filter import is_validurl, is_valid_package
13+
1214

1315
GITHUB_API = "https://api.github.com"
1416
GITHUB_TOKEN = os.getenv("GH_TOKEN")
@@ -50,45 +52,6 @@
5052

5153

5254
def check_package_url(name, lang):
53-
name = name.strip()
54-
ban_list = {
55-
"host",
56-
"port",
57-
"design",
58-
"pretty",
59-
"performance",
60-
"value",
61-
"index",
62-
"main",
63-
"default",
64-
"debug",
65-
"error",
66-
"message",
67-
"json",
68-
"config",
69-
"release",
70-
"object",
71-
"input",
72-
"output",
73-
"none",
74-
"true",
75-
"false",
76-
"null",
77-
}
78-
if (
79-
not name
80-
or len(name) < 2
81-
or name.lower() in ban_list
82-
or name.strip().isdigit()
83-
or re.fullmatch(r"[-_.]+", name)
84-
or re.match(r"^[A-Z0-9_]{3,}$", name)
85-
or re.search(r"[^a-zA-Z0-9_\-]", name)
86-
or name.startswith("-")
87-
or name.endswith("-")
88-
or name.count("-") > 3
89-
):
90-
return name, "INVALID"
91-
9255
url = PACKAGE_REGISTRIES[lang](name)
9356
try:
9457
r = requests.get(url, timeout=6)
@@ -185,15 +148,16 @@ def extract_urls_and_packages(repo_path):
185148
print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
186149
for u in raw_urls:
187150
print(f" URL: {u}")
188-
findings["urls"].update(raw_urls)
151+
filtered = [u for u in raw_urls if is_validurl(u)]
152+
findings["urls"].update(filtered)
189153
declared = extract_declared_packages(full_path)
190154
for k in declared:
191-
findings["packages"][k].update(declared[k])
155+
cleaned = {p for p in declared[k] if is_valid_package(p)}
156+
findings["packages"][k].update(cleaned)
192157
except Exception as e:
193158
print(f"[!] Error reading {full_path}: {e}")
194159
return findings
195-
196-
160+
# org from here
197161
def get_repos(org):
198162
repos = []
199163
page = 1
@@ -252,7 +216,9 @@ def write_output(org, findings):
252216
for lang, pkgs in findings["packages"].items():
253217
f.write(f"\n==== {lang.upper()} Packages (Status) ====\n")
254218
pkg_futures = [
255-
executor.submit(check_package_url, p, lang) for p in pkgs
219+
executor.submit(check_package_url, p, lang)
220+
for p in pkgs
221+
if is_valid_package(p)
256222
]
257223
for pf in concurrent.futures.as_completed(pkg_futures):
258224
name, status = pf.result()

test_filter.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from domain_filter import is_validurl
2+
3+
test_urls = [
4+
"http://example.com",
5+
"https://example.org/",
6+
"http://localhost:8080",
7+
"http://127.0.0.1:5000",
8+
"https://t.co/abc123",
9+
"https://stackoverflow.com/questions/123",
10+
"https://real-domain.com/page",
11+
"https://nonexistent.vulntrap.xyz/",
12+
"https://youtube.com/watch?v=abc",
13+
"https://github.com/noob6t5/repo",
14+
]
15+
16+
print("Filtered Results:\n")
17+
for url in test_urls:
18+
print(f"{url} => {'✔️ VALID' if is_validurl(url) else '❌ FILTERED'}")

workflow.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,8 @@ Live/Broken Link Checker
1010

1111
Reporting Engine (TXT, JSON, etc.)
1212

13-
## Todo
14-
Make separate code and regexes for filtering dummy and test domain's.
13+
todo : remove exmaple based domain from below type of url
14+
15+
16+
http://home.example.org:8888/cookie-parser-result?domain0019
17+

0 commit comments

Comments
 (0)