Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions domain_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import re
from urllib.parse import urlparse


NOT2SAVE_DOMAINS = {
"example",
"example.com",
"example.org",
"example.net",
"localhost",
"127.0.0.1",
"0.0.0.0",
"test.com",
"dummy.com",
"youtube.com",
"stackoverflow.com",
"bitly.com",
"en.wikipedia.org" ,
"apache.org/licenses"
}
# can be added more further
def normalize_hostname(hostname):
if not hostname:
return ""
return hostname.lower().lstrip("www.")


def is_validurl(url: str) -> bool:
try:
parsed = urlparse(url)
hostname = normalize_hostname(parsed.hostname)
if not hostname:
return False
if hostname in NOT2SAVE_DOMAINS:
return False
if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
return False
if re.match(
r"https?://[^/]+\.\w{1,6}[:/]*$", url
): # overly short root or malformed
return False
except:
return False
return True

# for false-positive package names
NOT2SAVE_PACKAGES = {
"host",
"port",
"design",
"pretty",
"performance",
"value",
"index",
"main",
"default",
"debug",
"error",
"message",
"json",
"config",
"release",
"object",
"input",
"output",
"none",
"true",
"false",
"null",
"env",
"test",
"data",
"code",
"temp",
"sample",
}


def is_valid_package(pkg: str) -> bool:
if not pkg or len(pkg.strip()) < 2:
return False
if pkg.lower() in NOT2SAVE_PACKAGES:
return False
if pkg.isdigit() or re.fullmatch(r"[-_.]+", pkg):
return False
if re.match(r"^[A-Z0-9_]{3,}$", pkg):
return False
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style
return False
return True
54 changes: 10 additions & 44 deletions finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from git import Repo
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from domain_filter import is_validurl, is_valid_package


GITHUB_API = "https://api.github.com"
GITHUB_TOKEN = os.getenv("GH_TOKEN")
Expand Down Expand Up @@ -50,45 +52,6 @@


def check_package_url(name, lang):
name = name.strip()
ban_list = {
"host",
"port",
"design",
"pretty",
"performance",
"value",
"index",
"main",
"default",
"debug",
"error",
"message",
"json",
"config",
"release",
"object",
"input",
"output",
"none",
"true",
"false",
"null",
}
if (
not name
or len(name) < 2
or name.lower() in ban_list
or name.strip().isdigit()
or re.fullmatch(r"[-_.]+", name)
or re.match(r"^[A-Z0-9_]{3,}$", name)
or re.search(r"[^a-zA-Z0-9_\-]", name)
or name.startswith("-")
or name.endswith("-")
or name.count("-") > 3
):
return name, "INVALID"

url = PACKAGE_REGISTRIES[lang](name)
try:
r = requests.get(url, timeout=6)
Expand Down Expand Up @@ -185,15 +148,16 @@ def extract_urls_and_packages(repo_path):
print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
for u in raw_urls:
print(f" URL: {u}")
findings["urls"].update(raw_urls)
filtered = [u for u in raw_urls if is_validurl(u)]
findings["urls"].update(filtered)
declared = extract_declared_packages(full_path)
for k in declared:
findings["packages"][k].update(declared[k])
cleaned = {p for p in declared[k] if is_valid_package(p)}
findings["packages"][k].update(cleaned)
except Exception as e:
print(f"[!] Error reading {full_path}: {e}")
return findings


# org from here
def get_repos(org):
repos = []
page = 1
Expand Down Expand Up @@ -252,7 +216,9 @@ def write_output(org, findings):
for lang, pkgs in findings["packages"].items():
f.write(f"\n==== {lang.upper()} Packages (Status) ====\n")
pkg_futures = [
executor.submit(check_package_url, p, lang) for p in pkgs
executor.submit(check_package_url, p, lang)
for p in pkgs
if is_valid_package(p)
]
for pf in concurrent.futures.as_completed(pkg_futures):
name, status = pf.result()
Expand Down
18 changes: 18 additions & 0 deletions test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from domain_filter import is_validurl

test_urls = [
"http://example.com",
"https://example.org/",
"http://localhost:8080",
"http://127.0.0.1:5000",
"https://t.co/abc123",
"https://stackoverflow.com/questions/123",
"https://real-domain.com/page",
"https://nonexistent.vulntrap.xyz/",
"https://youtube.com/watch?v=abc",
"https://github.com/noob6t5/repo",
]

print("Filtered Results:\n")
for url in test_urls:
print(f"{url} => {'✔️ VALID' if is_validurl(url) else '❌ FILTERED'}")
7 changes: 5 additions & 2 deletions workflow.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ Live/Broken Link Checker

Reporting Engine (TXT, JSON, etc.)

## Todo
Make separate code and regexes for filtering dummy and test domain's.
todo : remove exmaple based domain from below type of url


http://home.example.org:8888/cookie-parser-result?domain0019