diff --git a/domain_filter.py b/domain_filter.py new file mode 100644 index 0000000..ba2b3f6 --- /dev/null +++ b/domain_filter.py @@ -0,0 +1,90 @@ +import re +from urllib.parse import urlparse + + +NOT2SAVE_DOMAINS = { + "example", + "example.com", + "example.org", + "example.net", + "localhost", + "127.0.0.1", + "0.0.0.0", + "test.com", + "dummy.com", + "youtube.com", + "stackoverflow.com", + "bitly.com", + "en.wikipedia.org" , + "apache.org/licenses" +} +# can be added more further +def normalize_hostname(hostname): + if not hostname: + return "" + return hostname.lower().lstrip("www.") + + +def is_validurl(url: str) -> bool: + try: + parsed = urlparse(url) + hostname = normalize_hostname(parsed.hostname) + if not hostname: + return False + if hostname in NOT2SAVE_DOMAINS: + return False + if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url: + return False + if re.match( + r"https?://[^/]+\.\w{1,6}[:/]*$", url + ): # overly short root or malformed + return False + except: + return False + return True + +# for false-positive package names +NOT2SAVE_PACKAGES = { + "host", + "port", + "design", + "pretty", + "performance", + "value", + "index", + "main", + "default", + "debug", + "error", + "message", + "json", + "config", + "release", + "object", + "input", + "output", + "none", + "true", + "false", + "null", + "env", + "test", + "data", + "code", + "temp", + "sample", +} + + +def is_valid_package(pkg: str) -> bool: + if not pkg or len(pkg.strip()) < 2: + return False + if pkg.lower() in NOT2SAVE_PACKAGES: + return False + if pkg.isdigit() or re.fullmatch(r"[-_.]+", pkg): + return False + if re.match(r"^[A-Z0-9_]{3,}$", pkg): + return False + if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style + return False + return True diff --git a/finder.py b/finder.py index ea56bfc..741b0d1 100644 --- a/finder.py +++ b/finder.py @@ -9,6 +9,8 @@ from git import Repo from urllib.parse import urlparse from bs4 import BeautifulSoup +from domain_filter import is_validurl, is_valid_package + GITHUB_API = "https://api.github.com" GITHUB_TOKEN = os.getenv("GH_TOKEN") @@ -50,45 +52,6 @@ def check_package_url(name, lang): - name = name.strip() - ban_list = { - "host", - "port", - "design", - "pretty", - "performance", - "value", - "index", - "main", - "default", - "debug", - "error", - "message", - "json", - "config", - "release", - "object", - "input", - "output", - "none", - "true", - "false", - "null", - } - if ( - not name - or len(name) < 2 - or name.lower() in ban_list - or name.strip().isdigit() - or re.fullmatch(r"[-_.]+", name) - or re.match(r"^[A-Z0-9_]{3,}$", name) - or re.search(r"[^a-zA-Z0-9_\-]", name) - or name.startswith("-") - or name.endswith("-") - or name.count("-") > 3 - ): - return name, "INVALID" - url = PACKAGE_REGISTRIES[lang](name) try: r = requests.get(url, timeout=6) @@ -185,15 +148,16 @@ def extract_urls_and_packages(repo_path): print(f"[+] Found {len(raw_urls)} URLs in {full_path}") for u in raw_urls: print(f" URL: {u}") - findings["urls"].update(raw_urls) + filtered = [u for u in raw_urls if is_validurl(u)] + findings["urls"].update(filtered) declared = extract_declared_packages(full_path) for k in declared: - findings["packages"][k].update(declared[k]) + cleaned = {p for p in declared[k] if is_valid_package(p)} + findings["packages"][k].update(cleaned) except Exception as e: print(f"[!] Error reading {full_path}: {e}") return findings - - +# org from here def get_repos(org): repos = [] page = 1 @@ -252,7 +216,9 @@ def write_output(org, findings): for lang, pkgs in findings["packages"].items(): f.write(f"\n==== {lang.upper()} Packages (Status) ====\n") pkg_futures = [ - executor.submit(check_package_url, p, lang) for p in pkgs + executor.submit(check_package_url, p, lang) + for p in pkgs + if is_valid_package(p) ] for pf in concurrent.futures.as_completed(pkg_futures): name, status = pf.result() diff --git a/test_filter.py b/test_filter.py new file mode 100644 index 0000000..9d9f5d4 --- /dev/null +++ b/test_filter.py @@ -0,0 +1,18 @@ +from domain_filter import is_validurl + +test_urls = [ + "http://example.com", + "https://example.org/", + "http://localhost:8080", + "http://127.0.0.1:5000", + "https://t.co/abc123", + "https://stackoverflow.com/questions/123", + "https://real-domain.com/page", + "https://nonexistent.vulntrap.xyz/", + "https://youtube.com/watch?v=abc", + "https://github.com/noob6t5/repo", +] + +print("Filtered Results:\n") +for url in test_urls: + print(f"{url} => {'✔️ VALID' if is_validurl(url) else '❌ FILTERED'}") diff --git a/workflow.txt b/workflow.txt index 814c5e7..06277a6 100644 --- a/workflow.txt +++ b/workflow.txt @@ -10,5 +10,8 @@ Live/Broken Link Checker Reporting Engine (TXT, JSON, etc.) -## Todo -Make separate code and regexes for filtering dummy and test domain's. +todo : remove exmaple based domain from below type of url + + +http://home.example.org:8888/cookie-parser-result?domain0019 +