diff --git a/domain_filter.py b/domain_filter.py index ba2b3f6..78afc97 100644 --- a/domain_filter.py +++ b/domain_filter.py @@ -1,7 +1,7 @@ import re from urllib.parse import urlparse - +# Skip useless domains NOT2SAVE_DOMAINS = { "example", "example.com", @@ -15,10 +15,11 @@ "youtube.com", "stackoverflow.com", "bitly.com", - "en.wikipedia.org" , - "apache.org/licenses" + "en.wikipedia.org", + "apache.org/licenses", } -# can be added more further + + def normalize_hostname(hostname): if not hostname: return "" @@ -29,21 +30,51 @@ def is_validurl(url: str) -> bool: try: parsed = urlparse(url) hostname = normalize_hostname(parsed.hostname) + path = parsed.path.lower() + if not hostname: return False + if hostname in NOT2SAVE_DOMAINS: return False + if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url: return False - if re.match( - r"https?://[^/]+\.\w{1,6}[:/]*$", url - ): # overly short root or malformed + + if re.match(r"https?://[^/]+\.\w{1,6}[:/]*$", url): + return False + + # ===== GitHub-specific filtering ===== + if hostname == "github.com": + # Block PRs, issues, etc + if re.search( + r"/(pull|issues|commit|commits|discussions|blob|tree|compare|releases|actions)(/|$)", + path, + ): + return False + + # Block GitHub actions or PR trash if somehow missed + if ( + "/pull/" in path + or "/issues/" in path + or "/commit/" in path + or "/discussions/" in path + ): return False + + # Still allow: + # - https://github.com/user + # - https://github.com/user/repo + # - https://raw.githubusercontent.com/... + # - https://gist.github.com/... + except: return False + return True -# for false-positive package names + +# Common non-target packages (false-positive filter) NOT2SAVE_PACKAGES = { "host", "port", @@ -75,7 +106,6 @@ def is_validurl(url: str) -> bool: "sample", } - def is_valid_package(pkg: str) -> bool: if not pkg or len(pkg.strip()) < 2: return False @@ -85,6 +115,6 @@ def is_valid_package(pkg: str) -> bool: return False if re.match(r"^[A-Z0-9_]{3,}$", pkg): return False - if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style + if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # allow dot for Maven style return False return True diff --git a/finder.py b/finder.py index 741b0d1..657586b 100644 --- a/finder.py +++ b/finder.py @@ -11,7 +11,6 @@ from bs4 import BeautifulSoup from domain_filter import is_validurl, is_valid_package - GITHUB_API = "https://api.github.com" GITHUB_TOKEN = os.getenv("GH_TOKEN") @@ -68,10 +67,7 @@ def check_package_url(name, lang): def check_url_live(url): try: res = requests.get(url, timeout=5) - if res.status_code < 400: - return url, True - else: - return url, False + return url, res.status_code < 400 except: return url, False @@ -131,10 +127,12 @@ def extract_urls_and_packages(repo_path): ) as file: content = file.read() raw_urls = URL_REGEX.findall(content) + if full_path.endswith(".md"): raw_urls += re.findall( r"\[.*?\]\((https?://[^\s\)]+)\)", content ) + if full_path.endswith((".html", ".htm")): soup = BeautifulSoup(content, "html.parser") raw_urls += [ @@ -142,22 +140,29 @@ def extract_urls_and_packages(repo_path): for a in soup.find_all("a", href=True) if a["href"].startswith("http") ] + if full_path.endswith(".json"): content = content.replace("\\/", "/") - if raw_urls: - print(f"[+] Found {len(raw_urls)} URLs in {full_path}") - for u in raw_urls: - print(f" URL: {u}") - filtered = [u for u in raw_urls if is_validurl(u)] + + filtered = [u for u in raw_urls if is_validurl(u)] + if filtered: + cleaned = [u for u in filtered if is_validurl(u)] + if cleaned: + print(f"[+] {len(cleaned)} useful URLs in {full_path}") + for u in cleaned: + print(f" URL: {u}") findings["urls"].update(filtered) + declared = extract_declared_packages(full_path) for k in declared: cleaned = {p for p in declared[k] if is_valid_package(p)} findings["packages"][k].update(cleaned) + except Exception as e: print(f"[!] Error reading {full_path}: {e}") return findings -# org from here + + def get_repos(org): repos = [] page = 1