noob6t5 · noob6t5 · Aug 1, 2025 · Aug 1, 2025
diff --git a/domain_filter.py b/domain_filter.py
@@ -1,7 +1,7 @@
 import re
 from urllib.parse import urlparse
 
-
+# Skip useless domains
 NOT2SAVE_DOMAINS = {
     "example",
     "example.com",
@@ -15,10 +15,11 @@
     "youtube.com",
     "stackoverflow.com",
     "bitly.com",
-    "en.wikipedia.org" ,
-    "apache.org/licenses"
+    "en.wikipedia.org",
+    "apache.org/licenses",
 }
-# can be added more further
+
+
 def normalize_hostname(hostname):
     if not hostname:
         return ""
@@ -29,21 +30,51 @@ def is_validurl(url: str) -> bool:
     try:
         parsed = urlparse(url)
         hostname = normalize_hostname(parsed.hostname)
+        path = parsed.path.lower()
+
         if not hostname:
             return False
+
         if hostname in NOT2SAVE_DOMAINS:
             return False
+
         if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
             return False
-        if re.match(
-            r"https?://[^/]+\.\w{1,6}[:/]*$", url
-        ):  # overly short root or malformed
+
+        if re.match(r"https?://[^/]+\.\w{1,6}[:/]*$", url):
+            return False  
+
+        # ===== GitHub-specific filtering =====
+        if hostname == "github.com":
+            # Block  PRs, issues, etc
+            if re.search(
+                r"/(pull|issues|commit|commits|discussions|blob|tree|compare|releases|actions)(/|$)",
+                path,
+            ):
+                return False
+
+        # Block GitHub actions or PR trash if somehow missed
+        if (
+            "/pull/" in path
+            or "/issues/" in path
+            or "/commit/" in path
+            or "/discussions/" in path
+        ):
             return False
+
+        # Still allow:
+        # - https://github.com/user
+        # - https://github.com/user/repo
+        # - https://raw.githubusercontent.com/...
+        # - https://gist.github.com/...
+
     except:
         return False
+
     return True
 
-# for false-positive package names
+
+# Common non-target packages (false-positive filter)
 NOT2SAVE_PACKAGES = {
     "host",
     "port",
@@ -75,7 +106,6 @@ def is_validurl(url: str) -> bool:
     "sample",
 }
 
-
 def is_valid_package(pkg: str) -> bool:
     if not pkg or len(pkg.strip()) < 2:
         return False
@@ -85,6 +115,6 @@ def is_valid_package(pkg: str) -> bool:
         return False
     if re.match(r"^[A-Z0-9_]{3,}$", pkg):
         return False
-    if re.search(r"[^a-zA-Z0-9_\-\.]", pkg):  # Allow dots for Java/Maven style
+    if re.search(r"[^a-zA-Z0-9_\-\.]", pkg):  # allow dot for Maven style
         return False
     return True
diff --git a/finder.py b/finder.py
@@ -11,7 +11,6 @@
 from bs4 import BeautifulSoup
 from domain_filter import is_validurl, is_valid_package
 
-
 GITHUB_API = "https://api.github.com"
 GITHUB_TOKEN = os.getenv("GH_TOKEN")
 
@@ -68,10 +67,7 @@ def check_package_url(name, lang):
 def check_url_live(url):
     try:
         res = requests.get(url, timeout=5)
-        if res.status_code < 400:
-            return url, True
-        else:
-            return url, False
+        return url, res.status_code < 400
     except:
         return url, False
 
@@ -131,33 +127,42 @@ def extract_urls_and_packages(repo_path):
                     ) as file:
                         content = file.read()
                         raw_urls = URL_REGEX.findall(content)
+
                         if full_path.endswith(".md"):
                             raw_urls += re.findall(
                                 r"\[.*?\]\((https?://[^\s\)]+)\)", content
                             )
+
                         if full_path.endswith((".html", ".htm")):
                             soup = BeautifulSoup(content, "html.parser")
                             raw_urls += [
                                 a["href"]
                                 for a in soup.find_all("a", href=True)
                                 if a["href"].startswith("http")
                             ]
+
                         if full_path.endswith(".json"):
                             content = content.replace("\\/", "/")
-                        if raw_urls:
-                            print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
-                            for u in raw_urls:
-                                print(f"    URL: {u}")
-                            filtered = [u for u in raw_urls if  is_validurl(u)]
+
+                        filtered = [u for u in raw_urls if is_validurl(u)]
+                        if filtered:
+                            cleaned = [u for u in filtered if is_validurl(u)]
+                            if cleaned:
+                                print(f"[+] {len(cleaned)} useful URLs in {full_path}")
+                                for u in cleaned:
+                                    print(f"    URL: {u}")
                             findings["urls"].update(filtered)
+
                         declared = extract_declared_packages(full_path)
                         for k in declared:
                             cleaned = {p for p in declared[k] if is_valid_package(p)}
                             findings["packages"][k].update(cleaned)
+
                 except Exception as e:
                     print(f"[!] Error reading {full_path}: {e}")
     return findings
-# org from here
+
+
 def get_repos(org):
     repos = []
     page = 1