Merge pull request #1 from noob6t5/my-feature-branch

noob6t5 · web-flow · commit 56ec4bcf243b · 2025-07-31T22:35:25.000+05:45
Fix: Avoiding Dummy URLS
diff --git a/domain_filter.py b/domain_filter.py
@@ -0,0 +1,90 @@
+import re
+from urllib.parse import urlparse
+
+
+NOT2SAVE_DOMAINS = {
+    "example",
+    "example.com",
+    "example.org",
+    "example.net",
+    "localhost",
+    "127.0.0.1",
+    "0.0.0.0",
+    "test.com",
+    "dummy.com",
+    "youtube.com",
+    "stackoverflow.com",
+    "bitly.com",
+    "en.wikipedia.org" ,
+    "apache.org/licenses"
+}
+# can be added more further
+def normalize_hostname(hostname):
+    if not hostname:
+        return ""
+    return hostname.lower().lstrip("www.")
+
+
+def is_validurl(url: str) -> bool:
+    try:
+        parsed = urlparse(url)
+        hostname = normalize_hostname(parsed.hostname)
+        if not hostname:
+            return False
+        if hostname in NOT2SAVE_DOMAINS:
+            return False
+        if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
+            return False
+        if re.match(
+            r"https?://[^/]+\.\w{1,6}[:/]*$", url
+        ):  # overly short root or malformed
+            return False
+    except:
+        return False
+    return True
+
+# for false-positive package names
+NOT2SAVE_PACKAGES = {
+    "host",
+    "port",
+    "design",
+    "pretty",
+    "performance",
+    "value",
+    "index",
+    "main",
+    "default",
+    "debug",
+    "error",
+    "message",
+    "json",
+    "config",
+    "release",
+    "object",
+    "input",
+    "output",
+    "none",
+    "true",
+    "false",
+    "null",
+    "env",
+    "test",
+    "data",
+    "code",
+    "temp",
+    "sample",
+}
+
+
+def is_valid_package(pkg: str) -> bool:
+    if not pkg or len(pkg.strip()) < 2:
+        return False
+    if pkg.lower() in NOT2SAVE_PACKAGES:
+        return False
+    if pkg.isdigit() or re.fullmatch(r"[-_.]+", pkg):
+        return False
+    if re.match(r"^[A-Z0-9_]{3,}$", pkg):
+        return False
+    if re.search(r"[^a-zA-Z0-9_\-\.]", pkg):  # Allow dots for Java/Maven style
+        return False
+    return True
diff --git a/finder.py b/finder.py
@@ -9,6 +9,8 @@
 from git import Repo
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
+from domain_filter import is_validurl, is_valid_package
+
 
 GITHUB_API = "https://api.github.com"
 GITHUB_TOKEN = os.getenv("GH_TOKEN")
@@ -50,45 +52,6 @@
 
 
 def check_package_url(name, lang):
-    name = name.strip()
-    ban_list = {
-        "host",
-        "port",
-        "design",
-        "pretty",
-        "performance",
-        "value",
-        "index",
-        "main",
-        "default",
-        "debug",
-        "error",
-        "message",
-        "json",
-        "config",
-        "release",
-        "object",
-        "input",
-        "output",
-        "none",
-        "true",
-        "false",
-        "null",
-    }
-    if (
-        not name
-        or len(name) < 2
-        or name.lower() in ban_list
-        or name.strip().isdigit()
-        or re.fullmatch(r"[-_.]+", name)
-        or re.match(r"^[A-Z0-9_]{3,}$", name)
-        or re.search(r"[^a-zA-Z0-9_\-]", name)
-        or name.startswith("-")
-        or name.endswith("-")
-        or name.count("-") > 3
-    ):
-        return name, "INVALID"
-
     url = PACKAGE_REGISTRIES[lang](name)
     try:
         r = requests.get(url, timeout=6)
@@ -185,15 +148,16 @@ def extract_urls_and_packages(repo_path):
                             print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
                             for u in raw_urls:
                                 print(f"    URL: {u}")
-                            findings["urls"].update(raw_urls)
+                            filtered = [u for u in raw_urls if  is_validurl(u)]
+                            findings["urls"].update(filtered)
                         declared = extract_declared_packages(full_path)
                         for k in declared:
-                            findings["packages"][k].update(declared[k])
+                            cleaned = {p for p in declared[k] if is_valid_package(p)}
+                            findings["packages"][k].update(cleaned)
                 except Exception as e:
                     print(f"[!] Error reading {full_path}: {e}")
     return findings
-
-
+# org from here
 def get_repos(org):
     repos = []
     page = 1
@@ -252,7 +216,9 @@ def write_output(org, findings):
             for lang, pkgs in findings["packages"].items():
                 f.write(f"\n==== {lang.upper()} Packages (Status) ====\n")
                 pkg_futures = [
-                    executor.submit(check_package_url, p, lang) for p in pkgs
+                    executor.submit(check_package_url, p, lang)
+                    for p in pkgs
+                    if is_valid_package(p)
                 ]
                 for pf in concurrent.futures.as_completed(pkg_futures):
                     name, status = pf.result()
diff --git a/test_filter.py b/test_filter.py
@@ -0,0 +1,18 @@
+from domain_filter import is_validurl
+
+test_urls = [
+    "http://example.com",
+    "https://example.org/",
+    "http://localhost:8080",
+    "http://127.0.0.1:5000",
+    "https://t.co/abc123",
+    "https://stackoverflow.com/questions/123",
+    "https://real-domain.com/page",
+    "https://nonexistent.vulntrap.xyz/",
+    "https://youtube.com/watch?v=abc",
+    "https://github.com/noob6t5/repo",
+]
+
+print("Filtered Results:\n")
+for url in test_urls:
+    print(f"{url} => {'✔️ VALID' if is_validurl(url) else '❌ FILTERED'}")
diff --git a/workflow.txt b/workflow.txt
@@ -10,5 +10,8 @@ Live/Broken Link Checker
 
 Reporting Engine (TXT, JSON, etc.)
 
-##  Todo
-Make separate code and regexes for filtering dummy and test domain's.
+todo : remove exmaple based domain from below type of url
+
+
+http://home.example.org:8888/cookie-parser-result?domain0019
+