Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 40 additions & 10 deletions domain_filter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from urllib.parse import urlparse


# Skip useless domains
NOT2SAVE_DOMAINS = {
"example",
"example.com",
Expand All @@ -15,10 +15,11 @@
"youtube.com",
"stackoverflow.com",
"bitly.com",
"en.wikipedia.org" ,
"apache.org/licenses"
"en.wikipedia.org",
"apache.org/licenses",
}
# can be added more further


def normalize_hostname(hostname):
if not hostname:
return ""
Expand All @@ -29,21 +30,51 @@ def is_validurl(url: str) -> bool:
try:
parsed = urlparse(url)
hostname = normalize_hostname(parsed.hostname)
path = parsed.path.lower()

if not hostname:
return False

if hostname in NOT2SAVE_DOMAINS:
return False

if re.search(r"\{\{.*?\}\}", url) or "{" in url or "}" in url:
return False
if re.match(
r"https?://[^/]+\.\w{1,6}[:/]*$", url
): # overly short root or malformed

if re.match(r"https?://[^/]+\.\w{1,6}[:/]*$", url):
return False

# ===== GitHub-specific filtering =====
if hostname == "github.com":
# Block PRs, issues, etc
if re.search(
r"/(pull|issues|commit|commits|discussions|blob|tree|compare|releases|actions)(/|$)",
path,
):
return False

# Block GitHub actions or PR trash if somehow missed
if (
"/pull/" in path
or "/issues/" in path
or "/commit/" in path
or "/discussions/" in path
):
return False

# Still allow:
# - https://github.com/user
# - https://github.com/user/repo
# - https://raw.githubusercontent.com/...
# - https://gist.github.com/...

except:
return False

return True

# for false-positive package names

# Common non-target packages (false-positive filter)
NOT2SAVE_PACKAGES = {
"host",
"port",
Expand Down Expand Up @@ -75,7 +106,6 @@ def is_validurl(url: str) -> bool:
"sample",
}


def is_valid_package(pkg: str) -> bool:
if not pkg or len(pkg.strip()) < 2:
return False
Expand All @@ -85,6 +115,6 @@ def is_valid_package(pkg: str) -> bool:
return False
if re.match(r"^[A-Z0-9_]{3,}$", pkg):
return False
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # Allow dots for Java/Maven style
if re.search(r"[^a-zA-Z0-9_\-\.]", pkg): # allow dot for Maven style
return False
return True
27 changes: 16 additions & 11 deletions finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from bs4 import BeautifulSoup
from domain_filter import is_validurl, is_valid_package


GITHUB_API = "https://api.github.com"
GITHUB_TOKEN = os.getenv("GH_TOKEN")

Expand Down Expand Up @@ -68,10 +67,7 @@ def check_package_url(name, lang):
def check_url_live(url):
try:
res = requests.get(url, timeout=5)
if res.status_code < 400:
return url, True
else:
return url, False
return url, res.status_code < 400
except:
return url, False

Expand Down Expand Up @@ -131,33 +127,42 @@ def extract_urls_and_packages(repo_path):
) as file:
content = file.read()
raw_urls = URL_REGEX.findall(content)

if full_path.endswith(".md"):
raw_urls += re.findall(
r"\[.*?\]\((https?://[^\s\)]+)\)", content
)

if full_path.endswith((".html", ".htm")):
soup = BeautifulSoup(content, "html.parser")
raw_urls += [
a["href"]
for a in soup.find_all("a", href=True)
if a["href"].startswith("http")
]

if full_path.endswith(".json"):
content = content.replace("\\/", "/")
if raw_urls:
print(f"[+] Found {len(raw_urls)} URLs in {full_path}")
for u in raw_urls:
print(f" URL: {u}")
filtered = [u for u in raw_urls if is_validurl(u)]

filtered = [u for u in raw_urls if is_validurl(u)]
if filtered:
cleaned = [u for u in filtered if is_validurl(u)]
if cleaned:
print(f"[+] {len(cleaned)} useful URLs in {full_path}")
for u in cleaned:
print(f" URL: {u}")
findings["urls"].update(filtered)

declared = extract_declared_packages(full_path)
for k in declared:
cleaned = {p for p in declared[k] if is_valid_package(p)}
findings["packages"][k].update(cleaned)

except Exception as e:
print(f"[!] Error reading {full_path}: {e}")
return findings
# org from here


def get_repos(org):
repos = []
page = 1
Expand Down