11import re
22from urllib .parse import urlparse
33
4-
4+ # Skip useless domains
55NOT2SAVE_DOMAINS = {
66 "example" ,
77 "example.com" ,
1515 "youtube.com" ,
1616 "stackoverflow.com" ,
1717 "bitly.com" ,
18- "en.wikipedia.org" ,
19- "apache.org/licenses"
18+ "en.wikipedia.org" ,
19+ "apache.org/licenses" ,
2020}
21- # can be added more further
21+
22+
2223def normalize_hostname (hostname ):
2324 if not hostname :
2425 return ""
@@ -29,21 +30,51 @@ def is_validurl(url: str) -> bool:
2930 try :
3031 parsed = urlparse (url )
3132 hostname = normalize_hostname (parsed .hostname )
33+ path = parsed .path .lower ()
34+
3235 if not hostname :
3336 return False
37+
3438 if hostname in NOT2SAVE_DOMAINS :
3539 return False
40+
3641 if re .search (r"\{\{.*?\}\}" , url ) or "{" in url or "}" in url :
3742 return False
38- if re .match (
39- r"https?://[^/]+\.\w{1,6}[:/]*$" , url
40- ): # overly short root or malformed
43+
44+ if re .match (r"https?://[^/]+\.\w{1,6}[:/]*$" , url ):
45+ return False
46+
47+ # ===== GitHub-specific filtering =====
48+ if hostname == "github.com" :
49+ # Block PRs, issues, etc
50+ if re .search (
51+ r"/(pull|issues|commit|commits|discussions|blob|tree|compare|releases|actions)(/|$)" ,
52+ path ,
53+ ):
54+ return False
55+
56+ # Block GitHub actions or PR trash if somehow missed
57+ if (
58+ "/pull/" in path
59+ or "/issues/" in path
60+ or "/commit/" in path
61+ or "/discussions/" in path
62+ ):
4163 return False
64+
65+ # Still allow:
66+ # - https://github.com/user
67+ # - https://github.com/user/repo
68+ # - https://raw.githubusercontent.com/...
69+ # - https://gist.github.com/...
70+
4271 except :
4372 return False
73+
4474 return True
4575
46- # for false-positive package names
76+
77+ # Common non-target packages (false-positive filter)
4778NOT2SAVE_PACKAGES = {
4879 "host" ,
4980 "port" ,
@@ -75,7 +106,6 @@ def is_validurl(url: str) -> bool:
75106 "sample" ,
76107}
77108
78-
79109def is_valid_package (pkg : str ) -> bool :
80110 if not pkg or len (pkg .strip ()) < 2 :
81111 return False
@@ -85,6 +115,6 @@ def is_valid_package(pkg: str) -> bool:
85115 return False
86116 if re .match (r"^[A-Z0-9_]{3,}$" , pkg ):
87117 return False
88- if re .search (r"[^a-zA-Z0-9_\-\.]" , pkg ): # Allow dots for Java/ Maven style
118+ if re .search (r"[^a-zA-Z0-9_\-\.]" , pkg ): # allow dot for Maven style
89119 return False
90120 return True
0 commit comments