Skip to content

Commit fae73ec

Browse files
committed
feat: added in support for different data representations, thresholds updated
1 parent ba5f2d9 commit fae73ec

File tree

5 files changed

+443
-0
lines changed

5 files changed

+443
-0
lines changed

src/macaron/config/defaults.ini

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,3 +581,10 @@ releases_frequency_threshold = 2
581581
# The gap threshold.
582582
# The timedelta indicate the gap between the date maintainer registers their pypi's account and the date of latest release.
583583
timedelta_threshold_of_join_release = 5
584+
585+
# Any major version above this value is detected as anomalistic and marked as suspicious
586+
major_threshold = 20
587+
# Any epoch number avove this value is detected as anomalistic and marked as suspicious
588+
epoch_threshold = 3
589+
# The number of days +/- the day of publish the calendar versioning day may be
590+
day_publish_error = 4

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ class Heuristics(str, Enum):
3434
#: Indicates that the package does not include a .whl file
3535
WHEEL_ABSENCE = "wheel_absence"
3636

37+
#: Indicates that the package has an unusually large version number for a single release
38+
ANOMALISTIC_VERSION = "anomalistic_version"
39+
3740

3841
class HeuristicResult(str, Enum):
3942
"""Result type indicating the outcome of a heuristic."""
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""The heuristic analyzer to check for an anomalistic package version."""
5+
6+
import logging
7+
from enum import Enum
8+
9+
from packaging.version import InvalidVersion, parse
10+
11+
from macaron.config.defaults import defaults
12+
from macaron.errors import HeuristicAnalyzerValueError
13+
from macaron.json_tools import JsonType, json_extract
14+
from macaron.malware_analyzer.datetime_parser import parse_datetime
15+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
16+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
17+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
18+
19+
logger: logging.Logger = logging.getLogger(__name__)
20+
21+
22+
class AnomalisticVersionAnalyzer(BaseHeuristicAnalyzer):
23+
"""
24+
Analyze the version number (if there is only a single release) to detect if it is anomalistic.
25+
26+
A version number is anomalistic if it is above the thresholds for an epoch, major, or minor value.
27+
If the version does not adhere to PyPI standards (PEP 440, as per the 'packaging' module), this heuristic
28+
cannot analyze it.
29+
30+
Calendar versioning is detected as version numbers with the major value as the year (either yyyy or yy),
31+
the minor as the month, and the micro as the day (+/- 2 days), with no further values.
32+
33+
Calendar-semantic versioning is detected as version numbers with the major value as the year (either yyyy or yy),
34+
and any other series of numbers following it.
35+
36+
All other versionings are detected as semantic versioning.
37+
"""
38+
39+
DETAIL_INFO_KEY: str = "versioning"
40+
41+
def __init__(self) -> None:
42+
super().__init__(
43+
name="anomalistic_version_analyzer",
44+
heuristic=Heuristics.ANOMALISTIC_VERSION,
45+
depends_on=[(Heuristics.ONE_RELEASE, HeuristicResult.FAIL)],
46+
)
47+
self.major_threshold, self.epoch_threshold, self.day_publish_error = self._load_defaults()
48+
49+
def _load_defaults(self) -> tuple[int, int, int]:
50+
"""Load default settings from defaults.ini."""
51+
section_name = "heuristic.pypi"
52+
if defaults.has_section(section_name):
53+
section = defaults[section_name]
54+
return (
55+
section.getint("major_threshold"),
56+
section.getint("epoch_threshold"),
57+
section.getint("day_publish_error"),
58+
)
59+
# Major threshold, Epoch threshold, Day pushlish error
60+
return 20, 3, 4
61+
62+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
63+
"""Analyze the package.
64+
65+
Parameters
66+
----------
67+
pypi_package_json: PyPIPackageJsonAsset
68+
The PyPI package JSON asset object.
69+
70+
Returns
71+
-------
72+
tuple[HeuristicResult, dict[str, JsonType]]:
73+
The result and related information collected during the analysis.
74+
75+
Raises
76+
------
77+
HeuristicAnalyzerValueError
78+
if there is no release information available.
79+
"""
80+
releases = pypi_package_json.get_releases()
81+
if releases is None: # no release information
82+
error_msg = "There is no information for any release of this package."
83+
logger.debug(error_msg)
84+
raise HeuristicAnalyzerValueError(error_msg)
85+
86+
if len(releases) != 1:
87+
error_msg = (
88+
"This heuristic depends on a single release, but somehow there are multiple when the one release"
89+
+ " heuristic failed."
90+
)
91+
logger.debug(error_msg)
92+
raise HeuristicAnalyzerValueError(error_msg)
93+
94+
# Since there is only one release, the latest version should be that release
95+
release = pypi_package_json.get_latest_version()
96+
if release is None:
97+
error_msg = "No latest version information available"
98+
logger.debug(error_msg)
99+
raise HeuristicAnalyzerValueError(error_msg)
100+
101+
try:
102+
release_metadata = releases[release]
103+
except KeyError as release_error:
104+
error_msg = "The latest release is not available in the list of releases"
105+
logger.debug(error_msg)
106+
raise HeuristicAnalyzerValueError(error_msg) from release_error
107+
108+
try:
109+
version = parse(release)
110+
except InvalidVersion:
111+
return HeuristicResult.SKIP, {self.DETAIL_INFO_KEY: Versioning.INVALID.value}
112+
113+
years = []
114+
months = []
115+
publish_days = []
116+
117+
for distribution in release_metadata:
118+
upload_time = json_extract(distribution, ["upload_time"], str)
119+
if upload_time is None:
120+
error_msg = "Missing upload time from release information"
121+
logger.debug(error_msg)
122+
raise HeuristicAnalyzerValueError(error_msg)
123+
124+
parsed_time = parse_datetime(upload_time)
125+
if parsed_time is None:
126+
error_msg = "Upload time is not of the expected PyPI format"
127+
logger.debug(error_msg)
128+
raise HeuristicAnalyzerValueError(error_msg)
129+
130+
years.append(parsed_time.year)
131+
years.append(parsed_time.year % 100) # last 2 digits
132+
months.append(parsed_time.month)
133+
publish_days.append(parsed_time.day)
134+
135+
days = list(range(min(publish_days) - self.day_publish_error, max(publish_days) + self.day_publish_error + 1))
136+
137+
calendar = False
138+
calendar_semantic = False
139+
140+
# check for year YY[YY]...
141+
if version.major in years:
142+
# calendar versioning: YY[YY].(M[M].D[D])(D[D].M[M])...
143+
if (version.minor in months and version.micro in days) or (
144+
version.minor in days and version.micro in months
145+
):
146+
calendar = True
147+
else:
148+
calendar_semantic = True
149+
# check for calendar versioning: M[M].D[D].YY[YY]... or D[D].M[M].YY[YY]...
150+
elif (
151+
(version.major in months and version.minor in days) or (version.major in days and version.minor in months)
152+
) and version.micro in years:
153+
# must include day and year for this to be calendar
154+
calendar = True
155+
156+
if calendar: # just check epoch
157+
detail_info: dict[str, JsonType] = {self.DETAIL_INFO_KEY: Versioning.CALENDAR.value}
158+
if version.epoch > self.epoch_threshold:
159+
return HeuristicResult.FAIL, detail_info
160+
161+
return HeuristicResult.PASS, detail_info
162+
163+
if calendar_semantic: # check minor (as major) and epoch
164+
detail_info = {self.DETAIL_INFO_KEY: Versioning.CALENDAR_SEMANTIC.value}
165+
166+
if version.epoch > self.epoch_threshold:
167+
return HeuristicResult.FAIL, detail_info
168+
if version.minor > self.major_threshold:
169+
return HeuristicResult.FAIL, detail_info
170+
171+
return HeuristicResult.PASS, detail_info
172+
173+
# semantic versioning
174+
detail_info = {self.DETAIL_INFO_KEY: Versioning.SEMANTIC.value}
175+
176+
if version.epoch > self.epoch_threshold:
177+
return HeuristicResult.FAIL, detail_info
178+
if version.major > self.major_threshold:
179+
return HeuristicResult.FAIL, detail_info
180+
181+
return HeuristicResult.PASS, detail_info
182+
183+
184+
class Versioning(Enum):
185+
"""Enum used to assign different versioning methods."""
186+
187+
INVALID = "invalid"
188+
CALENDAR = "calendar"
189+
CALENDAR_SEMANTIC = "calendar_semantic"
190+
SEMANTIC = "semantic"

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from macaron.json_tools import JsonType, json_extract
1616
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
1717
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
18+
from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer
1819
from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer
1920
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
2021
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
@@ -73,6 +74,7 @@ class MaliciousMetadataFacts(CheckFacts):
7374
CloserReleaseJoinDateAnalyzer,
7475
SuspiciousSetupAnalyzer,
7576
WheelAbsenceAnalyzer,
77+
AnomalisticVersionAnalyzer,
7678
]
7779

7880
# The HeuristicResult sequence is aligned with the sequence of ANALYZERS list
@@ -86,6 +88,7 @@ class MaliciousMetadataFacts(CheckFacts):
8688
HeuristicResult,
8789
HeuristicResult,
8890
HeuristicResult,
91+
HeuristicResult,
8992
],
9093
float,
9194
] = {
@@ -98,9 +101,26 @@ class MaliciousMetadataFacts(CheckFacts):
98101
HeuristicResult.FAIL, # Closer Release Join Date
99102
HeuristicResult.FAIL, # Suspicious Setup
100103
HeuristicResult.FAIL, # Wheel Absence
104+
HeuristicResult.FAIL, # Anomalistic Version
101105
# No project link, only one release, and the maintainer released it shortly
102106
# after account registration.
103107
# The setup.py file contains suspicious imports and .whl file isn't present.
108+
# Anomalistic version has no effect.
109+
): Confidence.HIGH,
110+
(
111+
HeuristicResult.FAIL, # Empty Project
112+
HeuristicResult.SKIP, # Unreachable Project Links
113+
HeuristicResult.FAIL, # One Release
114+
HeuristicResult.SKIP, # High Release Frequency
115+
HeuristicResult.SKIP, # Unchanged Release
116+
HeuristicResult.FAIL, # Closer Release Join Date
117+
HeuristicResult.FAIL, # Suspicious Setup
118+
HeuristicResult.FAIL, # Wheel Absence
119+
HeuristicResult.PASS, # Anomalistic Version
120+
# No project link, only one release, and the maintainer released it shortly
121+
# after account registration.
122+
# The setup.py file contains suspicious imports and .whl file isn't present.
123+
# Anomalistic version has no effect.
104124
): Confidence.HIGH,
105125
(
106126
HeuristicResult.FAIL, # Empty Project
@@ -111,6 +131,7 @@ class MaliciousMetadataFacts(CheckFacts):
111131
HeuristicResult.FAIL, # Closer Release Join Date
112132
HeuristicResult.FAIL, # Suspicious Setup
113133
HeuristicResult.FAIL, # Wheel Absence
134+
HeuristicResult.SKIP, # Anomalistic Version
114135
# No project link, frequent releases of multiple versions without modifying the content,
115136
# and the maintainer released it shortly after account registration.
116137
# The setup.py file contains suspicious imports and .whl file isn't present.
@@ -124,6 +145,7 @@ class MaliciousMetadataFacts(CheckFacts):
124145
HeuristicResult.FAIL, # Closer Release Join Date
125146
HeuristicResult.FAIL, # Suspicious Setup
126147
HeuristicResult.FAIL, # Wheel Absence
148+
HeuristicResult.SKIP, # Anomalistic Version
127149
# No project link, frequent releases of multiple versions,
128150
# and the maintainer released it shortly after account registration.
129151
# The setup.py file contains suspicious imports and .whl file isn't present.
@@ -137,6 +159,7 @@ class MaliciousMetadataFacts(CheckFacts):
137159
HeuristicResult.FAIL, # Closer Release Join Date
138160
HeuristicResult.PASS, # Suspicious Setup
139161
HeuristicResult.PASS, # Wheel Absence
162+
HeuristicResult.SKIP, # Anomalistic Version
140163
# No project link, frequent releases of multiple versions without modifying the content,
141164
# and the maintainer released it shortly after account registration. Presence/Absence of
142165
# .whl file has no effect
@@ -150,6 +173,7 @@ class MaliciousMetadataFacts(CheckFacts):
150173
HeuristicResult.FAIL, # Closer Release Join Date
151174
HeuristicResult.PASS, # Suspicious Setup
152175
HeuristicResult.FAIL, # Wheel Absence
176+
HeuristicResult.SKIP, # Anomalistic Version
153177
# No project link, frequent releases of multiple versions without modifying the content,
154178
# and the maintainer released it shortly after account registration. Presence/Absence of
155179
# .whl file has no effect
@@ -163,10 +187,56 @@ class MaliciousMetadataFacts(CheckFacts):
163187
HeuristicResult.FAIL, # Closer Release Join Date
164188
HeuristicResult.FAIL, # Suspicious Setup
165189
HeuristicResult.FAIL, # Wheel Absence
190+
HeuristicResult.SKIP, # Anomalistic Version
166191
# All project links are unreachable, frequent releases of multiple versions,
167192
# and the maintainer released it shortly after account registration.
168193
# The setup.py file contains suspicious imports and .whl file isn't present.
169194
): Confidence.HIGH,
195+
(
196+
HeuristicResult.FAIL, # Empty Project
197+
HeuristicResult.SKIP, # Unreachable Project Links
198+
HeuristicResult.FAIL, # One Release
199+
HeuristicResult.SKIP, # High Release Frequency
200+
HeuristicResult.SKIP, # Unchanged Release
201+
HeuristicResult.FAIL, # Closer Release Join Date
202+
HeuristicResult.PASS, # Suspicious Setup
203+
HeuristicResult.PASS, # Wheel Absence
204+
HeuristicResult.FAIL, # Anomalistic Version
205+
# No project link, only one release, and the maintainer released it shortly
206+
# after account registration.
207+
# The setup.py file has no effect and .whl file is present.
208+
# The version number is anomalistic.
209+
): Confidence.MEDIUM,
210+
(
211+
HeuristicResult.FAIL, # Empty Project
212+
HeuristicResult.SKIP, # Unreachable Project Links
213+
HeuristicResult.FAIL, # One Release
214+
HeuristicResult.SKIP, # High Release Frequency
215+
HeuristicResult.SKIP, # Unchanged Release
216+
HeuristicResult.FAIL, # Closer Release Join Date
217+
HeuristicResult.FAIL, # Suspicious Setup
218+
HeuristicResult.PASS, # Wheel Absence
219+
HeuristicResult.FAIL, # Anomalistic Version
220+
# No project link, only one release, and the maintainer released it shortly
221+
# after account registration.
222+
# The setup.py file has no effect and .whl file is present.
223+
# The version number is anomalistic.
224+
): Confidence.MEDIUM,
225+
(
226+
HeuristicResult.FAIL, # Empty Project
227+
HeuristicResult.SKIP, # Unreachable Project Links
228+
HeuristicResult.FAIL, # One Release
229+
HeuristicResult.SKIP, # High Release Frequency
230+
HeuristicResult.SKIP, # Unchanged Release
231+
HeuristicResult.FAIL, # Closer Release Join Date
232+
HeuristicResult.SKIP, # Suspicious Setup
233+
HeuristicResult.PASS, # Wheel Absence
234+
HeuristicResult.FAIL, # Anomalistic Version
235+
# No project link, only one release, and the maintainer released it shortly
236+
# after account registration.
237+
# The setup.py file has no effect and .whl file is present.
238+
# The version number is anomalistic.
239+
): Confidence.MEDIUM,
170240
}
171241

172242

0 commit comments

Comments
 (0)