From 434e322501f7f220f67c4ca4c18357393f75fe38 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Fri, 10 Jan 2025 09:57:10 +1000 Subject: [PATCH 1/3] feat: add in new metadata-based heuristic to pypi malware analyzer --- src/macaron/config/defaults.ini | 7 + .../pypi_heuristics/heuristics.py | 5 +- .../metadata/anomalistic_version.py | 223 +++++++++++++ .../checks/detect_malicious_metadata_check.py | 70 ++++ .../pypi/test_anomalistic_version.py | 299 ++++++++++++++++++ 5 files changed, 603 insertions(+), 1 deletion(-) create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py create mode 100644 tests/malware_analyzer/pypi/test_anomalistic_version.py diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index a6b13a80c..65ff37d80 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -586,3 +586,10 @@ releases_frequency_threshold = 2 # The gap threshold. # The timedelta indicate the gap between the date maintainer registers their pypi's account and the date of latest release. timedelta_threshold_of_join_release = 5 + +# Any major version above this value is detected as anomalistic and marked as suspicious +major_threshold = 20 +# Any epoch number avove this value is detected as anomalistic and marked as suspicious +epoch_threshold = 3 +# The number of days +/- the day of publish the calendar versioning day may be +day_publish_error = 4 diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index d3e574027..6c7a3f61d 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Define the heuristic enum.""" @@ -34,6 +34,9 @@ class Heuristics(str, Enum): #: Indicates that the package does not include a .whl file WHEEL_ABSENCE = "wheel_absence" + #: Indicates that the package has an unusually large version number for a single release + ANOMALISTIC_VERSION = "anomalistic_version" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py new file mode 100644 index 000000000..0307ae205 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py @@ -0,0 +1,223 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""The heuristic analyzer to check for an anomalistic package version.""" + +import logging +from enum import Enum + +from packaging.version import InvalidVersion, parse + +from macaron.config.defaults import defaults +from macaron.errors import HeuristicAnalyzerValueError +from macaron.json_tools import JsonType, json_extract +from macaron.malware_analyzer.datetime_parser import parse_datetime +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class AnomalisticVersionAnalyzer(BaseHeuristicAnalyzer): + """ + Analyze the version number (if there is only a single release) to detect if it is anomalistic. + + A version number is anomalistic if it is above the thresholds for an epoch, major, or minor value. + If the version does not adhere to PyPI standards (PEP 440, as per the 'packaging' module), this heuristic + cannot analyze it. + + Calendar versioning is detected as version numbers with the year, month and day present in the following combinations: + (using the example 11th October 2016) + - YYYY.MM.DD, e.g. 2016.10.11 + - YYYY.DD.MM, e.g. 2016.11.10 + - YY.DD.MM, e.g. 16.11.10 + - YY.MM.DD, e.g. 16.10.11 + - MM.DD.YYYY, e.g. 10.11.2016 + - DD.MM.YYYY, e.g. 11.10.2016 + - DD.MM.YY, e.g. 11.10.16 + - MM.DD.YY, e.g. 10.11.16 + - YYYYMMDD, e.g. 20161011 + - YYYYDDMM, e.g. 20161110 + - YYDDMM, e.g. 161110 + - YYMMDD, e.g. 161011 + - MMDDYYYY, e.g. 10112016 + - DDMMYYYY, e.g. 11102016 + - DDMMYY, e.g. 111016 + - MMDDYY, e.g. 101116 + This may be followed by further versioning (e.g. 2016.10.11.5.6.2). This type of versioning is detected based on the + date of the upload time for the release within a threshold of a number of days (in the defaults file). + + Calendar-semantic versioning is detected as version numbers with the major value as the year (either yyyy or yy), + and any other series of numbers following it: + - 2016.7.1 woud be version 7.1 of 2016 + - 16.1.4 would be version 1.4 of 2016 + This type of versioning is detected based on the exact year of the upload time for the release. + + All other versionings are detected as semantic versioning. + """ + + DETAIL_INFO_KEY: str = "versioning" + DIGIT_DATE_FORMATS: list[str] = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "%y%m%d", "%y%d%m", "%d%m%y", "%m%d%y"] + + def __init__(self) -> None: + super().__init__( + name="anomalistic_version_analyzer", + heuristic=Heuristics.ANOMALISTIC_VERSION, + depends_on=[(Heuristics.ONE_RELEASE, HeuristicResult.FAIL)], + ) + self.major_threshold, self.epoch_threshold, self.day_publish_error = self._load_defaults() + + def _load_defaults(self) -> tuple[int, int, int]: + """Load default settings from defaults.ini.""" + section_name = "heuristic.pypi" + if defaults.has_section(section_name): + section = defaults[section_name] + return ( + section.getint("major_threshold"), + section.getint("epoch_threshold"), + section.getint("day_publish_error"), + ) + # Major threshold, Epoch threshold, Day pushlish error + return 20, 3, 4 + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + + Raises + ------ + HeuristicAnalyzerValueError + if there is no release information available. + """ + releases = pypi_package_json.get_releases() + if releases is None: # no release information + error_msg = "There is no information for any release of this package." + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + if len(releases) != 1: + error_msg = ( + "This heuristic depends on a single release, but somehow there are multiple when the one release" + + " heuristic failed." + ) + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + # Since there is only one release, the latest version should be that release + release = pypi_package_json.get_latest_version() + if release is None: + error_msg = "No latest version information available" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + try: + release_metadata = releases[release] + except KeyError as release_error: + error_msg = "The latest release is not available in the list of releases" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from release_error + + try: + version = parse(release) + except InvalidVersion: + return HeuristicResult.SKIP, {self.DETAIL_INFO_KEY: Versioning.INVALID.value} + + years = [] + months = [] + publish_days = [] + + for distribution in release_metadata: + upload_time = json_extract(distribution, ["upload_time"], str) + if upload_time is None: + error_msg = "Missing upload time from release information" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + parsed_time = parse_datetime(upload_time) + if parsed_time is None: + error_msg = "Upload time is not of the expected PyPI format" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + years.append(parsed_time.year) + years.append(parsed_time.year % 100) # last 2 digits + months.append(parsed_time.month) + publish_days.append(parsed_time.day) + + days = list(range(min(publish_days) - self.day_publish_error, max(publish_days) + self.day_publish_error + 1)) + + calendar = False + calendar_semantic = False + + # check for year YY[YY]... + if version.major in years: + # calendar versioning: YY[YY].(M[M].D[D])(D[D].M[M])... + if (version.minor in months and version.micro in days) or ( + version.minor in days and version.micro in months + ): + calendar = True + else: + calendar_semantic = True + # check for calendar versioning: M[M].D[D].YY[YY]... or D[D].M[M].YY[YY]... or the whole digit rerpesenting a datetime + elif ( + ((version.major in months and version.minor in days) or (version.major in days and version.minor in months)) + and version.micro in years + ) or self.__integer_date(version.major, years, months, days): + # must include day and year for this to be calendar + calendar = True + + if calendar: # just check epoch + detail_info: dict[str, JsonType] = {self.DETAIL_INFO_KEY: Versioning.CALENDAR.value} + if version.epoch > self.epoch_threshold: + return HeuristicResult.FAIL, detail_info + + return HeuristicResult.PASS, detail_info + + if calendar_semantic: # check minor (as major) and epoch + detail_info = {self.DETAIL_INFO_KEY: Versioning.CALENDAR_SEMANTIC.value} + + if version.epoch > self.epoch_threshold: + return HeuristicResult.FAIL, detail_info + if version.minor > self.major_threshold: + return HeuristicResult.FAIL, detail_info + + return HeuristicResult.PASS, detail_info + + # semantic versioning + detail_info = {self.DETAIL_INFO_KEY: Versioning.SEMANTIC.value} + + if version.epoch > self.epoch_threshold: + return HeuristicResult.FAIL, detail_info + if version.major > self.major_threshold: + return HeuristicResult.FAIL, detail_info + + return HeuristicResult.PASS, detail_info + + def __integer_date(self, value: int, years: list[int], months: list[int], days: list[int]) -> bool: + for date_format in self.DIGIT_DATE_FORMATS: + if (date := parse_datetime(str(value), date_format)) is None: + continue + + if date.year in years and date.month in months and date.day in days: + return True + + return False + + +class Versioning(Enum): + """Enum used to assign different versioning methods.""" + + INVALID = "invalid" + CALENDAR = "calendar" + CALENDAR_SEMANTIC = "calendar_semantic" + SEMANTIC = "semantic" diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index f9c75d64f..181f27600 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -15,6 +15,7 @@ from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer @@ -75,6 +76,7 @@ class MaliciousMetadataFacts(CheckFacts): CloserReleaseJoinDateAnalyzer, SuspiciousSetupAnalyzer, WheelAbsenceAnalyzer, + AnomalisticVersionAnalyzer, ] @@ -89,6 +91,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult, HeuristicResult, HeuristicResult, + HeuristicResult, ], float, ] = { @@ -101,9 +104,26 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.FAIL, # Anomalistic Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. + # Anomalistic version has no effect. + ): Confidence.HIGH, + ( + HeuristicResult.FAIL, # Empty Project + HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.FAIL, # One Release + HeuristicResult.SKIP, # High Release Frequency + HeuristicResult.SKIP, # Unchanged Release + HeuristicResult.FAIL, # Closer Release Join Date + HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.PASS, # Anomalistic Version + # No project link, only one release, and the maintainer released it shortly + # after account registration. + # The setup.py file contains suspicious imports and .whl file isn't present. + # Anomalistic version has no effect. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -114,6 +134,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.SKIP, # Anomalistic Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. @@ -127,6 +148,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.SKIP, # Anomalistic Version # No project link, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. @@ -140,6 +162,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup HeuristicResult.PASS, # Wheel Absence + HeuristicResult.SKIP, # Anomalistic Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. Presence/Absence of # .whl file has no effect @@ -153,6 +176,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.SKIP, # Anomalistic Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. Presence/Absence of # .whl file has no effect @@ -166,10 +190,56 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence + HeuristicResult.SKIP, # Anomalistic Version # All project links are unreachable, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, + ( + HeuristicResult.FAIL, # Empty Project + HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.FAIL, # One Release + HeuristicResult.SKIP, # High Release Frequency + HeuristicResult.SKIP, # Unchanged Release + HeuristicResult.FAIL, # Closer Release Join Date + HeuristicResult.PASS, # Suspicious Setup + HeuristicResult.PASS, # Wheel Absence + HeuristicResult.FAIL, # Anomalistic Version + # No project link, only one release, and the maintainer released it shortly + # after account registration. + # The setup.py file has no effect and .whl file is present. + # The version number is anomalistic. + ): Confidence.MEDIUM, + ( + HeuristicResult.FAIL, # Empty Project + HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.FAIL, # One Release + HeuristicResult.SKIP, # High Release Frequency + HeuristicResult.SKIP, # Unchanged Release + HeuristicResult.FAIL, # Closer Release Join Date + HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.PASS, # Wheel Absence + HeuristicResult.FAIL, # Anomalistic Version + # No project link, only one release, and the maintainer released it shortly + # after account registration. + # The setup.py file has no effect and .whl file is present. + # The version number is anomalistic. + ): Confidence.MEDIUM, + ( + HeuristicResult.FAIL, # Empty Project + HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.FAIL, # One Release + HeuristicResult.SKIP, # High Release Frequency + HeuristicResult.SKIP, # Unchanged Release + HeuristicResult.FAIL, # Closer Release Join Date + HeuristicResult.SKIP, # Suspicious Setup + HeuristicResult.PASS, # Wheel Absence + HeuristicResult.FAIL, # Anomalistic Version + # No project link, only one release, and the maintainer released it shortly + # after account registration. + # The setup.py file has no effect and .whl file is present. + # The version number is anomalistic. + ): Confidence.MEDIUM, } diff --git a/tests/malware_analyzer/pypi/test_anomalistic_version.py b/tests/malware_analyzer/pypi/test_anomalistic_version.py new file mode 100644 index 000000000..9f24bd07c --- /dev/null +++ b/tests/malware_analyzer/pypi/test_anomalistic_version.py @@ -0,0 +1,299 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for heuristic detecting anomalistic version numbers""" +from unittest.mock import MagicMock + +import pytest + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer, Versioning + + +def test_analyze_no_information(pypi_package_json: MagicMock) -> None: + """Test for when there is no release information, so error""" + analyzer = AnomalisticVersionAnalyzer() + + pypi_package_json.get_releases.return_value = None + + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +def test_analyze_invalid_time(pypi_package_json: MagicMock) -> None: + """Test for when the supplied upload time does not conform with PEP 440, so error.""" + analyzer = AnomalisticVersionAnalyzer() + version = "1.1" + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": "ttttttttest_nester.py-0.1.0.tar.gz", + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "sdist", + "python_version": "source", + "requires_python": None, + "size": 546, + "upload_time": "September 2 2022 5:42pm 27s", + "upload_time_iso_8601": "2022-09-02T05:42:27.073842Z", + "url": "https://files.pythonhosted.org/packages/de/fa/" + + "2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3/ttttttttest_nester.py-0.1.0.tar.gz", + "yanked": False, + "yanked_reason": None, + } + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.get_latest_version.return_value = version + + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +def test_analyze_no_time(pypi_package_json: MagicMock) -> None: + """Test for when there is no supplied upload time, so error.""" + analyzer = AnomalisticVersionAnalyzer() + version = "1.1" + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": "ttttttttest_nester.py-0.1.0.tar.gz", + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "sdist", + "python_version": "source", + "requires_python": None, + "size": 546, + "url": "https://files.pythonhosted.org/packages/de/fa/" + + "2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3/ttttttttest_nester.py-0.1.0.tar.gz", + "yanked": False, + "yanked_reason": None, + } + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.get_latest_version.return_value = version + + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +@pytest.mark.parametrize( + ("version", "upload_date", "result", "versioning"), + [ + pytest.param( + "2016-10-13", "2016-10-13", HeuristicResult.SKIP, Versioning.INVALID.value, id="test_invalid_version" + ), + pytest.param( + "2016.10.11", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYYY.MM.DD_pass", + ), + pytest.param( + "2016.12.10", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYYY.DD.MM_pass", + ), + pytest.param( + "16.10.13", "2016-10-13", HeuristicResult.PASS, Versioning.CALENDAR.value, id="test_calendar_YY.DD.MM_pass" + ), + pytest.param( + "16.14.10", "2016-10-13", HeuristicResult.PASS, Versioning.CALENDAR.value, id="test_calendar_YY.MM.DD_pass" + ), + pytest.param( + "10.10.2016", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_MM.DD.YYYY_pass", + ), + pytest.param( + "9.10.2016", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_DD.MM.YYYY_pass", + ), + pytest.param( + "10.15.16", "2016-10-13", HeuristicResult.PASS, Versioning.CALENDAR.value, id="test_calendar_DD.MM.YY_pass" + ), + pytest.param( + "16.10.16", "2016-10-13", HeuristicResult.PASS, Versioning.CALENDAR.value, id="test_calendar_MM.DD.YY_pass" + ), + pytest.param( + "20161011.0", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYYYMMDD_pass", + ), + pytest.param( + "20161210.6.1", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYYYDDMM_pass", + ), + pytest.param( + "161013.9.0.5", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYDDMM_pass", + ), + pytest.param( + "161410.2.5.7", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_YYMMDD_pass", + ), + pytest.param( + "10102016.0", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_MMDDYYYY_pass", + ), + pytest.param( + "09102016", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_DDMMYYYY_pass", + ), + pytest.param( + "101516.5.7", "2016-10-13", HeuristicResult.PASS, Versioning.CALENDAR.value, id="test_calendar_DDMMYY_pass" + ), + pytest.param( + "161016.0.0.0.0", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_MMDDYY_pass", + ), + pytest.param( + "2!16.10.17.2.5.3", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR.value, + id="test_calendar_epoch_pass", + ), + pytest.param( + "100!2016.10.14", + "2016-10-13", + HeuristicResult.FAIL, + Versioning.CALENDAR.value, + id="test_calendar_epoch_fail", + ), + pytest.param( + "2016.7.2", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR_SEMANTIC.value, + id="test_calendar_semantic_pass", + ), + pytest.param( + "2016.100.0", + "2016-10-13", + HeuristicResult.FAIL, + Versioning.CALENDAR_SEMANTIC.value, + id="test_calendar_semantic_fail", + ), + pytest.param( + "2!2016.1.5.6", + "2016-10-13", + HeuristicResult.PASS, + Versioning.CALENDAR_SEMANTIC.value, + id="test_calendar_semantic_epoch_pass", + ), + pytest.param( + "100!2016.1", + "2016-10-13", + HeuristicResult.FAIL, + Versioning.CALENDAR_SEMANTIC.value, + id="test_calendar_semantic_epoch_fail", + ), + pytest.param("3.1", "2016-10-13", HeuristicResult.PASS, Versioning.SEMANTIC.value, id="test_semantic_pass"), + pytest.param("999", "2016-10-13", HeuristicResult.FAIL, Versioning.SEMANTIC.value, id="test_semantic_fail"), + pytest.param( + "3!0.1.9999", "2016-10-13", HeuristicResult.PASS, Versioning.SEMANTIC.value, id="test_semantic_epoch_pass" + ), + pytest.param( + "999!0.0.0", "2016-10-13", HeuristicResult.FAIL, Versioning.SEMANTIC.value, id="test_semantic_epoch_fail" + ), + ], +) +def test_analyze( + pypi_package_json: MagicMock, version: str, upload_date: str, result: HeuristicResult, versioning: str +) -> None: + """ + Generic test for the expected return value of the anomalistic version heuristic. + + Parameters + ---------- + version : str + the version number for the test package. + upload_date : str + the date of when the test package was uploaded. + result : HeuristicResult + the expected result the heuristic should arrive at. + versioning : str + which versioning system the heuristic should have identified. + """ + analyzer = AnomalisticVersionAnalyzer() + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": "ttttttttest_nester.py-0.1.0.tar.gz", + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "sdist", + "python_version": "source", + "requires_python": None, + "size": 546, + "upload_time": f"{upload_date}T05:42:27", + "upload_time_iso_8601": f"{upload_date}T05:42:27.073842Z", + "url": "https://files.pythonhosted.org/packages/de/fa/" + + "2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3/ttttttttest_nester.py-0.1.0.tar.gz", + "yanked": False, + "yanked_reason": None, + } + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.get_latest_version.return_value = version + expected_result: tuple[HeuristicResult, dict] = (result, {AnomalisticVersionAnalyzer.DETAIL_INFO_KEY: versioning}) + + actual_result = analyzer.analyze(pypi_package_json) + + assert actual_result == expected_result From ad04616d3e3dac7e6a7e4cc1440ebc90b1844183 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 13 Jan 2025 10:42:28 +1000 Subject: [PATCH 2/3] refactor: reworded 'anomalistic' to 'anomalous' --- src/macaron/config/defaults.ini | 9 ++--- .../pypi_heuristics/heuristics.py | 6 ++-- ...listic_version.py => anomalous_version.py} | 12 +++---- .../checks/detect_malicious_metadata_check.py | 34 +++++++++---------- ...c_version.py => test_anomalous_version.py} | 16 ++++----- 5 files changed, 39 insertions(+), 38 deletions(-) rename src/macaron/malware_analyzer/pypi_heuristics/metadata/{anomalistic_version.py => anomalous_version.py} (95%) rename tests/malware_analyzer/pypi/{test_anomalistic_version.py => test_anomalous_version.py} (95%) diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 65ff37d80..f895c20aa 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -584,12 +584,13 @@ include = * [heuristic.pypi] releases_frequency_threshold = 2 # The gap threshold. -# The timedelta indicate the gap between the date maintainer registers their pypi's account and the date of latest release. +# The timedelta represents the gap between when the date maintainer registers their pypi account, and the +# date of the latest release. timedelta_threshold_of_join_release = 5 -# Any major version above this value is detected as anomalistic and marked as suspicious +# Any major version above this value is detected as anomalous and marked as suspicious. major_threshold = 20 -# Any epoch number avove this value is detected as anomalistic and marked as suspicious +# Any epoch number avove this value is detected as anomalous and marked as suspicious. epoch_threshold = 3 -# The number of days +/- the day of publish the calendar versioning day may be +# The number of days +/- the day of publish the calendar versioning day may be. day_publish_error = 4 diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 6c7a3f61d..1bd724fad 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -31,11 +31,11 @@ class Heuristics(str, Enum): #: Indicates that the setup.py file contains suspicious imports, such as base64 and requests. SUSPICIOUS_SETUP = "suspicious_setup" - #: Indicates that the package does not include a .whl file + #: Indicates that the package does not include a .whl file. WHEEL_ABSENCE = "wheel_absence" - #: Indicates that the package has an unusually large version number for a single release - ANOMALISTIC_VERSION = "anomalistic_version" + #: Indicates that the package has an unusually large version number for a single release. + ANOMALOUS_VERSION = "anomalous_version" class HeuristicResult(str, Enum): diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py similarity index 95% rename from src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py rename to src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py index 0307ae205..2f39d6f70 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalistic_version.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. -"""The heuristic analyzer to check for an anomalistic package version.""" +"""The heuristic analyzer to check for an anomalous package version.""" import logging from enum import Enum @@ -19,11 +19,11 @@ logger: logging.Logger = logging.getLogger(__name__) -class AnomalisticVersionAnalyzer(BaseHeuristicAnalyzer): +class AnomalousVersionAnalyzer(BaseHeuristicAnalyzer): """ - Analyze the version number (if there is only a single release) to detect if it is anomalistic. + Analyze the version number (if there is only a single release) to detect if it is anomalous. - A version number is anomalistic if it is above the thresholds for an epoch, major, or minor value. + A version number is anomalous if any of its values are greater than the epoch, major, or minor threshold values. If the version does not adhere to PyPI standards (PEP 440, as per the 'packaging' module), this heuristic cannot analyze it. @@ -62,8 +62,8 @@ class AnomalisticVersionAnalyzer(BaseHeuristicAnalyzer): def __init__(self) -> None: super().__init__( - name="anomalistic_version_analyzer", - heuristic=Heuristics.ANOMALISTIC_VERSION, + name="anomalous_version_analyzer", + heuristic=Heuristics.ANOMALOUS_VERSION, depends_on=[(Heuristics.ONE_RELEASE, HeuristicResult.FAIL)], ) self.major_threshold, self.epoch_threshold, self.day_publish_error = self._load_defaults() diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 181f27600..0e2fe0039 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -15,7 +15,7 @@ from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics -from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.anomalous_version import AnomalousVersionAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer @@ -76,7 +76,7 @@ class MaliciousMetadataFacts(CheckFacts): CloserReleaseJoinDateAnalyzer, SuspiciousSetupAnalyzer, WheelAbsenceAnalyzer, - AnomalisticVersionAnalyzer, + AnomalousVersionAnalyzer, ] @@ -104,11 +104,11 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.FAIL, # Anomalistic Version + HeuristicResult.FAIL, # Anomalous Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalistic version has no effect. + # Anomalous version has no effect. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -119,11 +119,11 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.PASS, # Anomalistic Version + HeuristicResult.PASS, # Anomalous Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalistic version has no effect. + # Anomalous version has no effect. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -134,7 +134,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalistic Version + HeuristicResult.SKIP, # Anomalous Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. @@ -148,7 +148,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalistic Version + HeuristicResult.SKIP, # Anomalous Version # No project link, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. @@ -162,7 +162,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup HeuristicResult.PASS, # Wheel Absence - HeuristicResult.SKIP, # Anomalistic Version + HeuristicResult.SKIP, # Anomalous Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. Presence/Absence of # .whl file has no effect @@ -176,7 +176,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalistic Version + HeuristicResult.SKIP, # Anomalous Version # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. Presence/Absence of # .whl file has no effect @@ -190,7 +190,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalistic Version + HeuristicResult.SKIP, # Anomalous Version # All project links are unreachable, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. @@ -204,11 +204,11 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalistic Version + HeuristicResult.FAIL, # Anomalous Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file has no effect and .whl file is present. - # The version number is anomalistic. + # The version number is anomalous. ): Confidence.MEDIUM, ( HeuristicResult.FAIL, # Empty Project @@ -219,11 +219,11 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalistic Version + HeuristicResult.FAIL, # Anomalous Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file has no effect and .whl file is present. - # The version number is anomalistic. + # The version number is anomalous. ): Confidence.MEDIUM, ( HeuristicResult.FAIL, # Empty Project @@ -234,11 +234,11 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.SKIP, # Suspicious Setup HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalistic Version + HeuristicResult.FAIL, # Anomalous Version # No project link, only one release, and the maintainer released it shortly # after account registration. # The setup.py file has no effect and .whl file is present. - # The version number is anomalistic. + # The version number is anomalous. ): Confidence.MEDIUM, } diff --git a/tests/malware_analyzer/pypi/test_anomalistic_version.py b/tests/malware_analyzer/pypi/test_anomalous_version.py similarity index 95% rename from tests/malware_analyzer/pypi/test_anomalistic_version.py rename to tests/malware_analyzer/pypi/test_anomalous_version.py index 9f24bd07c..3edd73d0c 100644 --- a/tests/malware_analyzer/pypi/test_anomalistic_version.py +++ b/tests/malware_analyzer/pypi/test_anomalous_version.py @@ -1,19 +1,19 @@ # Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. -"""Tests for heuristic detecting anomalistic version numbers""" +"""Tests for heuristic detecting anomalous version numbers""" from unittest.mock import MagicMock import pytest from macaron.errors import HeuristicAnalyzerValueError from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult -from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer, Versioning +from macaron.malware_analyzer.pypi_heuristics.metadata.anomalous_version import AnomalousVersionAnalyzer, Versioning def test_analyze_no_information(pypi_package_json: MagicMock) -> None: """Test for when there is no release information, so error""" - analyzer = AnomalisticVersionAnalyzer() + analyzer = AnomalousVersionAnalyzer() pypi_package_json.get_releases.return_value = None @@ -23,7 +23,7 @@ def test_analyze_no_information(pypi_package_json: MagicMock) -> None: def test_analyze_invalid_time(pypi_package_json: MagicMock) -> None: """Test for when the supplied upload time does not conform with PEP 440, so error.""" - analyzer = AnomalisticVersionAnalyzer() + analyzer = AnomalousVersionAnalyzer() version = "1.1" release = { version: [ @@ -61,7 +61,7 @@ def test_analyze_invalid_time(pypi_package_json: MagicMock) -> None: def test_analyze_no_time(pypi_package_json: MagicMock) -> None: """Test for when there is no supplied upload time, so error.""" - analyzer = AnomalisticVersionAnalyzer() + analyzer = AnomalousVersionAnalyzer() version = "1.1" release = { version: [ @@ -249,7 +249,7 @@ def test_analyze( pypi_package_json: MagicMock, version: str, upload_date: str, result: HeuristicResult, versioning: str ) -> None: """ - Generic test for the expected return value of the anomalistic version heuristic. + Generic test for the expected return value of the anomalous version heuristic. Parameters ---------- @@ -262,7 +262,7 @@ def test_analyze( versioning : str which versioning system the heuristic should have identified. """ - analyzer = AnomalisticVersionAnalyzer() + analyzer = AnomalousVersionAnalyzer() release = { version: [ { @@ -292,7 +292,7 @@ def test_analyze( pypi_package_json.get_releases.return_value = release pypi_package_json.get_latest_version.return_value = version - expected_result: tuple[HeuristicResult, dict] = (result, {AnomalisticVersionAnalyzer.DETAIL_INFO_KEY: versioning}) + expected_result: tuple[HeuristicResult, dict] = (result, {AnomalousVersionAnalyzer.DETAIL_INFO_KEY: versioning}) actual_result = analyzer.analyze(pypi_package_json) From a829ce4b737cb303381ca28a10340482c1bc4ba9 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 13 Jan 2025 16:50:47 +1000 Subject: [PATCH 3/3] chore: included docstrings for integer date function --- .../metadata/anomalous_version.py | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py index 2f39d6f70..f02c4f595 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py @@ -69,7 +69,13 @@ def __init__(self) -> None: self.major_threshold, self.epoch_threshold, self.day_publish_error = self._load_defaults() def _load_defaults(self) -> tuple[int, int, int]: - """Load default settings from defaults.ini.""" + """Load default settings from defaults.ini. + + Returns + ------- + tuple[int, int, int]: + The Major threshold, Epoch threshold, and Day published error. + """ section_name = "heuristic.pypi" if defaults.has_section(section_name): section = defaults[section_name] @@ -78,7 +84,6 @@ def _load_defaults(self) -> tuple[int, int, int]: section.getint("epoch_threshold"), section.getint("day_publish_error"), ) - # Major threshold, Epoch threshold, Day pushlish error return 20, 3, 4 def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: @@ -172,7 +177,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes elif ( ((version.major in months and version.minor in days) or (version.major in days and version.minor in months)) and version.micro in years - ) or self.__integer_date(version.major, years, months, days): + ) or self._integer_date(version.major, years, months, days): # must include day and year for this to be calendar calendar = True @@ -203,7 +208,36 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes return HeuristicResult.PASS, detail_info - def __integer_date(self, value: int, years: list[int], months: list[int], days: list[int]) -> bool: + def _integer_date(self, value: int, years: list[int], months: list[int], days: list[int]) -> bool: + """Check whether the provided integer represents a date. + + Valid representations are: + - YYYYMMDD + - YYYYDDMM + - YYDDMM + - YYMMDD + - MMDDYYYY + - DDMMYYYY + - DDMMYY + - MMDDYY + + Parameters + ---------- + value: int + The integer to check. + years: list[int] + A list of integers representing valid years for components of value to represent. + months: list[int] + A list of integers representing valid months for components of value to represent. + days: list[int] + A list of integers representing valid days for components of value to represent. + + Returns + ------- + bool: + True if the integer may represent a date present in the list of valid years, months and days. + False otherwise. + """ for date_format in self.DIGIT_DATE_FORMATS: if (date := parse_datetime(str(value), date_format)) is None: continue