From 6e40ec938d66a68a9ea824be03d56e8c9b5df94d Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 3 Mar 2025 17:01:43 +1000 Subject: [PATCH 1/6] refactor: switched out suspicious combo dictionary for problog logical statements Signed-off-by: Carl Flottmann --- pyproject.toml | 2 + .../checks/detect_malicious_metadata_check.py | 254 +++++++----------- 2 files changed, 92 insertions(+), 164 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index feafa2b42..d46835842 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "cyclonedx-bom >=4.0.0,<5.0.0", "cyclonedx-python-lib[validation] >=7.3.4,<8.0.0", "beautifulsoup4 >= 4.12.0,<5.0.0", + "problog >= 2.2.6,<3.0.0", ] keywords = [] # https://pypi.org/classifiers/ @@ -203,6 +204,7 @@ module = [ "gitdb.*", "yamale.*", "defusedxml.*", + "problog.*", ] ignore_missing_imports = true diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 5cb058ca0..a5737f67d 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -6,6 +6,9 @@ import logging import requests +from problog import get_evaluatable +from problog.logic import Term +from problog.program import PrologString from sqlalchemy import ForeignKey, String from sqlalchemy.orm import Mapped, mapped_column @@ -79,168 +82,58 @@ class MaliciousMetadataFacts(CheckFacts): AnomalousVersionAnalyzer, ] - -# The HeuristicResult sequence is aligned with the sequence of ANALYZERS list -SUSPICIOUS_COMBO: dict[ - tuple[ - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - ], - float, -] = { - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalous version has no effect. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.PASS, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalous version has no effect. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.PASS, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. Presence/Absence of - # .whl file has no effect - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. Presence/Absence of - # .whl file has no effect - ): Confidence.MEDIUM, - ( - HeuristicResult.PASS, # Empty Project - HeuristicResult.FAIL, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.PASS, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No source code repo, frequent releases of multiple versions, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.SKIP, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, -} +RESULT = "result" + +STATIC_PROBLOG_MODEL = f""" +{Confidence.HIGH.value}::high :- + not {Heuristics.EMPTY_PROJECT_LINK.value}, + not {Heuristics.ONE_RELEASE.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + not {Heuristics.SUSPICIOUS_SETUP.value}, + not {Heuristics.WHEEL_ABSENCE.value}. +{Confidence.HIGH.value}::high :- + not {Heuristics.EMPTY_PROJECT_LINK.value}, + {Heuristics.ONE_RELEASE.value}, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + not {Heuristics.SUSPICIOUS_SETUP.value}, + not {Heuristics.WHEEL_ABSENCE.value}. +{Confidence.HIGH.value}::high :- + {Heuristics.EMPTY_PROJECT_LINK.value}, + not {Heuristics.SOURCE_CODE_REPO.value}, + {Heuristics.ONE_RELEASE.value}, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + {Heuristics.UNCHANGED_RELEASE.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + not {Heuristics.SUSPICIOUS_SETUP.value}, + not {Heuristics.WHEEL_ABSENCE.value}. + +{Confidence.MEDIUM.value}::medium :- + not {Heuristics.EMPTY_PROJECT_LINK.value}, + {Heuristics.ONE_RELEASE.value}, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.UNCHANGED_RELEASE.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + {Heuristics.SUSPICIOUS_SETUP.value}. +{Confidence.MEDIUM.value}::medium :- + not {Heuristics.EMPTY_PROJECT_LINK.value}, + not {Heuristics.ONE_RELEASE.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + {Heuristics.SUSPICIOUS_SETUP.value}, + {Heuristics.WHEEL_ABSENCE.value}, + not {Heuristics.ANOMALOUS_VERSION.value}. +{Confidence.MEDIUM.value}::medium :- + not {Heuristics.EMPTY_PROJECT_LINK.value}, + not {Heuristics.ONE_RELEASE.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + {Heuristics.WHEEL_ABSENCE.value}, + not {Heuristics.ANOMALOUS_VERSION.value}. + +{RESULT} :- high. +{RESULT} :- medium. + +query({RESULT}). +""" class DetectMaliciousMetadataCheck(BaseCheck): @@ -303,6 +196,40 @@ def validate_malware(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[boo is_malware, detail_info = sourcecode_analyzer.analyze() return is_malware, detail_info + def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, HeuristicResult]) -> float | None: + """Analyse the heuristic results to determine the maliciousness of the package. + + Parameters + ---------- + heuristic_results: dict[Heuristics, HeuristicResult] + Dictionary of Heuristic keys with HeuristicResult values, results of each heuristic scan. + + Returns + ------- + float | None + Returns the confidence associated with the detected malicious combination, otherwise None if no associated + malicious combination was triggered. + """ + facts_list: list[str] = [] + for heuristic, result in heuristic_results.items(): + if result == HeuristicResult.SKIP: + facts_list.append(f"0.0::{heuristic.value}.") + elif result == HeuristicResult.PASS: + facts_list.append(f"{heuristic.value} :- true.") + else: # HeuristicResult.FAIL + facts_list.append(f"{heuristic.value} :- false.") + + facts = "\n".join(facts_list) + problog_code = f"{facts}\n\n{STATIC_PROBLOG_MODEL}" + + problog_model = PrologString(problog_code) + problog_results: dict[Term, float] = get_evaluatable().create_from(problog_model).evaluate() + + confidence = problog_results.get(Term(RESULT)) + if confidence == 0.0: + return None # no rules were triggered + return confidence + def run_heuristics( self, pypi_package_json: PyPIPackageJsonAsset ) -> tuple[dict[Heuristics, HeuristicResult], dict[str, JsonType]]: @@ -418,8 +345,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: except HeuristicAnalyzerValueError: return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) - result_combo: tuple = tuple(result.values()) - confidence: float | None = SUSPICIOUS_COMBO.get(result_combo, None) + confidence = self.evaluate_heuristic_results(result) result_type = CheckResultType.FAILED if confidence is None: confidence = Confidence.HIGH From ce0faf2cac63510dc286c6e314aecb0ffa9c5a07 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 6 Mar 2025 12:55:03 +1000 Subject: [PATCH 2/6] refactor: simplified logic statements with comments Signed-off-by: Carl Flottmann --- .../checks/detect_malicious_metadata_check.py | 57 ++++++++----------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index a5737f67d..7cb9bdd4b 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -85,47 +85,38 @@ class MaliciousMetadataFacts(CheckFacts): RESULT = "result" STATIC_PROBLOG_MODEL = f""" -{Confidence.HIGH.value}::high :- - not {Heuristics.EMPTY_PROJECT_LINK.value}, - not {Heuristics.ONE_RELEASE.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, - not {Heuristics.SUSPICIOUS_SETUP.value}, - not {Heuristics.WHEEL_ABSENCE.value}. -{Confidence.HIGH.value}::high :- - not {Heuristics.EMPTY_PROJECT_LINK.value}, - {Heuristics.ONE_RELEASE.value}, +% Heuristic groupings + +% Maintainer has recently joined, publishing an undetailed page with no links. +quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}. + +% Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file. +forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, {Heuristics.WHEEL_ABSENCE.value}. + +% Suspicious Combinations + +% Package released recently with little detail, forcing the setup.py to run. +{Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.ONE_RELEASE.value}. +{Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}. + +% Package released recently with little detail, with some more refined trust markers introduced: project links, +% multiple different releases, but there is no source code repository matching it and the setup is suspicious. +{Confidence.HIGH.value}::high :- not {Heuristics.SOURCE_CODE_REPO.value}, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, - not {Heuristics.SUSPICIOUS_SETUP.value}, - not {Heuristics.WHEEL_ABSENCE.value}. -{Confidence.HIGH.value}::high :- - {Heuristics.EMPTY_PROJECT_LINK.value}, - not {Heuristics.SOURCE_CODE_REPO.value}, - {Heuristics.ONE_RELEASE.value}, - not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, {Heuristics.UNCHANGED_RELEASE.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, - not {Heuristics.SUSPICIOUS_SETUP.value}, - not {Heuristics.WHEEL_ABSENCE.value}. + forceSetup. -{Confidence.MEDIUM.value}::medium :- - not {Heuristics.EMPTY_PROJECT_LINK.value}, - {Heuristics.ONE_RELEASE.value}, +% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with +% the same code. +{Confidence.MEDIUM.value}::medium :- quickUndetailed, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, not {Heuristics.UNCHANGED_RELEASE.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, {Heuristics.SUSPICIOUS_SETUP.value}. -{Confidence.MEDIUM.value}::medium :- - not {Heuristics.EMPTY_PROJECT_LINK.value}, - not {Heuristics.ONE_RELEASE.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, - {Heuristics.SUSPICIOUS_SETUP.value}, - {Heuristics.WHEEL_ABSENCE.value}, - not {Heuristics.ANOMALOUS_VERSION.value}. -{Confidence.MEDIUM.value}::medium :- - not {Heuristics.EMPTY_PROJECT_LINK.value}, + +% Package released recently with little detail and an anomalous version number for a single-release package. +{Confidence.MEDIUM.value}::medium :- quickUndetailed, not {Heuristics.ONE_RELEASE.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, {Heuristics.WHEEL_ABSENCE.value}, not {Heuristics.ANOMALOUS_VERSION.value}. From fc809653947e0cecf3ec84a128f7168761a1129b Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 11 Mar 2025 10:45:03 +1000 Subject: [PATCH 3/6] fix: forceSetup rule was not defined properly for wheel absence, causing the CI fail. Signed-off-by: Carl Flottmann --- .../slsa_analyzer/checks/detect_malicious_metadata_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 7cb9bdd4b..b8f924c7f 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -91,7 +91,7 @@ class MaliciousMetadataFacts(CheckFacts): quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}. % Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file. -forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, {Heuristics.WHEEL_ABSENCE.value}. +forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, not {Heuristics.WHEEL_ABSENCE.value}. % Suspicious Combinations From 72e2608bbe2cb17d2fe93e293cf400738f5b22b6 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 11 Mar 2025 17:20:10 +1000 Subject: [PATCH 4/6] chore: updated expected type for confidence Signed-off-by: Carl Flottmann --- .../slsa_analyzer/checks/detect_malicious_metadata_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index b8f924c7f..50acc20d6 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -216,7 +216,7 @@ def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, Heurist problog_model = PrologString(problog_code) problog_results: dict[Term, float] = get_evaluatable().create_from(problog_model).evaluate() - confidence = problog_results.get(Term(RESULT)) + confidence: float | None = problog_results.get(Term(RESULT)) if confidence == 0.0: return None # no rules were triggered return confidence From 52aaecfc57755b6af93a9d2d2fd76450813285cc Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 12 Mar 2025 11:24:01 +1000 Subject: [PATCH 5/6] docs: updated README for contributing to the problog model Signed-off-by: Carl Flottmann --- src/macaron/malware_analyzer/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 6fe93d89a..e70a4a1bd 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -52,6 +52,19 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. - **Dependency**: Will be run if the One Release heuristic fails. +### Contributing + +When contributing an analyzer, it must meet the following requirements: + +- The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)). +- The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis. +- The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) +- Update the `STATIC_PROBLOG_MODEL` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: + - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum. + - Provide a name based on this confidence value (i.e. `high`, `medium`, or `low`) + - If it does not already exist, make sure to assign this to the result variable (`RESULT`) + - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). + ### Confidence Score Motivation The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results: From 0cb379cf67b6df4a164365f1355b9846f3ad846f Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 12 Mar 2025 11:49:40 +1000 Subject: [PATCH 6/6] refactor: problog model is now at the bottom of the file for readability, with better comments Signed-off-by: Carl Flottmann --- src/macaron/malware_analyzer/README.md | 4 +- .../checks/detect_malicious_metadata_check.py | 127 +++++++++--------- 2 files changed, 67 insertions(+), 64 deletions(-) diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index e70a4a1bd..7617e4156 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -59,10 +59,10 @@ When contributing an analyzer, it must meet the following requirements: - The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)). - The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis. - The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) -- Update the `STATIC_PROBLOG_MODEL` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: +- Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum. - Provide a name based on this confidence value (i.e. `high`, `medium`, or `low`) - - If it does not already exist, make sure to assign this to the result variable (`RESULT`) + - If it does not already exist, make sure to assign this to the result variable (`problog_result_access`) - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). ### Confidence Score Motivation diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 50acc20d6..80439bb79 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -68,65 +68,6 @@ class MaliciousMetadataFacts(CheckFacts): } -# This list contains the heuristic analyzer classes -# When implementing new analyzer, appending the classes to this list -ANALYZERS: list = [ - EmptyProjectLinkAnalyzer, - SourceCodeRepoAnalyzer, - OneReleaseAnalyzer, - HighReleaseFrequencyAnalyzer, - UnchangedReleaseAnalyzer, - CloserReleaseJoinDateAnalyzer, - SuspiciousSetupAnalyzer, - WheelAbsenceAnalyzer, - AnomalousVersionAnalyzer, -] - -RESULT = "result" - -STATIC_PROBLOG_MODEL = f""" -% Heuristic groupings - -% Maintainer has recently joined, publishing an undetailed page with no links. -quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}. - -% Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file. -forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, not {Heuristics.WHEEL_ABSENCE.value}. - -% Suspicious Combinations - -% Package released recently with little detail, forcing the setup.py to run. -{Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.ONE_RELEASE.value}. -{Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}. - -% Package released recently with little detail, with some more refined trust markers introduced: project links, -% multiple different releases, but there is no source code repository matching it and the setup is suspicious. -{Confidence.HIGH.value}::high :- not {Heuristics.SOURCE_CODE_REPO.value}, - not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, - not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, - {Heuristics.UNCHANGED_RELEASE.value}, - forceSetup. - -% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with -% the same code. -{Confidence.MEDIUM.value}::medium :- quickUndetailed, - not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, - not {Heuristics.UNCHANGED_RELEASE.value}, - {Heuristics.SUSPICIOUS_SETUP.value}. - -% Package released recently with little detail and an anomalous version number for a single-release package. -{Confidence.MEDIUM.value}::medium :- quickUndetailed, - not {Heuristics.ONE_RELEASE.value}, - {Heuristics.WHEEL_ABSENCE.value}, - not {Heuristics.ANOMALOUS_VERSION.value}. - -{RESULT} :- high. -{RESULT} :- medium. - -query({RESULT}). -""" - - class DetectMaliciousMetadataCheck(BaseCheck): """This check analyzes the metadata of a package for malicious behavior.""" @@ -211,12 +152,13 @@ def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, Heurist facts_list.append(f"{heuristic.value} :- false.") facts = "\n".join(facts_list) - problog_code = f"{facts}\n\n{STATIC_PROBLOG_MODEL}" + problog_code = f"{facts}\n\n{self.malware_rules_problog_model}" + logger.debug("Problog model used for evaluation:\n %s", problog_code) problog_model = PrologString(problog_code) problog_results: dict[Term, float] = get_evaluatable().create_from(problog_model).evaluate() - confidence: float | None = problog_results.get(Term(RESULT)) + confidence: float | None = problog_results.get(Term(self.problog_result_access)) if confidence == 0.0: return None # no rules were triggered return confidence @@ -244,7 +186,7 @@ def run_heuristics( results: dict[Heuristics, HeuristicResult] = {} detail_info: dict[str, JsonType] = {} - for _analyzer in ANALYZERS: + for _analyzer in self.analyzers: analyzer: BaseHeuristicAnalyzer = _analyzer() logger.debug("Instantiating %s", _analyzer.__name__) @@ -365,5 +307,66 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Return UNKNOWN result for unsupported ecosystems. return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + # This list contains the heuristic analyzer classes + # When implementing new analyzer, appending the classes to this list + analyzers: list = [ + EmptyProjectLinkAnalyzer, + SourceCodeRepoAnalyzer, + OneReleaseAnalyzer, + HighReleaseFrequencyAnalyzer, + UnchangedReleaseAnalyzer, + CloserReleaseJoinDateAnalyzer, + SuspiciousSetupAnalyzer, + WheelAbsenceAnalyzer, + AnomalousVersionAnalyzer, + ] + + problog_result_access = "result" + + malware_rules_problog_model = f""" + % Heuristic groupings + % These are common combinations of heuristics that are used in many of the rules, thus themselves representing + % certain behaviors. When changing or adding rules here, if there are frequent combinations of particular + % heuristics, group them together here. + + % Maintainer has recently joined, publishing an undetailed page with no links. + quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}. + + % Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file. + forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, not {Heuristics.WHEEL_ABSENCE.value}. + + % Suspicious Combinations + + % Package released recently with little detail, forcing the setup.py to run. + {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.ONE_RELEASE.value}. + {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}. + + % Package released recently with little detail, with some more refined trust markers introduced: project links, + % multiple different releases, but there is no source code repository matching it and the setup is suspicious. + {Confidence.HIGH.value}::high :- not {Heuristics.SOURCE_CODE_REPO.value}, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + {Heuristics.UNCHANGED_RELEASE.value}, + forceSetup. + + % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with + % the same code. + {Confidence.MEDIUM.value}::medium :- quickUndetailed, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.UNCHANGED_RELEASE.value}, + {Heuristics.SUSPICIOUS_SETUP.value}. + + % Package released recently with little detail and an anomalous version number for a single-release package. + {Confidence.MEDIUM.value}::medium :- quickUndetailed, + not {Heuristics.ONE_RELEASE.value}, + {Heuristics.WHEEL_ABSENCE.value}, + not {Heuristics.ANOMALOUS_VERSION.value}. + + {problog_result_access} :- high. + {problog_result_access} :- medium. + + query({problog_result_access}). + """ + registry.register(DetectMaliciousMetadataCheck())