diff --git a/pyproject.toml b/pyproject.toml index feafa2b42..d46835842 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "cyclonedx-bom >=4.0.0,<5.0.0", "cyclonedx-python-lib[validation] >=7.3.4,<8.0.0", "beautifulsoup4 >= 4.12.0,<5.0.0", + "problog >= 2.2.6,<3.0.0", ] keywords = [] # https://pypi.org/classifiers/ @@ -203,6 +204,7 @@ module = [ "gitdb.*", "yamale.*", "defusedxml.*", + "problog.*", ] ignore_missing_imports = true diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 6fe93d89a..7617e4156 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -52,6 +52,19 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. - **Dependency**: Will be run if the One Release heuristic fails. +### Contributing + +When contributing an analyzer, it must meet the following requirements: + +- The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)). +- The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis. +- The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) +- Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: + - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum. + - Provide a name based on this confidence value (i.e. `high`, `medium`, or `low`) + - If it does not already exist, make sure to assign this to the result variable (`problog_result_access`) + - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). + ### Confidence Score Motivation The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results: diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 5cb058ca0..80439bb79 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -6,6 +6,9 @@ import logging import requests +from problog import get_evaluatable +from problog.logic import Term +from problog.program import PrologString from sqlalchemy import ForeignKey, String from sqlalchemy.orm import Mapped, mapped_column @@ -65,184 +68,6 @@ class MaliciousMetadataFacts(CheckFacts): } -# This list contains the heuristic analyzer classes -# When implementing new analyzer, appending the classes to this list -ANALYZERS: list = [ - EmptyProjectLinkAnalyzer, - SourceCodeRepoAnalyzer, - OneReleaseAnalyzer, - HighReleaseFrequencyAnalyzer, - UnchangedReleaseAnalyzer, - CloserReleaseJoinDateAnalyzer, - SuspiciousSetupAnalyzer, - WheelAbsenceAnalyzer, - AnomalousVersionAnalyzer, -] - - -# The HeuristicResult sequence is aligned with the sequence of ANALYZERS list -SUSPICIOUS_COMBO: dict[ - tuple[ - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - HeuristicResult, - ], - float, -] = { - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalous version has no effect. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.PASS, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - # Anomalous version has no effect. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.PASS, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. Presence/Absence of - # .whl file has no effect - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.FAIL, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. Presence/Absence of - # .whl file has no effect - ): Confidence.MEDIUM, - ( - HeuristicResult.PASS, # Empty Project - HeuristicResult.FAIL, # Source Code Repo - HeuristicResult.PASS, # One Release - HeuristicResult.FAIL, # High Release Frequency - HeuristicResult.PASS, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.FAIL, # Wheel Absence - HeuristicResult.SKIP, # Anomalous Version - # No source code repo, frequent releases of multiple versions, - # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports and .whl file isn't present. - ): Confidence.HIGH, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.PASS, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.FAIL, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, - ( - HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Source Code Repo - HeuristicResult.FAIL, # One Release - HeuristicResult.SKIP, # High Release Frequency - HeuristicResult.SKIP, # Unchanged Release - HeuristicResult.FAIL, # Closer Release Join Date - HeuristicResult.SKIP, # Suspicious Setup - HeuristicResult.PASS, # Wheel Absence - HeuristicResult.FAIL, # Anomalous Version - # No project link, only one release, and the maintainer released it shortly - # after account registration. - # The setup.py file has no effect and .whl file is present. - # The version number is anomalous. - ): Confidence.MEDIUM, -} - - class DetectMaliciousMetadataCheck(BaseCheck): """This check analyzes the metadata of a package for malicious behavior.""" @@ -303,6 +128,41 @@ def validate_malware(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[boo is_malware, detail_info = sourcecode_analyzer.analyze() return is_malware, detail_info + def evaluate_heuristic_results(self, heuristic_results: dict[Heuristics, HeuristicResult]) -> float | None: + """Analyse the heuristic results to determine the maliciousness of the package. + + Parameters + ---------- + heuristic_results: dict[Heuristics, HeuristicResult] + Dictionary of Heuristic keys with HeuristicResult values, results of each heuristic scan. + + Returns + ------- + float | None + Returns the confidence associated with the detected malicious combination, otherwise None if no associated + malicious combination was triggered. + """ + facts_list: list[str] = [] + for heuristic, result in heuristic_results.items(): + if result == HeuristicResult.SKIP: + facts_list.append(f"0.0::{heuristic.value}.") + elif result == HeuristicResult.PASS: + facts_list.append(f"{heuristic.value} :- true.") + else: # HeuristicResult.FAIL + facts_list.append(f"{heuristic.value} :- false.") + + facts = "\n".join(facts_list) + problog_code = f"{facts}\n\n{self.malware_rules_problog_model}" + logger.debug("Problog model used for evaluation:\n %s", problog_code) + + problog_model = PrologString(problog_code) + problog_results: dict[Term, float] = get_evaluatable().create_from(problog_model).evaluate() + + confidence: float | None = problog_results.get(Term(self.problog_result_access)) + if confidence == 0.0: + return None # no rules were triggered + return confidence + def run_heuristics( self, pypi_package_json: PyPIPackageJsonAsset ) -> tuple[dict[Heuristics, HeuristicResult], dict[str, JsonType]]: @@ -326,7 +186,7 @@ def run_heuristics( results: dict[Heuristics, HeuristicResult] = {} detail_info: dict[str, JsonType] = {} - for _analyzer in ANALYZERS: + for _analyzer in self.analyzers: analyzer: BaseHeuristicAnalyzer = _analyzer() logger.debug("Instantiating %s", _analyzer.__name__) @@ -418,8 +278,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: except HeuristicAnalyzerValueError: return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) - result_combo: tuple = tuple(result.values()) - confidence: float | None = SUSPICIOUS_COMBO.get(result_combo, None) + confidence = self.evaluate_heuristic_results(result) result_type = CheckResultType.FAILED if confidence is None: confidence = Confidence.HIGH @@ -448,5 +307,66 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Return UNKNOWN result for unsupported ecosystems. return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + # This list contains the heuristic analyzer classes + # When implementing new analyzer, appending the classes to this list + analyzers: list = [ + EmptyProjectLinkAnalyzer, + SourceCodeRepoAnalyzer, + OneReleaseAnalyzer, + HighReleaseFrequencyAnalyzer, + UnchangedReleaseAnalyzer, + CloserReleaseJoinDateAnalyzer, + SuspiciousSetupAnalyzer, + WheelAbsenceAnalyzer, + AnomalousVersionAnalyzer, + ] + + problog_result_access = "result" + + malware_rules_problog_model = f""" + % Heuristic groupings + % These are common combinations of heuristics that are used in many of the rules, thus themselves representing + % certain behaviors. When changing or adding rules here, if there are frequent combinations of particular + % heuristics, group them together here. + + % Maintainer has recently joined, publishing an undetailed page with no links. + quickUndetailed :- not {Heuristics.EMPTY_PROJECT_LINK.value}, not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}. + + % Maintainer releases a suspicious setup.py and forces it to run by omitting a .whl file. + forceSetup :- not {Heuristics.SUSPICIOUS_SETUP.value}, not {Heuristics.WHEEL_ABSENCE.value}. + + % Suspicious Combinations + + % Package released recently with little detail, forcing the setup.py to run. + {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.ONE_RELEASE.value}. + {Confidence.HIGH.value}::high :- quickUndetailed, forceSetup, not {Heuristics.HIGH_RELEASE_FREQUENCY.value}. + + % Package released recently with little detail, with some more refined trust markers introduced: project links, + % multiple different releases, but there is no source code repository matching it and the setup is suspicious. + {Confidence.HIGH.value}::high :- not {Heuristics.SOURCE_CODE_REPO.value}, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.CLOSER_RELEASE_JOIN_DATE.value}, + {Heuristics.UNCHANGED_RELEASE.value}, + forceSetup. + + % Package released recently with little detail, with multiple releases as a trust marker, but frequent and with + % the same code. + {Confidence.MEDIUM.value}::medium :- quickUndetailed, + not {Heuristics.HIGH_RELEASE_FREQUENCY.value}, + not {Heuristics.UNCHANGED_RELEASE.value}, + {Heuristics.SUSPICIOUS_SETUP.value}. + + % Package released recently with little detail and an anomalous version number for a single-release package. + {Confidence.MEDIUM.value}::medium :- quickUndetailed, + not {Heuristics.ONE_RELEASE.value}, + {Heuristics.WHEEL_ABSENCE.value}, + not {Heuristics.ANOMALOUS_VERSION.value}. + + {problog_result_access} :- high. + {problog_result_access} :- medium. + + query({problog_result_access}). + """ + registry.register(DetectMaliciousMetadataCheck())