datahub-project · DucNgoQuang · Oct 13, 2025 · Oct 21, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/.github/workflows/gx-plugin.yml b/.github/workflows/gx-plugin.yml
@@ -38,7 +38,7 @@ jobs:
         python-version: ["3.9", "3.11"]
         include:
           - python-version: "3.9"
-            extraPythonRequirement: "great-expectations~=0.17.0"
+            extraPythonRequirement: "great-expectations~=0.18.0"
           - python-version: "3.11"
             extraPythonRequirement: "great-expectations~=0.18.0"
       fail-fast: false

diff --git a/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py b/metadata-ingestion-modules/gx-plugin/src/datahub_gx_plugin/action.py
@@ -25,7 +25,10 @@
     ExpectationSuiteIdentifier,
     ValidationResultIdentifier,
 )
-from great_expectations.execution_engine import PandasExecutionEngine
+from great_expectations.execution_engine import (
+    PandasExecutionEngine,
+    SparkDFExecutionEngine,
+)
 from great_expectations.execution_engine.sqlalchemy_execution_engine import (
     SqlAlchemyExecutionEngine,
 )
@@ -586,16 +589,66 @@ def get_min_max(kwargs, type=AssertionStdParameterType.UNKNOWN):
         )
 
     def get_dataset_partitions(self, batch_identifier, data_asset):
-        dataset_partitions = []
+        dataset_partitions: List[
+            Dict[str, Union[PartitionSpecClass, BatchSpec, str, None]]
+        ] = []
 
         logger.debug("Finding datasets being validated")
 
-        # for now, we support only v3-api and sqlalchemy execution engine and Pandas engine
+        # for now, we support only v3-api and sqlalchemy execution engine,Pandas engine and Spark engine
         is_sql_alchemy = isinstance(data_asset, Validator) and (
             isinstance(data_asset.execution_engine, SqlAlchemyExecutionEngine)
         )
         is_pandas = isinstance(data_asset.execution_engine, PandasExecutionEngine)
-        if is_sql_alchemy or is_pandas:
+
+        is_spark = isinstance(data_asset.execution_engine, SparkDFExecutionEngine)
+
+        if is_spark:
+            ge_batch_spec = data_asset.active_batch_spec
+            partitionSpec = None
+            batchSpecProperties = {
+                "data_asset_name": str(
+                    data_asset.active_batch_definition.data_asset_name
+                ),
+                "datasource_name": str(
+                    data_asset.active_batch_definition.datasource_name
+                ),
+            }
+
+            if isinstance(ge_batch_spec, RuntimeDataBatchSpec):
+                data_platform = self.get_platform_instance(
+                    data_asset.active_batch_definition.datasource_name
+                )
+                dataset_urn = builder.make_dataset_urn_with_platform_instance(
+                    platform=(
+                        data_platform
+                        if self.platform_alias is None
+                        else self.platform_alias
+                    ),
+                    name=data_asset.active_batch_definition.data_asset_name,
+                    platform_instance="",
+                    env=self.env,
+                )
+
+                batchSpec = BatchSpec(
+                    nativeBatchId=batch_identifier,
+                    query="",
+                    customProperties=batchSpecProperties,
+                )
+                dataset_partitions.append(
+                    {
+                        "dataset_urn": dataset_urn,
+                        "partitionSpec": partitionSpec,
+                        "batchSpec": batchSpec,
+                    }
+                )
+            else:
+                warn(
+                    "DataHubValidationAction does not recognize this GE batch spec type- {batch_spec_type}.".format(
+                        batch_spec_type=type(ge_batch_spec)
+                    )
+                )
+        elif is_sql_alchemy or is_pandas:
             ge_batch_spec = data_asset.active_batch_spec
             partitionSpec = None
             batchSpecProperties = {
@@ -607,6 +660,7 @@ def get_dataset_partitions(self, batch_identifier, data_asset):
                 ),
             }
             sqlalchemy_uri = None
+
             if is_sql_alchemy and isinstance(
                 data_asset.execution_engine.engine, Engine
             ):
@@ -758,7 +812,7 @@ def get_dataset_partitions(self, batch_identifier, data_asset):
         else:
             # TODO - v2-spec - SqlAlchemyDataset support
             warn(
-                "DataHubValidationAction does not recognize this GE data asset type - {asset_type}. This is either using v2-api or execution engine other than sqlalchemy.".format(
+                "DataHubValidationAction does not recognize this GE data asset type - {asset_type}.".format(
                     asset_type=type(data_asset)
                 )
             )

diff --git a/metadata-ingestion-modules/gx-plugin/tests/unit/test_great_expectations_action.py b/metadata-ingestion-modules/gx-plugin/tests/unit/test_great_expectations_action.py
@@ -29,6 +29,7 @@
     SqlAlchemyExecutionEngine,
 )
 from great_expectations.validator.validator import Validator
+from pyspark.sql import SparkSession
 
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.metadata.schema_classes import (
@@ -56,6 +57,17 @@ def ge_data_context(tmp_path: str) -> FileDataContext:
     return FileDataContext.create(tmp_path)
 
 
+@pytest.fixture(scope="function")
+def spark_session() -> SparkSession:
+    spark = (
+        SparkSession.builder.master("local")
+        .appName("pytest-pyspark-local-testing")
+        .getOrCreate()
+    )
+    yield spark
+    spark.stop()
+
+
 @pytest.fixture(scope="function")
 def ge_validator_sqlalchemy() -> Validator:
     validator = Validator(
@@ -92,11 +104,112 @@ def ge_validator_sqlalchemy() -> Validator:
 
 
 @pytest.fixture(scope="function")
-def ge_validator_spark() -> Validator:
-    validator = Validator(execution_engine=SparkDFExecutionEngine())
+def ge_validator_spark(
+    spark_session: SparkSession,
+) -> Validator:
+    validator = Validator(
+        execution_engine=SparkDFExecutionEngine(spark=spark_session),
+        batches=[
+            Batch(
+                data=spark_session.createDataFrame(
+                    [{"foo": 10, "bar": 100}, {"foo": 20, "bar": 200}]
+                ),
+                batch_request=BatchRequest(
+                    datasource_name="my_sparkdf_datasource",
+                    data_connector_name="spark_df",
+                    data_asset_name="foobar_spark_df",
+                ),
+                batch_definition=BatchDefinition(
+                    datasource_name="my_sparkdf_datasource",
+                    data_connector_name="spark_df",
+                    data_asset_name="foobar_spark_df",
+                    batch_identifiers=IDDict(),
+                ),
+                batch_spec=RuntimeDataBatchSpec(
+                    {
+                        "data_asset_name": "foobar_spark_df",
+                        "batch_identifiers": {},
+                        "batch_data": {},
+                        "type": "spark_dataframe",
+                    }
+                ),
+            )
+        ],
+    )
     return validator
 
 
+@pytest.fixture(scope="function")
+def ge_validation_result_suite_spark() -> ExpectationSuiteValidationResult:
+    validation_result_suite = ExpectationSuiteValidationResult(
+        results=[
+            {
+                "success": True,
+                "expectation_config": {
+                    "expectation_type": "expect_column_values_to_not_be_null",
+                    "kwargs": {"column": "foo", "batch_id": "hive-default.menu_silver"},
+                    "meta": {},
+                },
+                "result": {
+                    "element_count": 2,
+                    "unexpected_count": 0,
+                    "unexpected_percent": 0.0,
+                    "partial_unexpected_list": [],
+                    "partial_unexpected_counts": [],
+                },
+                "meta": {},
+                "exception_info": {
+                    "raised_exception": False,
+                    "exception_traceback": None,
+                    "exception_message": None,
+                },
+            }
+        ],
+        success=True,
+        statistics={
+            "evaluated_expectations": 1,
+            "successful_expectations": 1,
+            "unsuccessful_expectations": 0,
+            "success_percent": 100.0,
+        },
+        meta={
+            "great_expectations_version": "0.18.21",
+            "expectation_suite_name": "test_suite",
+            "run_id": {
+                "run_name": None,
+                "run_time": "2025-11-20T00:11:40.027152+07:00",
+            },
+            "batch_spec": {"batch_data": "SparkDataFrame"},
+            "batch_markers": {"ge_load_time": "20251119T171140.030260Z"},
+            "active_batch_definition": {
+                "datasource_name": "hive",
+                "data_connector_name": "fluent",
+                "data_asset_name": "default.menu_silver",
+                "batch_identifiers": {},
+            },
+            "validation_time": "20251119T171140.035732Z",
+            "checkpoint_name": "test_checkpoint",
+            "validation_id": None,
+            "checkpoint_id": None,
+        },
+    )
+    return validation_result_suite
+
+
+@pytest.fixture(scope="function")
+def ge_validation_result_suite_id_spark() -> ValidationResultIdentifier:
+    validation_result_suite_id = ValidationResultIdentifier(
+        expectation_suite_identifier=ExpectationSuiteIdentifier("test_suite"),
+        run_id=RunIdentifier(
+            run_name=None,
+            run_time=datetime.fromtimestamp(1731981100.027152, tz=timezone.utc),
+        ),
+        batch_identifier="hive-default.menu_silver",
+    )
+
+    return validation_result_suite_id
+
+
 @pytest.fixture(scope="function")
 def ge_validator_pandas() -> Validator:
     validator = Validator(
@@ -398,32 +511,120 @@ def test_DataHubValidationAction_pandas(
     )
 
 
-def test_DataHubValidationAction_graceful_failure(
+@mock.patch("datahub.emitter.rest_emitter.DatahubRestEmitter.emit_mcp", autospec=True)
+def test_DataHubValidationAction_spark(
+    mock_emitter: mock.MagicMock,
     ge_data_context: FileDataContext,
-    ge_validator_sqlalchemy: Validator,
-    ge_validation_result_suite: ExpectationSuiteValidationResult,
-    ge_validation_result_suite_id: ValidationResultIdentifier,
+    ge_validator_spark: Validator,
+    ge_validation_result_suite_spark: ExpectationSuiteValidationResult,
+    ge_validation_result_suite_id_spark: ValidationResultIdentifier,
 ) -> None:
     server_url = "http://localhost:9999"
 
     datahub_action = DataHubValidationAction(
-        data_context=ge_data_context, server_url=server_url
+        data_context=ge_data_context,
+        server_url=server_url,
+        platform_instance_map={"my_sparkdf_datasource": "custom_platefrom_spark"},
     )
 
     assert datahub_action.run(
-        validation_result_suite_identifier=ge_validation_result_suite_id,
-        validation_result_suite=ge_validation_result_suite,
-        data_asset=ge_validator_sqlalchemy,
-    ) == {"datahub_notification_result": "DataHub notification failed"}
+        validation_result_suite_identifier=ge_validation_result_suite_id_spark,
+        validation_result_suite=ge_validation_result_suite_spark,
+        data_asset=ge_validator_spark,
+    ) == {"datahub_notification_result": "DataHub notification succeeded"}
 
+    mock_emitter.assert_has_calls(
+        [
+            mock.call(
+                mock.ANY,
+                MetadataChangeProposalWrapper(
+                    entityType="assertion",
+                    changeType="UPSERT",
+                    entityUrn="urn:li:assertion:5f0a1761a5e0d1b7acb7ec622f778ebc",
+                    aspectName="assertionInfo",
+                    aspect=AssertionInfoClass(
+                        type=AssertionTypeClass.DATASET,
+                        customProperties={"expectation_suite_name": "test_suite"},
+                        datasetAssertion=DatasetAssertionInfoClass(
+                            dataset=(
+                                "urn:li:dataset:(urn:li:dataPlatform:custom_platefrom_spark,"
+                                "foobar_spark_df,PROD)"
+                            ),
+                            scope=DatasetAssertionScopeClass.DATASET_COLUMN,
+                            fields=[
+                                "urn:li:schemaField:("
+                                "urn:li:dataset:(urn:li:dataPlatform:custom_platefrom_spark,"
+                                "foobar_spark_df,PROD),foo)"
+                            ],
+                            aggregation="IDENTITY",
+                            operator="NOT_NULL",
+                            nativeType="expect_column_values_to_not_be_null",
+                            nativeParameters={"column": "foo"},
+                        ),
+                    ),
+                ),
+            ),
+            mock.call(
+                mock.ANY,
+                MetadataChangeProposalWrapper(
+                    entityType="assertion",
+                    changeType="UPSERT",
+                    entityUrn="urn:li:assertion:5f0a1761a5e0d1b7acb7ec622f778ebc",
+                    aspectName="dataPlatformInstance",
+                    aspect=DataPlatformInstanceClass(
+                        platform="urn:li:dataPlatform:great-expectations"
+                    ),
+                ),
+            ),
+            mock.call(
+                mock.ANY,
+                MetadataChangeProposalWrapper(
+                    entityType="assertion",
+                    changeType="UPSERT",
+                    entityUrn="urn:li:assertion:5f0a1761a5e0d1b7acb7ec622f778ebc",
+                    aspectName="assertionRunEvent",
+                    aspect=AssertionRunEventClass(
+                        timestampMillis=mock.ANY,
+                        runId=mock.ANY,
+                        assertionUrn="urn:li:assertion:5f0a1761a5e0d1b7acb7ec622f778ebc",
+                        asserteeUrn=(
+                            "urn:li:dataset:(urn:li:dataPlatform:custom_platefrom_spark,"
+                            "foobar_spark_df,PROD)"
+                        ),
+                        status=AssertionRunStatusClass.COMPLETE,
+                        result=AssertionResultClass(
+                            type=AssertionResultTypeClass.SUCCESS,
+                            rowCount=2,
+                            unexpectedCount=0,
+                            nativeResults={},
+                        ),
+                        batchSpec=BatchSpecClass(
+                            customProperties={
+                                "data_asset_name": "foobar_spark_df",
+                                "datasource_name": "my_sparkdf_datasource",
+                            },
+                            nativeBatchId="hive-default.menu_silver",
+                            query="",
+                        ),
+                        partitionSpec=PartitionSpecClass(
+                            type="FULL_TABLE",
+                            partition="FULL_TABLE_SNAPSHOT",
+                            timePartition=None,
+                        ),
+                    ),
+                ),
+            ),
+        ]
+    )
 
-def test_DataHubValidationAction_not_supported(
+
+def test_DataHubValidationAction_graceful_failure(
     ge_data_context: FileDataContext,
-    ge_validator_spark: Validator,
+    ge_validator_sqlalchemy: Validator,
     ge_validation_result_suite: ExpectationSuiteValidationResult,
     ge_validation_result_suite_id: ValidationResultIdentifier,
 ) -> None:
-    server_url = "http://localhost:99199"
+    server_url = "http://localhost:9999"
 
     datahub_action = DataHubValidationAction(
         data_context=ge_data_context, server_url=server_url
@@ -432,5 +633,5 @@ def test_DataHubValidationAction_not_supported(
     assert datahub_action.run(
         validation_result_suite_identifier=ge_validation_result_suite_id,
         validation_result_suite=ge_validation_result_suite,
-        data_asset=ge_validator_spark,
-    ) == {"datahub_notification_result": "none required"}
+        data_asset=ge_validator_sqlalchemy,
+    ) == {"datahub_notification_result": "DataHub notification failed"}