Feature/unify execution methods (Edwardvaneechoud#110)

Edwardvaneechoud · Bernardo Fernandes · commit c74d9da2cdd5 · 2025-08-26T11:43:36.000+01:00
* removing auto to improve maintainability

* Ensure the offload per worker is determined per graph and there is no dependency on a global variable.

* Small improvement in logging

* Removing global change in tests

* skipping test in docker
diff --git a/flowfile_core/flowfile_core/flowfile/flow_data_engine/flow_data_engine.py b/flowfile_core/flowfile_core/flowfile/flow_data_engine/flow_data_engine.py
@@ -1937,7 +1937,6 @@ def get_number_of_records(self, warn: bool = False, force_calculate: bool = Fals
         """
         if self.is_future and not self.is_collected:
             return -1
-        calculate_in_worker_process = False if not OFFLOAD_TO_WORKER else calculate_in_worker_process
         if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
             if self._number_of_records_callback is not None:
                 self._number_of_records_callback(self)
diff --git a/flowfile_core/flowfile_core/flowfile/flow_graph.py b/flowfile_core/flowfile_core/flowfile/flow_graph.py
@@ -229,7 +229,18 @@ def __init__(self,
         elif input_flow is not None:
             self.add_datasource(input_file=input_flow)
 
-        skip_nodes, execution_order = compute_execution_plan(nodes=self.nodes,flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
+    @property
+    def flow_settings(self) -> schemas.FlowSettings:
+        return self._flow_settings
+
+    @flow_settings.setter
+    def flow_settings(self, flow_settings: schemas.FlowSettings):
+        if (
+                (self._flow_settings.execution_location != flow_settings.execution_location) or
+                (self._flow_settings.execution_mode != flow_settings.execution_mode)
+        ):
+            self.reset()
+        self._flow_settings = flow_settings
 
     def add_node_promise(self, node_promise: input_schema.NodePromise):
         """Adds a placeholder node to the graph that is not yet fully configured.
@@ -320,6 +331,7 @@ def print_tree(self):
         if not self._node_db:
             self.flow_logger.info("Empty flow graph")
             return
+
         # Build node information
         node_info = build_node_info(self.nodes)
 
@@ -339,7 +351,7 @@ def print_tree(self):
 
         # Track which nodes connect to what
         merge_points = define_node_connections(node_info)
-
+        
         # Build the flow paths
 
         # Find the maximum label length for each depth level
@@ -348,7 +360,7 @@ def print_tree(self):
             if depth in depth_groups:
                 max_len = max(len(node_info[nid].label) for nid in depth_groups[depth])
                 max_label_length[depth] = max_len
-
+        
         # Draw the paths
         drawn_nodes = set()
         merge_drawn = set()
diff --git a/flowfile_core/flowfile_core/flowfile/flow_node/flow_node.py b/flowfile_core/flowfile_core/flowfile/flow_node/flow_node.py
@@ -5,7 +5,6 @@
 from flowfile_core.utils.arrow_reader import get_read_top_n
 from flowfile_core.schemas import input_schema, schemas
 from flowfile_core.configs.flow_logger import NodeLogger
-from flowfile_core.configs.settings import SINGLE_FILE_MODE, OFFLOAD_TO_WORKER
 
 from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
 from flowfile_core.flowfile.utils import get_hash
@@ -921,8 +920,14 @@ def execute_node(self, run_location: schemas.ExecutionLocationsLiteral, reset_ca
                                           node_logger=node_logger)
                     else:
                         self.results.errors = str(e)
-                        node_logger.error(f'Error with running the node: {e}')
-            elif ((run_location == 'local' or SINGLE_FILE_MODE) and
+                        if "Connection refused" in str(e) and "/submit_query/" in str(e):
+                            node_logger.warning("There was an issue connecting to the remote worker, "
+                                                "ensure the worker process is running, "
+                                                "or change the settings to, so it executes locally")
+                            node_logger.error("Could not execute in the remote worker. (Re)start the worker service, or change settings to local settings.")
+                        else:
+                            node_logger.error(f'Error with running the node: {e}')
+            elif ((run_location == 'local') and
                   (not self.node_stats.has_run_with_current_setup or self.node_template.node_group == "output")):
                 try:
                     node_logger.info('Executing fully locally')
diff --git a/flowfile_core/tests/flowfile/analytics/test_analytics_processor.py b/flowfile_core/tests/flowfile/analytics/test_analytics_processor.py
@@ -545,9 +545,6 @@ def test_analytics_processor_from_parquet_file_run_performance():
 
 
 def test_analytics_processor_from_parquet_file_run_in_one_local_process():
-    from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
-    OFFLOAD_TO_WORKER.value = False
-
     graph = create_graph()
 
     graph.flow_settings.execution_location = "local"
@@ -564,4 +561,3 @@ def test_analytics_processor_from_parquet_file_run_in_one_local_process():
     graph.run_graph()
     assert node_step.results.analysis_data_generator, 'The node should have to run'
     assert node_step.results.analysis_data_generator().__len__() == 10_000, 'There should be 1000 rows in the data'
-    OFFLOAD_TO_WORKER.value = True
diff --git a/flowfile_core/tests/flowfile/test_flowfile.py b/flowfile_core/tests/flowfile/test_flowfile.py
@@ -14,13 +14,25 @@
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
 from flowfile_core.flowfile.schema_callbacks import pre_calculate_pivot_schema
 
-
 import pytest
 from pathlib import Path
 from typing import List, Dict, Literal
 from copy import deepcopy
 from time import sleep
 
+def find_parent_directory(target_dir_name,):
+    """Navigate up directories until finding the target directory"""
+    current_path = Path(__file__)
+
+    while current_path != current_path.parent:
+        if current_path.name == target_dir_name:
+            return current_path
+        if current_path.name == target_dir_name:
+            return current_path
+        current_path = current_path.parent
+
+    raise FileNotFoundError(f"Directory '{target_dir_name}' not found")
+
 try:
     from tests.flowfile_core_test_utils import (is_docker_available, ensure_password_is_available)
     from tests.utils import ensure_cloud_storage_connection_is_available_and_get_connection
@@ -254,15 +266,22 @@ def test_opening_parquet_file(flow_logger: FlowLogger):
 def test_running_performance_mode():
     graph = create_graph()
     from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
+    from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
     add_node_promise_on_type(graph, 'read', 1, 1)
     from flowfile_core.configs.flow_logger import main_logger
+    received_table = input_schema.ReceivedTable(
+        file_type='parquet', name='table.parquet',
+        path=str(find_parent_directory("Flowfile")/'flowfile_core/tests/support_files/data/table.parquet'))
+    from flowfile_core.configs.flow_logger import main_logger
     received_table = input_schema.ReceivedTable(
         file_type='parquet', name='table.parquet',
         path=str(find_parent_directory("Flowfile")/'flowfile_core/tests/support_files/data/table.parquet'))
     node_read = input_schema.NodeRead(flow_id=1, node_id=1, cache_data=False, received_file=received_table)
     graph.add_read(node_read)
     main_logger.warning(str(graph))
     main_logger.warning(OFFLOAD_TO_WORKER)
+    main_logger.warning(str(graph))
+    main_logger.warning(OFFLOAD_TO_WORKER)
     add_node_promise_on_type(graph, 'record_count', 2)
     connection = input_schema.NodeConnection.create_from_simple_input(1, 2)
     add_connection(graph, connection)
@@ -274,6 +293,7 @@ def test_running_performance_mode():
     graph.flow_settings.execution_mode = 'Development'
     slow = graph.run_graph()
 
+
     assert slow.node_step_result[1].run_time > fast.node_step_result[1].run_time, 'Performance mode should be faster'
 
 
@@ -325,11 +345,8 @@ def test_add_fuzzy_match():
 
 
 def test_add_fuzzy_match_lcoal():
-    from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
-
     graph = create_graph()
     graph.flow_settings.execution_location = "local"
-    OFFLOAD_TO_WORKER.value = False
     input_data = [{'name': 'eduward'},
                   {'name': 'edward'},
                   {'name': 'courtney'}]
@@ -356,7 +373,6 @@ def test_add_fuzzy_match_lcoal():
         'name_vs_name_right_levenshtein': [1.0, 0.8571428571428572, 1.0, 0.8571428571428572, 1.0]}
     )
     output_data.assert_equal(expected_data)
-    OFFLOAD_TO_WORKER.value = True
 
 
 def test_add_record_count():
@@ -1164,12 +1180,16 @@ def tracking_method(*args, **kwargs):
     handle_run_info(result)
 
 
+@pytest.mark.skipif(not is_docker_available(), reason="Docker is not available or not running so database reader cannot be tested")
 @pytest.mark.skipif(not is_docker_available(), reason="Docker is not available or not running so database reader cannot be tested")
 def test_complex_cloud_write_scenario():
 
+
     ensure_cloud_storage_connection_is_available_and_get_connection()
     handler = FlowfileHandler()
 
+    flow_id = handler.import_flow(find_parent_directory("Flowfile") / "flowfile_core/tests/support_files/flows/test_cloud_local.flowfile")
+
     flow_id = handler.import_flow(find_parent_directory("Flowfile") / "flowfile_core/tests/support_files/flows/test_cloud_local.flowfile")
     graph = handler.get_flow(flow_id)
     node= graph.get_node(3)
@@ -1194,26 +1214,6 @@ def test_no_re_calculate_example_data_after_change_no_run():
     add_connection(graph, input_schema.NodeConnection.create_from_simple_input(from_id=1, to_id=3))
     graph.run_graph()
 
-
-def test_no_re_calculate_example_data_after_change_no_run():
-    from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
-
-    OFFLOAD_TO_WORKER.value = False
-
-    graph = get_dependency_example()
-    graph.flow_settings.execution_location = "local"
-    graph.run_graph()
-    graph.add_formula(
-        input_schema.NodeFormula(
-            flow_id=1,
-            node_id=3,
-            function=transform_schema.FunctionInput(transform_schema.FieldInput(name="titleCity"),
-                                                    function="titlecase([city])"),
-        )
-    )
-    add_connection(graph, input_schema.NodeConnection.create_from_simple_input(from_id=1, to_id=3))
-    graph.run_graph()
-
     first_data = [row["titleCity"] for row in graph.get_node_data(3, True).main_output.data]
     assert len(first_data) > 0, 'Data should be present'
     graph.add_formula(
@@ -1235,12 +1235,8 @@ def test_no_re_calculate_example_data_after_change_no_run():
 
     assert after_change_data_after_run != first_data, 'Data should be different after run'
 
-    OFFLOAD_TO_WORKER.value = True
-
 
 def test_add_fuzzy_match_only_local():
-    from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
-    OFFLOAD_TO_WORKER.value = False
     graph = create_graph()
     graph.flow_settings.execution_location = "local"
     input_data = [{'name': 'eduward'},
@@ -1269,7 +1265,40 @@ def test_add_fuzzy_match_only_local():
          'name_vs_name_right_levenshtein': [1.0, 0.8571428571428572, 1.0, 1.0, 0.8571428571428572]}
     )
     output_data.assert_equal(expected_data)
-    OFFLOAD_TO_WORKER.value = True
+
+
+def test_changes_execution_mode(flow_logger):
+    settings = {'flow_id': 1, 'node_id': 1, 'pos_x': 304.8727272727273,
+                'pos_y': 549.5272727272727, 'is_setup': True, 'description': 'Test csv',
+                'received_file': {'id': None, 'name': 'fake_data.csv',
+                                  'path': str(find_parent_directory("Flowfile")/'flowfile_core/tests/support_files/data/fake_data.csv'),
+                                  'directory': None, 'analysis_file_available': False, 'status': None,
+                                  'file_type': 'csv', 'fields': [], 'reference': '', 'starting_from_line': 0,
+                                  'delimiter': ',', 'has_headers': True, 'encoding': 'utf-8', 'parquet_ref': None,
+                                  'row_delimiter': '', 'quote_char': '', 'infer_schema_length': 20000,
+                                  'truncate_ragged_lines': False, 'ignore_errors': False, 'sheet_name': None,
+                                  'start_row': 0, 'start_column': 0, 'end_row': 0, 'end_column': 0,
+                                  'type_inference': False}}
+    graph = create_graph()
+    flow_logger.warning(str(graph))
+    add_node_promise_on_type(graph, 'read', 1)
+    input_file = input_schema.NodeRead(**settings)
+    graph.add_read(input_file)
+    run_info = graph.run_graph()
+    handle_run_info(run_info)
+    graph.add_select(select_settings=input_schema.NodeSelect(flow_id=1, node_id=2,
+                                                             select_input=[transform_schema.SelectInput("City")],
+                                                             keep_missing=True))
+    add_connection(graph, input_schema.NodeConnection.create_from_simple_input(1, 2))
+    explain_node_2 = graph.get_node(2).get_resulting_data().data_frame.explain()
+    assert "flowfile_core/tests/support_files/data/fake_data.csv" not in explain_node_2
+    graph.execution_location = "local"
+
+    explain_node_2 = graph.get_node(2).get_resulting_data().data_frame.explain()
+    # now it should read from the actual source, since we do not cache the data with the external worker
+
+    assert "flowfile_core/tests/support_files/data/fake_data.csv" in explain_node_2
+
 
 
 def test_fuzzy_match_schema_predict(flow_logger):
diff --git a/flowfile_frame/flowfile_frame/__init__.py b/flowfile_frame/flowfile_frame/__init__.py
@@ -1,9 +1,6 @@
 # flowframe/__init__.py
 """A Polars-like API for building ETL graphs."""
 
-from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
-OFFLOAD_TO_WORKER.value = False
-
 # Core classes
 from flowfile_frame.flow_frame import FlowFrame   # noqa: F401
 from pl_fuzzy_frame_match.models import FuzzyMapping  # noqa: F401