automl · ravinkohli · Dec 10, 2021 · Dec 8, 2021 · Dec 8, 2021 · Dec 9, 2021
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -27,7 +27,7 @@
 
 import pandas as pd
 
-from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
+from smac.runhistory.runhistory import DataOrigin, RunHistory
 from smac.stats.stats import Stats
 from smac.tae import StatusType
 
@@ -238,7 +238,7 @@ def __init__(
                                  " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))
 
     @abstractmethod
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
+    def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> BasePipeline:
         """
         Build pipeline according to current task
         and for the passed dataset properties
@@ -486,11 +486,16 @@ def _load_models(self) -> bool:
             raise ValueError("Resampling strategy is needed to determine what models to load")
         self.ensemble_ = self._backend.load_ensemble(self.seed)
 
-        if isinstance(self._disable_file_output, List):
-            disabled_file_outputs = self._disable_file_output
+        # TODO: remove this code after `fit_pipeline` is rebased.
+        if hasattr(self, '_disable_file_output'):
+            if isinstance(self._disable_file_output, List):
+                disabled_file_outputs = self._disable_file_output
+                disable_file_output = False
+            elif isinstance(self._disable_file_output, bool):
+                disable_file_output = self._disable_file_output
+                disabled_file_outputs = []
+        else:
             disable_file_output = False
-        elif isinstance(self._disable_file_output, bool):
-            disable_file_output = self._disable_file_output
             disabled_file_outputs = []
 
         # If no ensemble is loaded, try to get the best performing model
@@ -794,18 +799,15 @@ def run_traditional_ml(
                 learning algorithm runs over the time limit.
         """
         assert self._logger is not None  # for mypy compliancy
-        if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
-            self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
-        else:
-            traditional_task_name = 'runTraditional'
-            self._stopwatch.start_task(traditional_task_name)
-            elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
-            time_for_traditional = int(runtime_limit - elapsed_time)
-            self._do_traditional_prediction(
-                func_eval_time_limit_secs=func_eval_time_limit_secs,
-                time_left=time_for_traditional,
-            )
-            self._stopwatch.stop_task(traditional_task_name)
+        traditional_task_name = 'runTraditional'
+        self._stopwatch.start_task(traditional_task_name)
+        elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
+        time_for_traditional = int(runtime_limit - elapsed_time)
+        self._do_traditional_prediction(
+            func_eval_time_limit_secs=func_eval_time_limit_secs,
+            time_left=time_for_traditional,
+        )
+        self._stopwatch.stop_task(traditional_task_name)
 
     def _search(
         self,
@@ -1165,22 +1167,7 @@ def _search(
         self._logger.info("Starting Shutdown")
 
         if proc_ensemble is not None:
-            self._results_manager.ensemble_performance_history = list(proc_ensemble.history)
-
-            if len(proc_ensemble.futures) > 0:
-                # Also add ensemble runs that did not finish within smac time
-                # and add them into the ensemble history
-                self._logger.info("Ensemble script still running, waiting for it to finish.")
-                result = proc_ensemble.futures.pop().result()
-                if result:
-                    ensemble_history, _, _, _ = result
-                    self._results_manager.ensemble_performance_history.extend(ensemble_history)
-                self._logger.info("Ensemble script finished, continue shutdown.")
-
-            # save the ensemble performance history file
-            if len(self.ensemble_performance_history) > 0:
-                pd.DataFrame(self.ensemble_performance_history).to_json(
-                    os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
+            self._collect_results_ensemble(proc_ensemble)
 
         if load_models:
             self._logger.info("Loading models...")
@@ -1321,7 +1308,7 @@ def fit(self,
             exclude=self.exclude_components,
             search_space_updates=self.search_space_updates)
         dataset_properties = dataset.get_dataset_properties(dataset_requirements)
-        self._backend.replace_datamanager(dataset)
+        self._backend.save_datamanager(dataset)
 
         # build pipeline
         pipeline = self.build_pipeline(dataset_properties)
@@ -1339,7 +1326,6 @@ def fit(self,
         self._clean_logger()
         return pipeline
 
-
     def fit_ensemble(
             self,
             optimize_metric: Optional[str] = None,
@@ -1418,7 +1404,7 @@ def fit_ensemble(
         ensemble_fit_task_name = 'EnsembleFit'
         self._stopwatch.start_task(ensemble_fit_task_name)
         if enable_traditional_pipeline:
-            if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
+            if func_eval_time_limit_secs > time_for_task:
                 self._logger.warning(
                     'Time limit for a single run is higher than total time '
                     'limit. Capping the limit for a single run to the total '
@@ -1459,12 +1445,8 @@ def fit_ensemble(
         )
 
         manager.build_ensemble(self._dask_client)
-        future = manager.futures.pop()
-        result = future.result()
-        if result is None:
-            raise ValueError("Errors occurred while building the ensemble - please"
-                             " check the log file and command line output for error messages.")
-        self.ensemble_performance_history, _, _, _ = result
+        if manager is not None:
+            self._collect_results_ensemble(manager)
 
         if load_models:
             self._load_models()
@@ -1542,6 +1524,31 @@ def _init_ensemble_builder(
 
         return proc_ensemble
 
+    def _collect_results_ensemble(
+        self,
+        manager: EnsembleBuilderManager
+    ) -> None:
+
+        if self._logger is None:
+            raise ValueError("logger should be initialized to fit ensemble")
+
+        self._results_manager.ensemble_performance_history = list(manager.history)
+
+        if len(manager.futures) > 0:
+            # Also add ensemble runs that did not finish within smac time
+            # and add them into the ensemble history
+            self._logger.info("Ensemble script still running, waiting for it to finish.")
+            result = manager.futures.pop().result()
+            if result:
+                ensemble_history, _, _, _ = result
+                self._results_manager.ensemble_performance_history.extend(ensemble_history)
+            self._logger.info("Ensemble script finished, continue shutdown.")
+
+        # save the ensemble performance history file
+        if len(self.ensemble_performance_history) > 0:
+            pd.DataFrame(self.ensemble_performance_history).to_json(
+                os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
+
     def predict(
         self,
         X_test: np.ndarray,

diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -13,6 +13,7 @@
     TASK_TYPES_TO_STRING,
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -109,7 +110,7 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
         )
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
+    def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularClassificationPipeline:
         """
         Build pipeline according to current task and for the passed dataset properties
 
@@ -120,16 +121,7 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassific
             TabularClassificationPipeline:
                 Pipeline compatible with the given dataset properties.
         """
-
-    def build_pipeline(self, dataset_properties: Dict[str, Any],
-                       include_components: Optional[Dict] = None,
-                       exclude_components: Optional[Dict] = None,
-                       search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
-                       ) -> TabularClassificationPipeline:
-        return TabularClassificationPipeline(dataset_properties=dataset_properties,
-                                             include=include_components,
-                                             exclude=exclude_components,
-                                             search_space_updates=search_space_updates)
+        return TabularClassificationPipeline(dataset_properties=dataset_properties)
 
     def search(
         self,
@@ -281,6 +273,18 @@ def search(
             self
 
         """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
+
+        # Create a validator object to make sure that the data provided by
+        # the user matches the autopytorch requirements
+        self.InputValidator = TabularInputValidator(
+            is_classification=True,
+            logger_port=self._logger_port,
+        )
 
         # Fit a input validator to check the provided data
         # Also, an encoder is fit to both train and test data,
@@ -303,9 +307,9 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-
         if self.dataset is None:
             raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
+
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
@@ -345,24 +349,24 @@ def predict(
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator fit() method.")
 
-        X_test = self.input_validator.feature_validator.transform(X_test)
+        X_test = self.InputValidator.feature_validator.transform(X_test)
         predicted_probabilities = super().predict(X_test, batch_size=batch_size,
                                                   n_jobs=n_jobs)
 
-        if self.input_validator.target_validator.is_single_column_target():
+        if self.InputValidator.target_validator.is_single_column_target():
             predicted_indexes = np.argmax(predicted_probabilities, axis=1)
         else:
             predicted_indexes = (predicted_probabilities > 0.5).astype(int)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.input_validator.target_validator.inverse_transform(predicted_indexes)
+        return self.InputValidator.target_validator.inverse_transform(predicted_indexes)
 
     def predict_proba(self,
                       X_test: Union[np.ndarray, pd.DataFrame, List],
                       batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
-        if self.input_validator is None or not self.input_validator._is_fitted:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator fit() method.")
-        X_test = self.input_validator.feature_validator.transform(X_test)
+        X_test = self.InputValidator.feature_validator.transform(X_test)
         return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -13,6 +13,7 @@
     TASK_TYPES_TO_STRING
 )
 from autoPyTorch.data.tabular_validator import TabularInputValidator
+from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
 from autoPyTorch.datasets.resampling_strategy import (
     CrossValTypes,
     HoldoutValTypes,
@@ -81,9 +82,9 @@ def __init__(
         delete_output_folder_after_terminate: bool = True,
         include_components: Optional[Dict] = None,
         exclude_components: Optional[Dict] = None,
-        resampling_strategy:Union[CrossValTypes,
-                                    HoldoutValTypes,
-                                    NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
+        resampling_strategy: Union[CrossValTypes,
+                                   HoldoutValTypes,
+                                   NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         backend: Optional[Backend] = None,
         search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
@@ -109,7 +110,7 @@ def __init__(
             task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
         )
 
-    def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
+    def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularRegressionPipeline:
         """
         Build pipeline according to current task and for the passed dataset properties
 
@@ -272,6 +273,11 @@ def search(
             self
 
         """
+        if dataset_name is None:
+            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
+
+        # we have to create a logger for at this point for the validator
+        self._logger = self._get_logger(dataset_name)
 
         # Create a validator object to make sure that the data provided by
         # the user matches the autopytorch requirements
@@ -301,9 +307,9 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-
         if self.dataset is None:
             raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
+
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
@@ -329,14 +335,14 @@ def predict(
             batch_size: Optional[int] = None,
             n_jobs: int = 1
     ) -> np.ndarray:
-        if self.input_validator is None or not self.input_validator._is_fitted:
+        if self.InputValidator is None or not self.InputValidator._is_fitted:
             raise ValueError("predict() is only supported after calling search. Kindly call first "
                              "the estimator fit() method.")
 
-        X_test = self.input_validator.feature_validator.transform(X_test)
+        X_test = self.InputValidator.feature_validator.transform(X_test)
         predicted_values = super().predict(X_test, batch_size=batch_size,
                                            n_jobs=n_jobs)
 
         # Allow to predict in the original domain -- that is, the user is not interested
         # in our encoded values
-        return self.input_validator.target_validator.inverse_transform(predicted_values)
+        return self.InputValidator.target_validator.inverse_transform(predicted_values)
diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py
@@ -98,7 +98,6 @@ def fit(
                                      np.shape(y_test)
                                  ))
             if isinstance(y_train, pd.DataFrame):
-                y_train = cast(pd.DataFrame, y_train)
                 y_test = cast(pd.DataFrame, y_test)
                 if y_train.columns.tolist() != y_test.columns.tolist():
                     raise ValueError(

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Dict, List, Optional, Tuple, Union, cast
+from typing import Dict, List, Optional, Tuple, Type, Union, cast
 
 import numpy as np
 
@@ -263,7 +263,7 @@ def transform(
             X = self.numpy_to_pandas(X)
 
         if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
-            X = cast(pd.DataFrame, X)
+            X = cast(Type[pd.DataFrame], X)
 
         # Check the data here so we catch problems on new test data
         self._check_data(X)
@@ -391,9 +391,6 @@ def _get_columns_info(
                 Type of each column numerical/categorical
         """
 
-        if len(self.transformed_columns) > 0 and self.feat_type is not None:
-            return self.transformed_columns, self.feat_type
-
         # Register if a column needs encoding
         numerical_columns = []
         categorical_columns = []

diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, cast
+from typing import List, Optional, cast
 
 import numpy as np
 

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -92,7 +92,10 @@ class NoResamplingStrategyTypes(IntEnum):
 RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
 
 
-DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = {
+DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[CrossValTypes,
+                                          HoldoutValTypes,
+                                          NoResamplingStrategyTypes],
+                                    Dict[str, Any]] = {
     HoldoutValTypes.holdout_validation: {
         'val_share': 0.33,
     },
@@ -117,7 +120,7 @@ class NoResamplingStrategyTypes(IntEnum):
     NoResamplingStrategyTypes.shuffle_no_resampling: {
         'shuffle': True
     }
-}  # type: Dict[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes], Dict[str, Any]]
+}
 
 
 class HoldOutFuncs():