Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 50 additions & 43 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import pandas as pd

from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
from smac.runhistory.runhistory import DataOrigin, RunHistory
from smac.stats.stats import Stats
from smac.tae import StatusType

Expand Down Expand Up @@ -238,7 +238,7 @@ def __init__(
" HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates)))

@abstractmethod
def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline:
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> BasePipeline:
"""
Build pipeline according to current task
and for the passed dataset properties
Expand Down Expand Up @@ -486,11 +486,16 @@ def _load_models(self) -> bool:
raise ValueError("Resampling strategy is needed to determine what models to load")
self.ensemble_ = self._backend.load_ensemble(self.seed)

if isinstance(self._disable_file_output, List):
disabled_file_outputs = self._disable_file_output
# TODO: remove this code after `fit_pipeline` is rebased.
if hasattr(self, '_disable_file_output'):
if isinstance(self._disable_file_output, List):
disabled_file_outputs = self._disable_file_output
disable_file_output = False
elif isinstance(self._disable_file_output, bool):
disable_file_output = self._disable_file_output
disabled_file_outputs = []
else:
disable_file_output = False
elif isinstance(self._disable_file_output, bool):
disable_file_output = self._disable_file_output
disabled_file_outputs = []

# If no ensemble is loaded, try to get the best performing model
Expand Down Expand Up @@ -794,18 +799,15 @@ def run_traditional_ml(
learning algorithm runs over the time limit.
"""
assert self._logger is not None # for mypy compliancy
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
else:
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
time_for_traditional = int(runtime_limit - elapsed_time)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
time_for_traditional = int(runtime_limit - elapsed_time)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)

def _search(
self,
Expand Down Expand Up @@ -1165,22 +1167,7 @@ def _search(
self._logger.info("Starting Shutdown")

if proc_ensemble is not None:
self._results_manager.ensemble_performance_history = list(proc_ensemble.history)

if len(proc_ensemble.futures) > 0:
# Also add ensemble runs that did not finish within smac time
# and add them into the ensemble history
self._logger.info("Ensemble script still running, waiting for it to finish.")
result = proc_ensemble.futures.pop().result()
if result:
ensemble_history, _, _, _ = result
self._results_manager.ensemble_performance_history.extend(ensemble_history)
self._logger.info("Ensemble script finished, continue shutdown.")

# save the ensemble performance history file
if len(self.ensemble_performance_history) > 0:
pd.DataFrame(self.ensemble_performance_history).to_json(
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
self._collect_results_ensemble(proc_ensemble)

if load_models:
self._logger.info("Loading models...")
Expand Down Expand Up @@ -1321,7 +1308,7 @@ def fit(self,
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.replace_datamanager(dataset)
self._backend.save_datamanager(dataset)

# build pipeline
pipeline = self.build_pipeline(dataset_properties)
Expand All @@ -1339,7 +1326,6 @@ def fit(self,
self._clean_logger()
return pipeline


def fit_ensemble(
self,
optimize_metric: Optional[str] = None,
Expand Down Expand Up @@ -1418,7 +1404,7 @@ def fit_ensemble(
ensemble_fit_task_name = 'EnsembleFit'
self._stopwatch.start_task(ensemble_fit_task_name)
if enable_traditional_pipeline:
if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
if func_eval_time_limit_secs > time_for_task:
self._logger.warning(
'Time limit for a single run is higher than total time '
'limit. Capping the limit for a single run to the total '
Expand Down Expand Up @@ -1459,12 +1445,8 @@ def fit_ensemble(
)

manager.build_ensemble(self._dask_client)
future = manager.futures.pop()
result = future.result()
if result is None:
raise ValueError("Errors occurred while building the ensemble - please"
" check the log file and command line output for error messages.")
self.ensemble_performance_history, _, _, _ = result
if manager is not None:
self._collect_results_ensemble(manager)

if load_models:
self._load_models()
Expand Down Expand Up @@ -1542,6 +1524,31 @@ def _init_ensemble_builder(

return proc_ensemble

def _collect_results_ensemble(
self,
manager: EnsembleBuilderManager
) -> None:

if self._logger is None:
raise ValueError("logger should be initialized to fit ensemble")

self._results_manager.ensemble_performance_history = list(manager.history)

if len(manager.futures) > 0:
# Also add ensemble runs that did not finish within smac time
# and add them into the ensemble history
self._logger.info("Ensemble script still running, waiting for it to finish.")
result = manager.futures.pop().result()
if result:
ensemble_history, _, _, _ = result
self._results_manager.ensemble_performance_history.extend(ensemble_history)
self._logger.info("Ensemble script finished, continue shutdown.")

# save the ensemble performance history file
if len(self.ensemble_performance_history) > 0:
pd.DataFrame(self.ensemble_performance_history).to_json(
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))

def predict(
self,
X_test: np.ndarray,
Expand Down
38 changes: 21 additions & 17 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TASK_TYPES_TO_STRING,
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
Expand Down Expand Up @@ -109,7 +110,7 @@ def __init__(
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
)

def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline:
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularClassificationPipeline:
"""
Build pipeline according to current task and for the passed dataset properties

Expand All @@ -120,16 +121,7 @@ def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassific
TabularClassificationPipeline:
Pipeline compatible with the given dataset properties.
"""

def build_pipeline(self, dataset_properties: Dict[str, Any],
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
) -> TabularClassificationPipeline:
return TabularClassificationPipeline(dataset_properties=dataset_properties,
include=include_components,
exclude=exclude_components,
search_space_updates=search_space_updates)
return TabularClassificationPipeline(dataset_properties=dataset_properties)

def search(
self,
Expand Down Expand Up @@ -281,6 +273,18 @@ def search(
self

"""
if dataset_name is None:
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))

# we have to create a logger for at this point for the validator
self._logger = self._get_logger(dataset_name)

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
self.InputValidator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
)

# Fit a input validator to check the provided data
# Also, an encoder is fit to both train and test data,
Expand All @@ -303,9 +307,9 @@ def search(
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)


if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down Expand Up @@ -345,24 +349,24 @@ def predict(
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator fit() method.")

X_test = self.input_validator.feature_validator.transform(X_test)
X_test = self.InputValidator.feature_validator.transform(X_test)
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

if self.input_validator.target_validator.is_single_column_target():
if self.InputValidator.target_validator.is_single_column_target():
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
else:
predicted_indexes = (predicted_probabilities > 0.5).astype(int)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)

def predict_proba(self,
X_test: Union[np.ndarray, pd.DataFrame, List],
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
if self.input_validator is None or not self.input_validator._is_fitted:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator fit() method.")
X_test = self.input_validator.feature_validator.transform(X_test)
X_test = self.InputValidator.feature_validator.transform(X_test)
return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs)
22 changes: 14 additions & 8 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
TASK_TYPES_TO_STRING
)
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
Expand Down Expand Up @@ -81,9 +82,9 @@ def __init__(
delete_output_folder_after_terminate: bool = True,
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
resampling_strategy:Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
Expand All @@ -109,7 +110,7 @@ def __init__(
task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION],
)

def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline:
def build_pipeline(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> TabularRegressionPipeline:
"""
Build pipeline according to current task and for the passed dataset properties

Expand Down Expand Up @@ -272,6 +273,11 @@ def search(
self

"""
if dataset_name is None:
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))

# we have to create a logger for at this point for the validator
self._logger = self._get_logger(dataset_name)

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
Expand Down Expand Up @@ -301,9 +307,9 @@ def search(
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)


if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand All @@ -329,14 +335,14 @@ def predict(
batch_size: Optional[int] = None,
n_jobs: int = 1
) -> np.ndarray:
if self.input_validator is None or not self.input_validator._is_fitted:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator fit() method.")

X_test = self.input_validator.feature_validator.transform(X_test)
X_test = self.InputValidator.feature_validator.transform(X_test)
predicted_values = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.input_validator.target_validator.inverse_transform(predicted_values)
return self.InputValidator.target_validator.inverse_transform(predicted_values)
1 change: 0 additions & 1 deletion autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def fit(
np.shape(y_test)
))
if isinstance(y_train, pd.DataFrame):
y_train = cast(pd.DataFrame, y_train)
y_test = cast(pd.DataFrame, y_test)
if y_train.columns.tolist() != y_test.columns.tolist():
raise ValueError(
Expand Down
7 changes: 2 additions & 5 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import functools
from typing import Dict, List, Optional, Tuple, Union, cast
from typing import Dict, List, Optional, Tuple, Type, Union, cast

import numpy as np

Expand Down Expand Up @@ -263,7 +263,7 @@ def transform(
X = self.numpy_to_pandas(X)

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = cast(pd.DataFrame, X)
X = cast(Type[pd.DataFrame], X)

# Check the data here so we catch problems on new test data
self._check_data(X)
Expand Down Expand Up @@ -391,9 +391,6 @@ def _get_columns_info(
Type of each column numerical/categorical
"""

if len(self.transformed_columns) > 0 and self.feat_type is not None:
return self.transformed_columns, self.feat_type

# Register if a column needs encoding
numerical_columns = []
categorical_columns = []
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/data/tabular_target_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Union, cast
from typing import List, Optional, cast

import numpy as np

Expand Down
7 changes: 5 additions & 2 deletions autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ class NoResamplingStrategyTypes(IntEnum):
RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]


DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = {
DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes],
Dict[str, Any]] = {
HoldoutValTypes.holdout_validation: {
'val_share': 0.33,
},
Expand All @@ -117,7 +120,7 @@ class NoResamplingStrategyTypes(IntEnum):
NoResamplingStrategyTypes.shuffle_no_resampling: {
'shuffle': True
}
} # type: Dict[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes], Dict[str, Any]]
}


class HoldOutFuncs():
Expand Down
Loading