diff --git a/build.cmd b/build.cmd index d2c350ba..c34f8e77 100644 --- a/build.cmd +++ b/build.cmd @@ -21,6 +21,7 @@ set BoostRoot=%DependenciesDir%BoostDbg3.7 set PythonVersion=3.7 set PythonTag=cp37 set RunTests=False +set InstallPythonPackages=False set RunExtendedTests=False set BuildDotNetBridgeOnly=False set SkipDotNetBridge=False @@ -33,6 +34,11 @@ if /i [%1] == [--configuration] ( ) if /i [%1] == [--runTests] ( set RunTests=True + set InstallPythonPackages=True + shift && goto :Arg_Loop +) +if /i [%1] == [--installPythonPackages] ( + set InstallPythonPackages=True shift && goto :Arg_Loop ) if /i [%1] == [--includeExtendedTests] ( @@ -58,6 +64,7 @@ echo "" echo "Options:" echo " --configuration Build Configuration (DbgWinPy3.7,DbgWinPy3.6,DbgWinPy3.5,DbgWinPy2.7,RlsWinPy3.7,RlsWinPy3.6,RlsWinPy3.5,RlsWinPy2.7)" echo " --runTests Run tests after build" +echo " --installPythonPackages Install python packages after build" echo " --includeExtendedTests Include the extended tests if the tests are run" echo " --buildDotNetBridgeOnly Build only DotNetBridge" echo " --skipDotNetBridge Build everything except DotNetBridge" @@ -157,7 +164,7 @@ if /i [%1] == [DbgWinPy2.7] ( :Build :: Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script echo Installing dotnet SDK ... -powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.200 -InstallDir ./cli" +powershell -NoProfile -ExecutionPolicy unrestricted -Command "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; &([scriptblock]::Create((Invoke-WebRequest -useb 'https://dot.net/v1/dotnet-install.ps1'))) -Version 2.1.701 -InstallDir ./cli" set _dotnetRoot=%__currentScriptDir%cli @@ -328,6 +335,19 @@ md "%__currentScriptDir%target" copy "%__currentScriptDir%src\python\dist\%WheelFile%" "%__currentScriptDir%target\%WheelFile%" echo Python package successfully created: %__currentScriptDir%target\%WheelFile% +if "%InstallPythonPackages%" == "True" ( + echo "" + echo "#################################" + echo "Installing python packages ... " + echo "#################################" + call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0" + if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq ) + :: Run azureml-dataprep tests only in pyhon 3.7 as its an optional dependency + if %PythonVersion% == 3.7 ( call "%PythonExe%" -m pip install --upgrade azureml-dataprep ) + call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" + call "%PythonExe%" -m pip install "scikit-learn==0.19.2" +) + if "%RunTests%" == "False" ( goto :Exit_Success ) @@ -337,11 +357,6 @@ echo "" echo "#################################" echo "Running tests ... " echo "#################################" -call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0" -if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq ) -call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" -call "%PythonExe%" -m pip install "scikit-learn==0.19.2" - set PackagePath=%PythonRoot%\Lib\site-packages\nimbusml set TestsPath1=%PackagePath%\tests set TestsPath2=%__currentScriptDir%src\python\tests diff --git a/build.sh b/build.sh index 78de9ff8..2689f233 100755 --- a/build.sh +++ b/build.sh @@ -16,6 +16,7 @@ usage() echo "Options:" echo " --configuration Build Configuration (DbgLinPy3.7,DbgLinPy3.6,DbgLinPy3.5,DbgLinPy2.7,RlsLinPy3.7,RlsLinPy3.6,RlsLinPy3.5,RlsLinPy2.7,DbgMacPy3.7,DbgMacPy3.6,DbgMacPy3.5,DbgMacPy2.7,RlsMacPy3.7,RlsMacPy3.6,RlsMacPy3.5,RlsMacPy2.7)" echo " --runTests Run tests after build" + echo " --installPythonPackages Install python packages after build" echo " --runTestsOnly Run tests on a wheel file in default build location (/target/)" echo " --includeExtendedTests Include the extended tests if the tests are run" echo " --buildNativeBridgeOnly Build only the native bridge code" @@ -31,6 +32,7 @@ else __configuration=DbgLinPy3.7 fi __runTests=false +__installPythonPackages=false __runExtendedTests=false __buildNativeBridge=true __buildDotNetBridge=true @@ -48,6 +50,10 @@ while [ "$1" != "" ]; do ;; --runtests) __runTests=true + __installPythonPackages=true + ;; + --installPythonPackages) + __installPythonPackages=true ;; --includeextendedtests) __runExtendedTests=true @@ -56,6 +62,7 @@ while [ "$1" != "" ]; do __buildNativeBridge=false __buildDotNetBridge=false __runTests=true + __installPythonPackages=true ;; --buildnativebridgeonly) __buildDotNetBridge=false @@ -166,7 +173,7 @@ if [ ${__buildDotNetBridge} = true ] then # Install dotnet SDK version, see https://docs.microsoft.com/en-us/dotnet/core/tools/dotnet-install-script echo "Installing dotnet SDK ... " - curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.200 -InstallDir ./cli + curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli # Build managed code echo "Building managed code ... " @@ -199,6 +206,7 @@ then cp "${BuildOutputDir}/${__configuration}"/DotNetBridge.dll "${__currentScriptDir}/src/python/nimbusml/internal/libs/" cp "${BuildOutputDir}/${__configuration}"/pybridge.so "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + # ls "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/ if [ ${PythonVersion} = 2.7 ] then cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/*.dll "${__currentScriptDir}/src/python/nimbusml/internal/libs/" @@ -246,11 +254,11 @@ then echo Python package successfully created: ${__currentScriptDir}/target/${WheelFile} fi -if [ ${__runTests} = true ] -then +if [ ${__installPythonPackages} = true ] +then echo "" echo "#################################" - echo "Running tests ... " + echo "Installing Python packages ... " echo "#################################" Wheel=${__currentScriptDir}/target/nimbusml-${ProductVersion}-${PythonTag}-none-${PlatName}.whl if [ ! -f ${Wheel} ] @@ -266,10 +274,20 @@ then elif [ ${PythonVersion} = 3.6 ] && [ "$(uname -s)" = "Darwin" ] then "${PythonExe}" -m pip install --upgrade pytest-remotedata - fi + elif [ ${PythonVersion} = 3.7 ] + then + "${PythonExe}" -m pip install --upgrade azureml-dataprep + fi "${PythonExe}" -m pip install --upgrade "${Wheel}" "${PythonExe}" -m pip install "scikit-learn==0.19.2" +fi +if [ ${__runTests} = true ] +then + echo "" + echo "#################################" + echo "Running tests ... " + echo "#################################" PackagePath=${PythonRoot}/lib/python${PythonVersion}/site-packages/nimbusml TestsPath1=${PackagePath}/tests TestsPath2=${__currentScriptDir}/src/python/tests diff --git a/build/libs_linux.txt b/build/libs_linux.txt index 15c3395e..d53a5a84 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -1,12 +1,17 @@ Newtonsoft.Json.dll libCpuMathNative.so -libFactorizationMachineNative.so libFastTreeNative.so libLdaNative.so libMklImports.so +libMklProxyNative.so libSymSgdNative.so lib_lightgbm.so libtensorflow.so -libtensorflow_framework.so +libtensorflow_framework.so.1 +libonnxruntime.so System.Drawing.Common.dll +TensorFlow.NET.dll +NumSharp.Core.dll +Microsoft.DataPrep.dll +Microsoft.DPrep.* Microsoft.ML.* diff --git a/build/libs_mac.txt b/build/libs_mac.txt index 2be6a809..de7e27b3 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -1,12 +1,17 @@ Newtonsoft.Json.dll libCpuMathNative.dylib -libFactorizationMachineNative.dylib libFastTreeNative.dylib libLdaNative.dylib libMklImports.dylib +libMklProxyNative.dylib libSymSgdNative.dylib lib_lightgbm.dylib libtensorflow.dylib -libtensorflow_framework.dylib +libonnxruntime.dylib +libtensorflow_framework.1.dylib System.Drawing.Common.dll +TensorFlow.NET.dll +NumSharp.Core.dll +Microsoft.DataPrep.dll +Microsoft.DPrep.* Microsoft.ML.* diff --git a/build/libs_win.txt b/build/libs_win.txt index dda6dcd6..62c1bab0 100644 --- a/build/libs_win.txt +++ b/build/libs_win.txt @@ -1,13 +1,17 @@ Google.Protobuf.dll Newtonsoft.Json.dll CpuMathNative.dll -FactorizationMachineNative.dll FastTreeNative.dll LdaNative.dll lib_lightgbm.dll libiomp5md.dll MklImports.dll +MklProxyNative.dll SymSgdNative.dll tensorflow.dll +TensorFlow.NET.dll +NumSharp.Core.dll System.Drawing.Common.dll +Microsoft.DataPrep.dll +Microsoft.DPrep.* Microsoft.ML.* diff --git a/build/sign.csproj b/build/sign.csproj index fa055f4d..2b26a8bd 100644 --- a/build/sign.csproj +++ b/build/sign.csproj @@ -1,7 +1,7 @@ - netstandard2.0 + net461 ../x64/ @@ -15,7 +15,7 @@ - Microsoft + Microsoft400 diff --git a/docs/release-notes/release-1.3.0.md b/docs/release-notes/release-1.3.0.md new file mode 100644 index 00000000..b704a08a --- /dev/null +++ b/docs/release-notes/release-1.3.0.md @@ -0,0 +1,101 @@ +# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.3.0 + +## **New Features** + +- **Save/Restore model when pickling Pipeline** + + [PR#189](https://github.com/microsoft/NimbusML/pull/189) Save and restore + the underlying model file when pickling a nimbusml Pipeline. + +- **Feature Contributions** + + [PR#196](https://github.com/microsoft/NimbusML/pull/196) Added support for + observation level feature contributions. Exposes an API + `Pipeline.get_feature_contributions()` that provides scores for how much + each feature influenced a particular prediction, thereby allowing users to + inspect which features were most important in making the prediction. + +- **Add `classes_` to Pipeline** + + [PR#200](https://github.com/microsoft/NimbusML/pull/200) Add a `classes_` + attribute to a Pipeline and/or predictor instance when calling + `Pipeline.predict_proba()`. + +- **Automatically Convert Input Of Handler, Filter and Indicator** + + [PR#204](https://github.com/microsoft/NimbusML/pull/204) Update Handler, + Filter, and Indicator to automatically convert the input columns to float + before performing the transform. + +- **Combine Models** + + [PR#208](https://github.com/microsoft/NimbusML/pull/208) Add support for + combining models from transforms, predictors and pipelines in to one model. + +- **Azureml-Dataprep integration** + + [PR#181](https://github.com/microsoft/NimbusML/pull/181) Added support for + dataflow objects as a datasource for pipeline training/testing. + +- **Linear SVM Binary Classifier** + [PR#180](https://github.com/microsoft/NimbusML/pull/180) Added + `LinearSvmBinaryClassifier` in `nimbusml.linear_model`. + +- **Ensemble Training** + + [PR#207](https://github.com/microsoft/NimbusML/pull/207) Enabled training of + Ensemble models by adding `nimbusml.ensemble.EnsembleRegressor` and + `nimbusml.ensemble.EnsembleClassifier`. Added components needed + to create ensemble models as new modules in `nimbusml.ensemble`. These + components are passed as arguments to the ensemble trainers. + - Preprocessing components for training multiple models to ensemble in + `nimbusml.ensemble.subset_selector` and `nimbusml.ensemble.feature_selector`. + - Post training components to create the ensemble from the trained models in + `nimbusml.ensemble.sub_model_selector` and `nimbusml.ensemble.output_combiner`. + +## **Bug Fixes** + +- **Fixed memory leak** + + The [PR#184](https://github.com/microsoft/NimbusML/pull/184) fixed potentially + large memory leak when transforming pandas dataframe. + +- **Remove Stored References To `X` and `y`** + + [PR#195](https://github.com/microsoft/NimbusML/pull/195) Remove the stored + references to X and y in BasePredictor. + +- **Fixed Explicit `evaltype`** + + The [issue](https://github.com/microsoft/NimbusML/issues/193) where passing + in an explicit `evaltype` to `_predict` in a NimbusML pipeline causes errors + has been fixed with this + [commit](https://github.com/microsoft/NimbusML/commit/1f97c9ef55f5e257f989db5f375cca5c55880258). + +## **Breaking Changes** + +None. + +## **Enhancements** + +None. + +## **Documentation and Samples** + +[Feature Contributions Example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/PipelineWithFeatureContributions.py) + +LinearSvmBinaryClassifier Examples: +- [FileDataStream example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py) +- [DataFrame example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py) + +EnsembleClassifier Examples: +- [FileDataStream example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/EnsembleClassifier.py) +- [DataFrame example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py) + +EnsembleRegressor Examples: +- [FileDataStream example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/EnsembleRegressor.py) +- [DataFrame example](https://github.com/microsoft/NimbusML/blob/master/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py) + +## **Remarks** + +None. diff --git a/release-next.md b/release-next.md index bed2f26f..68bfa7ef 100644 --- a/release-next.md +++ b/release-next.md @@ -18,7 +18,7 @@ None. ## **Documentation and Samples** -None. +None. ## **Remarks** diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 96100247..302a9426 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -314,7 +314,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd env.ComponentCatalog.RegisterAssembly(typeof(CategoricalCatalog).Assembly); // ML.Transforms env.ComponentCatalog.RegisterAssembly(typeof(FastTreeRegressionTrainer).Assembly); // ML.FastTree - //env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble + env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 9e89c206..d2a95a7c 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -1,6 +1,6 @@  - netstandard2.0 + netcoreapp2.1 true x64 CORECLR @@ -31,16 +31,19 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/src/DotNetBridge/RunGraph.cs b/src/DotNetBridge/RunGraph.cs index 09617aa6..f79cff9d 100644 --- a/src/DotNetBridge/RunGraph.cs +++ b/src/DotNetBridge/RunGraph.cs @@ -8,6 +8,7 @@ using System.Globalization; using System.IO; using System.Linq; +using Microsoft.DataPrep.Common; using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; @@ -146,7 +147,8 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s var extension = Path.GetExtension(path); if (extension == ".txt") dv = TextLoader.LoadFile(host, new TextLoader.Options(), new MultiFileSource(path)); - + else if(extension == ".dprep") + dv = DataFlow.FromDPrepFile(path).ToDataView(); else dv = new BinaryLoader(host, new BinaryLoader.Arguments(), path); } diff --git a/src/NativeBridge/ManagedInterop.cpp b/src/NativeBridge/ManagedInterop.cpp index 074492d8..497d5d0f 100644 --- a/src/NativeBridge/ManagedInterop.cpp +++ b/src/NativeBridge/ManagedInterop.cpp @@ -6,16 +6,22 @@ #include "DataViewInterop.h" #include "ManagedInterop.h" +inline void destroyManagerCObject(PyObject* obj) { + auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); + if (b) { delete b; } +} + #define SetDict2(cpptype, nptype); \ {\ PythonObject* col = dynamic_cast*>(column);\ auto shrd = col->GetData();\ auto* data = shrd->data();\ + bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ dict[_names[i]] = np::from_data(\ data,\ np::dtype::get_builtin(),\ bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(nptype)), bp::object());\ + bp::make_tuple(sizeof(nptype)), bp::object(h));\ } #define SetDict1(type) SetDict2(type, type) @@ -25,11 +31,12 @@ PythonObject* col = dynamic_cast*>(column);\ auto shrd = col->GetData();\ auto* data = shrd->data();\ + bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ np::ndarray npdata = np::from_data(\ data,\ np::dtype::get_builtin(),\ bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(float)), bp::object());\ + bp::make_tuple(sizeof(float)), bp::object(h));\ if (keyNames == nullptr)\ {\ dict[_names[i]] = npdata;\ @@ -305,6 +312,7 @@ bp::dict EnvironmentBlock::GetData() list.append(obj); } dict[_names[i]] = list; + delete column; } break; case TS: diff --git a/src/NativeBridge/PythonInterop.h b/src/NativeBridge/PythonInterop.h index 7ed17a99..9654476a 100644 --- a/src/NativeBridge/PythonInterop.h +++ b/src/NativeBridge/PythonInterop.h @@ -62,7 +62,7 @@ template class PythonObject : public PythonObjectBase { protected: - std::shared_ptr> _pData; + std::vector* _pData; size_t _numRows; size_t _numCols; @@ -71,7 +71,7 @@ class PythonObject : public PythonObjectBase PythonObject(const int& kind, size_t numRows = 1, size_t numCols = 1); virtual ~PythonObject(); void SetAt(size_t nRow, size_t nCol, const T& value); - const std::shared_ptr >& GetData() const; + const std::vector* GetData() const; }; template @@ -81,7 +81,7 @@ inline PythonObject::PythonObject(const int& kind, size_t numRows, size_t num _numRows = numRows; _numCols = numCols; - _pData = std::make_shared>(); + _pData = new std::vector(); if (_numRows > 0) _pData->reserve(_numRows*_numCols); } @@ -89,6 +89,7 @@ inline PythonObject::PythonObject(const int& kind, size_t numRows, size_t num template inline PythonObject::~PythonObject() { + delete _pData; } template @@ -101,7 +102,7 @@ inline void PythonObject::SetAt(size_t nRow, size_t nCol, const T& value) } template -inline const std::shared_ptr>& PythonObject::GetData() const +inline const std::vector* PythonObject::GetData() const { return _pData; } \ No newline at end of file diff --git a/src/NativeBridge/UnixInterface.h b/src/NativeBridge/UnixInterface.h index ab53f428..0a7c1155 100644 --- a/src/NativeBridge/UnixInterface.h +++ b/src/NativeBridge/UnixInterface.h @@ -144,15 +144,20 @@ class UnixMlNetInterface { } - FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath) + FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath) { if (_getter != nullptr) return _getter; - std::string libsroot(nimbuslibspath); + std::string libsroot(mlnetpath); std::string coreclrdir(coreclrpath); + if (strlen(dpreppath) == 0) + { + dpreppath = mlnetpath; + } + std::string dprepdir(dpreppath); - ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str()); + ICLRRuntimeHost2* host = EnsureClrHost(libsroot.c_str(), coreclrdir.c_str(), dprepdir.c_str()); if (host == nullptr) return nullptr; @@ -246,7 +251,7 @@ class UnixMlNetInterface closedir(dir); } - ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot) + ICLRRuntimeHost2* EnsureClrHost(const char * libsRoot, const char * coreclrDirRoot, const char * dprepDirRoot) { if (_host != nullptr) return _host; @@ -284,7 +289,7 @@ class UnixMlNetInterface // TRUSTED_PLATFORM_ASSEMBLIES tpaList.c_str(), // APP_PATHS - libsRoot, + dprepDirRoot, // AppDomainCompatSwitch W("UseLatestBehaviorWhenTFMNotSpecified") }; diff --git a/src/NativeBridge/WinInterface.h b/src/NativeBridge/WinInterface.h index 2fecf434..4f5238db 100644 --- a/src/NativeBridge/WinInterface.h +++ b/src/NativeBridge/WinInterface.h @@ -161,7 +161,7 @@ class WinMlNetInterface FindClose(findHandle); } - ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot) + ICLRRuntimeHost2* EnsureClrHost(const wchar_t * libsRoot, const wchar_t * coreclrDirRoot, const wchar_t * dprepDirRoot) { if (_host != nullptr) return _host; @@ -228,7 +228,7 @@ class WinMlNetInterface // TRUSTED_PLATFORM_ASSEMBLIES tpaList.c_str(), // APP_PATHS - libsRoot, + dprepDirRoot, // AppDomainCompatSwitch W("UseLatestBehaviorWhenTFMNotSpecified") }; @@ -267,26 +267,29 @@ class WinMlNetInterface } public: - FNGETTER EnsureGetter(const char *nimbuslibspath, const char *coreclrpath) + FNGETTER EnsureGetter(const char *mlnetpath, const char *coreclrpath, const char *dpreppath) { if (_getter != nullptr) return _getter; - std::wstring libsdir = Utf8ToUtf16le(nimbuslibspath); + std::wstring libsdir = Utf8ToUtf16le(mlnetpath); ConvertToWinPath(libsdir); - std::wstring coreclrdir; - if (strlen(coreclrpath) != 0) + std::wstring coreclrdir = Utf8ToUtf16le(coreclrpath); + ConvertToWinPath(coreclrdir); + + std::wstring dprepdir; + if (strlen(dpreppath) != 0) { - coreclrdir = Utf8ToUtf16le(coreclrpath); - ConvertToWinPath(coreclrdir); + dprepdir = Utf8ToUtf16le(dpreppath); + ConvertToWinPath(dprepdir); } else { - coreclrdir = libsdir; + dprepdir = libsdir; } - ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str()); + ICLRRuntimeHost2* host = EnsureClrHost(libsdir.c_str(), coreclrdir.c_str(), dprepdir.c_str()); if (host == nullptr) return nullptr; diff --git a/src/NativeBridge/dllmain.cpp b/src/NativeBridge/dllmain.cpp index 3f521f87..c656f6d0 100644 --- a/src/NativeBridge/dllmain.cpp +++ b/src/NativeBridge/dllmain.cpp @@ -9,8 +9,9 @@ #define PARAM_SEED "seed" #define PARAM_GRAPH "graph" #define PARAM_VERBOSE "verbose" -#define PARAM_NIMBUSML_PATH "nimbusmlPath" +#define PARAM_MLNET_PATH "mlnetPath" #define PARAM_DOTNETCLR_PATH "dotnetClrPath" +#define PARAM_DPREP_PATH "dprepPath" #define PARAM_DATA "data" @@ -44,14 +45,14 @@ static MlNetInterface *g_mlnetInterface = nullptr; static GENERICEXEC g_exec = nullptr; // Ensure that we have the DotNetBridge managed code entry point. -GENERICEXEC EnsureExec(const char *nimbuslibspath, const char *coreclrpath) +GENERICEXEC EnsureExec(const char *mlnetpath, const char *coreclrpath, const char *dpreppath) { if (g_mlnetInterface == nullptr) g_mlnetInterface = new MlNetInterface(); if (g_exec == nullptr) { - FNGETTER getter = g_mlnetInterface->EnsureGetter(nimbuslibspath, coreclrpath); + FNGETTER getter = g_mlnetInterface->EnsureGetter(mlnetpath, coreclrpath, dpreppath); if (getter != nullptr) g_exec = (GENERICEXEC)getter(FnIdGenericExec); } @@ -70,20 +71,23 @@ bp::dict pxCall(bp::dict& params) try { bp::extract graph(params[PARAM_GRAPH]); - bp::extract nimbusmlPath(params[PARAM_NIMBUSML_PATH]); + bp::extract mlnetPath(params[PARAM_MLNET_PATH]); bp::extract dotnetClrPath(params[PARAM_DOTNETCLR_PATH]); + bp::extract dprepPath(params[PARAM_DPREP_PATH]); bp::extract verbose(params[PARAM_VERBOSE]); std::int32_t i_verbose = std::int32_t(verbose); - std::string s_nimbusmlPath = std::string(nimbusmlPath); + std::string s_mlnetPath = std::string(mlnetPath); std::string s_dotnetClrPath = std::string(dotnetClrPath); + std::string s_dprepPath = std::string(dprepPath); std::string s_graph = std::string(graph); - const char *nimbuslibspath = s_nimbusmlPath.c_str(); + const char *mlnetpath = s_mlnetPath.c_str(); const char *coreclrpath = s_dotnetClrPath.c_str(); + const char *dpreppath = s_dprepPath.c_str(); - GENERICEXEC exec = EnsureExec(nimbuslibspath, coreclrpath); + GENERICEXEC exec = EnsureExec(mlnetpath, coreclrpath, dpreppath); if (exec == nullptr) - throw std::invalid_argument("Failed to communicate with the managed library. Path searched: " - + s_nimbusmlPath + " and " + s_dotnetClrPath); + throw std::invalid_argument("Failed to communicate with the managed library. Paths searched: " + + s_mlnetPath + " and " + s_dotnetClrPath); int seed = 42; if (params.has_key(PARAM_SEED)) diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 99150a1e..4cd284fc 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -3,7 +3,7 @@ dummy Exe - netcoreapp2.0 + netcoreapp2.1 x64 DbgWinPy3.7;DbgWinPy3.6;DbgWinPy3.5;DbgWinPy2.7;RlsWinPy3.7;RlsWinPy3.6;RlsWinPy3.5;RlsWinPy2.7;DbgLinPy3.7;DbgLinPy3.6;DbgLinPy3.5;DbgLinPy2.7;RlsLinPy3.7;RlsLinPy3.6;RlsLinPy3.5;RlsLinPy2.7;RlsMacPy3.7;RlsMacPy3.6 $(ProjectDir)..\..\x64\$(Configuration)\Platform\ @@ -11,16 +11,20 @@ - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt b/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt new file mode 100644 index 00000000..c7f2449a --- /dev/null +++ b/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt @@ -0,0 +1,38 @@ + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/ClassifierWeightedAverage.txt b/src/python/docs/docstrings/ClassifierWeightedAverage.txt new file mode 100644 index 00000000..2c3a77b0 --- /dev/null +++ b/src/python/docs/docstrings/ClassifierWeightedAverage.txt @@ -0,0 +1,61 @@ + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/EnsembleClassifier.txt b/src/python/docs/docstrings/EnsembleClassifier.txt new file mode 100644 index 00000000..3301f266 --- /dev/null +++ b/src/python/docs/docstrings/EnsembleClassifier.txt @@ -0,0 +1,144 @@ + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/EnsembleRegressor.txt b/src/python/docs/docstrings/EnsembleRegressor.txt new file mode 100644 index 00000000..e185307c --- /dev/null +++ b/src/python/docs/docstrings/EnsembleRegressor.txt @@ -0,0 +1,134 @@ + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/LinearSvmBinaryClassifier.txt b/src/python/docs/docstrings/LinearSvmBinaryClassifier.txt new file mode 100644 index 00000000..df805518 --- /dev/null +++ b/src/python/docs/docstrings/LinearSvmBinaryClassifier.txt @@ -0,0 +1,57 @@ + """ + + Linear Support Vector Machine (SVM) Binary Classifier + + .. remarks:: + Linear SVM implements an algorithm that finds a hyperplane in the + feature space for binary classification, by solving an SVM problem. + For instance, with feature values *f_0, f_1,..., f_{D-1}*, the + prediction is given by determining what side of the hyperplane the + point falls into. That is the same as the sign of the feautures' + weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*, + where *w_0, w_1,..., w_{D-1}* are the weights computed by the + algorithm, and *b* is the bias computed by the algorithm. + + This algorithm implemented is the PEGASOS method, which alternates + between stochastic gradient descent steps and projection steps, + introduced by Shalev-Shwartz, Singer and Srebro. + + + **Reference** + + `Wikipedia entry for Support Vector Machine + `_ + + `Pegasos: Primal Estimated sub-GrAdient SOlver for SVM + `_ + + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + + .. index:: models, classification, svm + + Example: + .. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py + :language: python + """ diff --git a/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt b/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt new file mode 100644 index 00000000..83ba0116 --- /dev/null +++ b/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt @@ -0,0 +1,36 @@ + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/sphinx/apiguide.rst b/src/python/docs/sphinx/apiguide.rst index 300330af..7f4a964b 100644 --- a/src/python/docs/sphinx/apiguide.rst +++ b/src/python/docs/sphinx/apiguide.rst @@ -50,7 +50,7 @@ Multiclass Classifiers ,, :py:class:`OneVsRestClassifier` ,, Yes ,, Yes ,, -Regressors +Regressors Classifiers """""""""""""""""""""" ,, Trainer ,, diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index fdad1666..a0ac2115 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -12,9 +12,10 @@ {888888a0-9f3d-457c-b088-3a5042f75d52} Standard Python launcher nimbusml - Global|VisualStudio|Py3.7 + Global|VisualStudio|MinePy37 ..\..\dependencies\Python3.7\python.exe False + nimbusml\tests\dprep\test_dprep.py @@ -45,6 +46,36 @@ + + + + + + + + + + + + + + + + + + + + + + + Code + + + + + + + @@ -67,6 +98,7 @@ + @@ -74,6 +106,8 @@ + + @@ -89,6 +123,7 @@ + @@ -124,6 +159,7 @@ + @@ -140,6 +176,7 @@ + @@ -174,6 +211,7 @@ + @@ -191,6 +229,7 @@ + @@ -207,6 +246,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -228,27 +295,17 @@ - - - - - - + - - - - - @@ -292,6 +349,8 @@ + + @@ -416,13 +475,18 @@ + + + + + @@ -498,6 +562,7 @@ + @@ -519,7 +584,6 @@ - @@ -541,6 +605,7 @@ + @@ -585,6 +650,10 @@ + + + + @@ -595,6 +664,8 @@ + + @@ -606,8 +677,8 @@ - + @@ -738,6 +809,7 @@ + @@ -749,6 +821,11 @@ + + + + + @@ -764,6 +841,11 @@ + + + + + @@ -830,6 +912,8 @@ + + @@ -839,6 +923,8 @@ + + @@ -868,6 +954,7 @@ + @@ -904,6 +991,7 @@ + @@ -1139,7 +1227,7 @@ - + \ No newline at end of file diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 170d503f..734b0333 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.2.0' +__version__ = '1.3.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. @@ -20,6 +20,7 @@ from .internal.utils.data_roles import Role from .internal.utils.data_schema import DataSchema from .internal.utils.data_stream import BinaryDataStream +from .internal.utils.data_stream import DprepDataStream from .internal.utils.data_stream import FileDataStream from .internal.utils.utils import run_tests from ._pipeline import Pipeline diff --git a/src/python/nimbusml/_pipeline.py b/src/python/nimbusml/_pipeline.py index 5a15bac4..692e1dea 100644 --- a/src/python/nimbusml/_pipeline.py +++ b/src/python/nimbusml/_pipeline.py @@ -5,6 +5,7 @@ import inspect import itertools import os +import tempfile import time import warnings from collections import OrderedDict, namedtuple, defaultdict @@ -17,6 +18,7 @@ from pandas import DataFrame, Series from scipy.sparse import csr_matrix from sklearn.utils.validation import check_X_y, check_array +from sklearn.utils.multiclass import unique_labels from .internal.core.base_pipeline_item import BasePipelineItem from .internal.entrypoints.data_customtextloader import \ @@ -40,6 +42,8 @@ transforms_datasetscorer from .internal.entrypoints.transforms_featurecombiner import \ transforms_featurecombiner +from .internal.entrypoints.transforms_featurecontributioncalculationtransformer import \ + transforms_featurecontributioncalculationtransformer from .internal.entrypoints.transforms_labelcolumnkeybooleanconverter \ import \ transforms_labelcolumnkeybooleanconverter @@ -1108,6 +1112,8 @@ def fit(self, X, y=None, verbose=1, **params): i, n.__class__.__name__), TrainedWarning) break + self._extract_classes(y) + graph, X, y, weights, start_time, schema, telemetry_info, \ learner_features, _, max_slots = self._fit_graph( X, y, verbose, **params) @@ -1436,31 +1442,63 @@ def _fix_ranking_metrics_schema(self, out_metrics): 'DCG@1', 'DCG@2', 'DCG@3', ] return out_metrics - @trace - def _evaluation(self, evaltype, group_id, **params): + def _evaluation_infer(self, evaltype, label_column, group_id, + **params): all_nodes = [] + if not self.steps: + if evaltype == 'auto': + raise ValueError( + "need to specify 'evaltype' explicitly if model is " + "loaded") + common_eval_args = OrderedDict(data="$scoredVectorData", + overall_metrics="$output_metrics", + score_column="Score", + label_column=label_column) + params.update(common_eval_args) - if evaltype == 'binary': - all_nodes.extend([ - models_binaryclassificationevaluator(**params) - ]) - elif evaltype == 'multiclass': - all_nodes.extend([ - models_classificationevaluator(**params) - ]) - elif evaltype == 'regression': - all_nodes.extend([ - models_regressionevaluator(**params) - ]) - elif evaltype == 'cluster': - all_nodes.extend([ - models_clusterevaluator(**params) - ]) - elif evaltype == 'anomaly': - all_nodes.extend([ - models_anomalydetectionevaluator(**params) - ]) - elif evaltype == 'ranking': + type_ = self._last_node_type() if evaltype == 'auto' else evaltype + + if type_ == 'binary': + all_nodes.extend( + [models_binaryclassificationevaluator(**params)]) + + elif type_ == 'multiclass': + all_nodes.extend( + [models_classificationevaluator(**params)]) + + elif type_ in ['regressor', 'regression']: + all_nodes.extend([models_regressionevaluator(**params)]) + + elif type_ in ['clusterer', 'cluster']: + label_node = transforms_labelcolumnkeybooleanconverter( + data="$scoredVectorData", label_column=label_column, + output_data="$label_data") + clustering_eval_args = OrderedDict( + data="$label_data", + overall_metrics="$output_metrics", + score_column="Score", + label_column=label_column) + params.update(clustering_eval_args) + all_nodes.extend([label_node, + models_clusterevaluator(**params) + ]) + + elif type_ == 'anomaly': + label_node = transforms_labelcolumnkeybooleanconverter( + data="$scoredVectorData", label_column=label_column, + output_data="$label_data") + anom_eval_args = OrderedDict( + data="$label_data", + overall_metrics="$output_metrics", + score_column="Score", + label_column=label_column + ) + params.update(anom_eval_args) + all_nodes.extend( + [label_node, + models_anomalydetectionevaluator(**params)]) + + elif type_ == 'ranking': svd = "$scoredVectorData" column = [OrderedDict(Source=group_id, Name=group_id)] algo_args = dict(data=svd, output_data=svd, column=column) @@ -1471,6 +1509,7 @@ def _evaluation(self, evaltype, group_id, **params): key_node, evaluate_node ]) + else: raise ValueError( "%s is not a valid type for evaluation." % @@ -1478,70 +1517,6 @@ def _evaluation(self, evaltype, group_id, **params): return all_nodes - def _evaluation_infer(self, evaltype, label_column, group_id, - **params): - all_nodes = [] - if len(self.steps) == 0: - if evaltype == 'auto': - raise ValueError( - "need to specify 'evaltype' explicitly if model is " - "loaded") - common_eval_args = OrderedDict(data="$scoredVectorData", - overall_metrics="$output_metrics", - score_column="Score", - label_column=label_column) - params.update(common_eval_args) - if evaltype == 'auto': - last_node_type = self._last_node_type() - if last_node_type == 'binary': - all_nodes.extend( - [models_binaryclassificationevaluator(**params)]) - - elif last_node_type == 'multiclass': - all_nodes.extend( - [models_classificationevaluator(**params)]) - - elif last_node_type == 'regressor': - all_nodes.extend([models_regressionevaluator(**params)]) - - elif last_node_type == 'clusterer': - label_node = transforms_labelcolumnkeybooleanconverter( - data="$scoredVectorData", label_column=label_column, - output_data="$label_data") - clustering_eval_args = OrderedDict( - data="$label_data", - overall_metrics="$output_metrics", - score_column="Score", - label_column=label_column) - params.update(clustering_eval_args) - all_nodes.extend([label_node, - models_clusterevaluator(**params) - ]) - - elif last_node_type == 'anomaly': - label_node = transforms_labelcolumnkeybooleanconverter( - data="$scoredVectorData", label_column=label_column, - output_data="$label_data") - anom_eval_args = OrderedDict( - data="$label_data", - overall_metrics="$output_metrics", - score_column="Score", - label_column=label_column - ) - params.update(anom_eval_args) - all_nodes.extend( - [label_node, - models_anomalydetectionevaluator(**params)]) - - else: - raise ValueError( - "evaltype is %s. Last node type is %s" % - evaltype, last_node_type) - else: - return self._evaluation(evaltype, group_id, **params) - - return all_nodes - def _last_node_type(self): last_node = self.last_node @@ -1693,6 +1668,120 @@ def getn(n): "only fit(X) is allowed or the training becomes " "ambiguous.") + @trace + def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, + as_binary_data_stream=False, **params): + """ + Calculates observation level feature contributions. Returns dataframe + with raw data, predictions, and feature contributiuons for each + prediction. Feature contributions are not supported for transforms, so + make sure that the last step in a pipeline is a model. Feature + contriutions are supported for the following models: + + * Regression: + + * OrdinaryLeastSquaresRegressor + * FastLinearRegressor + * OnlineGradientDescentRegressor + * PoissonRegressionRegressor + * GamRegressor + * LightGbmRegressor + * FastTreesRegressor + * FastForestRegressor + * FastTreesTweedieRegressor + + * Binary Classification: + + * AveragedPerceptronBinaryClassifier + * LinearSvmBinaryClassifier + * LogisticRegressionBinaryClassifier + * FastLinearBinaryClassifier + * SgdBinaryClassifier + * SymSgdBinaryClassifier + * GamBinaryClassifier + * FastForestBinaryClassifier + * FastTreesBinaryClassifier + * LightGbmBinaryClassifier + + * Ranking: + + * LightGbmRanker + + :param X: {array-like [n_samples, n_features], + :py:class:`nimbusml.FileDataStream` } + :param top: the number of positive contributions with highest magnitude + to report. + :param bottom: The number of negative contributions with highest + magnitude to report. + :return: dataframe of containing the raw data, predicted label, score, + probabilities, and feature contributions. + """ + self.verbose = verbose + + if not self._is_fitted: + raise ValueError( + "Model is not fitted. Train or load a model before test().") + + if len(self.steps) > 0: + last_node = self.last_node + if last_node.type == 'transform': + raise ValueError( + "Pipeline needs a trainer as last step for test()") + + X, y_temp, columns_renamed, feature_columns, label_column, \ + schema, weights, weight_column = self._preprocess_X_y(X) + + all_nodes = [] + inputs = dict([('data', ''), ('predictor_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('predictor_model', self.model)]) + + score_node = transforms_datasetscorer( + data="$data", + predictor_model="$predictor_model", + scored_data="$scoredvectordata") + + fcc_node = transforms_featurecontributioncalculationtransformer( + data="$scoredvectordata", + predictor_model="$predictor_model", + output_data="$output_data", + top=top, + bottom=bottom, + normalize=True) + + all_nodes.extend([score_node, fcc_node]) + + outputs = dict(output_data="") + + graph = Graph( + inputs, + outputs, + as_binary_data_stream, + *all_nodes) + + class_name = type(self).__name__ + method_name = inspect.currentframe().f_code.co_name + telemetry_info = ".".join([class_name, method_name]) + + try: + (out_model, out_data, out_metrics) = graph.run( + X=X, + random_state=self.random_state, + model=self.model, + verbose=verbose, + telemetry_info=telemetry_info, + **params) + except RuntimeError as e: + raise e + + return out_data + @trace def _predict(self, X, y=None, evaltype='auto', group_id=None, @@ -1744,8 +1833,12 @@ def _predict(self, X, y=None, scored_data="$scoredVectorData") all_nodes.extend([score_node]) - if hasattr(self, 'steps') and len(self.steps) > 0 \ - and self.last_node.type == 'classifier': + if (evaltype in ['binary', 'multiclass']) or \ + (hasattr(self, 'steps') + and self.steps is not None + and len(self.steps) > 0 + and self.last_node.type == 'classifier'): + select_node = transforms_scorecolumnselector( data="$scoredVectorData", output_data="$scoreColumnsOnlyData", score_column="Score") @@ -1806,6 +1899,25 @@ def _predict(self, X, y=None, self._write_csv_time = graph._write_csv_time return out_data, out_metrics + def _extract_classes(self, y): + if ((len(self.steps) > 0) and + (self.last_node.type in ['classifier', 'anomaly']) and + (y is not None) and + (not isinstance(y, (str, tuple)))): + + unique_classes = unique_labels(y) + if len(unique_classes) < 2: + raise ValueError( + "Classifier can't train when only one class is " + "present.") + self._add_classes(unique_classes) + + def _extract_classes_from_headers(self, headers): + if hasattr(self.last_node, 'classes_'): + classes = [x.replace('Score.', '') for x in headers] + classes = np.array(classes).astype(self.last_node.classes_.dtype) + self._add_classes(classes) + def _add_classes(self, classes): # Create classes_ attribute similar to scikit # Add both to pipeline and ending classifier @@ -1857,11 +1969,7 @@ def predict_proba(self, X, verbose=0, **params): # for multiclass, scores are probabilities pcols = [i for i in scores.columns if i.startswith('Score.')] if len(pcols) > 0: - # [todo]: this is a bug, predict_proba should not change - # internal state of pipeline. - # test check_dict_unchanged() detects that, commenting line - # for now - # self._add_classes([x.replace('Score.', '') for x in pcols]) + self._extract_classes_from_headers(pcols) return scores.loc[:, pcols].values raise ValueError( @@ -1902,7 +2010,7 @@ def decision_function(self, X, verbose=0, **params): # for multiclass with n_classes > 2 if len(scols) > 2: - self._add_classes([x.replace('Score.', '') for x in scols]) + self._extract_classes_from_headers(scols) return scores.loc[:, scols].values raise ValueError( @@ -1942,7 +2050,7 @@ def test( otherwise None in the returned tuple. :return: tuple (dataframe of evaluation metrics, dataframe of - scores). Is scores are + scores). If scores are required, set `output_scores`=True, otherwise None is returned by default. """ @@ -2265,6 +2373,38 @@ def load_model(self, src): self.model = src self.steps = [] + def __getstate__(self): + odict = {'export_version': 1} + + if hasattr(self, 'steps'): + odict['steps'] = self.steps + + if (hasattr(self, 'model') and + self.model is not None and + os.path.isfile(self.model)): + + with open(self.model, "rb") as f: + odict['modelbytes'] = f.read() + + return odict + + def __setstate__(self, state): + self.steps = [] + self.model = None + self.random_state = None + + for k, v in state.items(): + if k not in {'modelbytes', 'export_version'}: + setattr(self, k, v) + + if state.get('export_version', 0) == 1: + if 'modelbytes' in state: + (fd, modelfile) = tempfile.mkstemp() + fl = os.fdopen(fd, "wb") + fl.write(state['modelbytes']) + fl.close() + self.model = modelfile + @trace def score( self, @@ -2310,3 +2450,107 @@ def score( else: raise ValueError( "cannot generate score for {0}).".format(task_type)) + + + @classmethod + def combine_models(cls, *items, **params): + """ + Combine the models of multiple pipelines, transforms + and/or predictors in to a single model. The models are + combined in the order they are seen. + + :param items: the fitted pipelines, transforms and/or + predictors which contain the models to join. + + :param contains_predictor: Set to `True` if the + last item contains or is a predictor. Set to + `False` if `items` only contains transforms. + The default is True. + + :return: A new Pipeline which is backed by a model that + is the combination of all the models passed in + through `items`. + """ + if len(items) == 0: + raise RuntimeError( + 'At least one transform, predictor' + 'or pipeline must be specified.') + + for item in items: + if not item._is_fitted: + raise RuntimeError( + 'Item must be fitted before' + 'models can be combined.') + + contains_predictor = params.get('contains_predictor', True) + verbose = params.get('verbose', 0) + + get_model = lambda x: x.model if hasattr(x, 'model') else x.model_ + + if len(items) == 1: + return Pipeline(model=get_model(items[0])) + + start_time = time.time() + + nodes = [] + inputs = {} + transform_models = [] + + for index, item in enumerate(items[:-1], start=1): + var_name = 'transform_model' + str(index) + inputs[var_name] = get_model(item) + transform_models.append("$" + var_name) + + if contains_predictor: + inputs['predictor_model'] = get_model(items[-1]) + + combine_models_node = transforms_manyheterogeneousmodelcombiner( + transform_models=transform_models, + predictor_model='$predictor_model', + model='$output_model') + nodes.append(combine_models_node) + + else: + var_name = 'transform_model' + str(len(items)) + inputs[var_name] = get_model(items[-1]) + transform_models.append("$" + var_name) + + combine_models_node = transforms_modelcombiner( + models=transform_models, + output_model='$output_model') + nodes.append(combine_models_node) + + outputs = dict(output_model="") + + graph = Graph( + inputs, + outputs, + False, + *nodes) + + class_name = cls.__name__ + method_name = inspect.currentframe().f_code.co_name + telemetry_info = ".".join([class_name, method_name]) + + try: + (out_model, _, _) = graph.run( + X=None, + y=None, + random_state=None, + model=None, + verbose=verbose, + is_summary=False, + telemetry_info=telemetry_info, + no_input_data=True, + **params) + except RuntimeError as e: + raise e + + pipeline = Pipeline(model=out_model) + + # stop the clock + pipeline._run_time = time.time() - start_time + pipeline._write_csv_time = graph._write_csv_time + + return pipeline + diff --git a/src/python/nimbusml/base_predictor.py b/src/python/nimbusml/base_predictor.py index bfa2813f..f33f746c 100644 --- a/src/python/nimbusml/base_predictor.py +++ b/src/python/nimbusml/base_predictor.py @@ -11,8 +11,6 @@ import os from sklearn.base import BaseEstimator -from sklearn.utils.multiclass import unique_labels -from sklearn.utils.validation import check_is_fitted from . import Pipeline from .internal.core.base_pipeline_item import BasePipelineItem @@ -40,18 +38,6 @@ def fit(self, X, y=None, **params): :param y: array-like with shape=[n_samples] :return: self """ - if y is not None and not isinstance(y, ( - str, tuple)) and self.type in set( - ['classifier', 'anomaly']): - unique_classes = unique_labels(y) - if len(unique_classes) < 2: - raise ValueError( - "Classifier can't train when only one class is " - "present.") - self.classes_ = unique_classes - self.X_ = X - self.y_ = y - # Clear cached summary since it should not # retain its value after a new call to fit if hasattr(self, 'model_summary_'): @@ -69,13 +55,24 @@ def fit(self, X, y=None, **params): set_shape(self, X) return self + @property + def _is_fitted(self): + """ + Tells if the predictor was trained. + """ + return (hasattr(self, 'model_') and + self.model_ and + os.path.isfile(self.model_)) + @trace def _invoke_inference_method(self, method, X, **params): """ Returns predictions. Can be predicted labels, probabilities or else decision values. """ - check_is_fitted(self, ["X_", "y_"]) + if not self._is_fitted: + raise ValueError("Model is not fitted. " + "fit() must be called before {}.".format(method)) # Check that the input is of the same shape as the one passed # during @@ -89,6 +86,10 @@ def _invoke_inference_method(self, method, X, **params): data = getattr(pipeline, method)(X, **params) return data + @trace + def get_feature_contributions(self, X, **params): + return self._invoke_inference_method('get_feature_contributions', X, **params) + @trace def predict(self, X, **params): """ diff --git a/src/python/nimbusml/base_transform.py b/src/python/nimbusml/base_transform.py index 73e54abf..393c3655 100644 --- a/src/python/nimbusml/base_transform.py +++ b/src/python/nimbusml/base_transform.py @@ -8,6 +8,8 @@ __all__ = ["BaseTransform"] +import os + from sklearn.base import BaseEstimator from . import Pipeline @@ -71,6 +73,15 @@ def fit(self, X, y=None, **params): set_shape(self, X) return self + @property + def _is_fitted(self): + """ + Tells if the transform was trained. + """ + return (hasattr(self, 'model_') and + self.model_ and + os.path.isfile(self.model_)) + @trace def transform(self, X, as_binary_data_stream=False, **params): """ diff --git a/src/python/nimbusml/ensemble/__init__.py b/src/python/nimbusml/ensemble/__init__.py index 04c555f8..c37f0b6a 100644 --- a/src/python/nimbusml/ensemble/__init__.py +++ b/src/python/nimbusml/ensemble/__init__.py @@ -1,3 +1,5 @@ +from ._ensembleclassifier import EnsembleClassifier +from ._ensembleregressor import EnsembleRegressor from ._fastforestbinaryclassifier import FastForestBinaryClassifier from ._fastforestregressor import FastForestRegressor from ._fasttreesbinaryclassifier import FastTreesBinaryClassifier @@ -11,6 +13,8 @@ from ._lightgbmregressor import LightGbmRegressor __all__ = [ + 'EnsembleClassifier', + 'EnsembleRegressor', 'FastForestBinaryClassifier', 'FastForestRegressor', 'FastTreesBinaryClassifier', diff --git a/src/python/nimbusml/ensemble/_ensembleclassifier.py b/src/python/nimbusml/ensemble/_ensembleclassifier.py new file mode 100644 index 00000000..371f4703 --- /dev/null +++ b/src/python/nimbusml/ensemble/_ensembleclassifier.py @@ -0,0 +1,247 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleClassifier +""" + +__all__ = ["EnsembleClassifier"] + + +from sklearn.base import ClassifierMixin + +from ..base_predictor import BasePredictor +from ..internal.core.ensemble._ensembleclassifier import \ + EnsembleClassifier as core +from ..internal.utils.utils import trace +from .feature_selector import AllFeatureSelector +from .subset_selector import BootstrapSelector + + +class EnsembleClassifier(core, BasePredictor, ClassifierMixin): + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param feature: see `Columns `_. + + :param label: see `Columns `_. + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=BootstrapSelector( + feature_selector=AllFeatureSelector()), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + feature=None, + label=None, + **params): + + if 'feature_column_name' in params: + raise NameError( + "'feature_column_name' must be renamed to 'feature'") + if feature: + params['feature_column_name'] = feature + if 'label_column_name' in params: + raise NameError( + "'label_column_name' must be renamed to 'label'") + if label: + params['label_column_name'] = label + BasePredictor.__init__(self, type='classifier', **params) + core.__init__( + self, + sampling_type=sampling_type, + num_models=num_models, + sub_model_selector_type=sub_model_selector_type, + output_combiner=output_combiner, + normalize=normalize, + caching=caching, + train_parallel=train_parallel, + batch_size=batch_size, + show_metrics=show_metrics, + **params) + self.feature = feature + self.label = label + + @trace + def predict_proba(self, X, **params): + ''' + Returns probabilities + ''' + return self._predict_proba(X, **params) + + @trace + def decision_function(self, X, **params): + ''' + Returns score values + ''' + return self._decision_function(X, **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/_ensembleregressor.py b/src/python/nimbusml/ensemble/_ensembleregressor.py new file mode 100644 index 00000000..a253829f --- /dev/null +++ b/src/python/nimbusml/ensemble/_ensembleregressor.py @@ -0,0 +1,223 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleRegressor +""" + +__all__ = ["EnsembleRegressor"] + + +from sklearn.base import RegressorMixin + +from ..base_predictor import BasePredictor +from ..internal.core.ensemble._ensembleregressor import \ + EnsembleRegressor as core +from ..internal.utils.utils import trace +from .feature_selector import AllFeatureSelector +from .subset_selector import BootstrapSelector + + +class EnsembleRegressor(core, BasePredictor, RegressorMixin): + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param feature: see `Columns `_. + + :param label: see `Columns `_. + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=BootstrapSelector( + feature_selector=AllFeatureSelector()), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + feature=None, + label=None, + **params): + + if 'feature_column_name' in params: + raise NameError( + "'feature_column_name' must be renamed to 'feature'") + if feature: + params['feature_column_name'] = feature + if 'label_column_name' in params: + raise NameError( + "'label_column_name' must be renamed to 'label'") + if label: + params['label_column_name'] = label + BasePredictor.__init__(self, type='regressor', **params) + core.__init__( + self, + sampling_type=sampling_type, + num_models=num_models, + sub_model_selector_type=sub_model_selector_type, + output_combiner=output_combiner, + normalize=normalize, + caching=caching, + train_parallel=train_parallel, + batch_size=batch_size, + show_metrics=show_metrics, + **params) + self.feature = feature + self.label = label + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/_lightgbmbinaryclassifier.py b/src/python/nimbusml/ensemble/_lightgbmbinaryclassifier.py index e1f2ffdc..d4574094 100644 --- a/src/python/nimbusml/ensemble/_lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/_lightgbmbinaryclassifier.py @@ -156,7 +156,7 @@ def __init__( unbalanced_sets=False, weight_of_positive_examples=1.0, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Logloss', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/ensemble/_lightgbmclassifier.py b/src/python/nimbusml/ensemble/_lightgbmclassifier.py index e8f1ed8c..4fafbb41 100644 --- a/src/python/nimbusml/ensemble/_lightgbmclassifier.py +++ b/src/python/nimbusml/ensemble/_lightgbmclassifier.py @@ -151,7 +151,7 @@ def __init__( unbalanced_sets=False, use_softmax=None, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Error', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/ensemble/_lightgbmranker.py b/src/python/nimbusml/ensemble/_lightgbmranker.py index f040e89f..f9098ea7 100644 --- a/src/python/nimbusml/ensemble/_lightgbmranker.py +++ b/src/python/nimbusml/ensemble/_lightgbmranker.py @@ -150,7 +150,7 @@ def __init__( caching='Auto', custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='NormalizedDiscountedCumulativeGain', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/ensemble/_lightgbmregressor.py b/src/python/nimbusml/ensemble/_lightgbmregressor.py index 780532d0..0da14bac 100644 --- a/src/python/nimbusml/ensemble/_lightgbmregressor.py +++ b/src/python/nimbusml/ensemble/_lightgbmregressor.py @@ -141,7 +141,7 @@ def __init__( booster=None, normalize='Auto', caching='Auto', - evaluation_metric='Default', + evaluation_metric='RootMeanSquaredError', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/ensemble/feature_selector/__init__.py b/src/python/nimbusml/ensemble/feature_selector/__init__.py new file mode 100644 index 00000000..b01d1829 --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/__init__.py @@ -0,0 +1,7 @@ +from ._allfeatureselector import AllFeatureSelector +from ._randomfeatureselector import RandomFeatureSelector + +__all__ = [ + 'AllFeatureSelector', + 'RandomFeatureSelector' +] diff --git a/src/python/nimbusml/ensemble/feature_selector/_allfeatureselector.py b/src/python/nimbusml/ensemble/feature_selector/_allfeatureselector.py new file mode 100644 index 00000000..0aefa8f8 --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/_allfeatureselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllFeatureSelector +""" + +__all__ = ["AllFeatureSelector"] + + +from ...internal.core.ensemble.feature_selector._allfeatureselector import \ + AllFeatureSelector as core +from ...internal.utils.utils import trace + + +class AllFeatureSelector(core): + """ + **Description** + Selects all features for each trainer in the ensemble + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/feature_selector/_randomfeatureselector.py b/src/python/nimbusml/ensemble/feature_selector/_randomfeatureselector.py new file mode 100644 index 00000000..60178864 --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/_randomfeatureselector.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomFeatureSelector +""" + +__all__ = ["RandomFeatureSelector"] + + +from ...internal.core.ensemble.feature_selector._randomfeatureselector import \ + RandomFeatureSelector as core +from ...internal.utils.utils import trace + + +class RandomFeatureSelector(core): + """ + **Description** + Selects a random subset of features for each trainer in the ensemble + + :param features_selection_proportion: The proportion of features to be + selected. The range is 0.0-1.0. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + features_selection_proportion=0.8, + **params): + core.__init__( + self, + features_selection_proportion=features_selection_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/__init__.py b/src/python/nimbusml/ensemble/output_combiner/__init__.py new file mode 100644 index 00000000..fdc08b5d --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/__init__.py @@ -0,0 +1,19 @@ +from ._classifieraverage import ClassifierAverage +from ._classifiermedian import ClassifierMedian +from ._classifierstacking import ClassifierStacking +from ._classifiervoting import ClassifierVoting +from ._classifierweightedaverage import ClassifierWeightedAverage +from ._regressoraverage import RegressorAverage +from ._regressormedian import RegressorMedian +from ._regressorstacking import RegressorStacking + +__all__ = [ + 'ClassifierAverage', + 'ClassifierMedian', + 'ClassifierStacking', + 'ClassifierVoting', + 'ClassifierWeightedAverage', + 'ClassifierAverage', + 'ClassifierMedian', + 'ClassifierStacking' +] diff --git a/src/python/nimbusml/ensemble/output_combiner/_classifieraverage.py b/src/python/nimbusml/ensemble/output_combiner/_classifieraverage.py new file mode 100644 index 00000000..68721b5e --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_classifieraverage.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAverage +""" + +__all__ = ["ClassifierAverage"] + + +from ...internal.core.ensemble.output_combiner._classifieraverage import \ + ClassifierAverage as core +from ...internal.utils.utils import trace + + +class ClassifierAverage(core): + """ + **Description** + Computes the average of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + core.__init__( + self, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_classifiermedian.py b/src/python/nimbusml/ensemble/output_combiner/_classifiermedian.py new file mode 100644 index 00000000..35fcb4fa --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_classifiermedian.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierMedian +""" + +__all__ = ["ClassifierMedian"] + + +from ...internal.core.ensemble.output_combiner._classifiermedian import \ + ClassifierMedian as core +from ...internal.utils.utils import trace + + +class ClassifierMedian(core): + """ + **Description** + Computes the median of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + core.__init__( + self, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_classifierstacking.py b/src/python/nimbusml/ensemble/output_combiner/_classifierstacking.py new file mode 100644 index 00000000..2783a7de --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_classifierstacking.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierStacking +""" + +__all__ = ["ClassifierStacking"] + + +from ...internal.core.ensemble.output_combiner._classifierstacking import \ + ClassifierStacking as core +from ...internal.utils.utils import trace + + +class ClassifierStacking(core): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_classifiervoting.py b/src/python/nimbusml/ensemble/output_combiner/_classifiervoting.py new file mode 100644 index 00000000..ae7f5ae0 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_classifiervoting.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierVoting +""" + +__all__ = ["ClassifierVoting"] + + +from ...internal.core.ensemble.output_combiner._classifiervoting import \ + ClassifierVoting as core +from ...internal.utils.utils import trace + + +class ClassifierVoting(core): + """ + **Description** + Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_classifierweightedaverage.py b/src/python/nimbusml/ensemble/output_combiner/_classifierweightedaverage.py new file mode 100644 index 00000000..01c8bd55 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_classifierweightedaverage.py @@ -0,0 +1,98 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierWeightedAverage +""" + +__all__ = ["ClassifierWeightedAverage"] + + +from ...internal.core.ensemble.output_combiner._classifierweightedaverage import \ + ClassifierWeightedAverage as core +from ...internal.utils.utils import trace + + +class ClassifierWeightedAverage(core): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + weightage_name='AccuracyMicroAvg', + normalize=True, + **params): + core.__init__( + self, + weightage_name=weightage_name, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_regressoraverage.py b/src/python/nimbusml/ensemble/output_combiner/_regressoraverage.py new file mode 100644 index 00000000..e7fa15cb --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_regressoraverage.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAverage +""" + +__all__ = ["RegressorAverage"] + + +from ...internal.core.ensemble.output_combiner._regressoraverage import \ + RegressorAverage as core +from ...internal.utils.utils import trace + + +class RegressorAverage(core): + """ + **Description** + Computes the average of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_regressormedian.py b/src/python/nimbusml/ensemble/output_combiner/_regressormedian.py new file mode 100644 index 00000000..4cab3ae4 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_regressormedian.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorMedian +""" + +__all__ = ["RegressorMedian"] + + +from ...internal.core.ensemble.output_combiner._regressormedian import \ + RegressorMedian as core +from ...internal.utils.utils import trace + + +class RegressorMedian(core): + """ + **Description** + Computes the median of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/_regressorstacking.py b/src/python/nimbusml/ensemble/output_combiner/_regressorstacking.py new file mode 100644 index 00000000..20af86dc --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/_regressorstacking.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorStacking +""" + +__all__ = ["RegressorStacking"] + + +from ...internal.core.ensemble.output_combiner._regressorstacking import \ + RegressorStacking as core +from ...internal.utils.utils import trace + + +class RegressorStacking(core): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/__init__.py b/src/python/nimbusml/ensemble/sub_model_selector/__init__.py new file mode 100644 index 00000000..2ab42128 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/__init__.py @@ -0,0 +1,15 @@ +from ._classifierallselector import ClassifierAllSelector +from ._classifierbestdiverseselector import ClassifierBestDiverseSelector +from ._classifierbestperformanceselector import ClassifierBestPerformanceSelector +from ._regressorallselector import RegressorAllSelector +from ._regressorbestdiverseselector import RegressorBestDiverseSelector +from ._regressorbestperformanceselector import RegressorBestPerformanceSelector + +__all__ = [ + 'ClassifierAllSelector', + 'ClassifierBestDiverseSelector', + 'ClassifierBestPerformanceSelector', + 'RegressorAllSelector', + 'RegressorBestDiverseSelector', + 'RegressorBestPerformanceSelector' +] \ No newline at end of file diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_classifierallselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_classifierallselector.py new file mode 100644 index 00000000..a80d823b --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_classifierallselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAllSelector +""" + +__all__ = ["ClassifierAllSelector"] + + +from ...internal.core.ensemble.sub_model_selector._classifierallselector import \ + ClassifierAllSelector as core +from ...internal.utils.utils import trace + + +class ClassifierAllSelector(core): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestdiverseselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestdiverseselector.py new file mode 100644 index 00000000..c3409a0c --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestdiverseselector.py @@ -0,0 +1,56 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestDiverseSelector +""" + +__all__ = ["ClassifierBestDiverseSelector"] + + +from ...internal.core.ensemble.sub_model_selector._classifierbestdiverseselector import \ + ClassifierBestDiverseSelector as core +from ...internal.utils.utils import trace +from .diversity_measure import ClassifierDisagreement + + +class ClassifierBestDiverseSelector(core): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=ClassifierDisagreement(), + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + diversity_metric_type=diversity_metric_type, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestperformanceselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestperformanceselector.py new file mode 100644 index 00000000..8c2d5d94 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_classifierbestperformanceselector.py @@ -0,0 +1,84 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestPerformanceSelector +""" + +__all__ = ["ClassifierBestPerformanceSelector"] + + +from ...internal.core.ensemble.sub_model_selector._classifierbestperformanceselector import \ + ClassifierBestPerformanceSelector as core +from ...internal.utils.utils import trace + + +class ClassifierBestPerformanceSelector(core): + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='AccuracyMicro', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + metric_name=metric_name, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_regressorallselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_regressorallselector.py new file mode 100644 index 00000000..9df81f9c --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_regressorallselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAllSelector +""" + +__all__ = ["RegressorAllSelector"] + + +from ...internal.core.ensemble.sub_model_selector._regressorallselector import \ + RegressorAllSelector as core +from ...internal.utils.utils import trace + + +class RegressorAllSelector(core): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestdiverseselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestdiverseselector.py new file mode 100644 index 00000000..b6e5b0aa --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestdiverseselector.py @@ -0,0 +1,56 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestDiverseSelector +""" + +__all__ = ["RegressorBestDiverseSelector"] + + +from ...internal.core.ensemble.sub_model_selector._regressorbestdiverseselector import \ + RegressorBestDiverseSelector as core +from ...internal.utils.utils import trace +from .diversity_measure import RegressorDisagreement + + +class RegressorBestDiverseSelector(core): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=RegressorDisagreement(), + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + diversity_metric_type=diversity_metric_type, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestperformanceselector.py b/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestperformanceselector.py new file mode 100644 index 00000000..1191f3fc --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/_regressorbestperformanceselector.py @@ -0,0 +1,82 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestPerformanceSelector +""" + +__all__ = ["RegressorBestPerformanceSelector"] + + +from ...internal.core.ensemble.sub_model_selector._regressorbestperformanceselector import \ + RegressorBestPerformanceSelector as core +from ...internal.utils.utils import trace + + +class RegressorBestPerformanceSelector(core): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='L1', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + metric_name=metric_name, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py new file mode 100644 index 00000000..90fd8cb0 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py @@ -0,0 +1,7 @@ +from ._classifierdisagreement import ClassifierDisagreement +from ._regressordisagreement import RegressorDisagreement + +__all__ = [ + 'ClassifierDisagreement', + 'RegressorDisagreement' +] diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py new file mode 100644 index 00000000..ed28329c --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierDisagreement +""" + +__all__ = ["ClassifierDisagreement"] + + +from ....internal.core.ensemble.sub_model_selector.diversity_measure._classifierdisagreement import \ + ClassifierDisagreement as core +from ....internal.utils.utils import trace + + +class ClassifierDisagreement(core): + """ + **Description** + A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py new file mode 100644 index 00000000..91661ac1 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorDisagreement +""" + +__all__ = ["RegressorDisagreement"] + + +from ....internal.core.ensemble.sub_model_selector.diversity_measure._regressordisagreement import \ + RegressorDisagreement as core +from ....internal.utils.utils import trace + + +class RegressorDisagreement(core): + """ + **Description** + A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/__init__.py b/src/python/nimbusml/ensemble/subset_selector/__init__.py new file mode 100644 index 00000000..32539e74 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/__init__.py @@ -0,0 +1,9 @@ +from ._allinstanceselector import AllInstanceSelector +from ._bootstrapselector import BootstrapSelector +from ._randompartitionselector import RandomPartitionSelector + +__all__ = [ + 'AllInstanceSelector', + 'BootstrapSelector', + 'RandomPartitionSelector' +] diff --git a/src/python/nimbusml/ensemble/subset_selector/_allinstanceselector.py b/src/python/nimbusml/ensemble/subset_selector/_allinstanceselector.py new file mode 100644 index 00000000..d3d7dbcb --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/_allinstanceselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + +__all__ = ["AllInstanceSelector"] + + +from ...internal.core.ensemble.subset_selector._allinstanceselector import \ + AllInstanceSelector as core +from ...internal.utils.utils import trace + + +class AllInstanceSelector(core): + """ + **Description** + Selects all rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/_bootstrapselector.py b/src/python/nimbusml/ensemble/subset_selector/_bootstrapselector.py new file mode 100644 index 00000000..90b79e00 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/_bootstrapselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + +__all__ = ["BootstrapSelector"] + + +from ...internal.core.ensemble.subset_selector._bootstrapselector import \ + BootstrapSelector as core +from ...internal.utils.utils import trace + + +class BootstrapSelector(core): + """ + **Description** + Selects a bootstrapped sample of the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/_randompartitionselector.py b/src/python/nimbusml/ensemble/subset_selector/_randompartitionselector.py new file mode 100644 index 00000000..10572b92 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/_randompartitionselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + +__all__ = ["RandomPartitionSelector"] + + +from ...internal.core.ensemble.subset_selector._randompartitionselector import \ + RandomPartitionSelector as core +from ...internal.utils.utils import trace + + +class RandomPartitionSelector(core): + """ + **Description** + Randomly partitions the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/examples/EnsembleClassifier.py b/src/python/nimbusml/examples/EnsembleClassifier.py new file mode 100644 index 00000000..1ebb0e2a --- /dev/null +++ b/src/python/nimbusml/examples/EnsembleClassifier.py @@ -0,0 +1,85 @@ +############################################################################### +# EnsembleClassifier +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path) +print(data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... + + +# define the training pipeline using default sampling and ensembling parameters +pipeline_with_defaults = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleClassifier(feature=['age', 'edu', 'parity'], + label='induced', + num_models=3) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_defaults.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# PredictedLabel Score.0 Score.1 Score.2 +# 0 2 0.202721 0.186598 0.628115 +# 1 0 0.716737 0.190289 0.092974 +# 2 2 0.201026 0.185602 0.624761 +# 3 0 0.423328 0.235074 0.365649 +# 4 0 0.577509 0.220827 0.201664 + +# print evaluation metrics +print(metrics) +# Accuracy(micro-avg) Accuracy(macro-avg) Log-loss ... (class 0) ... +# 0 0.612903 0.417519 0.846467 ... 0.504007 ... +# (class 1) (class 2) +# 1.244033 1.439364 + + +# define the training pipeline with specific sampling and ensembling options +pipeline_with_options = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleClassifier(feature=['age', 'edu', 'parity'], + label='induced', + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_options.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# PredictedLabel Score.0 Score.1 Score.2 +# 0 2 0.0 0.0 1.0 +# 1 0 1.0 0.0 0.0 +# 2 2 0.0 0.0 1.0 +# 3 0 1.0 0.0 0.0 +# 4 0 1.0 0.0 0.0 + +# print evaluation metrics +# note that accuracy metrics are lower than with defaults as this is a small +# dataset that we partition into 3 chunks for each classifier, which decreases +# model quality. +print(metrics) +# Accuracy(micro-avg) Accuracy(macro-avg) Log-loss ... (class 0) ... +# 0 0.596774 0.38352 13.926926 ... 0.48306 ... +# (class 1) (class 2) +# 33.52293 29.871374 \ No newline at end of file diff --git a/src/python/nimbusml/examples/EnsembleRegressor.py b/src/python/nimbusml/examples/EnsembleRegressor.py new file mode 100644 index 00000000..e5f73c97 --- /dev/null +++ b/src/python/nimbusml/examples/EnsembleRegressor.py @@ -0,0 +1,78 @@ +############################################################################### +# EnsembleRegressor +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path) +print(data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... + +# define the training pipeline using default sampling and ensembling parameters +pipeline_with_defaults = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleRegressor(feature=['induced', 'edu'], label='age', num_models=3) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_defaults.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# Score +# 0 26.046741 +# 1 26.046741 +# 2 29.225840 +# 3 29.225840 +# 4 33.849384 + +# print evaluation metrics +print(metrics) +# L1(avg) L2(avg) RMS(avg) Loss-fn(avg) R Squared +# 0 4.69884 33.346123 5.77461 33.346124 -0.214011 + + +# define the training pipeline with specific sampling and ensembling options +pipeline_with_options = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleRegressor(feature=['induced', 'edu'], + label='age', + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_options.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# Score +# 0 37.122200 +# 1 37.122200 +# 2 41.296204 +# 3 41.296204 +# 4 33.591423 + +# print evaluation metrics +# note that the converged loss function values are worse than with defaults as +# this is a small dataset that we partition into 3 chunks for each regressor, +# which decreases model quality +print(metrics) +# L1(avg) L2(avg) RMS(avg) Loss-fn(avg) R Squared +# 0 5.481676 44.924838 6.702599 44.924838 -0.63555 diff --git a/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py new file mode 100644 index 00000000..1a2d70e6 --- /dev/null +++ b/src/python/nimbusml/examples/LinearSvmBinaryClassifier.py @@ -0,0 +1,37 @@ +############################################################################### +# LinearSvmBinaryClassifier +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.linear_model import LinearSvmBinaryClassifier + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... +# define the training pipeline +pipeline = Pipeline([LinearSvmBinaryClassifier( + feature=['age', 'parity', 'spontaneous'], label='case')]) + +# train, predict, and evaluate +# TODO: Replace with CV +metrics, predictions = pipeline.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# PredictedLabel Score Probability +# 0 1 0.688481 0.607060 +# 1 0 -2.514992 0.203312 +# 2 0 -3.479344 0.129230 +# 3 0 -3.016621 0.161422 +# 4 0 -0.825512 0.397461 +# print evaluation metrics +print(metrics) +# AUC Accuracy Positive precision Positive recall ... +# 0 0.705476 0.71371 0.666667 0.289157 ... diff --git a/src/python/nimbusml/examples/PipelineWithFeatureContributions.py b/src/python/nimbusml/examples/PipelineWithFeatureContributions.py new file mode 100644 index 00000000..8d31decb --- /dev/null +++ b/src/python/nimbusml/examples/PipelineWithFeatureContributions.py @@ -0,0 +1,86 @@ +############################################################################### +# Pipeline with observation level feature contributions + +# Scoring a dataset with a trained model produces a score, or prediction, for +# each example. To understand and explain these predictions it can be useful to +# inspect which features influenced them most significantly. This function +# computes a model-specific list of per-feature contributions to the score for +# each example. These contributions can be positive (they make the score +# higher) or negative (they make the score lower). + +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import FastTreesBinaryClassifier +from nimbusml.linear_model import LogisticRegressionBinaryClassifier + +# data input (as a FileDataStream) +path = get_dataset('uciadult_train').as_filepath() + +data = FileDataStream.read_csv(path) +print(data.head()) +# label workclass education ... capital-loss hours-per-week +# 0 0 Private 11th ... 0 40 +# 1 0 Private HS-grad ... 0 50 +# 2 1 Local-gov Assoc-acdm ... 0 40 +# 3 1 Private Some-college ... 0 40 +# 4 0 ? Some-college ... 0 30 + +# define the training pipeline with a linear model +lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier( + feature=['age', 'education-num', 'hours-per-week'], label='label')]) + +# train the model +lr_model = lr_pipeline.fit(data) + +# For linear models, the contribution of a given feature is equal to the +# product of feature value times the corresponding weight. Similarly, for +# Generalized Additive Models (GAM), the contribution of a feature is equal to +# the shape function for the given feature evaluated at the feature value. +lr_feature_contributions = lr_model.get_feature_contributions(data) + +# Print predictions with feature contributions, which give a relative measure +# of how much each feature impacted the Score. +print("========== Feature Contributions for Linear Model ==========") +print(lr_feature_contributions.head()) +# label ... PredictedLabel Score ... FeatureContributions.hours-per-week +# 0 0 ... 0 -2.010687 ... 0.833069 +# 1 0 ... 0 -1.216163 ... 0.809928 +# 2 1 ... 0 -1.248412 ... 0.485957 +# 3 1 ... 0 -1.132419 ... 0.583148 +# 4 0 ... 0 -1.969522 ... 0.437361 + +# define the training pipeline with a tree model +tree_pipeline = Pipeline([FastTreesBinaryClassifier( + feature=['age', 'education-num', 'hours-per-week'], label='label')]) + +# train the model +tree_model = tree_pipeline.fit(data) + +# For tree-based models, the calculation of feature contribution essentially +# consists in determining which splits in the tree have the most impact on the +# final score and assigning the value of the impact to the features determining +# the split. More precisely, the contribution of a feature is equal to the +# change in score produced by exploring the opposite sub-tree every time a +# decision node for the given feature is encountered. +# +# Consider a simple case with a single decision tree that has a decision node +# for the binary feature F1. Given an example that has feature F1 equal to +# true, we can calculate the score it would have obtained if we chose the +# subtree corresponding to the feature F1 being equal to false while keeping +# the other features constant. The contribution of feature F1 for the given +# example is the difference between the original score and the score obtained +# by taking the opposite decision at the node corresponding to feature F1. This +# algorithm extends naturally to models with many decision trees. +tree_feature_contributions = tree_model.get_feature_contributions(data) + +# Print predictions with feature contributions, which give a relative measure +# of how much each feature impacted the Score. +print("========== Feature Contributions for Tree Model ==========") +print(tree_feature_contributions.head()) +# label ... PredictedLabel Score ... FeatureContributions.hours-per-week +# 0 0 ... 0 -16.717360 ... -0.608664 +# 1 0 ... 0 -7.688200 ... -0.541213 +# 2 1 ... 1 1.571164 ... 0.032862 +# 3 1 ... 1 2.115638 ... 0.537077 +# 4 0 ... 0 -23.038410 ... -0.682764 + diff --git a/src/python/nimbusml/examples/SsaForecaster.py b/src/python/nimbusml/examples/SsaForecaster.py index a7884c7c..1662a8ff 100644 --- a/src/python/nimbusml/examples/SsaForecaster.py +++ b/src/python/nimbusml/examples/SsaForecaster.py @@ -43,4 +43,5 @@ # 6 0.05 0.07 0.05 0.09 0.12 # 7 0.07 0.09 0.09 0.12 0.16 # 8 0.09 99.00 99.00 57.92 82.88 -# 9 1.10 0.10 0.10 60.50 77.18 \ No newline at end of file +# 9 1.10 0.10 0.10 60.50 77.18 + diff --git a/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py new file mode 100644 index 00000000..4f1e2108 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py @@ -0,0 +1,53 @@ +############################################################################### +# EnsembleClassifier +import numpy as np +import pandas as pd +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector +from sklearn.model_selection import train_test_split + +# use 'iris' data set to create test and train data +# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa +# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 +# 1 4.9 3.0 1.4 0.2 0 setosa 1.0 +np.random.seed(0) + +df = get_dataset("iris").as_df() +df.drop(['Species'], inplace=True, axis=1) + +X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) + +# train a model with default sampling and ensembling parameters and score +ensemble_with_defaults = EnsembleClassifier(num_models=3).fit(X_train, y_train) + +scores = ensemble_with_defaults.predict(X_test) +scores = pd.to_numeric(scores) + +# evaluate the model +print('Accuracy:', np.mean(y_test == [i for i in scores])) +# Accuracy: 0.9473684210526315 + + +# train a model with specific sampling and ensembling options and score +ensemble_with_options = EnsembleClassifier( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()).fit(X_train, y_train) + +scores = ensemble_with_options.predict(X_test) +scores = pd.to_numeric(scores) + +# evaluate the model +# note that accuracy is lower than with defaults as this is a small dataset +# that we partition into 3 chunks for each classifier, which decreases model +# quality. +print('Accuracy:', np.mean(y_test == [i for i in scores])) +# Accuracy: 0.5789473684210527 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py new file mode 100644 index 00000000..b80df686 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py @@ -0,0 +1,49 @@ +############################################################################### +# EnsembleRegressor +import numpy as np +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split + +# use the built-in data set 'airquality' to create test and train data +# Unnamed: 0 Ozone Solar_R Wind Temp Month Day +# 0 1 41.0 190.0 7.4 67 5 1 +# 1 2 36.0 118.0 8.0 72 5 2 +np.random.seed(0) + +df = get_dataset("airquality").as_df().fillna(0) +df = df[df.Ozone.notnull()] + +X_train, X_test, y_train, y_test = train_test_split( + df.loc[:, df.columns != 'Ozone'], df['Ozone']) + +# train a model with default sampling and ensembling parameters and score +ensemble_with_defaults = EnsembleRegressor(num_models=3).fit(X_train, y_train) +scores = ensemble_with_defaults.predict(X_test) + +# evaluate the model +print('R-squared fit:', r2_score(y_test, scores, )) +# R-squared fit: 0.12144964995862884 + + +# train a model withe specific sampling and ensembling options and score +ensemble_with_options = EnsembleRegressor( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()).fit(X_train, y_train) +scores = ensemble_with_options.predict(X_test) + +# evaluate the model +# note that this is a worse fit than with defaults as this is a small dataset +# that we partition into 3 chunks for each regressor, which decreases model +# quality +print('R-squared fit:', r2_score(y_test, scores, )) +# R-squared fit: 0.027908675807698735 \ No newline at end of file diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py new file mode 100644 index 00000000..a421dae8 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/LinearSvmBinaryClassifier_df.py @@ -0,0 +1,31 @@ +############################################################################### +# AveragedPerceptronBinaryClassifier +import numpy as np +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LinearSvmBinaryClassifier +from sklearn.model_selection import train_test_split + +# use the built-in data set 'infert' to create test and train data +# Unnamed: 0 education age parity induced case spontaneous stratum \ +# 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 +# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 +# pooled.stratum education_str +# 0 3.0 0-5yrs +# 1 1.0 0-5yrs +np.random.seed(0) + +df = get_dataset("infert").as_df() + +# remove : and ' ' from column names, and encode categorical column +df.columns = [i.replace(': ', '') for i in df.columns] +df = (OneHotVectorizer() << 'education_str').fit_transform(df) + +X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'case'], df['case']) + +lr = LinearSvmBinaryClassifier().fit(X_train, y_train) +scores = lr.predict(X_test) + +# Evaluate the model +print('Accuracy:', np.mean(y_test == [i for i in scores])) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/SsaForecaster_df.py b/src/python/nimbusml/examples/examples_from_dataframe/SsaForecaster_df.py index d4138b31..c93636fc 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/SsaForecaster_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/SsaForecaster_df.py @@ -83,4 +83,4 @@ # 11 1 2.00 3.00 4.00 -0.00 # 12 100 3.00 4.00 0.00 1.00 # 13 110 4.00 -0.00 1.00 75.50 -# 14 120 -0.00 1.00 83.67 83.25 \ No newline at end of file +# 14 120 -0.00 1.00 83.67 83.25 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py index c99d7401..56a00809 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordEmbedding_df.py @@ -3,7 +3,7 @@ import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import WordEmbedding -from nimbusml.feature_extraction.text import NGramFeaturizer +from nimbusml.feature_extraction.text._ngramfeaturizer import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram # create the data diff --git a/src/python/nimbusml/internal/core/base_pipeline_item.py b/src/python/nimbusml/internal/core/base_pipeline_item.py index e45b3d0c..8700da1a 100644 --- a/src/python/nimbusml/internal/core/base_pipeline_item.py +++ b/src/python/nimbusml/internal/core/base_pipeline_item.py @@ -15,6 +15,7 @@ from abc import ABCMeta, abstractmethod from collections import OrderedDict from itertools import chain +from shutil import copyfile from textwrap import wrap import six @@ -375,6 +376,8 @@ def _get_node(self, **params): def __getstate__(self): "Selects what to pickle." odict = self.__dict__.copy() + odict['export_version'] = 1 + if hasattr(self, 'model_') and \ self.model_ is not None and os.path.isfile(self.model_): with open(self.model_, "rb") as mfile: @@ -387,8 +390,11 @@ def __getstate__(self): def __setstate__(self, state): "Restore a pickled object." for k, v in state.items(): - if k not in {'modelbytes', 'type'}: + if k not in {'modelbytes', 'type', 'export_version'}: setattr(self, k, v) + + # Note: modelbytes and type were + # added before export_version 1 if 'modelbytes' in state: (fd, modelfile) = tempfile.mkstemp() fl = os.fdopen(fd, "wb") @@ -442,6 +448,19 @@ def get_roles_params(self): res["columns"] = pars return res + @trace + def save_model(self, dst): + """ + Save model to file. For more details, please refer to + `load/save model `_ + + :param dst: filename to be saved with + + """ + if self.model_ is not None: + if os.path.isfile(self.model_): + copyfile(self.model_, dst) + def __getitem__(self, cols): """ Returns a View on this element restricted to the selected column. @@ -937,8 +956,10 @@ def _steal_io(self, node): """ if hasattr(node, '_columns') and node._columns is not None: self << node._columns - setattr(node, node._attr_input, - getattr(node, node._attr_output)) + + if hasattr(node, '_attr_output'): + setattr(node, node._attr_input, + getattr(node, node._attr_output)) else: # No columns specified. The user plans to fit the pipeline as # fit(X, y). diff --git a/src/python/nimbusml/internal/core/ensemble/_ensembleclassifier.py b/src/python/nimbusml/internal/core/ensemble/_ensembleclassifier.py new file mode 100644 index 00000000..13850fc8 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/_ensembleclassifier.py @@ -0,0 +1,238 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleClassifier +""" + +__all__ = ["EnsembleClassifier"] + + +from ...entrypoints._ensemblemulticlassoutputcombiner_multimedian import \ + multi_median +from ...entrypoints._ensemblemulticlasssubmodelselector_allselectormulticlass import \ + all_selector_multi_class +from ...entrypoints._ensemblesubsetselector_bootstrapselector import \ + bootstrap_selector +from ...entrypoints.trainers_ensembleclassification import \ + trainers_ensembleclassification +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles + + +class EnsembleClassifier( + BasePipelineItem, + DefaultSignatureWithRoles): + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + BasePipelineItem.__init__( + self, type='classifier', **params) + + self.sampling_type = sampling_type + self.num_models = num_models + self.sub_model_selector_type = sub_model_selector_type + self.output_combiner = output_combiner + self.normalize = normalize + self.caching = caching + self.train_parallel = train_parallel + self.batch_size = batch_size + self.show_metrics = show_metrics + + @property + def _entrypoint(self): + return trainers_ensembleclassification + + @trace + def _get_node(self, **all_args): + algo_args = dict( + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + sampling_type=self.sampling_type, + num_models=self.num_models, + sub_model_selector_type=self.sub_model_selector_type, + output_combiner=self.output_combiner, + normalize_features=self.normalize, + caching=self.caching, + train_parallel=self.train_parallel, + batch_size=self.batch_size, + show_metrics=self.show_metrics) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/ensemble/_ensembleregressor.py b/src/python/nimbusml/internal/core/ensemble/_ensembleregressor.py new file mode 100644 index 00000000..408b607d --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/_ensembleregressor.py @@ -0,0 +1,226 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleRegressor +""" + +__all__ = ["EnsembleRegressor"] + + +from ...entrypoints._ensembleregressionsubmodelselector_allselector import \ + all_selector +from ...entrypoints._ensemblesubsetselector_bootstrapselector import \ + bootstrap_selector +from ...entrypoints.trainers_ensembleregression import \ + trainers_ensembleregression +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles + + +class EnsembleRegressor( + BasePipelineItem, + DefaultSignatureWithRoles): + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + BasePipelineItem.__init__( + self, type='regressor', **params) + + self.sampling_type = sampling_type + self.num_models = num_models + self.sub_model_selector_type = sub_model_selector_type + self.output_combiner = output_combiner + self.normalize = normalize + self.caching = caching + self.train_parallel = train_parallel + self.batch_size = batch_size + self.show_metrics = show_metrics + + @property + def _entrypoint(self): + return trainers_ensembleregression + + @trace + def _get_node(self, **all_args): + algo_args = dict( + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + sampling_type=self.sampling_type, + num_models=self.num_models, + sub_model_selector_type=self.sub_model_selector_type, + output_combiner=self.output_combiner, + normalize_features=self.normalize, + caching=self.caching, + train_parallel=self.train_parallel, + batch_size=self.batch_size, + show_metrics=self.show_metrics) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/ensemble/_lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/_lightgbmbinaryclassifier.py index f44c5841..2bf8468b 100644 --- a/src/python/nimbusml/internal/core/ensemble/_lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/_lightgbmbinaryclassifier.py @@ -145,7 +145,7 @@ def __init__( unbalanced_sets=False, weight_of_positive_examples=1.0, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Logloss', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/core/ensemble/_lightgbmclassifier.py b/src/python/nimbusml/internal/core/ensemble/_lightgbmclassifier.py index 865587fd..5feace13 100644 --- a/src/python/nimbusml/internal/core/ensemble/_lightgbmclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/_lightgbmclassifier.py @@ -143,7 +143,7 @@ def __init__( unbalanced_sets=False, use_softmax=None, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Error', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/core/ensemble/_lightgbmranker.py b/src/python/nimbusml/internal/core/ensemble/_lightgbmranker.py index 17d23591..6c06148d 100644 --- a/src/python/nimbusml/internal/core/ensemble/_lightgbmranker.py +++ b/src/python/nimbusml/internal/core/ensemble/_lightgbmranker.py @@ -140,7 +140,7 @@ def __init__( caching='Auto', custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='NormalizedDiscountedCumulativeGain', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/core/ensemble/_lightgbmregressor.py b/src/python/nimbusml/internal/core/ensemble/_lightgbmregressor.py index 6d053a54..20fe5e57 100644 --- a/src/python/nimbusml/internal/core/ensemble/_lightgbmregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/_lightgbmregressor.py @@ -133,7 +133,7 @@ def __init__( booster=None, normalize='Auto', caching='Auto', - evaluation_metric='Default', + evaluation_metric='RootMeanSquaredError', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/_allfeatureselector.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/_allfeatureselector.py new file mode 100644 index 00000000..d2fab736 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/_allfeatureselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllFeatureSelector +""" + +__all__ = ["AllFeatureSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class AllFeatureSelector(Component): + """ + **Description** + Selects all features for each trainer in the ensemble + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleFeatureSelector' + self.name = 'AllFeatureSelector' + self.settings = {} + + super( + AllFeatureSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/_randomfeatureselector.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/_randomfeatureselector.py new file mode 100644 index 00000000..9933864a --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/_randomfeatureselector.py @@ -0,0 +1,52 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomFeatureSelector +""" + +__all__ = ["RandomFeatureSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RandomFeatureSelector(Component): + """ + **Description** + Selects a random subset of features for each trainer in the ensemble + + :param features_selection_proportion: The proportion of features to be + selected. The range is 0.0-1.0. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + features_selection_proportion=0.8, + **params): + + self.features_selection_proportion = features_selection_proportion + self.kind = 'EnsembleFeatureSelector' + self.name = 'RandomFeatureSelector' + self.settings = {} + + if features_selection_proportion is not None: + self.settings['FeaturesSelectionProportion'] = try_set( + obj=features_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RandomFeatureSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifieraverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifieraverage.py new file mode 100644 index 00000000..2b683f05 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifieraverage.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAverage +""" + +__all__ = ["ClassifierAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierAverage(Component): + """ + **Description** + Computes the average of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiAverage' + self.settings = {} + + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiermedian.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiermedian.py new file mode 100644 index 00000000..e984a7ff --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiermedian.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierMedian +""" + +__all__ = ["ClassifierMedian"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierMedian(Component): + """ + **Description** + Computes the median of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiMedian' + self.settings = {} + + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierMedian, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierstacking.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierstacking.py new file mode 100644 index 00000000..0f6aef2f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierstacking.py @@ -0,0 +1,53 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierStacking +""" + +__all__ = ["ClassifierStacking"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierStacking(Component): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiStacking' + self.settings = {} + + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierStacking, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiervoting.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiervoting.py new file mode 100644 index 00000000..582a4df0 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifiervoting.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierVoting +""" + +__all__ = ["ClassifierVoting"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class ClassifierVoting(Component): + """ + **Description** + Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiVoting' + self.settings = {} + + super( + ClassifierVoting, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierweightedaverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierweightedaverage.py new file mode 100644 index 00000000..7459a947 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_classifierweightedaverage.py @@ -0,0 +1,107 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierWeightedAverage +""" + +__all__ = ["ClassifierWeightedAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierWeightedAverage(Component): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + weightage_name='AccuracyMicroAvg', + normalize=True, + **params): + + self.weightage_name = weightage_name + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiWeightedAverage' + self.settings = {} + + if weightage_name is not None: + self.settings['WeightageName'] = try_set( + obj=weightage_name, none_acceptable=True, is_of_type=str, values=[ + 'AccuracyMicroAvg', 'AccuracyMacroAvg']) + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierWeightedAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressoraverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressoraverage.py new file mode 100644 index 00000000..b129d20a --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressoraverage.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAverage +""" + +__all__ = ["RegressorAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorAverage(Component): + """ + **Description** + Computes the average of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'Average' + self.settings = {} + + super( + RegressorAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressormedian.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressormedian.py new file mode 100644 index 00000000..113a5a60 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressormedian.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorMedian +""" + +__all__ = ["RegressorMedian"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorMedian(Component): + """ + **Description** + Computes the median of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'Median' + self.settings = {} + + super( + RegressorMedian, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressorstacking.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressorstacking.py new file mode 100644 index 00000000..68bbe05f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/_regressorstacking.py @@ -0,0 +1,53 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorStacking +""" + +__all__ = ["RegressorStacking"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorStacking(Component): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'RegressionStacking' + self.settings = {} + + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorStacking, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierallselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierallselector.py new file mode 100644 index 00000000..a5890ded --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierallselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAllSelector +""" + +__all__ = ["ClassifierAllSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class ClassifierAllSelector(Component): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'AllSelectorMultiClass' + self.settings = {} + + super( + ClassifierAllSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestdiverseselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestdiverseselector.py new file mode 100644 index 00000000..a4c3e22c --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestdiverseselector.py @@ -0,0 +1,73 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestDiverseSelector +""" + +__all__ = ["ClassifierBestDiverseSelector"] + +import numbers + +from ....entrypoints._ensemblemulticlassdiversitymeasure_multidisagreementdiversitymeasure import \ + multi_disagreement_diversity_measure +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierBestDiverseSelector(Component): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.diversity_metric_type = diversity_metric_type + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'BestDiverseSelectorMultiClass' + self.settings = {} + + if diversity_metric_type is not None: + self.settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierBestDiverseSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestperformanceselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestperformanceselector.py new file mode 100644 index 00000000..baf96b79 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_classifierbestperformanceselector.py @@ -0,0 +1,101 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestPerformanceSelector +""" + +__all__ = ["ClassifierBestPerformanceSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierBestPerformanceSelector(Component): + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='AccuracyMicro', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.metric_name = metric_name + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'BestPerformanceSelectorMultiClass' + self.settings = {} + + if metric_name is not None: + self.settings['MetricName'] = try_set( + obj=metric_name, none_acceptable=True, is_of_type=str, values=[ + 'AccuracyMicro', 'AccuracyMacro', 'LogLoss', 'LogLossReduction']) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierBestPerformanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorallselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorallselector.py new file mode 100644 index 00000000..0aa9d614 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorallselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAllSelector +""" + +__all__ = ["RegressorAllSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorAllSelector(Component): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'AllSelector' + self.settings = {} + + super( + RegressorAllSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestdiverseselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestdiverseselector.py new file mode 100644 index 00000000..4691dd21 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestdiverseselector.py @@ -0,0 +1,73 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestDiverseSelector +""" + +__all__ = ["RegressorBestDiverseSelector"] + +import numbers + +from ....entrypoints._ensembleregressiondiversitymeasure_regressiondisagreementdiversitymeasure import \ + regression_disagreement_diversity_measure +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorBestDiverseSelector(Component): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.diversity_metric_type = diversity_metric_type + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'BestDiverseSelectorRegression' + self.settings = {} + + if diversity_metric_type is not None: + self.settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorBestDiverseSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestperformanceselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestperformanceselector.py new file mode 100644 index 00000000..51b07c66 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/_regressorbestperformanceselector.py @@ -0,0 +1,99 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestPerformanceSelector +""" + +__all__ = ["RegressorBestPerformanceSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorBestPerformanceSelector(Component): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='L1', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.metric_name = metric_name + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'BestPerformanceRegressionSelector' + self.settings = {} + + if metric_name is not None: + self.settings['MetricName'] = try_set( + obj=metric_name, none_acceptable=True, is_of_type=str, values=[ + 'L1', 'L2', 'Rms', 'Loss', 'RSquared']) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorBestPerformanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py new file mode 100644 index 00000000..d5fa370e --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_classifierdisagreement.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierDisagreement +""" + +__all__ = ["ClassifierDisagreement"] + + +from .....utils.entrypoints import Component +from .....utils.utils import trace + + +class ClassifierDisagreement(Component): + """ + **Description** + A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassDiversityMeasure' + self.name = 'MultiDisagreementDiversityMeasure' + self.settings = {} + + super( + ClassifierDisagreement, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py new file mode 100644 index 00000000..f0f3d95b --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/_regressordisagreement.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorDisagreement +""" + +__all__ = ["RegressorDisagreement"] + + +from .....utils.entrypoints import Component +from .....utils.utils import trace + + +class RegressorDisagreement(Component): + """ + **Description** + A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionDiversityMeasure' + self.name = 'RegressionDisagreementDiversityMeasure' + self.settings = {} + + super( + RegressorDisagreement, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/_allinstanceselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/_allinstanceselector.py new file mode 100644 index 00000000..75d86728 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/_allinstanceselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + +__all__ = ["AllInstanceSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class AllInstanceSelector(Component): + """ + **Description** + Selects all rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'AllInstanceSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + AllInstanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/_bootstrapselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/_bootstrapselector.py new file mode 100644 index 00000000..f394418f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/_bootstrapselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + +__all__ = ["BootstrapSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class BootstrapSelector(Component): + """ + **Description** + Selects a bootstrapped sample of the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'BootstrapSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + BootstrapSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/_randompartitionselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/_randompartitionselector.py new file mode 100644 index 00000000..a1f1e451 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/_randompartitionselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + +__all__ = ["RandomPartitionSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RandomPartitionSelector(Component): + """ + **Description** + Randomly partitions the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'RandomPartitionSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + RandomPartitionSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py new file mode 100644 index 00000000..54bff625 --- /dev/null +++ b/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py @@ -0,0 +1,161 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LinearSvmBinaryClassifier +""" + +__all__ = ["LinearSvmBinaryClassifier"] + + +from ...entrypoints.trainers_linearsvmbinaryclassifier import \ + trainers_linearsvmbinaryclassifier +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles + + +class LinearSvmBinaryClassifier( + BasePipelineItem, + DefaultSignatureWithRoles): + """ + + Linear Support Vector Machine (SVM) Binary Classifier + + .. remarks:: + Linear SVM implements an algorithm that finds a hyperplane in the + feature space for binary classification, by solving an SVM problem. + For instance, with feature values *f_0, f_1,..., f_{D-1}*, the + prediction is given by determining what side of the hyperplane the + point falls into. That is the same as the sign of the feautures' + weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*, + where *w_0, w_1,..., w_{D-1}* are the weights computed by the + algorithm, and *b* is the bias computed by the algorithm. + + This algorithm implemented is the PEGASOS method, which alternates + between stochastic gradient descent steps and projection steps, + introduced by Shalev-Shwartz, Singer and Srebro. + + + **Reference** + + `Wikipedia entry for Support Vector Machine + `_ + + `Pegasos: Primal Estimated sub-GrAdient SOlver for SVM + `_ + + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param lambda_: Regularizer constant. + + :param perform_projection: Perform projection to unit-ball? Typically used + with batch size > 1. + + :param number_of_iterations: Number of iterations. + + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. + + :param no_bias: No bias. + + :param initial_weights: Initial Weights and bias, comma-separated. + + :param shuffle: Whether to shuffle for each training iteration. + + :param batch_size: Batch size. + + :param params: Additional arguments sent to compute engine. + + .. index:: models, classification, svm + + Example: + .. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py + :language: python + """ + + @trace + def __init__( + self, + normalize='Auto', + caching='Auto', + lambda_=0.001, + perform_projection=False, + number_of_iterations=1, + initial_weights_diameter=0.0, + no_bias=False, + initial_weights=None, + shuffle=True, + batch_size=1, + **params): + BasePipelineItem.__init__( + self, type='classifier', **params) + + self.normalize = normalize + self.caching = caching + self.lambda_ = lambda_ + self.perform_projection = perform_projection + self.number_of_iterations = number_of_iterations + self.initial_weights_diameter = initial_weights_diameter + self.no_bias = no_bias + self.initial_weights = initial_weights + self.shuffle = shuffle + self.batch_size = batch_size + + @property + def _entrypoint(self): + return trainers_linearsvmbinaryclassifier + + @trace + def _get_node(self, **all_args): + algo_args = dict( + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + example_weight_column_name=self._getattr_role( + 'example_weight_column_name', + all_args), + normalize_features=self.normalize, + caching=self.caching, + lambda_=self.lambda_, + perform_projection=self.perform_projection, + number_of_iterations=self.number_of_iterations, + initial_weights_diameter=self.initial_weights_diameter, + no_bias=self.no_bias, + initial_weights=self.initial_weights, + shuffle=self.shuffle, + batch_size=self.batch_size) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/loss/loss_factory.py b/src/python/nimbusml/internal/core/loss/loss_factory.py index c34b809d..ab097b3a 100644 --- a/src/python/nimbusml/internal/core/loss/loss_factory.py +++ b/src/python/nimbusml/internal/core/loss/loss_factory.py @@ -100,7 +100,7 @@ def create_loss(cls, component_kind, learner, api_loss): api_loss_name = getattr(api_loss, '_string_name') api_loss_params = getattr(api_loss, '_params') except BaseException: - # The given object is not a pytlc loss object + # The given object is not a nimbusml loss object raise TypeError(error_msg) if api_loss_name not in valid_str_losses: diff --git a/src/python/nimbusml/internal/core/preprocessing/_tensorflowscorer.py b/src/python/nimbusml/internal/core/preprocessing/_tensorflowscorer.py index 29a82109..9b6bb4d3 100644 --- a/src/python/nimbusml/internal/core/preprocessing/_tensorflowscorer.py +++ b/src/python/nimbusml/internal/core/preprocessing/_tensorflowscorer.py @@ -52,41 +52,8 @@ class TensorFlowScorer(BasePipelineItem, DefaultSignature): :param output_columns: The name of the outputs. - :param label_column: Training labels. - - :param tensor_flow_label: TensorFlow label node. - - :param optimization_operation: The name of the optimization operation in - the TensorFlow graph. - - :param loss_operation: The name of the operation in the TensorFlow graph to - compute training loss (Optional). - - :param metric_operation: The name of the operation in the TensorFlow graph - to compute performance metric during training (Optional). - :param batch_size: Number of samples to use for mini-batch training. - :param epoch: Number of training iterations. - - :param learning_rate_operation: The name of the operation in the TensorFlow - graph which sets optimizer learning rate (Optional). - - :param learning_rate: Determines the size of the step taken in the - direction of the gradient in each step of the learning process. This - determines how fast or slow the learner converges on the optimal - solution. If the step size is too big, you might overshoot the optimal - solution. If the step size is too small, training takes longer to - converge to the best solution. - - :param save_location_operation: Name of the input in TensorFlow graph that - specifiy the location for saving/restoring models from disk. - - :param save_operation: Name of the input in TensorFlow graph that specifiy - the location for saving/restoring models from disk. - - :param re_train: Retrain TensorFlow model. - :param add_batch_dimension_inputs: Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. @@ -105,18 +72,7 @@ def __init__( model_location, input_columns=None, output_columns=None, - label_column=None, - tensor_flow_label=None, - optimization_operation=None, - loss_operation=None, - metric_operation=None, batch_size=64, - epoch=5, - learning_rate_operation=None, - learning_rate=0.01, - save_location_operation='save/Const', - save_operation='save/control_dependency', - re_train=False, add_batch_dimension_inputs=False, **params): BasePipelineItem.__init__( @@ -125,18 +81,7 @@ def __init__( self.model_location = model_location self.input_columns = input_columns self.output_columns = output_columns - self.label_column = label_column - self.tensor_flow_label = tensor_flow_label - self.optimization_operation = optimization_operation - self.loss_operation = loss_operation - self.metric_operation = metric_operation self.batch_size = batch_size - self.epoch = epoch - self.learning_rate_operation = learning_rate_operation - self.learning_rate = learning_rate - self.save_location_operation = save_location_operation - self.save_operation = save_operation - self.re_train = re_train self.add_batch_dimension_inputs = add_batch_dimension_inputs @property @@ -149,18 +94,7 @@ def _get_node(self, **all_args): model_location=self.model_location, input_columns=self.input_columns, output_columns=self.output_columns, - label_column=self.label_column, - tensor_flow_label=self.tensor_flow_label, - optimization_operation=self.optimization_operation, - loss_operation=self.loss_operation, - metric_operation=self.metric_operation, batch_size=self.batch_size, - epoch=self.epoch, - learning_rate_operation=self.learning_rate_operation, - learning_rate=self.learning_rate, - save_location_operation=self.save_location_operation, - save_operation=self.save_operation, - re_train=self.re_train, add_batch_dimension_inputs=self.add_batch_dimension_inputs) all_args.update(algo_args) diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py b/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py new file mode 100644 index 00000000..213322e5 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py @@ -0,0 +1,53 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BestDiverseSelectorMultiClass +""" + +import numbers + +from ..utils.entrypoints import Component +from ..utils.utils import try_set +from ._ensemblemulticlassdiversitymeasure_multidisagreementdiversitymeasure import \ + multi_disagreement_diversity_measure + + +def best_diverse_selector_multi_class( + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + """ + **Description** + None + + :param diversity_metric_type: The metric type to be used to find + the diversity among base learners (settings). + :param learners_selection_proportion: The proportion of best base + learners to be selected. The range is 0.0-1.0 (settings). + :param validation_dataset_proportion: The proportion of instances + to be selected to test the individual base learner. If it is + 0, it uses training set (settings). + """ + + entrypoint_name = 'BestDiverseSelectorMultiClass' + settings = {} + + if diversity_metric_type is not None: + settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleMulticlassSubModelSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py b/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py new file mode 100644 index 00000000..fccf36b2 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py @@ -0,0 +1,53 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BestDiverseSelectorRegression +""" + +import numbers + +from ..utils.entrypoints import Component +from ..utils.utils import try_set +from ._ensembleregressiondiversitymeasure_regressiondisagreementdiversitymeasure import \ + regression_disagreement_diversity_measure + + +def best_diverse_selector_regression( + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + """ + **Description** + None + + :param diversity_metric_type: The metric type to be used to find + the diversity among base learners (settings). + :param learners_selection_proportion: The proportion of best base + learners to be selected. The range is 0.0-1.0 (settings). + :param validation_dataset_proportion: The proportion of instances + to be selected to test the individual base learner. If it is + 0, it uses training set (settings). + """ + + entrypoint_name = 'BestDiverseSelectorRegression' + settings = {} + + if diversity_metric_type is not None: + settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleRegressionSubModelSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py new file mode 100644 index 00000000..4f8f83b8 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def all_instance_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'AllInstanceSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py new file mode 100644 index 00000000..c6d7868b --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def bootstrap_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'BootstrapSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py new file mode 100644 index 00000000..6b36937c --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def random_partition_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'RandomPartitionSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py index 70bef2a8..3c080eb6 100644 --- a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py +++ b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py @@ -10,14 +10,15 @@ def models_onnxconverter( onnx, - model, data_file=None, json=None, name=None, domain=None, inputs_to_drop=None, outputs_to_drop=None, + model=None, onnx_version='Stable', + predictive_model=None, **params): """ **Description** @@ -40,6 +41,8 @@ def models_onnxconverter( "Stable" or "Experimental". If "Experimental" is used, produced model can contain components that is not officially supported in ONNX standard. (inputs). + :param predictive_model: Predictor model that needs to be + converted to ONNX format. (inputs). """ entrypoint_name = 'Models.OnnxConverter' @@ -85,7 +88,7 @@ def models_onnxconverter( if model is not None: inputs['Model'] = try_set( obj=model, - none_acceptable=False, + none_acceptable=True, is_of_type=str) if onnx_version is not None: inputs['OnnxVersion'] = try_set( @@ -95,6 +98,9 @@ def models_onnxconverter( values=[ 'Stable', 'Experimental']) + if predictive_model is not None: + inputs['PredictiveModel'] = try_set( + obj=predictive_model, none_acceptable=True, is_of_type=str) input_variables = { x for x in unlist(inputs.values()) diff --git a/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py b/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py new file mode 100644 index 00000000..aa87e677 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py @@ -0,0 +1,150 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Trainers.EnsembleClassification +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist +from ._ensemblemulticlasssubmodelselector_allselectormulticlass import \ + all_selector_multi_class +from ._ensemblesubsetselector_bootstrapselector import bootstrap_selector + + +def trainers_ensembleclassification( + training_data, + predictor_model=None, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + feature_column_name='Features', + num_models=None, + label_column_name='Label', + sub_model_selector_type=None, + output_combiner=None, + normalize_features='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + """ + **Description** + Train multiclass ensemble. + + :param training_data: The data to be used for training (inputs). + :param sampling_type: Sampling Type (inputs). + :param feature_column_name: Column to use for features (inputs). + :param num_models: Number of models per batch. If not specified, + will default to 50 if there is only one base predictor, or + the number of base predictors otherwise. (inputs). + :param label_column_name: Column to use for labels (inputs). + :param sub_model_selector_type: Algorithm to prune the base + learners for selective Ensemble (inputs). + :param output_combiner: Output combiner (inputs). + :param normalize_features: Normalize option for the feature + column (inputs). + :param caching: Whether trainer should cache input training data + (inputs). + :param train_parallel: All the base learners will run + asynchronously if the value is true (inputs). + :param batch_size: Batch size (inputs). + :param show_metrics: True, if metrics for each model need to be + evaluated and shown in comparison table. This is done by + using validation set if available or the training set + (inputs). + :param predictor_model: The trained model (outputs). + """ + + entrypoint_name = 'Trainers.EnsembleClassification' + inputs = {} + outputs = {} + + if training_data is not None: + inputs['TrainingData'] = try_set( + obj=training_data, + none_acceptable=False, + is_of_type=str) + if sampling_type is not None: + inputs['SamplingType'] = try_set( + obj=sampling_type, + none_acceptable=True, + is_of_type=dict) + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if num_models is not None: + inputs['NumModels'] = try_set( + obj=num_models, + none_acceptable=True, + is_of_type=numbers.Real) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if sub_model_selector_type is not None: + inputs['SubModelSelectorType'] = try_set( + obj=sub_model_selector_type, + none_acceptable=True, + is_of_type=dict) + if output_combiner is not None: + inputs['OutputCombiner'] = try_set( + obj=output_combiner, + none_acceptable=True, + is_of_type=dict) + if normalize_features is not None: + inputs['NormalizeFeatures'] = try_set( + obj=normalize_features, + none_acceptable=True, + is_of_type=str, + values=[ + 'No', + 'Warn', + 'Auto', + 'Yes']) + if caching is not None: + inputs['Caching'] = try_set( + obj=caching, + none_acceptable=True, + is_of_type=str, + values=[ + 'Auto', + 'Memory', + 'None']) + if train_parallel is not None: + inputs['TrainParallel'] = try_set( + obj=train_parallel, + none_acceptable=True, + is_of_type=bool) + if batch_size is not None: + inputs['BatchSize'] = try_set( + obj=batch_size, + none_acceptable=True, + is_of_type=numbers.Real) + if show_metrics is not None: + inputs['ShowMetrics'] = try_set( + obj=show_metrics, + none_acceptable=True, + is_of_type=bool) + if predictor_model is not None: + outputs['PredictorModel'] = try_set( + obj=predictor_model, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py b/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py new file mode 100644 index 00000000..e07b2ce8 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py @@ -0,0 +1,148 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Trainers.EnsembleRegression +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist +from ._ensemblesubsetselector_bootstrapselector import bootstrap_selector + + +def trainers_ensembleregression( + training_data, + predictor_model=None, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + feature_column_name='Features', + num_models=None, + label_column_name='Label', + sub_model_selector_type=None, + output_combiner=None, + normalize_features='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + """ + **Description** + Train regression ensemble. + + :param training_data: The data to be used for training (inputs). + :param sampling_type: Sampling Type (inputs). + :param feature_column_name: Column to use for features (inputs). + :param num_models: Number of models per batch. If not specified, + will default to 50 if there is only one base predictor, or + the number of base predictors otherwise. (inputs). + :param label_column_name: Column to use for labels (inputs). + :param sub_model_selector_type: Algorithm to prune the base + learners for selective Ensemble (inputs). + :param output_combiner: Output combiner (inputs). + :param normalize_features: Normalize option for the feature + column (inputs). + :param caching: Whether trainer should cache input training data + (inputs). + :param train_parallel: All the base learners will run + asynchronously if the value is true (inputs). + :param batch_size: Batch size (inputs). + :param show_metrics: True, if metrics for each model need to be + evaluated and shown in comparison table. This is done by + using validation set if available or the training set + (inputs). + :param predictor_model: The trained model (outputs). + """ + + entrypoint_name = 'Trainers.EnsembleRegression' + inputs = {} + outputs = {} + + if training_data is not None: + inputs['TrainingData'] = try_set( + obj=training_data, + none_acceptable=False, + is_of_type=str) + if sampling_type is not None: + inputs['SamplingType'] = try_set( + obj=sampling_type, + none_acceptable=True, + is_of_type=dict) + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if num_models is not None: + inputs['NumModels'] = try_set( + obj=num_models, + none_acceptable=True, + is_of_type=numbers.Real) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if sub_model_selector_type is not None: + inputs['SubModelSelectorType'] = try_set( + obj=sub_model_selector_type, + none_acceptable=True, + is_of_type=dict) + if output_combiner is not None: + inputs['OutputCombiner'] = try_set( + obj=output_combiner, + none_acceptable=True, + is_of_type=dict) + if normalize_features is not None: + inputs['NormalizeFeatures'] = try_set( + obj=normalize_features, + none_acceptable=True, + is_of_type=str, + values=[ + 'No', + 'Warn', + 'Auto', + 'Yes']) + if caching is not None: + inputs['Caching'] = try_set( + obj=caching, + none_acceptable=True, + is_of_type=str, + values=[ + 'Auto', + 'Memory', + 'None']) + if train_parallel is not None: + inputs['TrainParallel'] = try_set( + obj=train_parallel, + none_acceptable=True, + is_of_type=bool) + if batch_size is not None: + inputs['BatchSize'] = try_set( + obj=batch_size, + none_acceptable=True, + is_of_type=numbers.Real) + if show_metrics is not None: + inputs['ShowMetrics'] = try_set( + obj=show_metrics, + none_acceptable=True, + is_of_type=bool) + if predictor_model is not None: + outputs['PredictorModel'] = try_set( + obj=predictor_model, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py index 472fe605..5a54c69f 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py @@ -26,7 +26,7 @@ def trainers_lightgbmbinaryclassifier( unbalanced_sets=False, weight_of_positive_examples=1.0, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Logloss', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py index d1ac99df..28f13e0a 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py @@ -26,7 +26,7 @@ def trainers_lightgbmclassifier( unbalanced_sets=False, use_softmax=None, sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='Error', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py index 1da8408d..5a3a44fd 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py @@ -25,7 +25,7 @@ def trainers_lightgbmranker( caching='Auto', custom_gains=[0, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095], sigmoid=0.5, - evaluation_metric='Default', + evaluation_metric='NormalizedDiscountedCumulativeGain', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py index ffd022cd..32260ebe 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py @@ -23,7 +23,7 @@ def trainers_lightgbmregressor( row_group_column_name=None, normalize_features='Auto', caching='Auto', - evaluation_metric='Default', + evaluation_metric='RootMeanSquaredError', maximum_bin_count_per_feature=255, verbose=False, silent=True, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py b/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py index 73dc2ebe..1a95687d 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_tensorflowscorer.py @@ -16,18 +16,7 @@ def transforms_tensorflowscorer( output_columns, output_data=None, model=None, - label_column=None, - tensor_flow_label=None, - optimization_operation=None, - loss_operation=None, - metric_operation=None, batch_size=64, - epoch=5, - learning_rate_operation=None, - learning_rate=0.01, - save_location_operation='save/Const', - save_operation='save/control_dependency', - re_train=False, add_batch_dimension_inputs=False, **params): """ @@ -40,31 +29,8 @@ def transforms_tensorflowscorer( :param input_columns: The names of the model inputs (inputs). :param data: Input dataset (inputs). :param output_columns: The name of the outputs (inputs). - :param label_column: Training labels. (inputs). - :param tensor_flow_label: TensorFlow label node. (inputs). - :param optimization_operation: The name of the optimization - operation in the TensorFlow graph. (inputs). - :param loss_operation: The name of the operation in the - TensorFlow graph to compute training loss (Optional) - (inputs). - :param metric_operation: The name of the operation in the - TensorFlow graph to compute performance metric during - training (Optional) (inputs). :param batch_size: Number of samples to use for mini-batch training. (inputs). - :param epoch: Number of training iterations. (inputs). - :param learning_rate_operation: The name of the operation in the - TensorFlow graph which sets optimizer learning rate - (Optional). (inputs). - :param learning_rate: Learning rate to use during optimization. - (inputs). - :param save_location_operation: Name of the input in TensorFlow - graph that specifiy the location for saving/restoring models - from disk. (inputs). - :param save_operation: Name of the input in TensorFlow graph that - specifiy the location for saving/restoring models from disk. - (inputs). - :param re_train: Retrain TensorFlow model. (inputs). :param add_batch_dimension_inputs: Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. (inputs). @@ -96,58 +62,11 @@ def transforms_tensorflowscorer( obj=output_columns, none_acceptable=False, is_of_type=list) - if label_column is not None: - inputs['LabelColumn'] = try_set( - obj=label_column, - none_acceptable=True, - is_of_type=str) - if tensor_flow_label is not None: - inputs['TensorFlowLabel'] = try_set( - obj=tensor_flow_label, - none_acceptable=True, - is_of_type=str) - if optimization_operation is not None: - inputs['OptimizationOperation'] = try_set( - obj=optimization_operation, none_acceptable=True, is_of_type=str) - if loss_operation is not None: - inputs['LossOperation'] = try_set( - obj=loss_operation, - none_acceptable=True, - is_of_type=str) - if metric_operation is not None: - inputs['MetricOperation'] = try_set( - obj=metric_operation, none_acceptable=True, is_of_type=str) if batch_size is not None: inputs['BatchSize'] = try_set( obj=batch_size, none_acceptable=True, is_of_type=numbers.Real) - if epoch is not None: - inputs['Epoch'] = try_set( - obj=epoch, - none_acceptable=True, - is_of_type=numbers.Real) - if learning_rate_operation is not None: - inputs['LearningRateOperation'] = try_set( - obj=learning_rate_operation, none_acceptable=True, is_of_type=str) - if learning_rate is not None: - inputs['LearningRate'] = try_set( - obj=learning_rate, - none_acceptable=True, - is_of_type=numbers.Real) - if save_location_operation is not None: - inputs['SaveLocationOperation'] = try_set( - obj=save_location_operation, none_acceptable=True, is_of_type=str) - if save_operation is not None: - inputs['SaveOperation'] = try_set( - obj=save_operation, - none_acceptable=True, - is_of_type=str) - if re_train is not None: - inputs['ReTrain'] = try_set( - obj=re_train, - none_acceptable=True, - is_of_type=bool) if add_batch_dimension_inputs is not None: inputs['AddBatchDimensionInputs'] = try_set( obj=add_batch_dimension_inputs, diff --git a/src/python/nimbusml/internal/utils/data_stream.py b/src/python/nimbusml/internal/utils/data_stream.py index e4d51ba0..7d490bc6 100644 --- a/src/python/nimbusml/internal/utils/data_stream.py +++ b/src/python/nimbusml/internal/utils/data_stream.py @@ -3,8 +3,10 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- """ -Owns pytlc's containers. +Owns nimbusml's containers. """ +import os +import tempfile from shutil import copyfile from .data_roles import DataRoles @@ -467,3 +469,34 @@ def clone(self): "Method clone was not overwritten for class '{0}'".format( type(self))) return BinaryDataStream(self._filename) + + +class DprepDataStream(BinaryDataStream): + """ + Defines a data view over dprep file. + """ + + def __init__(self, dataflow=None, filename=None): + if dataflow is None and filename is None: + raise ValueError('Both dataflow object and filename are None') + super(DprepDataStream, self).__init__(DataSchema("")) + if dataflow is not None: + (fd, filename) = tempfile.mkstemp(suffix='.dprep') + fl = os.fdopen(fd, "wt") + fl.write(dataflow.to_json()) + fl.close() + self._filename = filename + + def __repr__(self): + return "DprepDataStream('{2}',\n '{0}',\n {1})".format( + self._schema, self._roles, self._filename.replace('\\', '\\\\')) + + def clone(self): + """ + Copy/clone the object. + """ + if not isinstance(self, DprepDataStream): + raise NotImplementedError( + "Method clone was not overwritten for class '{0}'".format( + type(self))) + return DprepDataStream(self._filename) \ No newline at end of file diff --git a/src/python/nimbusml/internal/utils/dataframes.py b/src/python/nimbusml/internal/utils/dataframes.py index cca54698..fe46ac20 100644 --- a/src/python/nimbusml/internal/utils/dataframes.py +++ b/src/python/nimbusml/internal/utils/dataframes.py @@ -189,9 +189,9 @@ def get_obj(el): "of the input columns has name 'F?'.\n" + "This happens for example when X and y contain the " "same column name.\n" + - "pytlc cannot distinguish between the label in X and " + "nimbusml cannot distinguish between the label in X and " "the label in Y.\n" + - "pytlc generates intermediate columns with this kind " + "nimbusml generates intermediate columns with this kind " "of name. Issue with column '{0}' among " "columns\n{1}".format( i, diff --git a/src/python/nimbusml/internal/utils/entrypoints.py b/src/python/nimbusml/internal/utils/entrypoints.py index 8d9ef085..6d985eaa 100644 --- a/src/python/nimbusml/internal/utils/entrypoints.py +++ b/src/python/nimbusml/internal/utils/entrypoints.py @@ -18,12 +18,13 @@ from scipy.sparse import csr_matrix from nimbusml.utils import signature +from .data_stream import DprepDataStream from .data_stream import BinaryDataStream from .data_stream import FileDataStream from .dataframes import resolve_dataframe, resolve_csr_matrix, pd_concat, \ resolve_output from .utils import try_set, set_clr_environment_vars, get_clr_path, \ - get_nimbusml_libs + get_mlnet_path, get_dprep_path from ..libs.pybridge import px_call @@ -399,12 +400,12 @@ def remove_multi_level_index(c): concatenated = True elif isinstance(X, FileDataStream): self.inputs['file'] = X.filename - elif isinstance(X, BinaryDataStream): + elif isinstance(X, BinaryDataStream) or isinstance(X, DprepDataStream): if 'input_data' in self.inputs: self.inputs['input_data'] = X._filename elif 'data' in self.inputs: self.inputs['data'] = X._filename - elif not summary: + elif not (summary or params.get('no_input_data')): raise RuntimeError( "data should be a dataframe, FileDataStream or DataView") @@ -439,17 +440,16 @@ def remove_multi_level_index(c): with open(input_graphfilename, 'w') as f: f.write(self.nimbusml_runnable_graph) - nimbusml_path = os.path.join(os.path.dirname(__file__), "..", "libs") - nimbusml_path = os.path.abspath(nimbusml_path) call_parameters['verbose'] = try_set(verbose, False, six.integer_types) call_parameters['graph'] = try_set( 'graph = {%s} %s' % (str(self), code), False, str) - # Set paths to ML.NET libs (in nimbusml) and to .NET Core CLR libs - call_parameters['nimbusmlPath'] = try_set(get_nimbusml_libs(), True, str) + # Set paths to .NET Core CLR, ML.NET and DataPrep libs set_clr_environment_vars() - call_parameters['dotnetClrPath'] = try_set(get_clr_path(), True, str) + call_parameters['dotnetClrPath'] = try_set(get_clr_path(), False, str) + call_parameters['mlnetPath'] = try_set(get_mlnet_path(), False, str) + call_parameters['dprepPath'] = try_set(get_dprep_path(), False, str) if random_state: call_parameters['seed'] = try_set(random_state, False, six.integer_types) diff --git a/src/python/nimbusml/internal/utils/utils.py b/src/python/nimbusml/internal/utils/utils.py index a63452b6..b8aab001 100644 --- a/src/python/nimbusml/internal/utils/utils.py +++ b/src/python/nimbusml/internal/utils/utils.py @@ -301,12 +301,12 @@ def set_clr_environment_vars(): def get_clr_path(): """ - Return path to .NET CLR libs. + Return path to .NET CLR binaries. Use dotnetcore2 package if Python 3.x, otherwise look for libs bundled with NimbusML. """ if six.PY2: - return get_nimbusml_libs() + return get_mlnet_path() else: from dotnetcore2 import runtime as clr_runtime libs_root = os.path.join(clr_runtime._get_bin_folder(), 'shared', @@ -329,9 +329,20 @@ def get_clr_path(): "found in {}.".format(libs_root)) return clr_path -def get_nimbusml_libs(): +def get_dprep_path(): """ - Return path to NimbusML libs (the ML.NET binaries). + Return path to DataPrep binaries if its installed, None otherwise + """ + try: + from azureml.dataprep.api.engineapi.engine import _get_engine_path + return os.path.dirname(_get_engine_path()) + except ImportError: + pass + return '' + +def get_mlnet_path(): + """ + Return path to ML.NET binaries. """ return os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'libs')) diff --git a/src/python/nimbusml/linear_model/__init__.py b/src/python/nimbusml/linear_model/__init__.py index 21ffb89c..35332f88 100644 --- a/src/python/nimbusml/linear_model/__init__.py +++ b/src/python/nimbusml/linear_model/__init__.py @@ -1,8 +1,11 @@ -from ._averagedperceptronbinaryclassifier import AveragedPerceptronBinaryClassifier +from ._averagedperceptronbinaryclassifier import \ + AveragedPerceptronBinaryClassifier from ._fastlinearbinaryclassifier import FastLinearBinaryClassifier from ._fastlinearclassifier import FastLinearClassifier from ._fastlinearregressor import FastLinearRegressor -from ._logisticregressionbinaryclassifier import LogisticRegressionBinaryClassifier +from ._linearsvmbinaryclassifier import LinearSvmBinaryClassifier +from ._logisticregressionbinaryclassifier import \ + LogisticRegressionBinaryClassifier from ._logisticregressionclassifier import LogisticRegressionClassifier from ._onlinegradientdescentregressor import OnlineGradientDescentRegressor from ._ordinaryleastsquaresregressor import OrdinaryLeastSquaresRegressor @@ -15,12 +18,13 @@ 'FastLinearBinaryClassifier', 'FastLinearClassifier', 'FastLinearRegressor', + 'LinearSvmBinaryClassifier', 'LogisticRegressionBinaryClassifier', 'LogisticRegressionClassifier', 'OnlineGradientDescentRegressor', 'OrdinaryLeastSquaresRegressor', 'PoissonRegressionRegressor', 'SgdBinaryClassifier', - 'SymSgdBinaryClassifier', + 'SymSgdBinaryClassifier' ] diff --git a/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py b/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py new file mode 100644 index 00000000..556e621a --- /dev/null +++ b/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py @@ -0,0 +1,183 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LinearSvmBinaryClassifier +""" + +__all__ = ["LinearSvmBinaryClassifier"] + + +from sklearn.base import ClassifierMixin + +from ..base_predictor import BasePredictor +from ..internal.core.linear_model._linearsvmbinaryclassifier import \ + LinearSvmBinaryClassifier as core +from ..internal.utils.utils import trace + + +class LinearSvmBinaryClassifier( + core, + BasePredictor, + ClassifierMixin): + """ + + Linear Support Vector Machine (SVM) Binary Classifier + + .. remarks:: + Linear SVM implements an algorithm that finds a hyperplane in the + feature space for binary classification, by solving an SVM problem. + For instance, with feature values *f_0, f_1,..., f_{D-1}*, the + prediction is given by determining what side of the hyperplane the + point falls into. That is the same as the sign of the feautures' + weighted sum, i.e. *\sum_{i = 0}^{D-1} \left(w_i * f_i \right) + b*, + where *w_0, w_1,..., w_{D-1}* are the weights computed by the + algorithm, and *b* is the bias computed by the algorithm. + + This algorithm implemented is the PEGASOS method, which alternates + between stochastic gradient descent steps and projection steps, + introduced by Shalev-Shwartz, Singer and Srebro. + + + **Reference** + + `Wikipedia entry for Support Vector Machine + `_ + + `Pegasos: Primal Estimated sub-GrAdient SOlver for SVM + `_ + + + :param feature: see `Columns `_. + + :param label: see `Columns `_. + + :param weight: see `Columns `_. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param lambda_: Regularizer constant. + + :param perform_projection: Perform projection to unit-ball? Typically used + with batch size > 1. + + :param number_of_iterations: Number of iterations. + + :param initial_weights_diameter: Sets the initial weights diameter that + specifies the range from which values are drawn for the initial + weights. These weights are initialized randomly from within this range. + For example, if the diameter is specified to be ``d``, then the weights + are uniformly distributed between ``-d/2`` and ``d/2``. The default + value is ``0``, which specifies that all the weights are set to zero. + + :param no_bias: No bias. + + :param initial_weights: Initial Weights and bias, comma-separated. + + :param shuffle: Whether to shuffle for each training iteration. + + :param batch_size: Batch size. + + :param params: Additional arguments sent to compute engine. + + .. index:: models, classification, svm + + Example: + .. literalinclude:: /../nimbusml/examples/LinearSvmBinaryClassifier.py + :language: python + """ + + @trace + def __init__( + self, + normalize='Auto', + caching='Auto', + lambda_=0.001, + perform_projection=False, + number_of_iterations=1, + initial_weights_diameter=0.0, + no_bias=False, + initial_weights=None, + shuffle=True, + batch_size=1, + feature=None, + label=None, + weight=None, + **params): + + if 'feature_column_name' in params: + raise NameError( + "'feature_column_name' must be renamed to 'feature'") + if feature: + params['feature_column_name'] = feature + if 'label_column_name' in params: + raise NameError( + "'label_column_name' must be renamed to 'label'") + if label: + params['label_column_name'] = label + if 'example_weight_column_name' in params: + raise NameError( + "'example_weight_column_name' must be renamed to 'weight'") + if weight: + params['example_weight_column_name'] = weight + BasePredictor.__init__(self, type='classifier', **params) + core.__init__( + self, + normalize=normalize, + caching=caching, + lambda_=lambda_, + perform_projection=perform_projection, + number_of_iterations=number_of_iterations, + initial_weights_diameter=initial_weights_diameter, + no_bias=no_bias, + initial_weights=initial_weights, + shuffle=shuffle, + batch_size=batch_size, + **params) + self.feature = feature + self.label = label + self.weight = weight + + @trace + def predict_proba(self, X, **params): + ''' + Returns probabilities + ''' + return self._predict_proba(X, **params) + + @trace + def decision_function(self, X, **params): + ''' + Returns score values + ''' + return self._decision_function(X, **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/_tensorflowscorer.py b/src/python/nimbusml/preprocessing/_tensorflowscorer.py index 9dceab2a..528174d2 100644 --- a/src/python/nimbusml/preprocessing/_tensorflowscorer.py +++ b/src/python/nimbusml/preprocessing/_tensorflowscorer.py @@ -56,41 +56,8 @@ class TensorFlowScorer(core, BaseTransform, TransformerMixin): :param output_columns: The name of the outputs. - :param label_column: Training labels. - - :param tensor_flow_label: TensorFlow label node. - - :param optimization_operation: The name of the optimization operation in - the TensorFlow graph. - - :param loss_operation: The name of the operation in the TensorFlow graph to - compute training loss (Optional). - - :param metric_operation: The name of the operation in the TensorFlow graph - to compute performance metric during training (Optional). - :param batch_size: Number of samples to use for mini-batch training. - :param epoch: Number of training iterations. - - :param learning_rate_operation: The name of the operation in the TensorFlow - graph which sets optimizer learning rate (Optional). - - :param learning_rate: Determines the size of the step taken in the - direction of the gradient in each step of the learning process. This - determines how fast or slow the learner converges on the optimal - solution. If the step size is too big, you might overshoot the optimal - solution. If the step size is too small, training takes longer to - converge to the best solution. - - :param save_location_operation: Name of the input in TensorFlow graph that - specifiy the location for saving/restoring models from disk. - - :param save_operation: Name of the input in TensorFlow graph that specifiy - the location for saving/restoring models from disk. - - :param re_train: Retrain TensorFlow model. - :param add_batch_dimension_inputs: Add a batch dimension to the input e.g. input = [224, 224, 3] => [-1, 224, 224, 3]. @@ -109,18 +76,7 @@ def __init__( model_location, input_columns=None, output_columns=None, - label_column=None, - tensor_flow_label=None, - optimization_operation=None, - loss_operation=None, - metric_operation=None, batch_size=64, - epoch=5, - learning_rate_operation=None, - learning_rate=0.01, - save_location_operation='save/Const', - save_operation='save/control_dependency', - re_train=False, add_batch_dimension_inputs=False, columns=None, **params): @@ -144,18 +100,7 @@ def __init__( model_location=model_location, input_columns=input_columns, output_columns=output_columns, - label_column=label_column, - tensor_flow_label=tensor_flow_label, - optimization_operation=optimization_operation, - loss_operation=loss_operation, - metric_operation=metric_operation, batch_size=batch_size, - epoch=epoch, - learning_rate_operation=learning_rate_operation, - learning_rate=learning_rate, - save_location_operation=save_location_operation, - save_operation=save_operation, - re_train=re_train, add_batch_dimension_inputs=add_batch_dimension_inputs, **params) self._columns = columns diff --git a/src/python/nimbusml/preprocessing/missing_values/_filter.py b/src/python/nimbusml/preprocessing/missing_values/_filter.py index 4b8e294d..eb239e34 100644 --- a/src/python/nimbusml/preprocessing/missing_values/_filter.py +++ b/src/python/nimbusml/preprocessing/missing_values/_filter.py @@ -78,3 +78,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/preprocessing/missing_values/_handler.py b/src/python/nimbusml/preprocessing/missing_values/_handler.py index de776ca7..095ddb36 100644 --- a/src/python/nimbusml/preprocessing/missing_values/_handler.py +++ b/src/python/nimbusml/preprocessing/missing_values/_handler.py @@ -106,3 +106,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/preprocessing/missing_values/_indicator.py b/src/python/nimbusml/preprocessing/missing_values/_indicator.py index 5299523c..41709148 100644 --- a/src/python/nimbusml/preprocessing/missing_values/_indicator.py +++ b/src/python/nimbusml/preprocessing/missing_values/_indicator.py @@ -79,3 +79,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/tests/dprep/__init__.py b/src/python/nimbusml/tests/dprep/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/python/nimbusml/tests/dprep/test_dprep.py b/src/python/nimbusml/tests/dprep/test_dprep.py new file mode 100644 index 00000000..c8ebbbdb --- /dev/null +++ b/src/python/nimbusml/tests/dprep/test_dprep.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import os +import sys +from nimbusml import Pipeline, FileDataStream, BinaryDataStream, DprepDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.normalization import MinMaxScaler +from sklearn.utils.testing import assert_true, assert_array_equal + +def is_nan(x): + return (x is np.nan or x != x) + +def assert_2d_array_equal(actual, desired): + if len(actual) != len(desired): + assert_true(False, "arrays are of different lengths.") + + for i in range(len(actual)): + if len(actual[i]) != len(desired[i]): + assert_true(False, "arrays are of different lengths.") + for y in range(len(actual[i])): + if is_nan(actual[i][y]) and is_nan(desired[i][y]): + continue + assert_true(actual[i][y] == desired[i][y]) + +@unittest.skipIf(os.name == "posix" or sys.version_info[:2] != (3, 7), "azureml-dataprep is not installed.") +class TestDprep(unittest.TestCase): + + def test_fit_transform(self): + import azureml.dataprep as dprep + + path = get_dataset('infert').as_filepath() + dflow = dprep.auto_read_file(path=path) + dprep_data = DprepDataStream(dflow) + file_data = FileDataStream.read_csv(path) + + xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) + pipe = Pipeline([xf]) + transformed_data = pipe.fit_transform(file_data) + transformed_data1 = pipe.fit_transform(dprep_data) + + assert_array_equal( + transformed_data.columns, + transformed_data1.columns) + assert_2d_array_equal( + transformed_data.values, + transformed_data1.values) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py b/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py new file mode 100644 index 00000000..53bbab22 --- /dev/null +++ b/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +import six +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector +from sklearn.model_selection import train_test_split +from sklearn.utils.testing import assert_greater + + +class TestEnsembleClassifier(unittest.TestCase): + + def test_ensembleclassifier(self): + np.random.seed(0) + df = get_dataset("iris").as_df() + df.drop(['Species'], inplace=True, axis=1) + + X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) + + ensemble = EnsembleClassifier(num_models=3).fit(X_train, y_train, verbose=0) + scores = ensemble.predict(X_test) + scores = pd.to_numeric(scores) + accuracy = np.mean(y_test == [i for i in scores]) + assert_greater( + accuracy, + 0.947, + "accuracy should be greater than %s" % + 0.948) + + ensemble_with_options = EnsembleClassifier( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()).fit(X_train, y_train) + + scores = ensemble.predict(X_test) + scores = pd.to_numeric(scores) + accuracy = np.mean(y_test == [i for i in scores]) + assert_greater( + accuracy, + 0.578, + "accuracy should be greater than %s" % + 0.579) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py new file mode 100644 index 00000000..5c61d9b2 --- /dev/null +++ b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import platform +import unittest + +import numpy as np +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split +from sklearn.utils.testing import assert_greater, assert_less + + +class TestEnsembleRegressor(unittest.TestCase): + + def test_ensembleregressor(self): + np.random.seed(0) + + df = get_dataset("airquality").as_df().fillna(0) + df = df[df.Ozone.notnull()] + + X_train, X_test, y_train, y_test = train_test_split( + df.loc[:, df.columns != 'Ozone'], df['Ozone']) + + # Train a model and score + ensemble = EnsembleRegressor(num_models=3).fit(X_train, y_train) + scores = ensemble.predict(X_test) + + r2 = r2_score(y_test, scores) + assert_greater(r2, 0.12, "should be greater than %s" % 0.12) + assert_less(r2, 0.13, "sum should be less than %s" % 0.13) + + ensemble_with_options = EnsembleRegressor( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()).fit(X_train, y_train) + scores = ensemble_with_options.predict(X_test) + + r2 = r2_score(y_test, scores) + assert_greater(r2, 0.0279, "R-Squared should be greater than %s" % 0.0279) + assert_less(r2, 0.03, "R-Squared should be less than %s" % 0.03) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py b/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py index e252ed7c..691307d3 100644 --- a/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py +++ b/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py @@ -13,7 +13,7 @@ from sklearn.utils.testing import assert_greater, assert_less -class TestFastLinearRegressor(unittest.TestCase): +class TestLightGbmRegressor(unittest.TestCase): def test_lightgbmregressor(self): np.random.seed(0) diff --git a/src/python/nimbusml/tests/linear_model/test_linearsvmbinaryclassifier.py b/src/python/nimbusml/tests/linear_model/test_linearsvmbinaryclassifier.py new file mode 100644 index 00000000..c144c377 --- /dev/null +++ b/src/python/nimbusml/tests/linear_model/test_linearsvmbinaryclassifier.py @@ -0,0 +1,64 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +try: + # pandas 0.20.0+ + from pandas.api.types import is_string_dtype +except ImportError: + def is_string_dtype(dt): + return 'object' in str(dt) or "dtype('O')" in str(dt) + +import numpy as np +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LinearSvmBinaryClassifier +from nimbusml.datasets import get_dataset +from nimbusml import Pipeline +from sklearn.model_selection import train_test_split +from sklearn.utils.testing import assert_almost_equal, assert_greater + + +class TestLinearSvmBinaryClassifier(unittest.TestCase): + @classmethod + def setUpClass(self): + np.random.seed(0) + df = get_dataset("infert").as_df() + # remove : and ' ' from column names, and encode categorical column + df.columns = [i.replace(': ', '') for i in df.columns] + assert is_string_dtype(df['education_str'].dtype) + df = (OneHotVectorizer() << ['education_str']).fit_transform(df) + assert 'education_str' not in df.columns + self.X_train, self.X_test, self.y_train, self.y_test = \ + train_test_split(df.loc[:, df.columns != 'case'], + df['case'], + random_state=0) + self.svm = LinearSvmBinaryClassifier(shuffle=False).fit(self.X_train, + self.y_train) + self.predictions = self.svm.predict(self.X_test) + self.accuracy = np.mean(self.y_test == [i for i in self.predictions]) + + def test_linearsvm(self): + assert_greater(self.accuracy, 0.96, "accuracy should be %s" % 0.96) + + def test_linearsvm_predict_proba(self): + probabilities = self.svm.predict_proba(self.X_test) + # Test that the class probabilities for each instance add up to 1 + [assert_almost_equal(probabilities[i][0] + probabilities[i][1], 1) \ + for i in range(probabilities.shape[0])] + + def test_linearsvm_decision_function(self): + fn = self.svm.decision_function(self.X_test) + predictions_from_fn = [fn[i] >= 0 for i in range(len(fn))] + assert [predictions_from_fn[i] == self.predictions[i] \ + for i in range(len(self.predictions))] + accuracy_from_fn = np.mean( + self.y_test == [i for i in predictions_from_fn]) + assert accuracy_from_fn == self.accuracy + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 309650b5..fc112fe5 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +import os import pickle import unittest @@ -44,8 +45,14 @@ def test_model_dataframe(self): model_nimbusml.fit(train, label) # Save with pickle - pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) - model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) @@ -72,6 +79,8 @@ def test_model_dataframe(self): model_nimbusml_load.sum().sum(), decimal=2) + os.remove('model.nimbusml.m') + def test_model_datastream(self): model_nimbusml = Pipeline( steps=[ @@ -85,8 +94,14 @@ def test_model_datastream(self): model_nimbusml.fit(train, label) # Save with pickle - pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) - model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) @@ -119,6 +134,197 @@ def test_model_datastream(self): model_nimbusml_load.sum().sum(), decimal=2) + os.remove('model.nimbusml.m') + + def test_pipeline_saves_complete_model_file_when_pickled(self): + model_nimbusml = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + model_nimbusml.fit(train, label) + metrics, score = model_nimbusml.test(test, test_label, output_scores=True) + + pickle_filename = 'nimbusml_model.p' + + # Save with pickle + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + # Remove the pipeline model from disk so + # that the unpickled pipeline is forced + # to get its model from the pickled file. + os.remove(model_nimbusml.model) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) + + metrics_pickle, score_pickle = model_nimbusml_pickle.test( + test, test_label, output_scores=True) + + assert_almost_equal(score.sum().sum(), + score_pickle.sum().sum(), + decimal=2) + + assert_almost_equal(metrics.sum().sum(), + metrics_pickle.sum().sum(), + decimal=2) + + def test_unfitted_pickled_pipeline_can_be_fit(self): + pipeline = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + pipeline.fit(train, label) + metrics, score = pipeline.test(test, test_label, output_scores=True) + + # Create a new unfitted pipeline + pipeline = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + pickle_filename = 'nimbusml_model.p' + + # Save with pickle + with open(pickle_filename, 'wb') as f: + pickle.dump(pipeline, f) + + with open(pickle_filename, "rb") as f: + pipeline_pickle = pickle.load(f) + + os.remove(pickle_filename) + + pipeline_pickle.fit(train, label) + metrics_pickle, score_pickle = pipeline_pickle.test( + test, test_label, output_scores=True) + + assert_almost_equal(score.sum().sum(), + score_pickle.sum().sum(), + decimal=2) + + assert_almost_equal(metrics.sum().sum(), + metrics_pickle.sum().sum(), + decimal=2) + + def test_unpickled_pipeline_has_feature_contributions(self): + features = ['age', 'education-num', 'hours-per-week'] + + model_nimbusml = Pipeline( + steps=[FastLinearBinaryClassifier(feature=features)]) + model_nimbusml.fit(train, label) + fc = model_nimbusml.get_feature_contributions(test) + + # Save with pickle + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + # Unpickle model + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + fc_pickle = model_nimbusml_pickle.get_feature_contributions(test) + + assert ['FeatureContributions.' + feature in fc_pickle.columns + for feature in features] + + assert [fc['FeatureContributions.' + feature].equals( + fc_pickle['FeatureContributions.' + feature]) + for feature in features] + + os.remove(pickle_filename) + + def test_unpickled_predictor_has_feature_contributions(self): + features = ['age', 'education-num', 'hours-per-week'] + + model_nimbusml = FastLinearBinaryClassifier(feature=features) + model_nimbusml.fit(train, label) + fc = model_nimbusml.get_feature_contributions(test) + + # Save with pickle + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + # Unpickle model + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + fc_pickle = model_nimbusml_pickle.get_feature_contributions(test) + + assert ['FeatureContributions.' + feature in fc_pickle.columns + for feature in features] + + assert [fc['FeatureContributions.' + feature].equals( + fc_pickle['FeatureContributions.' + feature]) + for feature in features] + + os.remove(pickle_filename) + + def test_pipeline_loaded_from_zip_has_feature_contributions(self): + features = ['age', 'education-num', 'hours-per-week'] + + model_nimbusml = Pipeline( + steps=[FastLinearBinaryClassifier(feature=features)]) + model_nimbusml.fit(train, label) + fc = model_nimbusml.get_feature_contributions(test) + + # Save the model to zip + model_filename = 'nimbusml_model.zip' + model_nimbusml.save_model(model_filename) + # Load the model from zip + model_nimbusml_zip = Pipeline() + model_nimbusml_zip.load_model(model_filename) + + fc_zip = model_nimbusml_zip.get_feature_contributions(test) + + assert ['FeatureContributions.' + feature in fc_zip.columns + for feature in features] + + assert [fc['FeatureContributions.' + feature].equals( + fc_zip['FeatureContributions.' + feature]) + for feature in features] + + os.remove(model_filename) + + def test_predictor_loaded_from_zip_has_feature_contributions(self): + features = ['age', 'education-num', 'hours-per-week'] + + model_nimbusml = FastLinearBinaryClassifier(feature=features) + model_nimbusml.fit(train, label) + fc = model_nimbusml.get_feature_contributions(test) + + # Save the model to zip + model_filename = 'nimbusml_model.zip' + model_nimbusml.save_model(model_filename) + # Load the model from zip + model_nimbusml_zip = Pipeline() + model_nimbusml_zip.load_model(model_filename) + + fc_zip = model_nimbusml_zip.get_feature_contributions(test) + + assert ['FeatureContributions.' + feature in fc_zip.columns + for feature in features] + + assert [fc['FeatureContributions.' + feature].equals( + fc_zip['FeatureContributions.' + feature]) + for feature in features] + + os.remove(model_filename) if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py b/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py new file mode 100644 index 00000000..f16e43aa --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py @@ -0,0 +1,411 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier, OnlineGradientDescentRegressor +from nimbusml.preprocessing.filter import RangeFilter + +seed = 0 + +train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': [1, 2, 3, 4], + 'c2': [2, 3, 4, 5]} +train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + +test_data = {'c0': ['a', 'b', 'b'], + 'c1': [1.5, 2.3, 3.7], + 'c2': [2.2, 4.9, 2.7]} +test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + +class TestPipelineCombining(unittest.TestCase): + + def test_two_pipelines_created_using_dataframes_can_not_be_combined_when_the_schemas_are_different(self): + """ + This test verifies that two models created using DataFrames + can not be combined if the output schema of the first is + different then the input schema of the second. + NOTE: This issue only happens with Pipelines created and fit + using dataframes. Pipelines created and fit using IDV binary + streams do not have this issue (see the tests below). + """ + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df) + + # Create and fit an OnlineGradientDescentRegressor using + # the transformed training data from the previous step. + predictor_pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=seed) + predictor_pipeline.fit(df) + + # Perform a prediction given the test data using + # the transform and predictor defined previously. + df = transform_pipeline.transform(test_df) + result_1 = predictor_pipeline.predict(df) + + try: + # This does not work because the output schema of the + combined_pipeline = Pipeline.combine_models(transform_pipeline, + predictor_pipeline) + except Exception as e: + pass + else: + self.fail() + + + def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same(self): + """ + This test verifies that two models created using DataFrames + can be combined if the output schema of the first is the same + as the input schema of the second. + """ + df = train_df.drop(['c0'], axis=1) + + # Create and fit a RangeFilter transform using the training + # data and use it to transform the training data. + transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) + transform_pipeline.fit(df) + df = transform_pipeline.transform(df) + + # Create and fit an OnlineGradientDescentRegressor using + # the transformed training data from the previous step. + predictor_pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], + random_state=seed) + predictor_pipeline.fit(df) + + # Perform a prediction given the test data using + # the transform and predictor defined previously. + df = transform_pipeline.transform(test_df) + result_1 = predictor_pipeline.predict(df) + + df = test_df.drop(['c0'], axis=1) + + # Combine the above Pipelines in to one Pipeline and use + # the new Pipeline to get predictions given the test data. + combined_pipeline = Pipeline.combine_models(transform_pipeline, + predictor_pipeline) + result_2 = combined_pipeline.predict(df) + + # Verify that the prediction from the combined Pipeline + # matches the prediction from the original two Pipelines. + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + + def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(self): + """ + This test verifies that two models can be combined + even if the transform increases the number of columns. + """ + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df, as_binary_data_stream=True) + + # Create and fit an OnlineGradientDescentRegressor using + # the transformed training data from the previous step. + predictor_pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], + random_state=seed) + predictor_pipeline.fit(df) + + # Perform a prediction given the test data using + # the transform and predictor defined previously. + df = transform_pipeline.transform(test_df, as_binary_data_stream=True) + result_1 = predictor_pipeline.predict(df) + + # Combine the above Pipelines in to one Pipeline and use + # the new Pipeline to get predictions given the test data. + combined_pipeline = Pipeline.combine_models(transform_pipeline, + predictor_pipeline) + result_2 = combined_pipeline.predict(test_df) + + # Verify that the prediction from the combined Pipeline + # matches the prediction from the original two Pipelines. + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + + def test_three_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(self): + """ + This test verifies that three models can be combined + even if the transform increases the number of columns. + """ + # Create and fit a RangeFilter transform using the training + # data and use it to transform the training data. + transform_pipeline_1 = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) + df = transform_pipeline_1.fit_transform(train_df, as_binary_data_stream=True) + + # Create and fit a OneHotVectorizer transform using + # the transformed data from the previous step and use it + # to transform the data from the previous step. + transform_pipeline_2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline_2.fit(df) + df = transform_pipeline_2.transform(df, as_binary_data_stream=True) + + # Create and fit an OnlineGradientDescentRegressor using + # the transformed training data from the previous step. + predictor_pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], + random_state=seed) + predictor_pipeline.fit(df) + + # Perform a prediction given the test data using + # the transforms and predictor defined previously. + df = transform_pipeline_1.transform(test_df, as_binary_data_stream=True) + df = transform_pipeline_2.transform(df, as_binary_data_stream=True) + result_1 = predictor_pipeline.predict(df) + + # Combine the above Pipelines in to one Pipeline and use + # the new Pipeline to get predictions given the test data. + combined_pipeline = Pipeline.combine_models(transform_pipeline_1, + transform_pipeline_2, + predictor_pipeline) + result_2 = combined_pipeline.predict(test_df) + + # Verify that the prediction from the combined Pipeline + # matches the prediction from the original two Pipelines. + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + + def test_combine_two_pipelines_created_from_model_files(self): + """ + This test verifies that two models can be combined + after they are loaded from disk in to new Pipelines. + """ + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline_1.fit(train_df) + df = transform_pipeline_1.transform(train_df, as_binary_data_stream=True) + + # Create and fit an OnlineGradientDescentRegressor using + # the transformed training data from the previous step. + predictor_pipeline_1 = Pipeline([OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], + random_state=seed) + predictor_pipeline_1.fit(df) + + # Perform a prediction given the test data using + # the transform and predictor defined previously. + df = transform_pipeline_1.transform(test_df, as_binary_data_stream=True) + result_1 = predictor_pipeline_1.predict(df) + + # Use the model files stored in the Pipelines + # to create new Pipelines (aka. create new Pipelines + # using the model files stored on disk). + transform_pipeline_2 = Pipeline() + transform_pipeline_2.load_model(transform_pipeline_1.model) + predictor_pipeline_2 = Pipeline() + predictor_pipeline_2.load_model(predictor_pipeline_1.model) + + # Combine the newly created Pipelines in to one Pipeline + # and use it to get predictions given the test data. + combined_pipeline = Pipeline.combine_models(transform_pipeline_2, + predictor_pipeline_2) + result_2 = combined_pipeline.predict(test_df) + + # Verify that the prediction from the combined Pipeline + # matches the prediction from the original two Pipelines. + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + + def test_passing_in_a_single_transform_returns_new_pipeline(self): + transform = OneHotVectorizer() << 'c0' + transform.fit(train_df) + + combined_pipeline = Pipeline.combine_models(transform, + contains_predictor=False) + result = combined_pipeline.transform(test_df) + + self.assertEqual(len(result), 3) + self.assertEqual(len(result.columns), 4) + self.assertTrue(result.columns[0].startswith('c0.')) + self.assertTrue(result.columns[1].startswith('c0.')) + self.assertTrue(isinstance(combined_pipeline, Pipeline)) + + + def test_passing_in_a_single_predictor_returns_new_pipeline(self): + train_dropped_df = train_df.drop(['c0'], axis=1) + test_dropped_df = test_df.drop(['c0'], axis=1) + + predictor = OnlineGradientDescentRegressor(label='c2', feature=['c1']) + predictor.fit(train_dropped_df) + result_1 = predictor.predict(test_dropped_df) + + combined_pipeline = Pipeline.combine_models(predictor) + result_2 = combined_pipeline.predict(test_dropped_df) + + self.assertEqual(result_1[0], result_2.loc[0, 'Score']) + self.assertEqual(result_1[1], result_2.loc[1, 'Score']) + self.assertTrue(isinstance(combined_pipeline, Pipeline)) + + + def test_passing_in_a_single_pipeline_returns_new_pipeline(self): + pipeline = Pipeline([ + OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ]) + pipeline.fit(train_df) + result_1 = pipeline.predict(test_df) + + combined_pipeline = Pipeline.combine_models(pipeline) + result_2 = combined_pipeline.predict(test_df) + + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + self.assertTrue(isinstance(combined_pipeline, Pipeline)) + + + def test_combine_transform_and_transform(self): + transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2' + df = transform_1.fit_transform(train_df) + + transform_2 = OneHotVectorizer() << 'c0' + transform_2.fit(df) + + df = transform_1.transform(test_df) + result_1 = transform_2.transform(df) + + combined_pipeline = Pipeline.combine_models(transform_1, + transform_2, + contains_predictor=False) + result_2 = combined_pipeline.transform(test_df) + + self.assertTrue(result_1.equals(result_2)) + + + def test_combine_transform_and_predictor(self): + transform = OneHotVectorizer() << 'c0' + df = transform.fit_transform(train_df, as_binary_data_stream=True) + + predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + predictor.fit(df) + + df = transform.transform(test_df, as_binary_data_stream=True) + result_1 = predictor.predict(df) + + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.predict(test_df) + + self.assertEqual(result_1[0], result_2.loc[0, 'Score']) + self.assertEqual(result_1[1], result_2.loc[1, 'Score']) + + + def test_combine_transform_and_pipeline(self): + transform = RangeFilter(min=0.0, max=4.5) << 'c2' + df = transform.fit_transform(train_df, as_binary_data_stream=True) + + pipeline = Pipeline([ + OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ]) + pipeline.fit(df) + + df = transform.transform(test_df, as_binary_data_stream=True) + result_1 = pipeline.predict(df) + + combined_pipeline = Pipeline.combine_models(transform, pipeline) + result_2 = combined_pipeline.predict(test_df) + + self.assertTrue(result_1.equals(result_2)) + + + def test_combine_with_classifier_trained_with_y_arg(self): + """ + Tests a sequence where the initial transform is computed + using both X and y input args. Note, any steps after the + initial transform will be operating on data where the X + and y have been combined in to one dataset. + """ + np.random.seed(0) + + df = get_dataset("infert").as_df() + + X = df.loc[:, df.columns != 'case'] + y = df['case'] + + transform = OneHotVectorizer() << 'education_str' + + # Passing in both X and y + df = transform.fit_transform(X, y, as_binary_data_stream=True) + + # NOTE: need to specify the label column here because the + # feature and label data was joined in the last step. + predictor = LogisticRegressionBinaryClassifier(label='case', feature=list(X.columns)) + predictor.fit(df) + + df = transform.transform(X, as_binary_data_stream=True) + result_1 = predictor.predict(df) + + # Combine the models and perform a prediction + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.predict(X) + + result_2 = result_2['PredictedLabel'].astype(np.float64) + self.assertTrue(result_1.equals(result_2)) + + + def test_combine_with_classifier_trained_with_joined_X_and_y(self): + np.random.seed(0) + + infert_df = get_dataset("infert").as_df() + feature_cols = [c for c in infert_df.columns if c != 'case'] + + transform = OneHotVectorizer() << 'education_str' + df = transform.fit_transform(infert_df, as_binary_data_stream=True) + + predictor = LogisticRegressionBinaryClassifier(label='case', feature=feature_cols) + predictor.fit(df) + + df = transform.transform(infert_df, as_binary_data_stream=True) + result_1 = predictor.predict(df) + + # Combine the models and perform a prediction + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.predict(infert_df) + + result_2 = result_2['PredictedLabel'].astype(np.float64) + self.assertTrue(result_1.equals(result_2)) + + + def test_combine_with_classifier_trained_with_filedatastream(self): + path = get_dataset('infert').as_filepath() + + data = FileDataStream.read_csv(path) + + transform = OneHotVectorizer(columns={'edu': 'education'}) + df = transform.fit_transform(data, as_binary_data_stream=True) + + feature_cols = ['parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') + predictor.fit(df) + + data = FileDataStream.read_csv(path) + df = transform.transform(data, as_binary_data_stream=True) + result_1 = predictor.predict(df) + + data = FileDataStream.read_csv(path) + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.predict(data) + + result_1 = result_1.astype(np.int32) + result_2 = result_2['PredictedLabel'].astype(np.int32) + self.assertTrue(result_1.equals(result_2)) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py index 21aa24a0..f6cc1c70 100644 --- a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py +++ b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py @@ -28,6 +28,14 @@ X_train, X_test, y_train, y_test = \ train_test_split(features, labels) +# 3 class dataset with integer labels +np.random.seed(0) +df = get_dataset("iris").as_df() +df.drop(['Species'], inplace=True, axis=1) +features_3class_int, labels_3class_int = split_features_and_label(df, 'Label') +X_train_3class_int, X_test_3class_int, y_train_3class_int, y_test_3class_int = \ + train_test_split(features_3class_int, labels_3class_int) + # 3 class dataset with string labels np.random.seed(0) df = get_dataset("iris").as_df() @@ -112,9 +120,64 @@ def test_pass_predict_proba_multiclass_3class(self): s, 38.0, decimal=4, - err_msg=invalid_decision_function_output) + err_msg=invalid_predict_proba_output) assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'}) + def test_pass_predict_proba_multiclass_with_pipeline_adds_classes(self): + clf = FastLinearClassifier(number_of_threads=1) + pipeline = Pipeline([clf]) + pipeline.fit(X_train_3class, y_train_3class) + + expected_classes = {'Blue', 'Green', 'Red'} + assert_equal(set(clf.classes_), expected_classes) + assert_equal(set(pipeline.classes_), expected_classes) + + s = pipeline.predict_proba(X_test_3class).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_predict_proba_output) + + assert_equal(set(clf.classes_), expected_classes) + assert_equal(set(pipeline.classes_), expected_classes) + + def test_pass_predict_proba_multiclass_3class_retains_classes_type(self): + clf = FastLinearClassifier(number_of_threads=1) + clf.fit(X_train_3class_int, y_train_3class_int) + s = clf.predict_proba(X_test_3class_int).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_predict_proba_output) + assert_equal(set(clf.classes_), {0, 1, 2}) + + def test_predict_proba_multiclass_3class_no_y_input_implies_no_classes_attribute(self): + X_train = X_train_3class_int.join(y_train_3class_int) + X_test = X_test_3class_int.join(y_test_3class_int) + + clf = FastLinearClassifier(number_of_threads=1, label='Label') + clf.fit(X_train) + + if hasattr(clf, 'classes_'): + # The classes_ attribute is currently not supported + # when fitting when there is no y input specified. + self.fail("classes_ attribute not expected.") + + s = clf.predict_proba(X_test).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_predict_proba_output) + + if hasattr(clf, 'classes_'): + # The classes_ attribute is currently not supported + # when predicting when there was no y input specified + # during fitting. + self.fail("classes_ attribute not expected.") + def test_fail_predict_proba_multiclass_with_pipeline(self): check_unsupported_predict_proba(self, Pipeline( [NaiveBayesClassifier()]), X_train, y_train, X_test) @@ -174,6 +237,61 @@ def test_pass_decision_function_multiclass_3class(self): err_msg=invalid_decision_function_output) assert_equal(set(clf.classes_), {'Blue', 'Green', 'Red'}) + def test_pass_decision_function_multiclass_with_pipeline_adds_classes(self): + clf = FastLinearClassifier(number_of_threads=1) + pipeline = Pipeline([clf]) + pipeline.fit(X_train_3class, y_train_3class) + + expected_classes = {'Blue', 'Green', 'Red'} + assert_equal(set(clf.classes_), expected_classes) + assert_equal(set(pipeline.classes_), expected_classes) + + s = pipeline.decision_function(X_test_3class).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_decision_function_output) + + assert_equal(set(clf.classes_), expected_classes) + assert_equal(set(pipeline.classes_), expected_classes) + + def test_pass_decision_function_multiclass_3class_retains_classes_type(self): + clf = FastLinearClassifier(number_of_threads=1) + clf.fit(X_train_3class_int, y_train_3class_int) + s = clf.decision_function(X_test_3class_int).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_decision_function_output) + assert_equal(set(clf.classes_), {0, 1, 2}) + + def test_decision_function_multiclass_3class_no_y_input_implies_no_classes_attribute(self): + X_train = X_train_3class_int.join(y_train_3class_int) + X_test = X_test_3class_int.join(y_test_3class_int) + + clf = FastLinearClassifier(number_of_threads=1, label='Label') + clf.fit(X_train) + + if hasattr(clf, 'classes_'): + # The classes_ attribute is currently not supported + # when fitting when there is no y input specified. + self.fail("classes_ attribute not expected.") + + s = clf.decision_function(X_test).sum() + assert_almost_equal( + s, + 38.0, + decimal=4, + err_msg=invalid_decision_function_output) + + if hasattr(clf, 'classes_'): + # The classes_ attribute is currently not supported + # when predicting when there was no y input specified + # during fitting. + self.fail("classes_ attribute not expected.") + def test_fail_decision_function_multiclass(self): check_unsupported_decision_function( self, LogisticRegressionClassifier(), X_train, y_train, X_test) diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index 9b072af4..fb0bdc79 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -9,7 +9,7 @@ from math import isnan from nimbusml import Pipeline from nimbusml.linear_model import FastLinearRegressor -from nimbusml.preprocessing.missing_values import Filter, Handler +from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator from pandas import DataFrame from sklearn.utils.testing import assert_equal, assert_true, \ assert_allclose @@ -75,6 +75,90 @@ def test_input_types(self): res['Score'].values, [ 4.965541, 0.519701, 4.992831, 3.877400, 5.020121], rtol=1e-4) + def test_input_conversion_to_float(self): + data={'f0': [0, 1, 2, 3], + 'f1': [1, 2, 3, 4], + 'f2': [1, 2, 3, 4], + 'f3': [1, 2, 3, 4], + 'f4': ['2', '3', '4', '5'], + 'f5': [4, 5, np.nan, 9]} + + data = DataFrame(data).astype({ + 'f0': np.int8, + 'f1': np.int16, + 'f2': np.int32, + 'f3': np.int64, + 'f4': str, + 'f5': np.float64}) + + # Check Indicator + xf = Indicator() + result = xf.fit_transform(data) + + assert_equal(result.loc[2, 'f5'], True) + result.loc[2, 'f5'] = False + result = ~result + self.assertTrue(result.all(axis=None)) + + # Check Filter + xf = Filter() + result = xf.fit_transform(data) + assert_equal(len(result), 3) + assert_equal(result.loc[2, 'f5'], 9.0) + + # Check Handler + xf = Handler(replace_with='Mean') + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[2, 'f5.f5'], 6.0) + assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0) + + def test_input_conversion_to_float_retains_other_column_types(self): + data={'f0': [0, 1, 2, 3], + 'f1': ['2', '3', '4', '5'], + 'f2': [4, 5, np.nan, 9]} + + data = DataFrame(data).astype({ + 'f0': np.int32, + 'f1': str, + 'f2': np.float64}) + + # Check Indicator + xf = Indicator(columns={'f2.ind': 'f2'}) + result = xf.fit_transform(data) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2'], np.float64) + assert_equal(result.dtypes['f2.ind'], np.bool) + assert_equal(result.loc[2, 'f2.ind'], True) + assert_equal(len(result), 4) + + # Check Filter + xf = Filter(columns=['f2']) + result = xf.fit_transform(data) + assert_equal(len(result), 3) + assert_equal(result.loc[2, 'f2'], 9.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2'], np.float32) + + xf = Filter(columns=['f1']) + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[3, 'f2'], 9.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.float32) + assert_equal(result.dtypes['f2'], np.float64) + + # Check Handler + xf = Handler(columns=['f2'], replace_with='Mean') + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[2, 'f2.f2'], 6.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2.f2'], np.float32) + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py index 503c21a6..a08ce3b9 100644 --- a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py +++ b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +import os import pickle import unittest @@ -111,6 +112,7 @@ def test_pickle_predictor(self): # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(ftree) + os.remove(ftree.model_) ftree2 = pickle.loads(s) scores2 = ftree2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) @@ -130,6 +132,7 @@ def test_pickle_transform(self): # Unpickle transform and generate output. # We should get the exact same output as above s = pickle.dumps(cat) + os.remove(cat.model_) cat2 = pickle.loads(s) out2 = cat2.transform(X_train) assert_equal( @@ -158,7 +161,10 @@ def test_pickle_pipeline(self): # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(pipe) + os.remove(cat.model_) + os.remove(ftree.model_) pipe2 = pickle.loads(s) + scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal( diff --git a/src/python/nimbusml/tests/utils/test_exports.py b/src/python/nimbusml/tests/utils/test_exports.py index 96d1ddfa..26800725 100644 --- a/src/python/nimbusml/tests/utils/test_exports.py +++ b/src/python/nimbusml/tests/utils/test_exports.py @@ -492,6 +492,17 @@ def test_get_fit_info_fastl(self): 'Month', 'Day'], 'type': 'start'}, + {'inputs': ['Ozone'], + 'name': 'TypeConverter', + 'outputs': ['Ozone'], + 'schema_after': ['Unnamed0', + 'Ozone', + 'Solar_R', + 'Wind', + 'Temp', + 'Month', + 'Day'], + 'type': 'transform'}, {'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], @@ -506,7 +517,7 @@ def test_get_fit_info_fastl(self): for el in info[0]: if 'operator' in el: del el['operator'] - self.assertEqual(exp, info[0][:2]) + self.assertEqual(exp, info[0][:3]) def test_word_embedding(self): diff --git a/src/python/setup.py b/src/python/setup.py index 80f47621..3a19fc47 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.2.0', + version='1.3.0', description='NimbusML', long_description=long_description, @@ -115,6 +115,7 @@ 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], + 'dprep': ['azureml-dataprep'], 'utils': ['graphviz', 'imageio'], }, diff --git a/src/python/setup.py.in b/src/python/setup.py.in index 07f92fe1..3ddce586 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -115,6 +115,7 @@ setup( 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], + 'dprep': ['azureml-dataprep'], 'utils': ['graphviz', 'imageio'], }, diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index f101b1ec..9cbc09d0 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -8,6 +8,8 @@ import json import os +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble import EnsembleRegressor from nimbusml.ensemble import LightGbmBinaryClassifier from nimbusml.ensemble import LightGbmClassifier from nimbusml.ensemble import LightGbmRanker @@ -73,6 +75,10 @@ # dimensional arrays, tolerance 'FastLinearClassifier': 'check_classifiers_train', 'FastForestRegressor': 'check_fit_score_takes_y', # bug + 'EnsembleClassifier': 'check_supervised_y_2d, ' + 'check_classifiers_train', + 'EnsembleRegressor': 'check_supervised_y_2d, ' + 'check_regressors_train', # bug in decision_function 'FastTreesBinaryClassifier': 'check_decision_proba_consistency', @@ -181,6 +187,8 @@ 'check_classifiers_train'] INSTANCES = { + 'EnsembleClassifier': EnsembleClassifier(num_models=3), + 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier( @@ -290,6 +298,14 @@ def load_json(file_path): estimator = estimator << 'F0' for check in _yield_all_checks(class_name, estimator): + # Skip check_dict_unchanged for estimators which + # update the classes_ attribute. For more details + # see https://github.com/microsoft/NimbusML/pull/200 + if (check.__name__ == 'check_dict_unchanged') and \ + (hasattr(estimator, 'predict_proba') or + hasattr(estimator, 'decision_function')): + continue + if check.__name__ in OMITTED_CHECKS_ALWAYS: continue if 'Binary' in class_name and check.__name__ in NOBINARY_CHECKS: diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py index 8be5a7c1..21b6d1f4 100644 --- a/src/python/tools/code_fixer.py +++ b/src/python/tools/code_fixer.py @@ -20,6 +20,17 @@ NG_1_correct = """from ...base_transform import BaseTransform from .extractor import Ngram""" +ensemble = """from ..base_predictor import BasePredictor""" +ensemble_correct = """from ..base_predictor import BasePredictor +from .subset_selector import BootstrapSelector +from .feature_selector import AllFeatureSelector""" + +diverse_selector = """from ...internal.utils.utils import trace""" +classifier_diverse_selector = """from ...internal.utils.utils import trace +from .diversity_measure import ClassifierDisagreement""" +regressor_diverse_selector = """from ...internal.utils.utils import trace +from .diversity_measure import RegressorDisagreement""" + FM = \ """import numbers from sklearn.base import ClassifierMixin @@ -91,7 +102,25 @@ 'FactorizationMachineBinaryClassifier': (FM, FM_correct), 'OneHotHashVectorizer': (OHE, OHE_correct), 'CustomStopWordsRemover': (cust_stop, cust_stop_correct), - 'PredefinedStopWordsRemover': (pred_stop, pred_stop_correct) + 'PredefinedStopWordsRemover': (pred_stop, pred_stop_correct), + 'EnsembleClassifier': [(ensemble, ensemble_correct), + ('sampling_type = bootstrap_selector', + 'sampling_type = BootstrapSelector'), + ("feature_selector = {'Name': 'AllFeatureSelector'}", + "feature_selector = AllFeatureSelector()")], + 'EnsembleRegressor': [(ensemble, ensemble_correct), + ('sampling_type = bootstrap_selector', + 'sampling_type = BootstrapSelector'), + ("feature_selector = {'Name': 'AllFeatureSelector'}", + "feature_selector = AllFeatureSelector()")], + 'ClassifierBestDiverseSelector': [(diverse_selector, + classifier_diverse_selector), + ('diversity_metric_type = None', + 'diversity_metric_type = ClassifierDisagreement()')], + 'RegressorBestDiverseSelector': [(diverse_selector, + regressor_diverse_selector), + ('diversity_metric_type = None', + 'diversity_metric_type = RegressorDisagreement()')] } diff --git a/src/python/tools/compiler_utils.py b/src/python/tools/compiler_utils.py index c64f5af3..7771ed6c 100644 --- a/src/python/tools/compiler_utils.py +++ b/src/python/tools/compiler_utils.py @@ -129,6 +129,9 @@ def _nodes_with_presteps(self): 'MeanVarianceScaler': int_to_r4_converter, 'LogMeanVarianceScaler': int_to_r4_converter, 'Binner': int_to_r4_converter, + 'Filter': int_to_r4_converter, + 'Handler': int_to_r4_converter, + 'Indicator': int_to_r4_converter, # 'SupervisedBinner': int_to_r4_converter, # not exist in nimbusml 'IidSpikeDetector': timeseries_to_r4_converter, diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index 80d46d22..b34bf66f 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -302,24 +302,13 @@ def write_api(entrypoint, kind="node", pkg_path=None, overwrite=False): dots = "..." if "." in class_dir: dots = "...." - class_imports = [ + imports = [ arg.get_import( prefix=( "%sentrypoints." % dots)) for arg in visible_args if arg.get_import() is not None] - class_imports = '\n'.join(class_imports) - - dots = "..." - if "." in class_dir: - dots = "...." - core_class_imports = [ - arg.get_import( - prefix=( - "%sentrypoints." % - dots)) for arg in visible_args if - arg.get_import() is not None] - core_class_imports = '\n'.join(core_class_imports) + imports = '\n'.join(imports) # write the class to a file py_path = module_to_path(class_dir, pkg_path) @@ -377,7 +366,7 @@ def write_api(entrypoint, kind="node", pkg_path=None, overwrite=False): class_file, class_dir, banner, - core_class_imports, + imports, class_args, core_args_map, entrypoint_args_map, @@ -1503,7 +1492,15 @@ def parse_arg(argument, inout): "BoosterParameterFunction", "ParallelLightGBM", "AutoMlEngine", - "SearchTerminator"]: + "SearchTerminator", + "EnsembleSubsetSelector", + "EnsembleFeatureSelector", + "EnsembleMulticlassSubModelSelector", + "EnsembleMulticlassDiversityMeasure", + "EnsembleMulticlassOutputCombiner", + "EnsembleRegressionSubModelSelector", + "EnsembleRegressionDiversityMeasure", + "EnsembleRegressionOutputCombiner"]: arg_obj = ComponentArg(argument, inout) elif componentKind in ["ClassificationLossFunction", "RegressionLossFunction", diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index 35ebb09d..60bd5321 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -2275,9 +2275,10 @@ "Name": "Model", "Type": "TransformModel", "Desc": "Model that needs to be converted to ONNX format.", - "Required": true, + "Required": false, "SortOrder": 10.0, - "IsNullable": false + "IsNullable": false, + "Default": null }, { "Name": "OnnxVersion", @@ -2293,6 +2294,15 @@ "SortOrder": 11.0, "IsNullable": false, "Default": "Stable" + }, + { + "Name": "PredictiveModel", + "Type": "PredictorModel", + "Desc": "Predictor model that needs to be converted to ONNX format.", + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": null } ], "Outputs": [] @@ -11535,7 +11545,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Default" + "Default": "Logloss" }, { "Name": "MaximumBinCountPerFeature", @@ -12032,7 +12042,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Default" + "Default": "Error" }, { "Name": "MaximumBinCountPerFeature", @@ -12529,7 +12539,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Default" + "Default": "NormalizedDiscountedCumulativeGain" }, { "Name": "MaximumBinCountPerFeature", @@ -12987,7 +12997,7 @@ "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": "Default" + "Default": "RootMeanSquaredError" }, { "Name": "MaximumBinCountPerFeature", @@ -22338,66 +22348,6 @@ "SortOrder": 2.0, "IsNullable": false }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Training labels.", - "Aliases": [ - "label" - ], - "Required": false, - "SortOrder": 4.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "TensorFlowLabel", - "Type": "String", - "Desc": "TensorFlow label node.", - "Aliases": [ - "TFLabel" - ], - "Required": false, - "SortOrder": 5.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "OptimizationOperation", - "Type": "String", - "Desc": "The name of the optimization operation in the TensorFlow graph.", - "Aliases": [ - "OptimizationOp" - ], - "Required": false, - "SortOrder": 6.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "LossOperation", - "Type": "String", - "Desc": "The name of the operation in the TensorFlow graph to compute training loss (Optional)", - "Aliases": [ - "LossOp" - ], - "Required": false, - "SortOrder": 7.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "MetricOperation", - "Type": "String", - "Desc": "The name of the operation in the TensorFlow graph to compute performance metric during training (Optional)", - "Aliases": [ - "MetricOp" - ], - "Required": false, - "SortOrder": 8.0, - "IsNullable": false, - "Default": null - }, { "Name": "BatchSize", "Type": "Int", @@ -22407,60 +22357,6 @@ "IsNullable": false, "Default": 64 }, - { - "Name": "Epoch", - "Type": "Int", - "Desc": "Number of training iterations.", - "Required": false, - "SortOrder": 10.0, - "IsNullable": false, - "Default": 5 - }, - { - "Name": "LearningRateOperation", - "Type": "String", - "Desc": "The name of the operation in the TensorFlow graph which sets optimizer learning rate (Optional).", - "Required": false, - "SortOrder": 11.0, - "IsNullable": false, - "Default": null - }, - { - "Name": "LearningRate", - "Type": "Float", - "Desc": "Learning rate to use during optimization.", - "Required": false, - "SortOrder": 12.0, - "IsNullable": false, - "Default": 0.01 - }, - { - "Name": "SaveLocationOperation", - "Type": "String", - "Desc": "Name of the input in TensorFlow graph that specifiy the location for saving/restoring models from disk.", - "Required": false, - "SortOrder": 13.0, - "IsNullable": false, - "Default": "save/Const" - }, - { - "Name": "SaveOperation", - "Type": "String", - "Desc": "Name of the input in TensorFlow graph that specifiy the location for saving/restoring models from disk.", - "Required": false, - "SortOrder": 14.0, - "IsNullable": false, - "Default": "save/control_dependency" - }, - { - "Name": "ReTrain", - "Type": "Bool", - "Desc": "Retrain TensorFlow model.", - "Required": false, - "SortOrder": 15.0, - "IsNullable": false, - "Default": false - }, { "Name": "AddBatchDimensionInputs", "Type": "Bool", diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index 58d6b3a5..acff52df 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -235,6 +235,28 @@ "Predict_Proba" : true, "Decision_Function" : true }, + { + "Name": "Trainers.LinearSvmBinaryClassifier", + "NewName": "LinearSvmBinaryClassifier", + "Module": "linear_model", + "Type": "Classifier", + "Predict_Proba" : true, + "Decision_Function" : true + }, + { + "Name": "Trainers.EnsembleClassification", + "NewName": "EnsembleClassifier", + "Module": "ensemble", + "Type": "Classifier", + "Predict_Proba" : true, + "Decision_Function" : true + }, + { + "Name": "Trainers.EnsembleRegression", + "NewName": "EnsembleRegressor", + "Module": "ensemble", + "Type": "Regressor" + }, { "Name": "Transforms.ApproximateBootstrapSampler", "NewName": "BootstrapSampler", @@ -770,6 +792,193 @@ } ] }, + { + "Kind": "EnsembleSubsetSelector", + "Components": [ + { + "Name": "AllInstanceSelector", + "NewName": "AllInstanceSelector", + "Desc": "Selects all rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + }, + { + "Name": "BootstrapSelector", + "NewName": "BootstrapSelector", + "Desc": "Selects a bootstrapped sample of the rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + }, + { + "Name": "RandomPartitionSelector", + "NewName": "RandomPartitionSelector", + "Desc": "Randomly partitions the rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleFeatureSelector", + "Components": [ + { + "Name": "AllFeatureSelector", + "NewName": "AllFeatureSelector", + "Desc": "Selects all features for each trainer in the ensemble", + "Module": "ensemble.feature_selector", + "Type": "Component" + }, + { + "Name": "RandomFeatureSelector", + "NewName": "RandomFeatureSelector", + "Desc": "Selects a random subset of features for each trainer in the ensemble", + "Module": "ensemble.feature_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassOutputCombiner", + "Components": [ + { + "Name": "MultiAverage", + "NewName": "ClassifierAverage", + "Desc": "Computes the average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiMedian", + "NewName": "ClassifierMedian", + "Desc": "Computes the median of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiStacking", + "NewName": "ClassifierStacking", + "Desc": "Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiVoting", + "NewName": "ClassifierVoting", + "Desc": "Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiWeightedAverage", + "NewName": "ClassifierWeightedAverage", + "Desc": "Computes the weighted average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionOutputCombiner", + "Components": [ + { + "Name": "Average", + "NewName": "RegressorAverage", + "Desc": "Computes the average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "Median", + "NewName": "RegressorMedian", + "Desc": "Computes the median of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "RegressionStacking", + "NewName": "RegressorStacking", + "Desc": "Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label", + "Module": "ensemble.output_combiner", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassSubModelSelector", + "Components": [ + { + "Name": "AllSelectorMultiClass", + "NewName": "ClassifierAllSelector", + "Desc": "Combines all the models to create the output. This is the default submodel selector.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestDiverseSelectorMultiClass", + "NewName": "ClassifierBestDiverseSelector", + "Desc": "Combines the models whose predictions are as diverse as possible.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestPerformanceSelectorMultiClass", + "NewName": "ClassifierBestPerformanceSelector", + "Desc": "Combines only the models with the best performance.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionSubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "NewName": "RegressorAllSelector", + "Desc": "Combines all the models to create the output. This is the default submodel selector.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestDiverseSelectorRegression", + "NewName": "RegressorBestDiverseSelector", + "Desc": "Combines the models whose predictions are as diverse as possible.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestPerformanceRegressionSelector", + "NewName": "RegressorBestPerformanceSelector", + "Desc": "Combines only the models with the best performance.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassDiversityMeasure", + "Components": [ + { + "Name": "MultiDisagreementDiversityMeasure", + "NewName": "ClassifierDisagreement", + "Desc": "A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs", + "Module": "ensemble.sub_model_selector.diversity_measure", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionDiversityMeasure", + "Components": [ + { + "Name": "RegressionDisagreementDiversityMeasure", + "NewName": "RegressorDisagreement", + "Desc": "A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs", + "Module": "ensemble.sub_model_selector.diversity_measure", + "Type": "Component" + } + ] + }, { "Kind": "NgramExtractor", "Components": [ diff --git a/version.txt b/version.txt index 26aaba0e..f0bb29e7 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.2.0 +1.3.0